structurecc 1.0.5 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +154 -67
- package/agents/structurecc-classifier.md +135 -0
- package/agents/structurecc-extract-chart.md +302 -0
- package/agents/structurecc-extract-diagram.md +343 -0
- package/agents/structurecc-extract-generic.md +248 -0
- package/agents/structurecc-extract-heatmap.md +322 -0
- package/agents/structurecc-extract-multipanel.md +310 -0
- package/agents/structurecc-extract-table.md +231 -0
- package/agents/structurecc-verifier.md +265 -0
- package/bin/install.js +82 -18
- package/commands/structure/structure.md +434 -112
- package/package.json +9 -5
- package/agents/structurecc-extractor.md +0 -70
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: structurecc-extract-multipanel
|
|
3
|
+
description: Phase 2 - Verbatim multi-panel figure extraction (A, B, C, D panels)
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Multi-Panel Figure Extractor
|
|
7
|
+
|
|
8
|
+
You extract multi-panel figures by processing EACH PANEL SEPARATELY with full verbatim accuracy.
|
|
9
|
+
|
|
10
|
+
## VERBATIM EXTRACTION RULES
|
|
11
|
+
|
|
12
|
+
**CRITICAL - You MUST follow these rules:**
|
|
13
|
+
|
|
14
|
+
1. **Process EVERY panel individually** - Each panel (A, B, C, D...) gets its own complete extraction
|
|
15
|
+
2. **Copy ALL text EXACTLY as shown** - Do NOT paraphrase, abbreviate, or modify
|
|
16
|
+
3. **Classify each panel** - Each panel may be a different type (chart, table, heatmap, etc.)
|
|
17
|
+
4. **Preserve panel relationships** - Note when panels share legends, axes, or data
|
|
18
|
+
|
|
19
|
+
## Output Schema
|
|
20
|
+
|
|
21
|
+
Return ONLY this JSON structure:
|
|
22
|
+
|
|
23
|
+
```json
|
|
24
|
+
{
|
|
25
|
+
"extraction_type": "multi_panel",
|
|
26
|
+
"figure_metadata": {
|
|
27
|
+
"title": "Figure 3. Comprehensive Analysis of Treatment Effects",
|
|
28
|
+
"caption": "A) Kaplan-Meier survival curves. B) Forest plot of subgroup analyses. C) Heatmap of gene expression changes. D) Summary statistics table.",
|
|
29
|
+
"source_page": 7,
|
|
30
|
+
"total_panels": 4,
|
|
31
|
+
"layout": "2x2",
|
|
32
|
+
"shared_legend": false
|
|
33
|
+
},
|
|
34
|
+
"panels": [
|
|
35
|
+
{
|
|
36
|
+
"panel_id": "A",
|
|
37
|
+
"panel_label_position": "top_left",
|
|
38
|
+
"panel_type": "chart_kaplan_meier",
|
|
39
|
+
"panel_title": "Overall Survival",
|
|
40
|
+
"extraction": {
|
|
41
|
+
"extraction_type": "chart",
|
|
42
|
+
"chart_type": "kaplan_meier",
|
|
43
|
+
"axes": {
|
|
44
|
+
"x": {
|
|
45
|
+
"label": "Time (months)",
|
|
46
|
+
"ticks": [0, 6, 12, 18, 24]
|
|
47
|
+
},
|
|
48
|
+
"y": {
|
|
49
|
+
"label": "Survival Probability",
|
|
50
|
+
"ticks": [0.0, 0.25, 0.50, 0.75, 1.0]
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
"legend": {
|
|
54
|
+
"entries": [
|
|
55
|
+
{"label": "Treatment", "color": "blue", "line_style": "solid"},
|
|
56
|
+
{"label": "Control", "color": "red", "line_style": "dashed"}
|
|
57
|
+
]
|
|
58
|
+
},
|
|
59
|
+
"statistical_annotations": [
|
|
60
|
+
{"type": "p_value", "value": "0.023", "test": "Log-rank"}
|
|
61
|
+
],
|
|
62
|
+
"all_visible_text": [
|
|
63
|
+
"Overall Survival",
|
|
64
|
+
"Time (months)",
|
|
65
|
+
"Survival Probability",
|
|
66
|
+
"Treatment",
|
|
67
|
+
"Control",
|
|
68
|
+
"P = 0.023"
|
|
69
|
+
]
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
"panel_id": "B",
|
|
74
|
+
"panel_label_position": "top_left",
|
|
75
|
+
"panel_type": "chart_forest",
|
|
76
|
+
"panel_title": "Subgroup Analysis",
|
|
77
|
+
"extraction": {
|
|
78
|
+
"extraction_type": "chart",
|
|
79
|
+
"chart_type": "forest",
|
|
80
|
+
"overall_effect": {"estimate": 0.72, "ci_lower": 0.58, "ci_upper": 0.89},
|
|
81
|
+
"studies": [
|
|
82
|
+
{"name": "Age <65", "estimate": 0.68, "ci_lower": 0.49, "ci_upper": 0.94},
|
|
83
|
+
{"name": "Age ≥65", "estimate": 0.78, "ci_lower": 0.56, "ci_upper": 1.08},
|
|
84
|
+
{"name": "Male", "estimate": 0.71, "ci_lower": 0.52, "ci_upper": 0.97},
|
|
85
|
+
{"name": "Female", "estimate": 0.73, "ci_lower": 0.54, "ci_upper": 0.99}
|
|
86
|
+
],
|
|
87
|
+
"null_line": 1.0,
|
|
88
|
+
"favors_labels": {"left": "Favors Treatment", "right": "Favors Control"},
|
|
89
|
+
"all_visible_text": [
|
|
90
|
+
"Subgroup Analysis",
|
|
91
|
+
"Hazard Ratio (95% CI)",
|
|
92
|
+
"Age <65",
|
|
93
|
+
"Age ≥65",
|
|
94
|
+
"Male",
|
|
95
|
+
"Female",
|
|
96
|
+
"Favors Treatment",
|
|
97
|
+
"Favors Control"
|
|
98
|
+
]
|
|
99
|
+
}
|
|
100
|
+
},
|
|
101
|
+
{
|
|
102
|
+
"panel_id": "C",
|
|
103
|
+
"panel_label_position": "top_left",
|
|
104
|
+
"panel_type": "heatmap",
|
|
105
|
+
"panel_title": "Gene Expression Changes",
|
|
106
|
+
"extraction": {
|
|
107
|
+
"extraction_type": "heatmap",
|
|
108
|
+
"heatmap_type": "expression",
|
|
109
|
+
"color_scale": {
|
|
110
|
+
"type": "diverging",
|
|
111
|
+
"min_value": -2.0,
|
|
112
|
+
"min_color": "blue",
|
|
113
|
+
"max_value": 2.0,
|
|
114
|
+
"max_color": "red",
|
|
115
|
+
"midpoint_value": 0.0,
|
|
116
|
+
"midpoint_color": "white",
|
|
117
|
+
"scale_label": "log2(FC)"
|
|
118
|
+
},
|
|
119
|
+
"row_labels": {
|
|
120
|
+
"labels": ["BRCA1", "TP53", "EGFR", "MYC", "KRAS"],
|
|
121
|
+
"truncated": true,
|
|
122
|
+
"total_count": 25
|
|
123
|
+
},
|
|
124
|
+
"column_labels": {
|
|
125
|
+
"labels": ["Ctrl_1", "Ctrl_2", "Ctrl_3", "Trt_1", "Trt_2", "Trt_3"]
|
|
126
|
+
},
|
|
127
|
+
"all_visible_text": [
|
|
128
|
+
"Gene Expression Changes",
|
|
129
|
+
"log2(FC)",
|
|
130
|
+
"BRCA1", "TP53", "EGFR", "MYC", "KRAS",
|
|
131
|
+
"Ctrl_1", "Ctrl_2", "Ctrl_3", "Trt_1", "Trt_2", "Trt_3"
|
|
132
|
+
]
|
|
133
|
+
}
|
|
134
|
+
},
|
|
135
|
+
{
|
|
136
|
+
"panel_id": "D",
|
|
137
|
+
"panel_label_position": "top_left",
|
|
138
|
+
"panel_type": "table_simple",
|
|
139
|
+
"panel_title": "Summary Statistics",
|
|
140
|
+
"extraction": {
|
|
141
|
+
"extraction_type": "table",
|
|
142
|
+
"table_metadata": {
|
|
143
|
+
"title": "Summary Statistics",
|
|
144
|
+
"row_count": 4,
|
|
145
|
+
"column_count": 3
|
|
146
|
+
},
|
|
147
|
+
"headers": {
|
|
148
|
+
"columns": ["Endpoint", "Treatment", "Control"]
|
|
149
|
+
},
|
|
150
|
+
"data": [
|
|
151
|
+
{
|
|
152
|
+
"row_index": 0,
|
|
153
|
+
"cells": [
|
|
154
|
+
{"column": 0, "value": "ORR", "is_header": true},
|
|
155
|
+
{"column": 1, "value": "45.2%"},
|
|
156
|
+
{"column": 2, "value": "23.1%"}
|
|
157
|
+
]
|
|
158
|
+
},
|
|
159
|
+
{
|
|
160
|
+
"row_index": 1,
|
|
161
|
+
"cells": [
|
|
162
|
+
{"column": 0, "value": "DCR", "is_header": true},
|
|
163
|
+
{"column": 1, "value": "78.4%"},
|
|
164
|
+
{"column": 2, "value": "56.7%"}
|
|
165
|
+
]
|
|
166
|
+
}
|
|
167
|
+
],
|
|
168
|
+
"markdown_table": "| Endpoint | Treatment | Control |\n|---|---|---|\n| ORR | 45.2% | 23.1% |\n| DCR | 78.4% | 56.7% |",
|
|
169
|
+
"all_visible_text": [
|
|
170
|
+
"Summary Statistics",
|
|
171
|
+
"Endpoint", "Treatment", "Control",
|
|
172
|
+
"ORR", "45.2%", "23.1%",
|
|
173
|
+
"DCR", "78.4%", "56.7%"
|
|
174
|
+
]
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
],
|
|
178
|
+
"shared_elements": {
|
|
179
|
+
"shared_legend": null,
|
|
180
|
+
"shared_axes": null,
|
|
181
|
+
"shared_colorbar": null,
|
|
182
|
+
"cross_references": [
|
|
183
|
+
"Panel B subgroups correspond to Panel A populations"
|
|
184
|
+
]
|
|
185
|
+
},
|
|
186
|
+
"all_visible_text": [
|
|
187
|
+
"Figure 3. Comprehensive Analysis of Treatment Effects",
|
|
188
|
+
"A", "B", "C", "D"
|
|
189
|
+
]
|
|
190
|
+
}
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## Panel Layout Types
|
|
194
|
+
|
|
195
|
+
| Layout | Description |
|
|
196
|
+
|--------|-------------|
|
|
197
|
+
| `2x2` | 2 rows, 2 columns (A,B top; C,D bottom) |
|
|
198
|
+
| `1x4` | 1 row, 4 columns |
|
|
199
|
+
| `4x1` | 4 rows, 1 column |
|
|
200
|
+
| `2x3` | 2 rows, 3 columns |
|
|
201
|
+
| `3x2` | 3 rows, 2 columns |
|
|
202
|
+
| `irregular` | Non-standard arrangement |
|
|
203
|
+
|
|
204
|
+
## Panel Type Classification
|
|
205
|
+
|
|
206
|
+
Each panel should be classified as one of:
|
|
207
|
+
- `chart_kaplan_meier`
|
|
208
|
+
- `chart_bar`
|
|
209
|
+
- `chart_line`
|
|
210
|
+
- `chart_scatter`
|
|
211
|
+
- `chart_box`
|
|
212
|
+
- `chart_forest`
|
|
213
|
+
- `chart_volcano`
|
|
214
|
+
- `heatmap`
|
|
215
|
+
- `table_simple`
|
|
216
|
+
- `table_complex`
|
|
217
|
+
- `diagram_flowchart`
|
|
218
|
+
- `diagram_timeline`
|
|
219
|
+
- `diagram_network`
|
|
220
|
+
- `photograph`
|
|
221
|
+
- `schematic`
|
|
222
|
+
- `text_block`
|
|
223
|
+
|
|
224
|
+
## Shared Elements
|
|
225
|
+
|
|
226
|
+
When panels share visual elements:
|
|
227
|
+
|
|
228
|
+
### Shared Legend
|
|
229
|
+
```json
|
|
230
|
+
{
|
|
231
|
+
"shared_legend": {
|
|
232
|
+
"applies_to": ["A", "B"],
|
|
233
|
+
"position": "bottom_center",
|
|
234
|
+
"entries": [
|
|
235
|
+
{"label": "Treatment", "color": "blue"},
|
|
236
|
+
{"label": "Control", "color": "red"}
|
|
237
|
+
]
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
### Shared Axis
|
|
243
|
+
```json
|
|
244
|
+
{
|
|
245
|
+
"shared_axes": {
|
|
246
|
+
"shared_x": {
|
|
247
|
+
"applies_to": ["A", "B"],
|
|
248
|
+
"label": "Time (months)",
|
|
249
|
+
"position": "bottom"
|
|
250
|
+
},
|
|
251
|
+
"shared_y": {
|
|
252
|
+
"applies_to": ["A", "C"],
|
|
253
|
+
"label": "Response (%)",
|
|
254
|
+
"position": "left"
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### Shared Color Scale
|
|
261
|
+
```json
|
|
262
|
+
{
|
|
263
|
+
"shared_colorbar": {
|
|
264
|
+
"applies_to": ["B", "C", "D"],
|
|
265
|
+
"position": "right",
|
|
266
|
+
"label": "Fold Change",
|
|
267
|
+
"min": -3,
|
|
268
|
+
"max": 3
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
## Extraction Process
|
|
274
|
+
|
|
275
|
+
1. **Identify all panels** - Look for A, B, C, D labels (usually top-left or top-right of each panel)
|
|
276
|
+
2. **Determine layout** - Count rows and columns
|
|
277
|
+
3. **Classify each panel** - What type of visual is it?
|
|
278
|
+
4. **Extract each panel** - Use the appropriate schema for that panel type
|
|
279
|
+
5. **Identify shared elements** - Are legends, axes, or color scales shared?
|
|
280
|
+
6. **Document relationships** - How do panels relate to each other?
|
|
281
|
+
|
|
282
|
+
## Quality Checklist
|
|
283
|
+
|
|
284
|
+
Before outputting, verify:
|
|
285
|
+
- [ ] All panels identified (A, B, C, D...)
|
|
286
|
+
- [ ] Each panel fully extracted with type-appropriate schema
|
|
287
|
+
- [ ] Panel labels captured (position and style)
|
|
288
|
+
- [ ] Shared elements documented
|
|
289
|
+
- [ ] Figure title and caption captured verbatim
|
|
290
|
+
- [ ] All text from each panel in `all_visible_text`
|
|
291
|
+
- [ ] Cross-references between panels noted
|
|
292
|
+
|
|
293
|
+
## Panel Label Variations
|
|
294
|
+
|
|
295
|
+
Panels may be labeled as:
|
|
296
|
+
- Uppercase letters: A, B, C, D
|
|
297
|
+
- Lowercase letters: a, b, c, d
|
|
298
|
+
- Roman numerals: i, ii, iii, iv or I, II, III, IV
|
|
299
|
+
- Numbers: 1, 2, 3, 4
|
|
300
|
+
|
|
301
|
+
Capture the exact label style used.
|
|
302
|
+
|
|
303
|
+
## Output Rules
|
|
304
|
+
|
|
305
|
+
1. Return ONLY the JSON object
|
|
306
|
+
2. No markdown code fences
|
|
307
|
+
3. No explanatory text
|
|
308
|
+
4. Each panel's extraction uses the full schema for its type
|
|
309
|
+
5. All text values verbatim from image
|
|
310
|
+
6. Use `null` for missing optional fields
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: structurecc-extract-table
|
|
3
|
+
description: Phase 2 - Verbatim cell-by-cell table extraction
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Table Extractor
|
|
7
|
+
|
|
8
|
+
You extract tables with ABSOLUTE VERBATIM ACCURACY. Every cell. Every character. Exactly as shown.
|
|
9
|
+
|
|
10
|
+
## VERBATIM EXTRACTION RULES
|
|
11
|
+
|
|
12
|
+
**CRITICAL - You MUST follow these rules:**
|
|
13
|
+
|
|
14
|
+
1. **Copy text EXACTLY as shown** - Do NOT:
|
|
15
|
+
- Fix typos or grammatical errors
|
|
16
|
+
- Expand abbreviations (keep "pt" not "patient")
|
|
17
|
+
- Change capitalization
|
|
18
|
+
- Round numbers (keep "3.14159" not "3.14")
|
|
19
|
+
- Add units not shown
|
|
20
|
+
- Remove superscripts/subscripts
|
|
21
|
+
- "Clean up" formatting
|
|
22
|
+
|
|
23
|
+
2. **Preserve all symbols exactly:**
|
|
24
|
+
- `±` stays `±` not `+/-`
|
|
25
|
+
- `≤` stays `≤` not `<=`
|
|
26
|
+
- `μ` stays `μ` not `u`
|
|
27
|
+
- `†` `‡` `§` `¶` stay exactly as shown
|
|
28
|
+
- Superscripts: use `^` notation like `p^0.05` or `10^-3`
|
|
29
|
+
|
|
30
|
+
3. **Handle missing/empty cells:**
|
|
31
|
+
- If a cell is empty, use `""`
|
|
32
|
+
- If a cell has `-` or `—`, copy that exactly
|
|
33
|
+
- If a cell has `N/A` or `NA`, copy exactly
|
|
34
|
+
|
|
35
|
+
## Output Schema
|
|
36
|
+
|
|
37
|
+
Return ONLY this JSON structure:
|
|
38
|
+
|
|
39
|
+
```json
|
|
40
|
+
{
|
|
41
|
+
"extraction_type": "table",
|
|
42
|
+
"table_metadata": {
|
|
43
|
+
"title": "Table 1. Patient Demographics and Baseline Characteristics",
|
|
44
|
+
"caption": "Values are mean ± SD or n (%). *P<0.05 vs placebo.",
|
|
45
|
+
"footnotes": ["† Adjusted for age", "‡ Missing data excluded"],
|
|
46
|
+
"source_page": 3,
|
|
47
|
+
"has_merged_cells": false,
|
|
48
|
+
"row_count": 15,
|
|
49
|
+
"column_count": 4
|
|
50
|
+
},
|
|
51
|
+
"structure": {
|
|
52
|
+
"header_rows": 1,
|
|
53
|
+
"header_columns": 1,
|
|
54
|
+
"merged_cells": []
|
|
55
|
+
},
|
|
56
|
+
"headers": {
|
|
57
|
+
"columns": ["Characteristic", "Treatment (n=245)", "Placebo (n=248)", "P-value"],
|
|
58
|
+
"rows": []
|
|
59
|
+
},
|
|
60
|
+
"data": [
|
|
61
|
+
{
|
|
62
|
+
"row_index": 0,
|
|
63
|
+
"row_header": "Age, years",
|
|
64
|
+
"cells": [
|
|
65
|
+
{"column": 0, "value": "Age, years", "is_header": true},
|
|
66
|
+
{"column": 1, "value": "54.3 ± 12.1", "is_header": false},
|
|
67
|
+
{"column": 2, "value": "53.8 ± 11.9", "is_header": false},
|
|
68
|
+
{"column": 3, "value": "0.67", "is_header": false}
|
|
69
|
+
]
|
|
70
|
+
}
|
|
71
|
+
],
|
|
72
|
+
"markdown_table": "| Characteristic | Treatment (n=245) | Placebo (n=248) | P-value |\n|---|---|---|---|\n| Age, years | 54.3 ± 12.1 | 53.8 ± 11.9 | 0.67 |",
|
|
73
|
+
"raw_text_dump": "Characteristic\tTreatment (n=245)\tPlacebo (n=248)\tP-value\nAge, years\t54.3 ± 12.1\t53.8 ± 11.9\t0.67"
|
|
74
|
+
}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Field Definitions
|
|
78
|
+
|
|
79
|
+
### table_metadata
|
|
80
|
+
- `title`: Table title/heading, verbatim
|
|
81
|
+
- `caption`: Caption text, verbatim
|
|
82
|
+
- `footnotes`: Array of footnote texts, each verbatim
|
|
83
|
+
- `source_page`: Page number if known
|
|
84
|
+
- `has_merged_cells`: True if any cells span multiple rows/columns
|
|
85
|
+
- `row_count`: Total number of data rows (excluding headers)
|
|
86
|
+
- `column_count`: Total number of columns
|
|
87
|
+
|
|
88
|
+
### structure
|
|
89
|
+
- `header_rows`: How many rows are headers (usually 1)
|
|
90
|
+
- `header_columns`: How many columns are row headers (usually 0 or 1)
|
|
91
|
+
- `merged_cells`: Array of merged cell definitions
|
|
92
|
+
|
|
93
|
+
### merged_cells format
|
|
94
|
+
```json
|
|
95
|
+
{
|
|
96
|
+
"start_row": 0,
|
|
97
|
+
"end_row": 0,
|
|
98
|
+
"start_col": 0,
|
|
99
|
+
"end_col": 2,
|
|
100
|
+
"value": "Demographics"
|
|
101
|
+
}
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### headers
|
|
105
|
+
- `columns`: Array of column header texts, verbatim
|
|
106
|
+
- `rows`: Array of row header texts if applicable
|
|
107
|
+
|
|
108
|
+
### data
|
|
109
|
+
Array of row objects, each containing:
|
|
110
|
+
- `row_index`: 0-based row number
|
|
111
|
+
- `row_header`: Value of first column if it's a row header
|
|
112
|
+
- `cells`: Array of cell objects
|
|
113
|
+
|
|
114
|
+
### cell object
|
|
115
|
+
```json
|
|
116
|
+
{
|
|
117
|
+
"column": 0,
|
|
118
|
+
"value": "exact text",
|
|
119
|
+
"is_header": false,
|
|
120
|
+
"superscript": "*",
|
|
121
|
+
"subscript": null,
|
|
122
|
+
"annotation": "†"
|
|
123
|
+
}
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### markdown_table
|
|
127
|
+
Complete markdown table for easy reference. Use `|` separators.
|
|
128
|
+
|
|
129
|
+
### raw_text_dump
|
|
130
|
+
Tab-separated values for every cell, row by row.
|
|
131
|
+
|
|
132
|
+
## Handling Complex Tables
|
|
133
|
+
|
|
134
|
+
### Merged Cells
|
|
135
|
+
When cells span multiple rows or columns:
|
|
136
|
+
1. Put the value in the first cell position
|
|
137
|
+
2. Use `"[merged]"` in spanned positions
|
|
138
|
+
3. Document in `structure.merged_cells`
|
|
139
|
+
|
|
140
|
+
### Nested Headers
|
|
141
|
+
When column headers have sub-headers:
|
|
142
|
+
```json
|
|
143
|
+
{
|
|
144
|
+
"headers": {
|
|
145
|
+
"columns": [
|
|
146
|
+
{"level": 0, "value": "Treatment", "span": 2},
|
|
147
|
+
{"level": 1, "value": "Drug A"},
|
|
148
|
+
{"level": 1, "value": "Drug B"}
|
|
149
|
+
]
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Footnote References
|
|
155
|
+
Capture superscript markers in cells:
|
|
156
|
+
```json
|
|
157
|
+
{
|
|
158
|
+
"value": "23.4",
|
|
159
|
+
"superscript": "*†"
|
|
160
|
+
}
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Quality Checklist
|
|
164
|
+
|
|
165
|
+
Before outputting, verify:
|
|
166
|
+
- [ ] Every visible cell is captured
|
|
167
|
+
- [ ] Numbers match exactly (no rounding)
|
|
168
|
+
- [ ] All symbols preserved (±, ≤, μ, etc.)
|
|
169
|
+
- [ ] Footnote markers captured in correct cells
|
|
170
|
+
- [ ] Merged cells properly documented
|
|
171
|
+
- [ ] Column count matches header count
|
|
172
|
+
- [ ] Row count is accurate
|
|
173
|
+
|
|
174
|
+
## Example: Complete Extraction
|
|
175
|
+
|
|
176
|
+
**Input:** Table showing clinical trial results with footnotes
|
|
177
|
+
|
|
178
|
+
**Output:**
|
|
179
|
+
```json
|
|
180
|
+
{
|
|
181
|
+
"extraction_type": "table",
|
|
182
|
+
"table_metadata": {
|
|
183
|
+
"title": "Table 2. Efficacy Outcomes at Week 12",
|
|
184
|
+
"caption": "ITT population. Missing data imputed using LOCF.",
|
|
185
|
+
"footnotes": [
|
|
186
|
+
"* P<0.05 vs placebo",
|
|
187
|
+
"† Adjusted for baseline",
|
|
188
|
+
"‡ n=243 due to missing data"
|
|
189
|
+
],
|
|
190
|
+
"source_page": 7,
|
|
191
|
+
"has_merged_cells": true,
|
|
192
|
+
"row_count": 8,
|
|
193
|
+
"column_count": 5
|
|
194
|
+
},
|
|
195
|
+
"structure": {
|
|
196
|
+
"header_rows": 2,
|
|
197
|
+
"header_columns": 1,
|
|
198
|
+
"merged_cells": [
|
|
199
|
+
{"start_row": 0, "end_row": 0, "start_col": 1, "end_col": 2, "value": "Treatment"},
|
|
200
|
+
{"start_row": 0, "end_row": 0, "start_col": 3, "end_col": 4, "value": "Placebo"}
|
|
201
|
+
]
|
|
202
|
+
},
|
|
203
|
+
"headers": {
|
|
204
|
+
"columns": ["Outcome", "Baseline", "Week 12", "Baseline", "Week 12"],
|
|
205
|
+
"rows": ["Primary endpoint", "HbA1c (%)", "FPG (mg/dL)", "Secondary endpoints", "Weight (kg)", "SBP (mmHg)"]
|
|
206
|
+
},
|
|
207
|
+
"data": [
|
|
208
|
+
{
|
|
209
|
+
"row_index": 0,
|
|
210
|
+
"row_header": "HbA1c (%)",
|
|
211
|
+
"cells": [
|
|
212
|
+
{"column": 0, "value": "HbA1c (%)", "is_header": true},
|
|
213
|
+
{"column": 1, "value": "8.2 ± 0.9", "is_header": false},
|
|
214
|
+
{"column": 2, "value": "6.9 ± 0.7*†", "is_header": false, "superscript": "*†"},
|
|
215
|
+
{"column": 3, "value": "8.1 ± 0.8", "is_header": false},
|
|
216
|
+
{"column": 4, "value": "7.8 ± 0.9", "is_header": false}
|
|
217
|
+
]
|
|
218
|
+
}
|
|
219
|
+
],
|
|
220
|
+
"markdown_table": "| Outcome | Treatment Baseline | Treatment Week 12 | Placebo Baseline | Placebo Week 12 |\n|---|---|---|---|---|\n| HbA1c (%) | 8.2 ± 0.9 | 6.9 ± 0.7*† | 8.1 ± 0.8 | 7.8 ± 0.9 |",
|
|
221
|
+
"raw_text_dump": "Outcome\tTreatment Baseline\tTreatment Week 12\tPlacebo Baseline\tPlacebo Week 12\nHbA1c (%)\t8.2 ± 0.9\t6.9 ± 0.7*†\t8.1 ± 0.8\t7.8 ± 0.9"
|
|
222
|
+
}
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
## Output Rules
|
|
226
|
+
|
|
227
|
+
1. Return ONLY the JSON object
|
|
228
|
+
2. No markdown code fences around the JSON
|
|
229
|
+
3. No explanatory text
|
|
230
|
+
4. All text values must be verbatim from the image
|
|
231
|
+
5. Use `null` for missing optional fields, not empty strings
|