structurecc 1.0.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,310 @@
1
+ ---
2
+ name: structurecc-extract-multipanel
3
+ description: Phase 2 - Verbatim multi-panel figure extraction (A, B, C, D panels)
4
+ ---
5
+
6
+ # Multi-Panel Figure Extractor
7
+
8
+ You extract multi-panel figures by processing EACH PANEL SEPARATELY with full verbatim accuracy.
9
+
10
+ ## VERBATIM EXTRACTION RULES
11
+
12
+ **CRITICAL - You MUST follow these rules:**
13
+
14
+ 1. **Process EVERY panel individually** - Each panel (A, B, C, D...) gets its own complete extraction
15
+ 2. **Copy ALL text EXACTLY as shown** - Do NOT paraphrase, abbreviate, or modify
16
+ 3. **Classify each panel** - Each panel may be a different type (chart, table, heatmap, etc.)
17
+ 4. **Preserve panel relationships** - Note when panels share legends, axes, or data
18
+
19
+ ## Output Schema
20
+
21
+ Return ONLY this JSON structure:
22
+
23
+ ```json
24
+ {
25
+ "extraction_type": "multi_panel",
26
+ "figure_metadata": {
27
+ "title": "Figure 3. Comprehensive Analysis of Treatment Effects",
28
+ "caption": "A) Kaplan-Meier survival curves. B) Forest plot of subgroup analyses. C) Heatmap of gene expression changes. D) Summary statistics table.",
29
+ "source_page": 7,
30
+ "total_panels": 4,
31
+ "layout": "2x2",
32
+ "shared_legend": false
33
+ },
34
+ "panels": [
35
+ {
36
+ "panel_id": "A",
37
+ "panel_label_position": "top_left",
38
+ "panel_type": "chart_kaplan_meier",
39
+ "panel_title": "Overall Survival",
40
+ "extraction": {
41
+ "extraction_type": "chart",
42
+ "chart_type": "kaplan_meier",
43
+ "axes": {
44
+ "x": {
45
+ "label": "Time (months)",
46
+ "ticks": [0, 6, 12, 18, 24]
47
+ },
48
+ "y": {
49
+ "label": "Survival Probability",
50
+ "ticks": [0.0, 0.25, 0.50, 0.75, 1.0]
51
+ }
52
+ },
53
+ "legend": {
54
+ "entries": [
55
+ {"label": "Treatment", "color": "blue", "line_style": "solid"},
56
+ {"label": "Control", "color": "red", "line_style": "dashed"}
57
+ ]
58
+ },
59
+ "statistical_annotations": [
60
+ {"type": "p_value", "value": "0.023", "test": "Log-rank"}
61
+ ],
62
+ "all_visible_text": [
63
+ "Overall Survival",
64
+ "Time (months)",
65
+ "Survival Probability",
66
+ "Treatment",
67
+ "Control",
68
+ "P = 0.023"
69
+ ]
70
+ }
71
+ },
72
+ {
73
+ "panel_id": "B",
74
+ "panel_label_position": "top_left",
75
+ "panel_type": "chart_forest",
76
+ "panel_title": "Subgroup Analysis",
77
+ "extraction": {
78
+ "extraction_type": "chart",
79
+ "chart_type": "forest",
80
+ "overall_effect": {"estimate": 0.72, "ci_lower": 0.58, "ci_upper": 0.89},
81
+ "studies": [
82
+ {"name": "Age <65", "estimate": 0.68, "ci_lower": 0.49, "ci_upper": 0.94},
83
+ {"name": "Age ≥65", "estimate": 0.78, "ci_lower": 0.56, "ci_upper": 1.08},
84
+ {"name": "Male", "estimate": 0.71, "ci_lower": 0.52, "ci_upper": 0.97},
85
+ {"name": "Female", "estimate": 0.73, "ci_lower": 0.54, "ci_upper": 0.99}
86
+ ],
87
+ "null_line": 1.0,
88
+ "favors_labels": {"left": "Favors Treatment", "right": "Favors Control"},
89
+ "all_visible_text": [
90
+ "Subgroup Analysis",
91
+ "Hazard Ratio (95% CI)",
92
+ "Age <65",
93
+ "Age ≥65",
94
+ "Male",
95
+ "Female",
96
+ "Favors Treatment",
97
+ "Favors Control"
98
+ ]
99
+ }
100
+ },
101
+ {
102
+ "panel_id": "C",
103
+ "panel_label_position": "top_left",
104
+ "panel_type": "heatmap",
105
+ "panel_title": "Gene Expression Changes",
106
+ "extraction": {
107
+ "extraction_type": "heatmap",
108
+ "heatmap_type": "expression",
109
+ "color_scale": {
110
+ "type": "diverging",
111
+ "min_value": -2.0,
112
+ "min_color": "blue",
113
+ "max_value": 2.0,
114
+ "max_color": "red",
115
+ "midpoint_value": 0.0,
116
+ "midpoint_color": "white",
117
+ "scale_label": "log2(FC)"
118
+ },
119
+ "row_labels": {
120
+ "labels": ["BRCA1", "TP53", "EGFR", "MYC", "KRAS"],
121
+ "truncated": true,
122
+ "total_count": 25
123
+ },
124
+ "column_labels": {
125
+ "labels": ["Ctrl_1", "Ctrl_2", "Ctrl_3", "Trt_1", "Trt_2", "Trt_3"]
126
+ },
127
+ "all_visible_text": [
128
+ "Gene Expression Changes",
129
+ "log2(FC)",
130
+ "BRCA1", "TP53", "EGFR", "MYC", "KRAS",
131
+ "Ctrl_1", "Ctrl_2", "Ctrl_3", "Trt_1", "Trt_2", "Trt_3"
132
+ ]
133
+ }
134
+ },
135
+ {
136
+ "panel_id": "D",
137
+ "panel_label_position": "top_left",
138
+ "panel_type": "table_simple",
139
+ "panel_title": "Summary Statistics",
140
+ "extraction": {
141
+ "extraction_type": "table",
142
+ "table_metadata": {
143
+ "title": "Summary Statistics",
144
+ "row_count": 4,
145
+ "column_count": 3
146
+ },
147
+ "headers": {
148
+ "columns": ["Endpoint", "Treatment", "Control"]
149
+ },
150
+ "data": [
151
+ {
152
+ "row_index": 0,
153
+ "cells": [
154
+ {"column": 0, "value": "ORR", "is_header": true},
155
+ {"column": 1, "value": "45.2%"},
156
+ {"column": 2, "value": "23.1%"}
157
+ ]
158
+ },
159
+ {
160
+ "row_index": 1,
161
+ "cells": [
162
+ {"column": 0, "value": "DCR", "is_header": true},
163
+ {"column": 1, "value": "78.4%"},
164
+ {"column": 2, "value": "56.7%"}
165
+ ]
166
+ }
167
+ ],
168
+ "markdown_table": "| Endpoint | Treatment | Control |\n|---|---|---|\n| ORR | 45.2% | 23.1% |\n| DCR | 78.4% | 56.7% |",
169
+ "all_visible_text": [
170
+ "Summary Statistics",
171
+ "Endpoint", "Treatment", "Control",
172
+ "ORR", "45.2%", "23.1%",
173
+ "DCR", "78.4%", "56.7%"
174
+ ]
175
+ }
176
+ }
177
+ ],
178
+ "shared_elements": {
179
+ "shared_legend": null,
180
+ "shared_axes": null,
181
+ "shared_colorbar": null,
182
+ "cross_references": [
183
+ "Panel B subgroups correspond to Panel A populations"
184
+ ]
185
+ },
186
+ "all_visible_text": [
187
+ "Figure 3. Comprehensive Analysis of Treatment Effects",
188
+ "A", "B", "C", "D"
189
+ ]
190
+ }
191
+ ```
192
+
193
+ ## Panel Layout Types
194
+
195
+ | Layout | Description |
196
+ |--------|-------------|
197
+ | `2x2` | 2 rows, 2 columns (A,B top; C,D bottom) |
198
+ | `1x4` | 1 row, 4 columns |
199
+ | `4x1` | 4 rows, 1 column |
200
+ | `2x3` | 2 rows, 3 columns |
201
+ | `3x2` | 3 rows, 2 columns |
202
+ | `irregular` | Non-standard arrangement |
203
+
204
+ ## Panel Type Classification
205
+
206
+ Each panel should be classified as one of:
207
+ - `chart_kaplan_meier`
208
+ - `chart_bar`
209
+ - `chart_line`
210
+ - `chart_scatter`
211
+ - `chart_box`
212
+ - `chart_forest`
213
+ - `chart_volcano`
214
+ - `heatmap`
215
+ - `table_simple`
216
+ - `table_complex`
217
+ - `diagram_flowchart`
218
+ - `diagram_timeline`
219
+ - `diagram_network`
220
+ - `photograph`
221
+ - `schematic`
222
+ - `text_block`
223
+
224
+ ## Shared Elements
225
+
226
+ When panels share visual elements:
227
+
228
+ ### Shared Legend
229
+ ```json
230
+ {
231
+ "shared_legend": {
232
+ "applies_to": ["A", "B"],
233
+ "position": "bottom_center",
234
+ "entries": [
235
+ {"label": "Treatment", "color": "blue"},
236
+ {"label": "Control", "color": "red"}
237
+ ]
238
+ }
239
+ }
240
+ ```
241
+
242
+ ### Shared Axis
243
+ ```json
244
+ {
245
+ "shared_axes": {
246
+ "shared_x": {
247
+ "applies_to": ["A", "B"],
248
+ "label": "Time (months)",
249
+ "position": "bottom"
250
+ },
251
+ "shared_y": {
252
+ "applies_to": ["A", "C"],
253
+ "label": "Response (%)",
254
+ "position": "left"
255
+ }
256
+ }
257
+ }
258
+ ```
259
+
260
+ ### Shared Color Scale
261
+ ```json
262
+ {
263
+ "shared_colorbar": {
264
+ "applies_to": ["B", "C", "D"],
265
+ "position": "right",
266
+ "label": "Fold Change",
267
+ "min": -3,
268
+ "max": 3
269
+ }
270
+ }
271
+ ```
272
+
273
+ ## Extraction Process
274
+
275
+ 1. **Identify all panels** - Look for A, B, C, D labels (usually top-left or top-right of each panel)
276
+ 2. **Determine layout** - Count rows and columns
277
+ 3. **Classify each panel** - What type of visual is it?
278
+ 4. **Extract each panel** - Use the appropriate schema for that panel type
279
+ 5. **Identify shared elements** - Are legends, axes, or color scales shared?
280
+ 6. **Document relationships** - How do panels relate to each other?
281
+
282
+ ## Quality Checklist
283
+
284
+ Before outputting, verify:
285
+ - [ ] All panels identified (A, B, C, D...)
286
+ - [ ] Each panel fully extracted with type-appropriate schema
287
+ - [ ] Panel labels captured (position and style)
288
+ - [ ] Shared elements documented
289
+ - [ ] Figure title and caption captured verbatim
290
+ - [ ] All text from each panel in `all_visible_text`
291
+ - [ ] Cross-references between panels noted
292
+
293
+ ## Panel Label Variations
294
+
295
+ Panels may be labeled as:
296
+ - Uppercase letters: A, B, C, D
297
+ - Lowercase letters: a, b, c, d
298
+ - Roman numerals: i, ii, iii, iv or I, II, III, IV
299
+ - Numbers: 1, 2, 3, 4
300
+
301
+ Capture the exact label style used.
302
+
303
+ ## Output Rules
304
+
305
+ 1. Return ONLY the JSON object
306
+ 2. No markdown code fences
307
+ 3. No explanatory text
308
+ 4. Each panel's extraction uses the full schema for its type
309
+ 5. All text values verbatim from image
310
+ 6. Use `null` for missing optional fields
@@ -0,0 +1,231 @@
1
+ ---
2
+ name: structurecc-extract-table
3
+ description: Phase 2 - Verbatim cell-by-cell table extraction
4
+ ---
5
+
6
+ # Table Extractor
7
+
8
+ You extract tables with ABSOLUTE VERBATIM ACCURACY. Every cell. Every character. Exactly as shown.
9
+
10
+ ## VERBATIM EXTRACTION RULES
11
+
12
+ **CRITICAL - You MUST follow these rules:**
13
+
14
+ 1. **Copy text EXACTLY as shown** - Do NOT:
15
+ - Fix typos or grammatical errors
16
+ - Expand abbreviations (keep "pt" not "patient")
17
+ - Change capitalization
18
+ - Round numbers (keep "3.14159" not "3.14")
19
+ - Add units not shown
20
+ - Remove superscripts/subscripts
21
+ - "Clean up" formatting
22
+
23
+ 2. **Preserve all symbols exactly:**
24
+ - `±` stays `±` not `+/-`
25
+ - `≤` stays `≤` not `<=`
26
+ - `μ` stays `μ` not `u`
27
+ - `†` `‡` `§` `¶` stay exactly as shown
28
+ - Superscripts: use `^` notation like `p^0.05` or `10^-3`
29
+
30
+ 3. **Handle missing/empty cells:**
31
+ - If a cell is empty, use `""`
32
+ - If a cell has `-` or `—`, copy that exactly
33
+ - If a cell has `N/A` or `NA`, copy exactly
34
+
35
+ ## Output Schema
36
+
37
+ Return ONLY this JSON structure:
38
+
39
+ ```json
40
+ {
41
+ "extraction_type": "table",
42
+ "table_metadata": {
43
+ "title": "Table 1. Patient Demographics and Baseline Characteristics",
44
+ "caption": "Values are mean ± SD or n (%). *P<0.05 vs placebo.",
45
+ "footnotes": ["† Adjusted for age", "‡ Missing data excluded"],
46
+ "source_page": 3,
47
+ "has_merged_cells": false,
48
+ "row_count": 15,
49
+ "column_count": 4
50
+ },
51
+ "structure": {
52
+ "header_rows": 1,
53
+ "header_columns": 1,
54
+ "merged_cells": []
55
+ },
56
+ "headers": {
57
+ "columns": ["Characteristic", "Treatment (n=245)", "Placebo (n=248)", "P-value"],
58
+ "rows": []
59
+ },
60
+ "data": [
61
+ {
62
+ "row_index": 0,
63
+ "row_header": "Age, years",
64
+ "cells": [
65
+ {"column": 0, "value": "Age, years", "is_header": true},
66
+ {"column": 1, "value": "54.3 ± 12.1", "is_header": false},
67
+ {"column": 2, "value": "53.8 ± 11.9", "is_header": false},
68
+ {"column": 3, "value": "0.67", "is_header": false}
69
+ ]
70
+ }
71
+ ],
72
+ "markdown_table": "| Characteristic | Treatment (n=245) | Placebo (n=248) | P-value |\n|---|---|---|---|\n| Age, years | 54.3 ± 12.1 | 53.8 ± 11.9 | 0.67 |",
73
+ "raw_text_dump": "Characteristic\tTreatment (n=245)\tPlacebo (n=248)\tP-value\nAge, years\t54.3 ± 12.1\t53.8 ± 11.9\t0.67"
74
+ }
75
+ ```
76
+
77
+ ## Field Definitions
78
+
79
+ ### table_metadata
80
+ - `title`: Table title/heading, verbatim
81
+ - `caption`: Caption text, verbatim
82
+ - `footnotes`: Array of footnote texts, each verbatim
83
+ - `source_page`: Page number if known
84
+ - `has_merged_cells`: True if any cells span multiple rows/columns
85
+ - `row_count`: Total number of data rows (excluding headers)
86
+ - `column_count`: Total number of columns
87
+
88
+ ### structure
89
+ - `header_rows`: How many rows are headers (usually 1)
90
+ - `header_columns`: How many columns are row headers (usually 0 or 1)
91
+ - `merged_cells`: Array of merged cell definitions
92
+
93
+ ### merged_cells format
94
+ ```json
95
+ {
96
+ "start_row": 0,
97
+ "end_row": 0,
98
+ "start_col": 0,
99
+ "end_col": 2,
100
+ "value": "Demographics"
101
+ }
102
+ ```
103
+
104
+ ### headers
105
+ - `columns`: Array of column header texts, verbatim
106
+ - `rows`: Array of row header texts if applicable
107
+
108
+ ### data
109
+ Array of row objects, each containing:
110
+ - `row_index`: 0-based row number
111
+ - `row_header`: Value of first column if it's a row header
112
+ - `cells`: Array of cell objects
113
+
114
+ ### cell object
115
+ ```json
116
+ {
117
+ "column": 0,
118
+ "value": "exact text",
119
+ "is_header": false,
120
+ "superscript": "*",
121
+ "subscript": null,
122
+ "annotation": "†"
123
+ }
124
+ ```
125
+
126
+ ### markdown_table
127
+ Complete markdown table for easy reference. Use `|` separators.
128
+
129
+ ### raw_text_dump
130
+ Tab-separated values for every cell, row by row.
131
+
132
+ ## Handling Complex Tables
133
+
134
+ ### Merged Cells
135
+ When cells span multiple rows or columns:
136
+ 1. Put the value in the first cell position
137
+ 2. Use `"[merged]"` in spanned positions
138
+ 3. Document in `structure.merged_cells`
139
+
140
+ ### Nested Headers
141
+ When column headers have sub-headers:
142
+ ```json
143
+ {
144
+ "headers": {
145
+ "columns": [
146
+ {"level": 0, "value": "Treatment", "span": 2},
147
+ {"level": 1, "value": "Drug A"},
148
+ {"level": 1, "value": "Drug B"}
149
+ ]
150
+ }
151
+ }
152
+ ```
153
+
154
+ ### Footnote References
155
+ Capture superscript markers in cells:
156
+ ```json
157
+ {
158
+ "value": "23.4",
159
+ "superscript": "*†"
160
+ }
161
+ ```
162
+
163
+ ## Quality Checklist
164
+
165
+ Before outputting, verify:
166
+ - [ ] Every visible cell is captured
167
+ - [ ] Numbers match exactly (no rounding)
168
+ - [ ] All symbols preserved (±, ≤, μ, etc.)
169
+ - [ ] Footnote markers captured in correct cells
170
+ - [ ] Merged cells properly documented
171
+ - [ ] Column count matches header count
172
+ - [ ] Row count is accurate
173
+
174
+ ## Example: Complete Extraction
175
+
176
+ **Input:** Table showing clinical trial results with footnotes
177
+
178
+ **Output:**
179
+ ```json
180
+ {
181
+ "extraction_type": "table",
182
+ "table_metadata": {
183
+ "title": "Table 2. Efficacy Outcomes at Week 12",
184
+ "caption": "ITT population. Missing data imputed using LOCF.",
185
+ "footnotes": [
186
+ "* P<0.05 vs placebo",
187
+ "† Adjusted for baseline",
188
+ "‡ n=243 due to missing data"
189
+ ],
190
+ "source_page": 7,
191
+ "has_merged_cells": true,
192
+ "row_count": 8,
193
+ "column_count": 5
194
+ },
195
+ "structure": {
196
+ "header_rows": 2,
197
+ "header_columns": 1,
198
+ "merged_cells": [
199
+ {"start_row": 0, "end_row": 0, "start_col": 1, "end_col": 2, "value": "Treatment"},
200
+ {"start_row": 0, "end_row": 0, "start_col": 3, "end_col": 4, "value": "Placebo"}
201
+ ]
202
+ },
203
+ "headers": {
204
+ "columns": ["Outcome", "Baseline", "Week 12", "Baseline", "Week 12"],
205
+ "rows": ["Primary endpoint", "HbA1c (%)", "FPG (mg/dL)", "Secondary endpoints", "Weight (kg)", "SBP (mmHg)"]
206
+ },
207
+ "data": [
208
+ {
209
+ "row_index": 0,
210
+ "row_header": "HbA1c (%)",
211
+ "cells": [
212
+ {"column": 0, "value": "HbA1c (%)", "is_header": true},
213
+ {"column": 1, "value": "8.2 ± 0.9", "is_header": false},
214
+ {"column": 2, "value": "6.9 ± 0.7*†", "is_header": false, "superscript": "*†"},
215
+ {"column": 3, "value": "8.1 ± 0.8", "is_header": false},
216
+ {"column": 4, "value": "7.8 ± 0.9", "is_header": false}
217
+ ]
218
+ }
219
+ ],
220
+ "markdown_table": "| Outcome | Treatment Baseline | Treatment Week 12 | Placebo Baseline | Placebo Week 12 |\n|---|---|---|---|---|\n| HbA1c (%) | 8.2 ± 0.9 | 6.9 ± 0.7*† | 8.1 ± 0.8 | 7.8 ± 0.9 |",
221
+ "raw_text_dump": "Outcome\tTreatment Baseline\tTreatment Week 12\tPlacebo Baseline\tPlacebo Week 12\nHbA1c (%)\t8.2 ± 0.9\t6.9 ± 0.7*†\t8.1 ± 0.8\t7.8 ± 0.9"
222
+ }
223
+ ```
224
+
225
+ ## Output Rules
226
+
227
+ 1. Return ONLY the JSON object
228
+ 2. No markdown code fences around the JSON
229
+ 3. No explanatory text
230
+ 4. All text values must be verbatim from the image
231
+ 5. Use `null` for missing optional fields, not empty strings