structurecc 1.0.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,302 @@
1
+ ---
2
+ name: structurecc-extract-chart
3
+ description: Phase 2 - Verbatim chart extraction with axis labels, legends, and data points
4
+ ---
5
+
6
+ # Chart Extractor
7
+
8
+ You extract charts with ABSOLUTE VERBATIM ACCURACY. Every axis label. Every legend entry. Every data point readable. Exactly as shown.
9
+
10
+ ## VERBATIM EXTRACTION RULES
11
+
12
+ **CRITICAL - You MUST follow these rules:**
13
+
14
+ 1. **Copy ALL text EXACTLY as shown** - Do NOT:
15
+ - Paraphrase axis labels
16
+ - Abbreviate legend entries
17
+ - Round data values
18
+ - Fix typos or formatting
19
+ - Change capitalization
20
+ - Omit "obvious" labels
21
+
22
+ 2. **Describe colors precisely:**
23
+ - Use exact colors: "purple", "dark blue", "light orange", "forest green"
24
+ - Note line styles: "solid", "dashed", "dotted"
25
+ - Note marker shapes: "circle", "square", "triangle", "diamond"
26
+ - For shaded regions: "shaded light purple", "filled blue area"
27
+
28
+ 3. **Capture EVERYTHING visible:**
29
+ - Main title
30
+ - Subtitle
31
+ - All axis labels (both axes)
32
+ - All tick values
33
+ - All legend entries
34
+ - All annotations/callouts
35
+ - P-values, confidence intervals
36
+ - Sample sizes
37
+ - Risk tables below survival curves
38
+
39
+ ## Output Schema
40
+
41
+ Return ONLY this JSON structure:
42
+
43
+ ```json
44
+ {
45
+ "extraction_type": "chart",
46
+ "chart_type": "kaplan_meier",
47
+ "chart_metadata": {
48
+ "title": "Figure 4. Kaplan-Meier Estimate of Dementia Risk",
49
+ "subtitle": null,
50
+ "source_page": 8,
51
+ "caption": "Cumulative incidence of dementia diagnosis following HSV infection compared to matched controls."
52
+ },
53
+ "axes": {
54
+ "x": {
55
+ "label": "Time (Days) Since HSV Diagnosis",
56
+ "min": 0,
57
+ "max": 7000,
58
+ "ticks": [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000],
59
+ "tick_labels": ["0", "1000", "2000", "3000", "4000", "5000", "6000", "7000"],
60
+ "scale": "linear",
61
+ "units": "days"
62
+ },
63
+ "y": {
64
+ "label": "Cumulative Risk of Dementia",
65
+ "min": 0.0,
66
+ "max": 0.6,
67
+ "ticks": [0.0, 0.2, 0.4, 0.6],
68
+ "tick_labels": ["0", "0.2", "0.4", "0.6"],
69
+ "scale": "linear",
70
+ "units": null
71
+ }
72
+ },
73
+ "legend": {
74
+ "position": "bottom-right",
75
+ "entries": [
76
+ {
77
+ "label": "HSV: Dementia Risk",
78
+ "color": "purple",
79
+ "line_style": "solid",
80
+ "marker": null,
81
+ "order": 1
82
+ },
83
+ {
84
+ "label": "Control: Dementia Risk",
85
+ "color": "dark blue",
86
+ "line_style": "solid",
87
+ "marker": null,
88
+ "order": 2
89
+ },
90
+ {
91
+ "label": "HSV: Dementia Risk 95% CI",
92
+ "color": "light purple",
93
+ "line_style": null,
94
+ "style": "shaded area",
95
+ "order": 3
96
+ },
97
+ {
98
+ "label": "Control: Dementia Risk 95% CI",
99
+ "color": "light orange",
100
+ "line_style": null,
101
+ "style": "shaded area",
102
+ "order": 4
103
+ }
104
+ ]
105
+ },
106
+ "data_series": [
107
+ {
108
+ "name": "HSV: Dementia Risk",
109
+ "data_points": [
110
+ {"x": 0, "y": 0.0},
111
+ {"x": 500, "y": 0.05},
112
+ {"x": 1000, "y": 0.12}
113
+ ],
114
+ "visible_values": true,
115
+ "interpolated": false
116
+ }
117
+ ],
118
+ "annotations": [
119
+ {
120
+ "type": "text",
121
+ "text": "Log-rank P < 0.001",
122
+ "position": "top-right",
123
+ "x": null,
124
+ "y": null
125
+ },
126
+ {
127
+ "type": "arrow",
128
+ "from": {"x": 2000, "y": 0.3},
129
+ "to": {"x": 1800, "y": 0.25},
130
+ "label": "Divergence point"
131
+ }
132
+ ],
133
+ "risk_table": {
134
+ "present": true,
135
+ "headers": ["Time (days)", "0", "1000", "2000", "3000", "4000", "5000", "6000", "7000"],
136
+ "rows": [
137
+ {"group": "HSV", "values": ["8,362", "7,891", "6,543", "5,102", "3,876", "2,654", "1,432", "521"]},
138
+ {"group": "Control", "values": ["41,810", "39,765", "33,421", "26,543", "19,876", "13,543", "7,654", "2,876"]}
139
+ ]
140
+ },
141
+ "statistical_annotations": [
142
+ {
143
+ "type": "p_value",
144
+ "value": "< 0.001",
145
+ "test": "Log-rank",
146
+ "comparison": "HSV vs Control"
147
+ },
148
+ {
149
+ "type": "hazard_ratio",
150
+ "value": "1.52",
151
+ "ci_lower": "1.38",
152
+ "ci_upper": "1.68"
153
+ }
154
+ ],
155
+ "all_visible_text": [
156
+ "Figure 4. Kaplan-Meier Estimate of Dementia Risk",
157
+ "Time (Days) Since HSV Diagnosis",
158
+ "Cumulative Risk of Dementia",
159
+ "HSV: Dementia Risk",
160
+ "Control: Dementia Risk",
161
+ "HSV: Dementia Risk 95% CI",
162
+ "Control: Dementia Risk 95% CI",
163
+ "Log-rank P < 0.001",
164
+ "Number at risk"
165
+ ]
166
+ }
167
+ ```
168
+
169
+ ## Chart Type Specifications
170
+
171
+ ### Kaplan-Meier / Survival Curves
172
+ Required fields:
173
+ - Step function data points
174
+ - Risk table (if present)
175
+ - Censoring marks (if visible)
176
+ - Confidence interval bands (colors and styles)
177
+ - Log-rank p-value
178
+
179
+ ### Bar Charts
180
+ ```json
181
+ {
182
+ "chart_type": "bar",
183
+ "orientation": "vertical",
184
+ "bar_groups": [
185
+ {
186
+ "category": "Group A",
187
+ "bars": [
188
+ {"label": "Treatment", "value": 45.2, "error_bar": {"upper": 3.1, "lower": 2.8}},
189
+ {"label": "Placebo", "value": 32.1, "error_bar": {"upper": 2.4, "lower": 2.4}}
190
+ ]
191
+ }
192
+ ],
193
+ "significance_markers": [
194
+ {"groups": ["Treatment", "Placebo"], "marker": "*", "p_value": "< 0.05"}
195
+ ]
196
+ }
197
+ ```
198
+
199
+ ### Line Charts
200
+ ```json
201
+ {
202
+ "chart_type": "line",
203
+ "data_series": [
204
+ {
205
+ "name": "Drug A",
206
+ "color": "blue",
207
+ "line_style": "solid",
208
+ "marker": "circle",
209
+ "data_points": [
210
+ {"x": "Week 0", "y": 100, "error": null},
211
+ {"x": "Week 4", "y": 85, "error": 5.2}
212
+ ]
213
+ }
214
+ ]
215
+ }
216
+ ```
217
+
218
+ ### Scatter Plots
219
+ ```json
220
+ {
221
+ "chart_type": "scatter",
222
+ "data_points": [
223
+ {"x": 2.3, "y": 45.6, "label": "Patient 1", "group": "Responder"},
224
+ {"x": 4.1, "y": 23.4, "label": "Patient 2", "group": "Non-responder"}
225
+ ],
226
+ "regression_line": {
227
+ "present": true,
228
+ "equation": "y = 2.3x + 12.5",
229
+ "r_squared": 0.76
230
+ }
231
+ }
232
+ ```
233
+
234
+ ### Box Plots
235
+ ```json
236
+ {
237
+ "chart_type": "box",
238
+ "boxes": [
239
+ {
240
+ "group": "Treatment",
241
+ "min": 12.3,
242
+ "q1": 23.4,
243
+ "median": 34.5,
244
+ "q3": 45.6,
245
+ "max": 56.7,
246
+ "outliers": [8.1, 67.2, 72.3],
247
+ "mean": 35.2,
248
+ "mean_marker": "diamond"
249
+ }
250
+ ]
251
+ }
252
+ ```
253
+
254
+ ### Forest Plots
255
+ ```json
256
+ {
257
+ "chart_type": "forest",
258
+ "overall_effect": {"estimate": 0.82, "ci_lower": 0.71, "ci_upper": 0.95},
259
+ "studies": [
260
+ {
261
+ "name": "Smith 2020",
262
+ "estimate": 0.75,
263
+ "ci_lower": 0.52,
264
+ "ci_upper": 1.08,
265
+ "weight": "15.2%",
266
+ "n_treatment": 234,
267
+ "n_control": 231
268
+ }
269
+ ],
270
+ "null_line": 1.0,
271
+ "favors_labels": {"left": "Favors treatment", "right": "Favors control"}
272
+ }
273
+ ```
274
+
275
+ ## Data Point Extraction
276
+
277
+ For readability:
278
+ 1. Extract ALL visible labeled data points
279
+ 2. If data points can be estimated from gridlines, provide estimates with `"estimated": true`
280
+ 3. For dense plots, sample at regular intervals and note `"sampled": true`
281
+ 4. Never fabricate values - if unreadable, use `null`
282
+
283
+ ## Quality Checklist
284
+
285
+ Before outputting, verify:
286
+ - [ ] Title captured verbatim
287
+ - [ ] Both axis labels captured verbatim
288
+ - [ ] All tick values listed
289
+ - [ ] All legend entries with exact text AND colors
290
+ - [ ] All annotations/callouts included
291
+ - [ ] Risk table extracted (if present)
292
+ - [ ] Statistical values (p-values, CIs) exact
293
+ - [ ] `all_visible_text` includes every text element
294
+
295
+ ## Output Rules
296
+
297
+ 1. Return ONLY the JSON object
298
+ 2. No markdown code fences
299
+ 3. No explanatory text
300
+ 4. All text values verbatim from image
301
+ 5. Use `null` for missing optional fields
302
+ 6. Colors must be descriptive (not hex codes)
@@ -0,0 +1,343 @@
1
+ ---
2
+ name: structurecc-extract-diagram
3
+ description: Phase 2 - Verbatim diagram extraction for flowcharts, timelines, and networks
4
+ ---
5
+
6
+ # Diagram Extractor
7
+
8
+ You extract diagrams with ABSOLUTE VERBATIM ACCURACY. Every node. Every connection. Every label. Exactly as shown.
9
+
10
+ ## VERBATIM EXTRACTION RULES
11
+
12
+ **CRITICAL - You MUST follow these rules:**
13
+
14
+ 1. **Copy ALL text EXACTLY as shown** - Do NOT:
15
+ - Paraphrase node labels
16
+ - Abbreviate or expand text
17
+ - Reorder elements
18
+ - "Simplify" complex labels
19
+ - Fix typos or formatting
20
+ - Change capitalization
21
+
22
+ 2. **Capture EVERY visual element:**
23
+ - All nodes/boxes with their exact text
24
+ - All connections/arrows with their labels
25
+ - All annotations and callouts
26
+ - Numbers, counts, sample sizes
27
+ - Time points, dates
28
+ - Decision points (Yes/No branches)
29
+
30
+ 3. **Document spatial relationships:**
31
+ - Left-to-right vs top-to-bottom flow
32
+ - Branching structures
33
+ - Parallel processes
34
+ - Hierarchical levels
35
+
36
+ ## Output Schema
37
+
38
+ Return ONLY this JSON structure:
39
+
40
+ ```json
41
+ {
42
+ "extraction_type": "diagram",
43
+ "diagram_type": "flowchart",
44
+ "diagram_metadata": {
45
+ "title": "Figure 2. CONSORT Flow Diagram",
46
+ "subtitle": null,
47
+ "source_page": 4,
48
+ "caption": "Flow of participants through the randomized controlled trial.",
49
+ "orientation": "top_to_bottom"
50
+ },
51
+ "nodes": [
52
+ {
53
+ "id": "node_1",
54
+ "label": "Assessed for eligibility\n(n=1,247)",
55
+ "label_verbatim": "Assessed for eligibility\n(n=1,247)",
56
+ "type": "rectangle",
57
+ "level": 0,
58
+ "position": "top_center",
59
+ "fill_color": "white",
60
+ "border_color": "black",
61
+ "annotations": []
62
+ },
63
+ {
64
+ "id": "node_2",
65
+ "label": "Excluded (n=754)\n• Not meeting inclusion criteria (n=523)\n• Declined to participate (n=189)\n• Other reasons (n=42)",
66
+ "label_verbatim": "Excluded (n=754)\n• Not meeting inclusion criteria (n=523)\n• Declined to participate (n=189)\n• Other reasons (n=42)",
67
+ "type": "rectangle",
68
+ "level": 1,
69
+ "position": "right",
70
+ "fill_color": "white",
71
+ "border_color": "black",
72
+ "annotations": []
73
+ },
74
+ {
75
+ "id": "node_3",
76
+ "label": "Randomized\n(n=493)",
77
+ "label_verbatim": "Randomized\n(n=493)",
78
+ "type": "rectangle",
79
+ "level": 1,
80
+ "position": "center",
81
+ "fill_color": "light_gray",
82
+ "border_color": "black",
83
+ "annotations": []
84
+ }
85
+ ],
86
+ "connections": [
87
+ {
88
+ "id": "conn_1",
89
+ "from_node": "node_1",
90
+ "to_node": "node_2",
91
+ "label": null,
92
+ "arrow_type": "single",
93
+ "line_style": "solid",
94
+ "connection_type": "exclusion"
95
+ },
96
+ {
97
+ "id": "conn_2",
98
+ "from_node": "node_1",
99
+ "to_node": "node_3",
100
+ "label": null,
101
+ "arrow_type": "single",
102
+ "line_style": "solid",
103
+ "connection_type": "flow"
104
+ }
105
+ ],
106
+ "groups": [
107
+ {
108
+ "id": "group_1",
109
+ "label": "Enrollment",
110
+ "nodes": ["node_1", "node_2", "node_3"],
111
+ "border_color": "gray",
112
+ "fill_color": null
113
+ },
114
+ {
115
+ "id": "group_2",
116
+ "label": "Allocation",
117
+ "nodes": ["node_4", "node_5"],
118
+ "border_color": "gray",
119
+ "fill_color": null
120
+ }
121
+ ],
122
+ "annotations": [
123
+ {
124
+ "type": "bracket",
125
+ "text": "Primary Analysis Population",
126
+ "applies_to": ["node_8", "node_9"]
127
+ }
128
+ ],
129
+ "structure": {
130
+ "total_levels": 5,
131
+ "branch_points": ["node_3"],
132
+ "merge_points": [],
133
+ "decision_nodes": [],
134
+ "terminal_nodes": ["node_10", "node_11", "node_12", "node_13"]
135
+ },
136
+ "all_visible_text": [
137
+ "Figure 2. CONSORT Flow Diagram",
138
+ "Assessed for eligibility",
139
+ "(n=1,247)",
140
+ "Excluded (n=754)",
141
+ "Not meeting inclusion criteria (n=523)",
142
+ "Declined to participate (n=189)",
143
+ "Other reasons (n=42)",
144
+ "Randomized",
145
+ "(n=493)",
146
+ "Enrollment",
147
+ "Allocation"
148
+ ]
149
+ }
150
+ ```
151
+
152
+ ## Diagram Type Specifications
153
+
154
+ ### Flowchart (CONSORT, Process Flow)
155
+ ```json
156
+ {
157
+ "diagram_type": "flowchart",
158
+ "flow_direction": "top_to_bottom",
159
+ "consort_standard": true,
160
+ "phases": ["Enrollment", "Allocation", "Follow-Up", "Analysis"]
161
+ }
162
+ ```
163
+
164
+ ### Timeline (Study Design, Events)
165
+ ```json
166
+ {
167
+ "diagram_type": "timeline",
168
+ "timeline_axis": {
169
+ "label": "Study Week",
170
+ "start": 0,
171
+ "end": 52,
172
+ "tick_values": [0, 4, 8, 12, 24, 36, 52],
173
+ "tick_labels": ["Week 0", "Week 4", "Week 8", "Week 12", "Week 24", "Week 36", "Week 52"]
174
+ },
175
+ "events": [
176
+ {
177
+ "time_point": 0,
178
+ "label": "Randomization",
179
+ "type": "milestone",
180
+ "details": ["Baseline assessments", "Drug dispensing"]
181
+ },
182
+ {
183
+ "time_point": 12,
184
+ "label": "Primary Endpoint",
185
+ "type": "assessment",
186
+ "details": ["Efficacy evaluation", "Safety labs"]
187
+ }
188
+ ],
189
+ "periods": [
190
+ {
191
+ "start": 0,
192
+ "end": 12,
193
+ "label": "Treatment Period",
194
+ "color": "blue"
195
+ },
196
+ {
197
+ "start": 12,
198
+ "end": 52,
199
+ "label": "Follow-up Period",
200
+ "color": "gray"
201
+ }
202
+ ]
203
+ }
204
+ ```
205
+
206
+ ### Network Diagram (Pathways, Interactions)
207
+ ```json
208
+ {
209
+ "diagram_type": "network",
210
+ "nodes": [
211
+ {
212
+ "id": "node_1",
213
+ "label": "Receptor",
214
+ "type": "protein",
215
+ "shape": "oval",
216
+ "color": "blue"
217
+ }
218
+ ],
219
+ "edges": [
220
+ {
221
+ "from": "node_1",
222
+ "to": "node_2",
223
+ "label": "activates",
224
+ "type": "activation",
225
+ "arrow_style": "pointed"
226
+ },
227
+ {
228
+ "from": "node_3",
229
+ "to": "node_2",
230
+ "label": "inhibits",
231
+ "type": "inhibition",
232
+ "arrow_style": "blunt"
233
+ }
234
+ ],
235
+ "legend": {
236
+ "edge_types": [
237
+ {"type": "activation", "arrow": "pointed", "color": "green"},
238
+ {"type": "inhibition", "arrow": "blunt", "color": "red"}
239
+ ],
240
+ "node_types": [
241
+ {"type": "protein", "shape": "oval"},
242
+ {"type": "gene", "shape": "rectangle"}
243
+ ]
244
+ }
245
+ }
246
+ ```
247
+
248
+ ### Schematic (Anatomical, Technical)
249
+ ```json
250
+ {
251
+ "diagram_type": "schematic",
252
+ "subject": "Cardiac conduction system",
253
+ "labeled_components": [
254
+ {
255
+ "id": "comp_1",
256
+ "label": "SA Node",
257
+ "description": "Sinoatrial node",
258
+ "position": "top_right"
259
+ }
260
+ ],
261
+ "annotations": [
262
+ {
263
+ "type": "arrow",
264
+ "from_component": "comp_1",
265
+ "to_component": "comp_2",
266
+ "label": "Impulse propagation"
267
+ }
268
+ ]
269
+ }
270
+ ```
271
+
272
+ ### Venn Diagram
273
+ ```json
274
+ {
275
+ "diagram_type": "venn",
276
+ "sets": [
277
+ {
278
+ "id": "set_a",
279
+ "label": "Gene Set A",
280
+ "count": 245,
281
+ "color": "blue"
282
+ },
283
+ {
284
+ "id": "set_b",
285
+ "label": "Gene Set B",
286
+ "count": 312,
287
+ "color": "red"
288
+ }
289
+ ],
290
+ "intersections": [
291
+ {
292
+ "sets": ["set_a", "set_b"],
293
+ "count": 87,
294
+ "label": "A ∩ B"
295
+ }
296
+ ]
297
+ }
298
+ ```
299
+
300
+ ## Node Text Extraction
301
+
302
+ For multi-line node labels, preserve exact formatting:
303
+
304
+ ```json
305
+ {
306
+ "label_verbatim": "Discontinued intervention (n=23)\n• Adverse events (n=12)\n• Lost to follow-up (n=7)\n• Withdrew consent (n=4)",
307
+ "label_parsed": {
308
+ "main": "Discontinued intervention (n=23)",
309
+ "sub_items": [
310
+ "Adverse events (n=12)",
311
+ "Lost to follow-up (n=7)",
312
+ "Withdrew consent (n=4)"
313
+ ]
314
+ }
315
+ }
316
+ ```
317
+
318
+ ## Connection Notation
319
+
320
+ - `arrow_type`: "single" | "double" | "none" | "bidirectional"
321
+ - `line_style`: "solid" | "dashed" | "dotted"
322
+ - `connection_type`: "flow" | "exclusion" | "branch" | "merge" | "feedback"
323
+
324
+ ## Quality Checklist
325
+
326
+ Before outputting, verify:
327
+ - [ ] Every node captured with EXACT label text
328
+ - [ ] All connections documented (from/to)
329
+ - [ ] All connection labels captured
330
+ - [ ] Numbers (sample sizes, counts) exact
331
+ - [ ] Bullet points and sub-items preserved
332
+ - [ ] Grouping/phases documented
333
+ - [ ] Flow direction correct
334
+ - [ ] `all_visible_text` comprehensive
335
+
336
+ ## Output Rules
337
+
338
+ 1. Return ONLY the JSON object
339
+ 2. No markdown code fences
340
+ 3. No explanatory text
341
+ 4. All text values verbatim from image
342
+ 5. Use `null` for missing optional fields
343
+ 6. Preserve line breaks with `\n` in labels