structurecc 1.0.5 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +154 -67
- package/agents/structurecc-classifier.md +135 -0
- package/agents/structurecc-extract-chart.md +302 -0
- package/agents/structurecc-extract-diagram.md +343 -0
- package/agents/structurecc-extract-generic.md +248 -0
- package/agents/structurecc-extract-heatmap.md +322 -0
- package/agents/structurecc-extract-multipanel.md +310 -0
- package/agents/structurecc-extract-table.md +231 -0
- package/agents/structurecc-verifier.md +265 -0
- package/bin/install.js +82 -18
- package/commands/structure/structure.md +434 -112
- package/package.json +9 -5
- package/agents/structurecc-extractor.md +0 -70
|
@@ -1,31 +1,55 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: structure
|
|
3
|
-
description: Extract structured data from PDFs and Word docs using AI agent swarms
|
|
3
|
+
description: Extract structured data from PDFs and Word docs using AI agent swarms with verbatim accuracy
|
|
4
4
|
arguments:
|
|
5
5
|
- name: path
|
|
6
6
|
description: Path to document (PDF, DOCX, or image)
|
|
7
7
|
required: true
|
|
8
8
|
---
|
|
9
9
|
|
|
10
|
-
# /structure - Agentic Document Extraction
|
|
10
|
+
# /structure - Agentic Document Extraction v2.0
|
|
11
11
|
|
|
12
|
-
Turn complex documents into structured markdown using
|
|
12
|
+
Turn complex documents into structured JSON + markdown using a 3-phase pipeline with verification.
|
|
13
13
|
|
|
14
|
-
## Overview
|
|
14
|
+
## Pipeline Overview
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
2
|
|
18
|
-
|
|
19
|
-
|
|
16
|
+
```
|
|
17
|
+
Image → [Phase 1: Classify] → [Phase 2: Extract] → [Phase 3: Verify] → Output
|
|
18
|
+
↑__________REVISION LOOP__________↓
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
1. **Classify** - Identify visual element type (table, chart, heatmap, diagram, etc.)
|
|
22
|
+
2. **Extract** - Use specialized extractor for that type with verbatim accuracy
|
|
23
|
+
3. **Verify** - Score extraction quality, trigger revision if < 0.9
|
|
20
24
|
|
|
21
|
-
## Step 1: Setup
|
|
25
|
+
## Step 1: Setup Output Directory
|
|
22
26
|
|
|
23
27
|
Create output directory next to the document:
|
|
24
28
|
```
|
|
25
29
|
<document_name>_extracted/
|
|
26
|
-
├── images/
|
|
27
|
-
├──
|
|
28
|
-
|
|
30
|
+
├── images/ # Raw extracted images
|
|
31
|
+
├── classifications/ # Phase 1: type detection results
|
|
32
|
+
├── extractions/ # Phase 2: JSON extractions
|
|
33
|
+
├── verifications/ # Phase 3: quality scores
|
|
34
|
+
├── elements/ # Final markdown per element
|
|
35
|
+
├── STRUCTURED.md # Combined markdown output
|
|
36
|
+
└── extraction_report.json # Quality metrics summary
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
import os
|
|
41
|
+
from pathlib import Path
|
|
42
|
+
from datetime import datetime
|
|
43
|
+
|
|
44
|
+
doc_path = "<document_path>"
|
|
45
|
+
doc_name = Path(doc_path).stem
|
|
46
|
+
output_dir = Path(doc_path).parent / f"{doc_name}_extracted"
|
|
47
|
+
|
|
48
|
+
# Create all subdirectories
|
|
49
|
+
for subdir in ["images", "classifications", "extractions", "verifications", "elements"]:
|
|
50
|
+
(output_dir / subdir).mkdir(parents=True, exist_ok=True)
|
|
51
|
+
|
|
52
|
+
print(f"Output directory: {output_dir}")
|
|
29
53
|
```
|
|
30
54
|
|
|
31
55
|
## Step 2: Extract Images
|
|
@@ -34,12 +58,11 @@ Create output directory next to the document:
|
|
|
34
58
|
|
|
35
59
|
```python
|
|
36
60
|
import fitz
|
|
37
|
-
import
|
|
61
|
+
import json
|
|
38
62
|
|
|
39
63
|
pdf_path = "<document_path>"
|
|
40
|
-
output_dir = "<output_dir>"
|
|
41
|
-
images_dir =
|
|
42
|
-
os.makedirs(images_dir, exist_ok=True)
|
|
64
|
+
output_dir = Path("<output_dir>")
|
|
65
|
+
images_dir = output_dir / "images"
|
|
43
66
|
|
|
44
67
|
doc = fitz.open(pdf_path)
|
|
45
68
|
extracted = []
|
|
@@ -53,11 +76,22 @@ for page_num in range(len(doc)):
|
|
|
53
76
|
pix = fitz.Pixmap(fitz.csRGB, pix)
|
|
54
77
|
|
|
55
78
|
img_name = f"p{page_num + 1}_img{img_idx + 1}.png"
|
|
56
|
-
|
|
57
|
-
|
|
79
|
+
img_path = images_dir / img_name
|
|
80
|
+
pix.save(str(img_path))
|
|
81
|
+
extracted.append({
|
|
82
|
+
"id": f"element_{len(extracted) + 1:03d}",
|
|
83
|
+
"path": str(img_path),
|
|
84
|
+
"page": page_num + 1,
|
|
85
|
+
"name": img_name
|
|
86
|
+
})
|
|
58
87
|
pix = None
|
|
59
88
|
|
|
60
89
|
doc.close()
|
|
90
|
+
|
|
91
|
+
# Save image manifest
|
|
92
|
+
with open(output_dir / "image_manifest.json", "w") as f:
|
|
93
|
+
json.dump(extracted, f, indent=2)
|
|
94
|
+
|
|
61
95
|
print(f"Extracted {len(extracted)} images")
|
|
62
96
|
```
|
|
63
97
|
|
|
@@ -65,164 +99,434 @@ print(f"Extracted {len(extracted)} images")
|
|
|
65
99
|
|
|
66
100
|
```python
|
|
67
101
|
from zipfile import ZipFile
|
|
68
|
-
import os
|
|
69
102
|
|
|
70
103
|
docx_path = "<document_path>"
|
|
71
|
-
output_dir = "<output_dir>"
|
|
72
|
-
images_dir =
|
|
73
|
-
os.makedirs(images_dir, exist_ok=True)
|
|
104
|
+
output_dir = Path("<output_dir>")
|
|
105
|
+
images_dir = output_dir / "images"
|
|
74
106
|
|
|
75
107
|
extracted = []
|
|
76
108
|
with ZipFile(docx_path, 'r') as z:
|
|
77
109
|
for f in z.namelist():
|
|
78
110
|
if f.startswith('word/media/'):
|
|
79
111
|
name = os.path.basename(f)
|
|
80
|
-
path =
|
|
112
|
+
path = images_dir / name
|
|
81
113
|
with z.open(f) as src, open(path, 'wb') as dst:
|
|
82
114
|
dst.write(src.read())
|
|
83
|
-
extracted.append({
|
|
115
|
+
extracted.append({
|
|
116
|
+
"id": f"element_{len(extracted) + 1:03d}",
|
|
117
|
+
"path": str(path),
|
|
118
|
+
"name": name
|
|
119
|
+
})
|
|
120
|
+
|
|
121
|
+
# Save image manifest
|
|
122
|
+
with open(output_dir / "image_manifest.json", "w") as f:
|
|
123
|
+
json.dump(extracted, f, indent=2)
|
|
84
124
|
|
|
85
125
|
print(f"Extracted {len(extracted)} images")
|
|
86
126
|
```
|
|
87
127
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
Also extract main text:
|
|
91
|
-
- PDF: `page.get_text()` for each page
|
|
92
|
-
- DOCX: `textutil -convert txt "<path>" -stdout`
|
|
93
|
-
|
|
94
|
-
## Step 3: Spawn Agent Swarm
|
|
128
|
+
## Step 3: Phase 1 - Classification (Parallel)
|
|
95
129
|
|
|
96
|
-
**CRITICAL:** Launch ALL agents in ONE message
|
|
130
|
+
**CRITICAL:** Launch ALL classification agents in ONE message.
|
|
97
131
|
|
|
98
|
-
For EACH extracted image:
|
|
132
|
+
For EACH extracted image, spawn a classification agent:
|
|
99
133
|
|
|
100
134
|
```
|
|
101
135
|
Task(
|
|
102
136
|
subagent_type: "general-purpose",
|
|
103
|
-
|
|
137
|
+
model: "haiku", # Fast classification
|
|
138
|
+
description: "Classify element [N]",
|
|
104
139
|
prompt: """
|
|
105
|
-
You are
|
|
140
|
+
You are a visual element classifier. Read the agent instructions from:
|
|
141
|
+
~/.claude/agents/structurecc-classifier.md
|
|
106
142
|
|
|
107
143
|
**Image:** <full_path_to_image>
|
|
108
|
-
**
|
|
109
|
-
**Output:** Write to <output_dir>/
|
|
144
|
+
**Element ID:** <element_id>
|
|
145
|
+
**Output:** Write JSON to <output_dir>/classifications/<element_id>_class.json
|
|
146
|
+
|
|
147
|
+
Analyze the image and output the classification JSON as specified in the agent file.
|
|
148
|
+
"""
|
|
149
|
+
)
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Launch 10 images = 10 Task calls in ONE message. They run in parallel.
|
|
110
153
|
|
|
111
|
-
##
|
|
154
|
+
## Step 4: Phase 2 - Specialized Extraction (Parallel)
|
|
112
155
|
|
|
113
|
-
|
|
114
|
-
2. Identify what it contains (table, figure, chart, heatmap, diagram, etc.)
|
|
115
|
-
3. Extract ALL visible data - be exhaustive
|
|
116
|
-
4. Structure as clean markdown
|
|
156
|
+
After classifications complete, read each classification file and dispatch to the correct extractor.
|
|
117
157
|
|
|
118
|
-
|
|
158
|
+
**Extractor Routing:**
|
|
119
159
|
|
|
120
|
-
|
|
160
|
+
| Classification | Extractor Agent |
|
|
161
|
+
|---------------|-----------------|
|
|
162
|
+
| `table_simple`, `table_complex` | `structurecc-extract-table.md` |
|
|
163
|
+
| `chart_*` (all chart types) | `structurecc-extract-chart.md` |
|
|
164
|
+
| `heatmap` | `structurecc-extract-heatmap.md` |
|
|
165
|
+
| `diagram_*` (all diagram types) | `structurecc-extract-diagram.md` |
|
|
166
|
+
| `multi_panel` | `structurecc-extract-multipanel.md` |
|
|
167
|
+
| Everything else | `structurecc-extract-generic.md` |
|
|
121
168
|
|
|
122
|
-
|
|
123
|
-
# [Descriptive Title]
|
|
169
|
+
For EACH element, spawn the appropriate extractor:
|
|
124
170
|
|
|
125
|
-
|
|
126
|
-
|
|
171
|
+
```
|
|
172
|
+
Task(
|
|
173
|
+
subagent_type: "general-purpose",
|
|
174
|
+
model: "opus", # Best quality for extraction
|
|
175
|
+
description: "Extract element [N]",
|
|
176
|
+
prompt: """
|
|
177
|
+
You are extracting structured data from a visual element.
|
|
178
|
+
|
|
179
|
+
Read the agent instructions from:
|
|
180
|
+
~/.claude/agents/<appropriate_extractor>.md
|
|
127
181
|
|
|
128
|
-
|
|
182
|
+
**Image:** <full_path_to_image>
|
|
183
|
+
**Element ID:** <element_id>
|
|
184
|
+
**Classification:** <classification_type>
|
|
185
|
+
**Source:** Page <N> of <document_name>
|
|
186
|
+
**Output:** Write JSON to <output_dir>/extractions/<element_id>.json
|
|
129
187
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
188
|
+
Follow the extractor instructions EXACTLY. Output ONLY valid JSON.
|
|
189
|
+
Remember: VERBATIM extraction only. Copy text exactly as shown.
|
|
190
|
+
"""
|
|
191
|
+
)
|
|
192
|
+
```
|
|
135
193
|
|
|
136
|
-
|
|
194
|
+
Launch ALL extractions in ONE message for parallel processing.
|
|
137
195
|
|
|
138
|
-
|
|
196
|
+
## Step 5: Phase 3 - Verification (Parallel)
|
|
139
197
|
|
|
140
|
-
|
|
198
|
+
After extractions complete, verify each extraction:
|
|
141
199
|
|
|
142
|
-
[Confidence level, unclear items marked with [?]]
|
|
143
200
|
```
|
|
201
|
+
Task(
|
|
202
|
+
subagent_type: "general-purpose",
|
|
203
|
+
model: "sonnet", # Good balance for verification
|
|
204
|
+
description: "Verify element [N]",
|
|
205
|
+
prompt: """
|
|
206
|
+
You are verifying an extraction against its source image.
|
|
207
|
+
|
|
208
|
+
Read the agent instructions from:
|
|
209
|
+
~/.claude/agents/structurecc-verifier.md
|
|
144
210
|
|
|
145
|
-
|
|
211
|
+
**Source Image:** <full_path_to_image>
|
|
212
|
+
**Extraction JSON:** <output_dir>/extractions/<element_id>.json
|
|
213
|
+
**Element ID:** <element_id>
|
|
214
|
+
**Output:** Write JSON to <output_dir>/verifications/<element_id>_verify.json
|
|
215
|
+
|
|
216
|
+
Compare the extraction to the source image and produce a verification report.
|
|
146
217
|
"""
|
|
147
218
|
)
|
|
148
219
|
```
|
|
149
220
|
|
|
150
|
-
Launch
|
|
221
|
+
Launch ALL verifications in ONE message.
|
|
151
222
|
|
|
152
|
-
## Step
|
|
223
|
+
## Step 6: Revision Loop
|
|
153
224
|
|
|
154
|
-
|
|
225
|
+
After verifications complete, check for failures:
|
|
155
226
|
|
|
156
|
-
```
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
227
|
+
```python
|
|
228
|
+
import json
|
|
229
|
+
from pathlib import Path
|
|
230
|
+
|
|
231
|
+
output_dir = Path("<output_dir>")
|
|
232
|
+
verifications_dir = output_dir / "verifications"
|
|
233
|
+
|
|
234
|
+
needs_revision = []
|
|
235
|
+
passed = []
|
|
236
|
+
needs_human_review = []
|
|
237
|
+
|
|
238
|
+
for verify_file in verifications_dir.glob("*_verify.json"):
|
|
239
|
+
with open(verify_file) as f:
|
|
240
|
+
result = json.load(f)
|
|
241
|
+
|
|
242
|
+
element_id = result["element_id"]
|
|
243
|
+
|
|
244
|
+
if result.get("needs_human_review"):
|
|
245
|
+
needs_human_review.append(element_id)
|
|
246
|
+
elif not result["pass"]:
|
|
247
|
+
revision_num = result.get("revision_feedback", {}).get("revision_number", 0)
|
|
248
|
+
if revision_num < 2:
|
|
249
|
+
needs_revision.append({
|
|
250
|
+
"element_id": element_id,
|
|
251
|
+
"feedback": result["revision_feedback"],
|
|
252
|
+
"score": result["scores"]["overall"]
|
|
253
|
+
})
|
|
254
|
+
else:
|
|
255
|
+
needs_human_review.append(element_id)
|
|
256
|
+
else:
|
|
257
|
+
passed.append(element_id)
|
|
258
|
+
|
|
259
|
+
print(f"Passed: {len(passed)}")
|
|
260
|
+
print(f"Needs revision: {len(needs_revision)}")
|
|
261
|
+
print(f"Needs human review: {len(needs_human_review)}")
|
|
262
|
+
```
|
|
160
263
|
|
|
161
|
-
|
|
264
|
+
For elements needing revision, re-run extraction with specific feedback:
|
|
162
265
|
|
|
163
|
-
[Full text extracted from document, preserving structure]
|
|
164
266
|
```
|
|
267
|
+
Task(
|
|
268
|
+
subagent_type: "general-purpose",
|
|
269
|
+
model: "opus",
|
|
270
|
+
description: "Re-extract element [N] (revision)",
|
|
271
|
+
prompt: """
|
|
272
|
+
REVISION EXTRACTION - Previous extraction failed verification.
|
|
165
273
|
|
|
166
|
-
|
|
274
|
+
Read the agent instructions from:
|
|
275
|
+
~/.claude/agents/<appropriate_extractor>.md
|
|
167
276
|
|
|
168
|
-
|
|
277
|
+
**Image:** <full_path_to_image>
|
|
278
|
+
**Element ID:** <element_id>
|
|
279
|
+
**Previous Score:** <score>
|
|
280
|
+
**Output:** Write JSON to <output_dir>/extractions/<element_id>.json
|
|
169
281
|
|
|
170
|
-
|
|
282
|
+
SPECIFIC FIXES REQUIRED:
|
|
283
|
+
<list_specific_fixes_from_revision_feedback>
|
|
171
284
|
|
|
172
|
-
|
|
173
|
-
|
|
285
|
+
Focus on fixing these specific issues while preserving correct sections.
|
|
286
|
+
"""
|
|
287
|
+
)
|
|
288
|
+
```
|
|
174
289
|
|
|
175
|
-
|
|
176
|
-
**Extracted:** [date/time]
|
|
177
|
-
**Elements:** [N] visual elements processed
|
|
290
|
+
After re-extraction, re-verify. Max 2 revision attempts per element.
|
|
178
291
|
|
|
179
|
-
|
|
292
|
+
## Step 7: Generate Markdown Elements
|
|
180
293
|
|
|
181
|
-
|
|
294
|
+
After all verifications pass (or reach human review), convert JSON extractions to markdown:
|
|
182
295
|
|
|
183
|
-
|
|
296
|
+
```python
|
|
297
|
+
import json
|
|
298
|
+
from pathlib import Path
|
|
184
299
|
|
|
185
|
-
|
|
300
|
+
output_dir = Path("<output_dir>")
|
|
301
|
+
extractions_dir = output_dir / "extractions"
|
|
302
|
+
elements_dir = output_dir / "elements"
|
|
186
303
|
|
|
187
|
-
|
|
304
|
+
for extract_file in extractions_dir.glob("*.json"):
|
|
305
|
+
element_id = extract_file.stem
|
|
188
306
|
|
|
189
|
-
|
|
190
|
-
|
|
307
|
+
with open(extract_file) as f:
|
|
308
|
+
extraction = json.load(f)
|
|
191
309
|
|
|
192
|
-
|
|
193
|
-
|
|
310
|
+
# Convert to markdown based on type
|
|
311
|
+
md_content = json_to_markdown(extraction)
|
|
194
312
|
|
|
195
|
-
|
|
313
|
+
with open(elements_dir / f"{element_id}.md", "w") as f:
|
|
314
|
+
f.write(md_content)
|
|
315
|
+
```
|
|
196
316
|
|
|
197
|
-
|
|
317
|
+
**Markdown conversion function:**
|
|
198
318
|
|
|
199
|
-
|
|
319
|
+
```python
|
|
320
|
+
def json_to_markdown(extraction: dict) -> str:
|
|
321
|
+
"""Convert JSON extraction to clean markdown."""
|
|
322
|
+
|
|
323
|
+
ext_type = extraction.get("extraction_type")
|
|
324
|
+
|
|
325
|
+
if ext_type == "table":
|
|
326
|
+
return table_to_markdown(extraction)
|
|
327
|
+
elif ext_type == "chart":
|
|
328
|
+
return chart_to_markdown(extraction)
|
|
329
|
+
elif ext_type == "heatmap":
|
|
330
|
+
return heatmap_to_markdown(extraction)
|
|
331
|
+
elif ext_type == "diagram":
|
|
332
|
+
return diagram_to_markdown(extraction)
|
|
333
|
+
elif ext_type == "multi_panel":
|
|
334
|
+
return multipanel_to_markdown(extraction)
|
|
335
|
+
else:
|
|
336
|
+
return generic_to_markdown(extraction)
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def table_to_markdown(ext: dict) -> str:
|
|
340
|
+
md = []
|
|
341
|
+
meta = ext.get("table_metadata", {})
|
|
342
|
+
|
|
343
|
+
md.append(f"# {meta.get('title', 'Table')}")
|
|
344
|
+
md.append(f"\n**Type:** Table")
|
|
345
|
+
md.append(f"**Source:** Page {meta.get('source_page', '?')}")
|
|
346
|
+
|
|
347
|
+
if meta.get("caption"):
|
|
348
|
+
md.append(f"\n> {meta['caption']}")
|
|
349
|
+
|
|
350
|
+
md.append("\n## Data\n")
|
|
351
|
+
md.append(ext.get("markdown_table", ""))
|
|
352
|
+
|
|
353
|
+
if meta.get("footnotes"):
|
|
354
|
+
md.append("\n## Footnotes\n")
|
|
355
|
+
for fn in meta["footnotes"]:
|
|
356
|
+
md.append(f"- {fn}")
|
|
357
|
+
|
|
358
|
+
return "\n".join(md)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def chart_to_markdown(ext: dict) -> str:
|
|
362
|
+
md = []
|
|
363
|
+
meta = ext.get("chart_metadata", {})
|
|
364
|
+
|
|
365
|
+
md.append(f"# {meta.get('title', 'Chart')}")
|
|
366
|
+
md.append(f"\n**Type:** {ext.get('chart_type', 'Chart')}")
|
|
367
|
+
md.append(f"**Source:** Page {meta.get('source_page', '?')}")
|
|
368
|
+
|
|
369
|
+
axes = ext.get("axes", {})
|
|
370
|
+
md.append("\n## Axes\n")
|
|
371
|
+
if axes.get("x"):
|
|
372
|
+
md.append(f"- **X-axis:** {axes['x'].get('label', 'unlabeled')}")
|
|
373
|
+
md.append(f" - Range: {axes['x'].get('min')} to {axes['x'].get('max')}")
|
|
374
|
+
if axes.get("y"):
|
|
375
|
+
md.append(f"- **Y-axis:** {axes['y'].get('label', 'unlabeled')}")
|
|
376
|
+
md.append(f" - Range: {axes['y'].get('min')} to {axes['y'].get('max')}")
|
|
377
|
+
|
|
378
|
+
legend = ext.get("legend", {})
|
|
379
|
+
if legend.get("entries"):
|
|
380
|
+
md.append("\n## Legend\n")
|
|
381
|
+
for entry in legend["entries"]:
|
|
382
|
+
style = entry.get("line_style") or entry.get("style", "")
|
|
383
|
+
md.append(f"- **{entry['label']}**: {entry.get('color', '')} {style}")
|
|
384
|
+
|
|
385
|
+
stats = ext.get("statistical_annotations", [])
|
|
386
|
+
if stats:
|
|
387
|
+
md.append("\n## Statistical Annotations\n")
|
|
388
|
+
for stat in stats:
|
|
389
|
+
md.append(f"- {stat.get('type', 'stat')}: {stat.get('value', '')}")
|
|
390
|
+
|
|
391
|
+
risk = ext.get("risk_table", {})
|
|
392
|
+
if risk.get("present"):
|
|
393
|
+
md.append("\n## Risk Table\n")
|
|
394
|
+
headers = risk.get("headers", [])
|
|
395
|
+
md.append("| " + " | ".join(headers) + " |")
|
|
396
|
+
md.append("| " + " | ".join(["---"] * len(headers)) + " |")
|
|
397
|
+
for row in risk.get("rows", []):
|
|
398
|
+
values = [row.get("group", "")] + row.get("values", [])
|
|
399
|
+
md.append("| " + " | ".join(values) + " |")
|
|
400
|
+
|
|
401
|
+
return "\n".join(md)
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
## Step 8: Generate Combined STRUCTURED.md
|
|
405
|
+
|
|
406
|
+
```python
|
|
407
|
+
from pathlib import Path
|
|
408
|
+
from datetime import datetime
|
|
409
|
+
|
|
410
|
+
output_dir = Path("<output_dir>")
|
|
411
|
+
elements_dir = output_dir / "elements"
|
|
412
|
+
doc_name = "<document_name>"
|
|
413
|
+
|
|
414
|
+
# Read all element files in order
|
|
415
|
+
element_files = sorted(elements_dir.glob("element_*.md"))
|
|
416
|
+
|
|
417
|
+
sections = []
|
|
418
|
+
sections.append(f"# {doc_name} - Structured Extraction")
|
|
419
|
+
sections.append(f"\n**Original:** {doc_name}")
|
|
420
|
+
sections.append(f"**Extracted:** {datetime.now().isoformat()}")
|
|
421
|
+
sections.append(f"**Elements:** {len(element_files)} visual elements processed")
|
|
422
|
+
sections.append(f"**Pipeline:** structurecc v2.0 (3-phase with verification)")
|
|
423
|
+
sections.append("\n---\n")
|
|
424
|
+
|
|
425
|
+
# Add each element
|
|
426
|
+
for i, elem_file in enumerate(element_files, 1):
|
|
427
|
+
with open(elem_file) as f:
|
|
428
|
+
content = f.read()
|
|
429
|
+
|
|
430
|
+
sections.append(f"## Element {i}")
|
|
431
|
+
sections.append(content)
|
|
432
|
+
sections.append("\n---\n")
|
|
433
|
+
|
|
434
|
+
# Write combined file
|
|
435
|
+
with open(output_dir / "STRUCTURED.md", "w") as f:
|
|
436
|
+
f.write("\n".join(sections))
|
|
437
|
+
```
|
|
200
438
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
439
|
+
## Step 9: Generate Quality Report
|
|
440
|
+
|
|
441
|
+
```python
|
|
442
|
+
import json
|
|
443
|
+
from pathlib import Path
|
|
444
|
+
|
|
445
|
+
output_dir = Path("<output_dir>")
|
|
446
|
+
verifications_dir = output_dir / "verifications"
|
|
447
|
+
|
|
448
|
+
report = {
|
|
449
|
+
"document": "<document_name>",
|
|
450
|
+
"timestamp": datetime.now().isoformat(),
|
|
451
|
+
"pipeline_version": "2.0.0",
|
|
452
|
+
"elements_total": 0,
|
|
453
|
+
"elements_passed": 0,
|
|
454
|
+
"elements_revised": 0,
|
|
455
|
+
"elements_human_review": 0,
|
|
456
|
+
"average_quality_score": 0.0,
|
|
457
|
+
"element_details": []
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
scores = []
|
|
461
|
+
for verify_file in sorted(verifications_dir.glob("*_verify.json")):
|
|
462
|
+
with open(verify_file) as f:
|
|
463
|
+
result = json.load(f)
|
|
464
|
+
|
|
465
|
+
report["elements_total"] += 1
|
|
466
|
+
|
|
467
|
+
detail = {
|
|
468
|
+
"element_id": result["element_id"],
|
|
469
|
+
"scores": result["scores"],
|
|
470
|
+
"status": "passed" if result["pass"] else "failed",
|
|
471
|
+
"issues_count": len(result.get("issues", []))
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
if result.get("needs_human_review"):
|
|
475
|
+
detail["status"] = "human_review"
|
|
476
|
+
report["elements_human_review"] += 1
|
|
477
|
+
elif result["pass"]:
|
|
478
|
+
report["elements_passed"] += 1
|
|
479
|
+
|
|
480
|
+
if result.get("revision_feedback", {}).get("revision_number", 0) > 0:
|
|
481
|
+
report["elements_revised"] += 1
|
|
482
|
+
|
|
483
|
+
scores.append(result["scores"]["overall"])
|
|
484
|
+
report["element_details"].append(detail)
|
|
485
|
+
|
|
486
|
+
report["average_quality_score"] = sum(scores) / len(scores) if scores else 0
|
|
487
|
+
|
|
488
|
+
# Write report
|
|
489
|
+
with open(output_dir / "extraction_report.json", "w") as f:
|
|
490
|
+
json.dump(report, f, indent=2)
|
|
491
|
+
|
|
492
|
+
print(f"\nQuality Report:")
|
|
493
|
+
print(f" Total elements: {report['elements_total']}")
|
|
494
|
+
print(f" Passed: {report['elements_passed']}")
|
|
495
|
+
print(f" Revised: {report['elements_revised']}")
|
|
496
|
+
print(f" Human review: {report['elements_human_review']}")
|
|
497
|
+
print(f" Average score: {report['average_quality_score']:.2f}")
|
|
206
498
|
```
|
|
207
499
|
|
|
208
|
-
## Step
|
|
500
|
+
## Step 10: Display Results
|
|
209
501
|
|
|
210
502
|
```
|
|
211
|
-
|
|
212
|
-
║ EXTRACTION COMPLETE
|
|
213
|
-
|
|
214
|
-
║
|
|
215
|
-
║ Document: [name]
|
|
216
|
-
║ Output: [path]_extracted/
|
|
217
|
-
║
|
|
218
|
-
║
|
|
219
|
-
║
|
|
220
|
-
║
|
|
221
|
-
║
|
|
222
|
-
║
|
|
223
|
-
║
|
|
224
|
-
║
|
|
225
|
-
|
|
503
|
+
╔═══════════════════════════════════════════════════════════════════════════════╗
|
|
504
|
+
║ EXTRACTION COMPLETE ║
|
|
505
|
+
╠═══════════════════════════════════════════════════════════════════════════════╣
|
|
506
|
+
║ ║
|
|
507
|
+
║ Document: [name] ║
|
|
508
|
+
║ Output: [path]_extracted/ ║
|
|
509
|
+
║ Pipeline: structurecc v2.0 (3-phase with verification) ║
|
|
510
|
+
║ ║
|
|
511
|
+
║ QUALITY SUMMARY ║
|
|
512
|
+
║ ────────────────────────────────────────────────────── ║
|
|
513
|
+
║ Total elements: [N] ║
|
|
514
|
+
║ Passed (≥0.90): [N] ✓ ║
|
|
515
|
+
║ Revised: [N] ↻ ║
|
|
516
|
+
║ Human review: [N] ⚠ ║
|
|
517
|
+
║ Average score: [0.XX] ║
|
|
518
|
+
║ ║
|
|
519
|
+
║ FILES ║
|
|
520
|
+
║ ────────────────────────────────────────────────────── ║
|
|
521
|
+
║ images/ [N] extracted images ║
|
|
522
|
+
║ classifications/ [N] type classifications ║
|
|
523
|
+
║ extractions/ [N] JSON extractions ║
|
|
524
|
+
║ verifications/ [N] quality verifications ║
|
|
525
|
+
║ elements/ [N] markdown files ║
|
|
526
|
+
║ STRUCTURED.md Combined output ║
|
|
527
|
+
║ extraction_report.json Quality metrics ║
|
|
528
|
+
║ ║
|
|
529
|
+
╚═══════════════════════════════════════════════════════════════════════════════╝
|
|
226
530
|
```
|
|
227
531
|
|
|
228
532
|
Then open: `open "<output_dir>/STRUCTURED.md"`
|
|
@@ -237,6 +541,24 @@ pip3 install PyMuPDF --quiet
|
|
|
237
541
|
## Tips
|
|
238
542
|
|
|
239
543
|
- Use opus model for best extraction quality on complex visuals
|
|
240
|
-
-
|
|
241
|
-
-
|
|
242
|
-
-
|
|
544
|
+
- Classification uses haiku for speed (it's just triage)
|
|
545
|
+
- Verification uses sonnet for good balance
|
|
546
|
+
- Each phase runs in parallel for speed
|
|
547
|
+
- Max 2 revision attempts before human review
|
|
548
|
+
- Check `extraction_report.json` for quality metrics
|
|
549
|
+
- Individual JSON extractions preserved for programmatic use
|
|
550
|
+
|
|
551
|
+
## Troubleshooting
|
|
552
|
+
|
|
553
|
+
**Low quality scores:**
|
|
554
|
+
- Check source image quality
|
|
555
|
+
- Complex tables may need human review
|
|
556
|
+
- Handwritten text is challenging
|
|
557
|
+
|
|
558
|
+
**Revision loop stuck:**
|
|
559
|
+
- After 2 revisions, element goes to human review
|
|
560
|
+
- Check verifications/ for specific issues
|
|
561
|
+
|
|
562
|
+
**Missing elements:**
|
|
563
|
+
- Some PDFs render text as images - check image count
|
|
564
|
+
- Very small images may be logos/icons (expected)
|