natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +670 -595
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +188 -82
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +132 -16
- natural_pdf/core/pdf.py +486 -71
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +238 -111
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
- natural_pdf-0.1.34.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.32.dist-info/RECORD +0 -118
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
optimization/pdf_analyzer.py
CHANGED
@@ -9,65 +9,69 @@ Usage:
|
|
9
9
|
python pdf_analyzer.py path/to/document.pdf [num_pages] [output_folder]
|
10
10
|
"""
|
11
11
|
|
12
|
-
import sys
|
13
12
|
import json
|
13
|
+
import sys
|
14
14
|
from pathlib import Path
|
15
|
+
|
15
16
|
import natural_pdf as npdf
|
16
17
|
from natural_pdf.elements.collections import ElementCollection
|
17
18
|
|
18
19
|
|
19
|
-
def analyze_pdf(
|
20
|
+
def analyze_pdf(
|
21
|
+
pdf_path, num_pages=1, output_folder="analysis_results", create_timestamp_folder=True
|
22
|
+
):
|
20
23
|
"""Analyze a PDF using Natural PDF's full capabilities"""
|
21
|
-
|
24
|
+
|
22
25
|
pdf_file = Path(pdf_path)
|
23
26
|
if not pdf_file.exists():
|
24
27
|
print(f"❌ File not found: {pdf_path}")
|
25
28
|
return
|
26
|
-
|
29
|
+
|
27
30
|
# Create output folder structure
|
28
31
|
base_output_dir = Path(output_folder)
|
29
32
|
base_output_dir.mkdir(exist_ok=True)
|
30
|
-
|
33
|
+
|
31
34
|
# If create_timestamp_folder=True, create a timestamped run folder for batch analysis
|
32
35
|
if create_timestamp_folder:
|
33
36
|
import datetime
|
37
|
+
|
34
38
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
35
39
|
run_output_dir = base_output_dir / f"run_{timestamp}"
|
36
40
|
run_output_dir.mkdir(exist_ok=True)
|
37
41
|
else:
|
38
42
|
run_output_dir = base_output_dir
|
39
|
-
|
43
|
+
|
40
44
|
# Create subfolder for this specific PDF within the run folder
|
41
45
|
pdf_output_dir = run_output_dir / pdf_file.stem
|
42
46
|
pdf_output_dir.mkdir(exist_ok=True)
|
43
|
-
|
47
|
+
|
44
48
|
print(f"🔍 ANALYZING: {pdf_file.name}")
|
45
49
|
print(f"📁 Output folder: {pdf_output_dir}")
|
46
50
|
print("=" * 80)
|
47
|
-
|
51
|
+
|
48
52
|
analysis_data = {
|
49
53
|
"pdf_name": pdf_file.name,
|
50
54
|
"pdf_path": str(pdf_file),
|
51
55
|
"analysis_timestamp": None,
|
52
|
-
"pages": []
|
56
|
+
"pages": [],
|
53
57
|
}
|
54
|
-
|
58
|
+
|
55
59
|
try:
|
56
60
|
# Load PDF
|
57
61
|
pdf = npdf.PDF(str(pdf_file))
|
58
62
|
total_pages = len(pdf.pages)
|
59
63
|
pages_to_analyze = min(num_pages, total_pages)
|
60
|
-
|
64
|
+
|
61
65
|
analysis_data["total_pages"] = total_pages
|
62
66
|
analysis_data["pages_analyzed"] = pages_to_analyze
|
63
|
-
|
67
|
+
|
64
68
|
print(f"📄 Total pages: {total_pages}")
|
65
69
|
print(f"🔍 Analyzing first {pages_to_analyze} page(s)")
|
66
70
|
print()
|
67
|
-
|
71
|
+
|
68
72
|
for page_num in range(pages_to_analyze):
|
69
73
|
page = pdf.pages[page_num]
|
70
|
-
|
74
|
+
|
71
75
|
page_data = {
|
72
76
|
"page_number": page_num + 1,
|
73
77
|
"dimensions": {"width": page.width, "height": page.height},
|
@@ -77,30 +81,30 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
|
|
77
81
|
"analyze_layout": None,
|
78
82
|
"regions": None,
|
79
83
|
"elements_sample": None,
|
80
|
-
"image_path": None
|
84
|
+
"image_path": None,
|
81
85
|
}
|
82
|
-
|
86
|
+
|
83
87
|
print(f"📄 PAGE {page_num + 1}")
|
84
88
|
print("-" * 60)
|
85
|
-
|
89
|
+
|
86
90
|
# Basic page info
|
87
91
|
print(f"📐 Dimensions: {page.width:.1f} x {page.height:.1f}")
|
88
|
-
|
92
|
+
|
89
93
|
# 1. .describe() - Overview of elements
|
90
94
|
print(f"\n🤖 PAGE.DESCRIBE():")
|
91
95
|
try:
|
92
96
|
description = page.describe()
|
93
97
|
print(description)
|
94
98
|
page_data["describe"] = str(description)
|
95
|
-
|
99
|
+
|
96
100
|
# Save describe output to file
|
97
101
|
with open(pdf_output_dir / f"page_{page_num + 1}_describe.txt", "w") as f:
|
98
102
|
f.write(str(description))
|
99
|
-
|
103
|
+
|
100
104
|
except Exception as e:
|
101
105
|
print(f"❌ describe() failed: {e}")
|
102
106
|
page_data["describe"] = f"ERROR: {e}"
|
103
|
-
|
107
|
+
|
104
108
|
# 2. .extract_text() - Raw text extraction
|
105
109
|
print(f"\n📝 PAGE.EXTRACT_TEXT():")
|
106
110
|
try:
|
@@ -108,21 +112,25 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
|
|
108
112
|
if text:
|
109
113
|
print(f"Length: {len(text)} characters")
|
110
114
|
# Show first 300 chars
|
111
|
-
preview = text[:300].replace(
|
115
|
+
preview = text[:300].replace("\n", "\\n")
|
112
116
|
print(f"Preview: {preview}...")
|
113
|
-
page_data["extract_text"] = {
|
114
|
-
|
117
|
+
page_data["extract_text"] = {
|
118
|
+
"length": len(text),
|
119
|
+
"preview": preview,
|
120
|
+
"full_text": text,
|
121
|
+
}
|
122
|
+
|
115
123
|
# Save full text to file
|
116
124
|
with open(pdf_output_dir / f"page_{page_num + 1}_text.txt", "w") as f:
|
117
125
|
f.write(text)
|
118
|
-
|
126
|
+
|
119
127
|
else:
|
120
128
|
print("No text extracted")
|
121
129
|
page_data["extract_text"] = {"length": 0, "preview": "", "full_text": ""}
|
122
130
|
except Exception as e:
|
123
131
|
print(f"❌ extract_text() failed: {e}")
|
124
132
|
page_data["extract_text"] = f"ERROR: {e}"
|
125
|
-
|
133
|
+
|
126
134
|
# 3. .extract_table() - Table extraction (returns List[List[str]])
|
127
135
|
print(f"\n📊 PAGE.EXTRACT_TABLE():")
|
128
136
|
try:
|
@@ -134,25 +142,25 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
|
|
134
142
|
print("Sample data (first 3 rows):")
|
135
143
|
for i, row in enumerate(table_data[:3]):
|
136
144
|
print(f" Row {i+1}: {row}")
|
137
|
-
|
145
|
+
|
138
146
|
page_data["extract_table"] = {
|
139
147
|
"found": True,
|
140
148
|
"rows": rows,
|
141
149
|
"columns": cols,
|
142
|
-
"data": table_data
|
150
|
+
"data": table_data,
|
143
151
|
}
|
144
|
-
|
152
|
+
|
145
153
|
# Save table data as JSON
|
146
154
|
with open(pdf_output_dir / f"page_{page_num + 1}_table.json", "w") as f:
|
147
155
|
json.dump(table_data, f, indent=2)
|
148
|
-
|
156
|
+
|
149
157
|
else:
|
150
158
|
print("No table extracted")
|
151
159
|
page_data["extract_table"] = {"found": False}
|
152
160
|
except Exception as e:
|
153
161
|
print(f"❌ extract_table() failed: {e}")
|
154
162
|
page_data["extract_table"] = f"ERROR: {e}"
|
155
|
-
|
163
|
+
|
156
164
|
# 4. .analyze_layout() - Layout analysis
|
157
165
|
print(f"\n🏗️ PAGE.ANALYZE_LAYOUT():")
|
158
166
|
try:
|
@@ -162,17 +170,19 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
|
|
162
170
|
layout_info = []
|
163
171
|
for i, region in enumerate(layout[:5]): # Show first 5
|
164
172
|
region_info = {
|
165
|
-
"type": getattr(region,
|
173
|
+
"type": getattr(region, "type", "unknown"),
|
166
174
|
"bbox": [region.x0, region.top, region.x1, region.bottom],
|
167
|
-
"confidence": getattr(region,
|
175
|
+
"confidence": getattr(region, "confidence", 0),
|
168
176
|
}
|
169
177
|
layout_info.append(region_info)
|
170
|
-
print(
|
171
|
-
|
178
|
+
print(
|
179
|
+
f" {i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})"
|
180
|
+
)
|
181
|
+
|
172
182
|
page_data["analyze_layout"] = {
|
173
183
|
"found": True,
|
174
184
|
"count": len(layout),
|
175
|
-
"regions": layout_info
|
185
|
+
"regions": layout_info,
|
176
186
|
}
|
177
187
|
else:
|
178
188
|
print("No layout regions found")
|
@@ -180,38 +190,40 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
|
|
180
190
|
except Exception as e:
|
181
191
|
print(f"❌ analyze_layout() failed: {e}")
|
182
192
|
page_data["analyze_layout"] = f"ERROR: {e}"
|
183
|
-
|
193
|
+
|
184
194
|
# 4b. .analyze_layout('tatr') - Table structure analysis (append to preserve YOLO results)
|
185
195
|
print(f"\n🏗️ PAGE.ANALYZE_LAYOUT('TATR') - Table Structure:")
|
186
196
|
try:
|
187
|
-
tatr_layout = page.analyze_layout(
|
197
|
+
tatr_layout = page.analyze_layout("tatr", existing="append")
|
188
198
|
if tatr_layout and len(tatr_layout) > 0:
|
189
199
|
print(f"TATR layout regions found: {len(tatr_layout)}")
|
190
200
|
tatr_info = []
|
191
201
|
for i, region in enumerate(tatr_layout[:5]): # Show first 5
|
192
202
|
region_info = {
|
193
|
-
"type": getattr(region,
|
203
|
+
"type": getattr(region, "type", "unknown"),
|
194
204
|
"bbox": [region.x0, region.top, region.x1, region.bottom],
|
195
|
-
"confidence": getattr(region,
|
205
|
+
"confidence": getattr(region, "confidence", 0),
|
196
206
|
}
|
197
207
|
tatr_info.append(region_info)
|
198
|
-
print(
|
199
|
-
|
208
|
+
print(
|
209
|
+
f" {i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})"
|
210
|
+
)
|
211
|
+
|
200
212
|
page_data["analyze_layout_tatr"] = {
|
201
213
|
"found": True,
|
202
214
|
"count": len(tatr_layout),
|
203
|
-
"regions": tatr_info
|
215
|
+
"regions": tatr_info,
|
204
216
|
}
|
205
|
-
|
217
|
+
|
206
218
|
# Save TATR layout analysis to file
|
207
219
|
tatr_summary = f"TATR Layout Analysis\n{'='*50}\n"
|
208
220
|
tatr_summary += f"Found {len(tatr_layout)} regions:\n\n"
|
209
221
|
for i, region_info in enumerate(tatr_info):
|
210
222
|
tatr_summary += f"{i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})\n"
|
211
|
-
|
223
|
+
|
212
224
|
with open(pdf_output_dir / f"page_{page_num + 1}_tatr_layout.txt", "w") as f:
|
213
225
|
f.write(tatr_summary)
|
214
|
-
|
226
|
+
|
215
227
|
# Try to get detailed table structure
|
216
228
|
try:
|
217
229
|
table_structure = page.find_table_structure()
|
@@ -221,11 +233,14 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
|
|
221
233
|
page_data["table_structure"] = {
|
222
234
|
"found": True,
|
223
235
|
"count": len(table_structure),
|
224
|
-
"details": table_details[:1000]
|
236
|
+
"details": table_details[:1000]
|
237
|
+
+ ("..." if len(table_details) > 1000 else ""),
|
225
238
|
}
|
226
|
-
|
239
|
+
|
227
240
|
# Save table structure to file
|
228
|
-
with open(
|
241
|
+
with open(
|
242
|
+
pdf_output_dir / f"page_{page_num + 1}_table_structure.txt", "w"
|
243
|
+
) as f:
|
229
244
|
f.write(table_details)
|
230
245
|
else:
|
231
246
|
page_data["table_structure"] = {"found": False}
|
@@ -240,171 +255,201 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
|
|
240
255
|
print(f"❌ analyze_layout('tatr') failed: {e}")
|
241
256
|
page_data["analyze_layout_tatr"] = f"ERROR: {e}"
|
242
257
|
page_data["table_structure"] = f"ERROR: {e}"
|
243
|
-
|
258
|
+
|
244
259
|
# 5. Find regions by model and save separate + combined files
|
245
260
|
print(f"\n📍 REGION ANALYSIS - By Model:")
|
246
261
|
try:
|
247
|
-
all_regions = page.find_all(
|
262
|
+
all_regions = page.find_all("region")
|
248
263
|
if all_regions and len(all_regions) > 0:
|
249
264
|
print(f"Total regions found: {len(all_regions)}")
|
250
|
-
|
265
|
+
|
251
266
|
# Group regions by model/source
|
252
|
-
yolo_regions = [
|
253
|
-
|
254
|
-
|
255
|
-
|
267
|
+
yolo_regions = [
|
268
|
+
r
|
269
|
+
for r in all_regions
|
270
|
+
if getattr(r, "model", "") == "" or getattr(r, "model", "") == "yolo"
|
271
|
+
]
|
272
|
+
tatr_regions = [r for r in all_regions if getattr(r, "model", "") == "tatr"]
|
273
|
+
other_regions = [
|
274
|
+
r
|
275
|
+
for r in all_regions
|
276
|
+
if getattr(r, "model", "") not in ["", "yolo", "tatr"]
|
277
|
+
]
|
278
|
+
|
256
279
|
print(f" YOLO regions: {len(yolo_regions)}")
|
257
280
|
print(f" TATR regions: {len(tatr_regions)}")
|
258
281
|
print(f" Other regions: {len(other_regions)}")
|
259
|
-
|
282
|
+
|
260
283
|
# Save separate files for each model
|
261
284
|
if yolo_regions:
|
262
285
|
yolo_inspect = str(ElementCollection(yolo_regions).inspect(limit=1000))
|
263
|
-
with open(
|
264
|
-
f
|
265
|
-
|
286
|
+
with open(
|
287
|
+
pdf_output_dir / f"page_{page_num + 1}_yolo_regions.txt", "w"
|
288
|
+
) as f:
|
289
|
+
f.write(
|
290
|
+
f"YOLO Layout Regions ({len(yolo_regions)} found)\n{'='*50}\n\n{yolo_inspect}"
|
291
|
+
)
|
292
|
+
|
266
293
|
if tatr_regions:
|
267
294
|
tatr_inspect = str(ElementCollection(tatr_regions).inspect(limit=1000))
|
268
|
-
with open(
|
269
|
-
f
|
270
|
-
|
271
|
-
|
295
|
+
with open(
|
296
|
+
pdf_output_dir / f"page_{page_num + 1}_tatr_regions.txt", "w"
|
297
|
+
) as f:
|
298
|
+
f.write(
|
299
|
+
f"TATR Layout Regions ({len(tatr_regions)} found)\n{'='*50}\n\n{tatr_inspect}"
|
300
|
+
)
|
301
|
+
|
302
|
+
# Combined regions inspect
|
272
303
|
all_inspect = str(all_regions.inspect(limit=1000))
|
273
304
|
print(f"Combined regions preview (first 500 chars):\n{all_inspect[:500]}...")
|
274
|
-
|
305
|
+
|
275
306
|
# Save combined regions file
|
276
307
|
with open(pdf_output_dir / f"page_{page_num + 1}_all_regions.txt", "w") as f:
|
277
308
|
f.write(f"All Layout Regions ({len(all_regions)} found)\n{'='*50}\n")
|
278
|
-
f.write(
|
309
|
+
f.write(
|
310
|
+
f"YOLO: {len(yolo_regions)}, TATR: {len(tatr_regions)}, Other: {len(other_regions)}\n\n"
|
311
|
+
)
|
279
312
|
f.write(all_inspect)
|
280
|
-
|
313
|
+
|
281
314
|
page_data["regions"] = {
|
282
315
|
"found": True,
|
283
316
|
"total_count": len(all_regions),
|
284
317
|
"yolo_count": len(yolo_regions),
|
285
318
|
"tatr_count": len(tatr_regions),
|
286
319
|
"other_count": len(other_regions),
|
287
|
-
"inspect_preview":
|
320
|
+
"inspect_preview": (
|
321
|
+
all_inspect[:500] + "..." if len(all_inspect) > 500 else all_inspect
|
322
|
+
),
|
288
323
|
}
|
289
|
-
|
324
|
+
|
290
325
|
else:
|
291
326
|
print("No regions found")
|
292
327
|
page_data["regions"] = {"found": False}
|
293
328
|
except Exception as e:
|
294
329
|
print(f"❌ region analysis failed: {e}")
|
295
330
|
page_data["regions"] = f"ERROR: {e}"
|
296
|
-
|
331
|
+
|
297
332
|
# 6. General element inspection
|
298
333
|
print(f"\n🔍 GENERAL ELEMENT INSPECTION:")
|
299
334
|
try:
|
300
335
|
# Count different element types
|
301
|
-
all_elements = page.find_all(
|
336
|
+
all_elements = page.find_all("*")
|
302
337
|
if all_elements and len(all_elements) > 0:
|
303
338
|
print(f"Total elements: {len(all_elements)}")
|
304
|
-
|
339
|
+
|
305
340
|
# Full inspect output - shows complete breakdown
|
306
341
|
print(f"\nFull element breakdown (.inspect()):")
|
307
342
|
# Get string representation of inspect result (increased limit)
|
308
343
|
inspect_result = all_elements.inspect(limit=1000)
|
309
344
|
inspect_text = str(inspect_result)
|
310
345
|
print(inspect_text)
|
311
|
-
|
346
|
+
|
312
347
|
# Sample some elements for detailed inspection
|
313
348
|
sample_elements = all_elements[:10] # First 10 elements
|
314
349
|
print(f"Sample of first 10 elements:")
|
315
350
|
elements_sample = []
|
316
351
|
for i, elem in enumerate(sample_elements):
|
317
|
-
elem_type = getattr(elem,
|
318
|
-
text_preview =
|
352
|
+
elem_type = getattr(elem, "object_type", "unknown")
|
353
|
+
text_preview = (
|
354
|
+
getattr(elem, "text", "")[:30] if hasattr(elem, "text") else ""
|
355
|
+
)
|
319
356
|
elem_info = {
|
320
357
|
"type": elem_type,
|
321
358
|
"text": text_preview,
|
322
359
|
"x0": elem.x0,
|
323
|
-
"top": elem.top
|
360
|
+
"top": elem.top,
|
324
361
|
}
|
325
362
|
elements_sample.append(elem_info)
|
326
|
-
print(
|
327
|
-
|
363
|
+
print(
|
364
|
+
f" {i+1}. {elem_type}: '{text_preview}' at ({elem.x0:.0f}, {elem.top:.0f})"
|
365
|
+
)
|
366
|
+
|
328
367
|
page_data["elements_sample"] = {
|
329
368
|
"total_count": len(all_elements),
|
330
369
|
"full_inspect": inspect_text,
|
331
|
-
"sample": elements_sample
|
370
|
+
"sample": elements_sample,
|
332
371
|
}
|
333
|
-
|
372
|
+
|
334
373
|
# Save full inspect to file
|
335
|
-
with open(
|
374
|
+
with open(
|
375
|
+
pdf_output_dir / f"page_{page_num + 1}_all_elements_inspect.txt", "w"
|
376
|
+
) as f:
|
336
377
|
f.write(inspect_text)
|
337
|
-
|
378
|
+
|
338
379
|
else:
|
339
380
|
print("No elements found")
|
340
381
|
page_data["elements_sample"] = {"total_count": 0, "sample": []}
|
341
382
|
except Exception as e:
|
342
383
|
print(f"❌ element inspection failed: {e}")
|
343
384
|
page_data["elements_sample"] = f"ERROR: {e}"
|
344
|
-
|
385
|
+
|
345
386
|
# 7. Render page as image
|
346
387
|
print(f"\n🖼️ RENDERING PAGE AS IMAGE:")
|
347
388
|
try:
|
348
389
|
img = page.to_image(resolution=144)
|
349
390
|
print(f"Image: {img.width}x{img.height} pixels")
|
350
|
-
|
391
|
+
|
351
392
|
# Save image in output folder
|
352
393
|
img_filename = f"page_{page_num + 1}.png"
|
353
394
|
img_path = pdf_output_dir / img_filename
|
354
395
|
img.save(str(img_path))
|
355
396
|
print(f"Saved: {img_path}")
|
356
397
|
page_data["image_path"] = str(img_path)
|
357
|
-
|
398
|
+
|
358
399
|
except Exception as e:
|
359
400
|
print(f"❌ image rendering failed: {e}")
|
360
401
|
page_data["image_path"] = f"ERROR: {e}"
|
361
|
-
|
402
|
+
|
362
403
|
analysis_data["pages"].append(page_data)
|
363
|
-
|
404
|
+
|
364
405
|
if page_num < pages_to_analyze - 1:
|
365
406
|
print("\n" + "=" * 80 + "\n")
|
366
|
-
|
407
|
+
|
367
408
|
# Save complete analysis data as JSON
|
368
409
|
import datetime
|
410
|
+
|
369
411
|
analysis_data["analysis_timestamp"] = datetime.datetime.now().isoformat()
|
370
|
-
|
412
|
+
|
371
413
|
summary_file = pdf_output_dir / "analysis_summary.json"
|
372
414
|
with open(summary_file, "w") as f:
|
373
415
|
json.dump(analysis_data, f, indent=2)
|
374
|
-
|
416
|
+
|
375
417
|
print(f"\n✅ ANALYSIS COMPLETE")
|
376
418
|
print(f"📊 Summary: Analyzed {pages_to_analyze} page(s) of {pdf_file.name}")
|
377
419
|
print(f"📁 All results saved to: {pdf_output_dir}")
|
378
420
|
print(f"📋 Summary JSON: {summary_file}")
|
379
|
-
|
421
|
+
|
380
422
|
except Exception as e:
|
381
423
|
print(f"❌ CRITICAL ERROR: {e}")
|
382
424
|
import traceback
|
425
|
+
|
383
426
|
traceback.print_exc()
|
384
427
|
|
385
428
|
|
386
429
|
def main():
|
387
430
|
"""Main function"""
|
388
431
|
if len(sys.argv) < 2:
|
389
|
-
print(
|
432
|
+
print(
|
433
|
+
"Usage: python pdf_analyzer.py <pdf_path> [num_pages] [output_folder] [--no-timestamp]"
|
434
|
+
)
|
390
435
|
print("Example: python pdf_analyzer.py bad-pdfs/submissions/Focus.pdf 2 analysis_results")
|
391
436
|
print(" python pdf_analyzer.py Focus.pdf 1 my_analysis --no-timestamp")
|
392
437
|
sys.exit(1)
|
393
|
-
|
438
|
+
|
394
439
|
pdf_path = sys.argv[1]
|
395
440
|
num_pages = int(sys.argv[2]) if len(sys.argv) > 2 else 1
|
396
441
|
output_folder = "analysis_results"
|
397
442
|
create_timestamp_folder = True
|
398
|
-
|
443
|
+
|
399
444
|
# Parse remaining arguments
|
400
445
|
for arg in sys.argv[3:]:
|
401
446
|
if arg == "--no-timestamp":
|
402
447
|
create_timestamp_folder = False
|
403
448
|
elif not arg.startswith("--"):
|
404
449
|
output_folder = arg
|
405
|
-
|
450
|
+
|
406
451
|
analyze_pdf(pdf_path, num_pages, output_folder, create_timestamp_folder)
|
407
452
|
|
408
453
|
|
409
454
|
if __name__ == "__main__":
|
410
|
-
main()
|
455
|
+
main()
|