natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +751 -607
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +131 -45
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +120 -23
  19. natural_pdf/core/pdf.py +477 -75
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +222 -108
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.35.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.33.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0
@@ -9,65 +9,69 @@ Usage:
9
9
  python pdf_analyzer.py path/to/document.pdf [num_pages] [output_folder]
10
10
  """
11
11
 
12
- import sys
13
12
  import json
13
+ import sys
14
14
  from pathlib import Path
15
+
15
16
  import natural_pdf as npdf
16
17
  from natural_pdf.elements.collections import ElementCollection
17
18
 
18
19
 
19
- def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_timestamp_folder=True):
20
+ def analyze_pdf(
21
+ pdf_path, num_pages=1, output_folder="analysis_results", create_timestamp_folder=True
22
+ ):
20
23
  """Analyze a PDF using Natural PDF's full capabilities"""
21
-
24
+
22
25
  pdf_file = Path(pdf_path)
23
26
  if not pdf_file.exists():
24
27
  print(f"❌ File not found: {pdf_path}")
25
28
  return
26
-
29
+
27
30
  # Create output folder structure
28
31
  base_output_dir = Path(output_folder)
29
32
  base_output_dir.mkdir(exist_ok=True)
30
-
33
+
31
34
  # If create_timestamp_folder=True, create a timestamped run folder for batch analysis
32
35
  if create_timestamp_folder:
33
36
  import datetime
37
+
34
38
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
35
39
  run_output_dir = base_output_dir / f"run_{timestamp}"
36
40
  run_output_dir.mkdir(exist_ok=True)
37
41
  else:
38
42
  run_output_dir = base_output_dir
39
-
43
+
40
44
  # Create subfolder for this specific PDF within the run folder
41
45
  pdf_output_dir = run_output_dir / pdf_file.stem
42
46
  pdf_output_dir.mkdir(exist_ok=True)
43
-
47
+
44
48
  print(f"🔍 ANALYZING: {pdf_file.name}")
45
49
  print(f"📁 Output folder: {pdf_output_dir}")
46
50
  print("=" * 80)
47
-
51
+
48
52
  analysis_data = {
49
53
  "pdf_name": pdf_file.name,
50
54
  "pdf_path": str(pdf_file),
51
55
  "analysis_timestamp": None,
52
- "pages": []
56
+ "pages": [],
53
57
  }
54
-
58
+
55
59
  try:
56
60
  # Load PDF
57
61
  pdf = npdf.PDF(str(pdf_file))
58
62
  total_pages = len(pdf.pages)
59
63
  pages_to_analyze = min(num_pages, total_pages)
60
-
64
+
61
65
  analysis_data["total_pages"] = total_pages
62
66
  analysis_data["pages_analyzed"] = pages_to_analyze
63
-
67
+
64
68
  print(f"📄 Total pages: {total_pages}")
65
69
  print(f"🔍 Analyzing first {pages_to_analyze} page(s)")
66
70
  print()
67
-
71
+
68
72
  for page_num in range(pages_to_analyze):
69
73
  page = pdf.pages[page_num]
70
-
74
+
71
75
  page_data = {
72
76
  "page_number": page_num + 1,
73
77
  "dimensions": {"width": page.width, "height": page.height},
@@ -77,30 +81,30 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
77
81
  "analyze_layout": None,
78
82
  "regions": None,
79
83
  "elements_sample": None,
80
- "image_path": None
84
+ "image_path": None,
81
85
  }
82
-
86
+
83
87
  print(f"📄 PAGE {page_num + 1}")
84
88
  print("-" * 60)
85
-
89
+
86
90
  # Basic page info
87
91
  print(f"📐 Dimensions: {page.width:.1f} x {page.height:.1f}")
88
-
92
+
89
93
  # 1. .describe() - Overview of elements
90
94
  print(f"\n🤖 PAGE.DESCRIBE():")
91
95
  try:
92
96
  description = page.describe()
93
97
  print(description)
94
98
  page_data["describe"] = str(description)
95
-
99
+
96
100
  # Save describe output to file
97
101
  with open(pdf_output_dir / f"page_{page_num + 1}_describe.txt", "w") as f:
98
102
  f.write(str(description))
99
-
103
+
100
104
  except Exception as e:
101
105
  print(f"❌ describe() failed: {e}")
102
106
  page_data["describe"] = f"ERROR: {e}"
103
-
107
+
104
108
  # 2. .extract_text() - Raw text extraction
105
109
  print(f"\n📝 PAGE.EXTRACT_TEXT():")
106
110
  try:
@@ -108,21 +112,25 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
108
112
  if text:
109
113
  print(f"Length: {len(text)} characters")
110
114
  # Show first 300 chars
111
- preview = text[:300].replace('\n', '\\n')
115
+ preview = text[:300].replace("\n", "\\n")
112
116
  print(f"Preview: {preview}...")
113
- page_data["extract_text"] = {"length": len(text), "preview": preview, "full_text": text}
114
-
117
+ page_data["extract_text"] = {
118
+ "length": len(text),
119
+ "preview": preview,
120
+ "full_text": text,
121
+ }
122
+
115
123
  # Save full text to file
116
124
  with open(pdf_output_dir / f"page_{page_num + 1}_text.txt", "w") as f:
117
125
  f.write(text)
118
-
126
+
119
127
  else:
120
128
  print("No text extracted")
121
129
  page_data["extract_text"] = {"length": 0, "preview": "", "full_text": ""}
122
130
  except Exception as e:
123
131
  print(f"❌ extract_text() failed: {e}")
124
132
  page_data["extract_text"] = f"ERROR: {e}"
125
-
133
+
126
134
  # 3. .extract_table() - Table extraction (returns List[List[str]])
127
135
  print(f"\n📊 PAGE.EXTRACT_TABLE():")
128
136
  try:
@@ -134,25 +142,25 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
134
142
  print("Sample data (first 3 rows):")
135
143
  for i, row in enumerate(table_data[:3]):
136
144
  print(f" Row {i+1}: {row}")
137
-
145
+
138
146
  page_data["extract_table"] = {
139
147
  "found": True,
140
148
  "rows": rows,
141
149
  "columns": cols,
142
- "data": table_data
150
+ "data": table_data,
143
151
  }
144
-
152
+
145
153
  # Save table data as JSON
146
154
  with open(pdf_output_dir / f"page_{page_num + 1}_table.json", "w") as f:
147
155
  json.dump(table_data, f, indent=2)
148
-
156
+
149
157
  else:
150
158
  print("No table extracted")
151
159
  page_data["extract_table"] = {"found": False}
152
160
  except Exception as e:
153
161
  print(f"❌ extract_table() failed: {e}")
154
162
  page_data["extract_table"] = f"ERROR: {e}"
155
-
163
+
156
164
  # 4. .analyze_layout() - Layout analysis
157
165
  print(f"\n🏗️ PAGE.ANALYZE_LAYOUT():")
158
166
  try:
@@ -162,17 +170,19 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
162
170
  layout_info = []
163
171
  for i, region in enumerate(layout[:5]): # Show first 5
164
172
  region_info = {
165
- "type": getattr(region, 'type', 'unknown'),
173
+ "type": getattr(region, "type", "unknown"),
166
174
  "bbox": [region.x0, region.top, region.x1, region.bottom],
167
- "confidence": getattr(region, 'confidence', 0)
175
+ "confidence": getattr(region, "confidence", 0),
168
176
  }
169
177
  layout_info.append(region_info)
170
- print(f" {i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})")
171
-
178
+ print(
179
+ f" {i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})"
180
+ )
181
+
172
182
  page_data["analyze_layout"] = {
173
183
  "found": True,
174
184
  "count": len(layout),
175
- "regions": layout_info
185
+ "regions": layout_info,
176
186
  }
177
187
  else:
178
188
  print("No layout regions found")
@@ -180,38 +190,40 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
180
190
  except Exception as e:
181
191
  print(f"❌ analyze_layout() failed: {e}")
182
192
  page_data["analyze_layout"] = f"ERROR: {e}"
183
-
193
+
184
194
  # 4b. .analyze_layout('tatr') - Table structure analysis (append to preserve YOLO results)
185
195
  print(f"\n🏗️ PAGE.ANALYZE_LAYOUT('TATR') - Table Structure:")
186
196
  try:
187
- tatr_layout = page.analyze_layout('tatr', existing="append")
197
+ tatr_layout = page.analyze_layout("tatr", existing="append")
188
198
  if tatr_layout and len(tatr_layout) > 0:
189
199
  print(f"TATR layout regions found: {len(tatr_layout)}")
190
200
  tatr_info = []
191
201
  for i, region in enumerate(tatr_layout[:5]): # Show first 5
192
202
  region_info = {
193
- "type": getattr(region, 'type', 'unknown'),
203
+ "type": getattr(region, "type", "unknown"),
194
204
  "bbox": [region.x0, region.top, region.x1, region.bottom],
195
- "confidence": getattr(region, 'confidence', 0)
205
+ "confidence": getattr(region, "confidence", 0),
196
206
  }
197
207
  tatr_info.append(region_info)
198
- print(f" {i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})")
199
-
208
+ print(
209
+ f" {i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})"
210
+ )
211
+
200
212
  page_data["analyze_layout_tatr"] = {
201
213
  "found": True,
202
214
  "count": len(tatr_layout),
203
- "regions": tatr_info
215
+ "regions": tatr_info,
204
216
  }
205
-
217
+
206
218
  # Save TATR layout analysis to file
207
219
  tatr_summary = f"TATR Layout Analysis\n{'='*50}\n"
208
220
  tatr_summary += f"Found {len(tatr_layout)} regions:\n\n"
209
221
  for i, region_info in enumerate(tatr_info):
210
222
  tatr_summary += f"{i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})\n"
211
-
223
+
212
224
  with open(pdf_output_dir / f"page_{page_num + 1}_tatr_layout.txt", "w") as f:
213
225
  f.write(tatr_summary)
214
-
226
+
215
227
  # Try to get detailed table structure
216
228
  try:
217
229
  table_structure = page.find_table_structure()
@@ -221,11 +233,14 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
221
233
  page_data["table_structure"] = {
222
234
  "found": True,
223
235
  "count": len(table_structure),
224
- "details": table_details[:1000] + ("..." if len(table_details) > 1000 else "")
236
+ "details": table_details[:1000]
237
+ + ("..." if len(table_details) > 1000 else ""),
225
238
  }
226
-
239
+
227
240
  # Save table structure to file
228
- with open(pdf_output_dir / f"page_{page_num + 1}_table_structure.txt", "w") as f:
241
+ with open(
242
+ pdf_output_dir / f"page_{page_num + 1}_table_structure.txt", "w"
243
+ ) as f:
229
244
  f.write(table_details)
230
245
  else:
231
246
  page_data["table_structure"] = {"found": False}
@@ -240,171 +255,201 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
240
255
  print(f"❌ analyze_layout('tatr') failed: {e}")
241
256
  page_data["analyze_layout_tatr"] = f"ERROR: {e}"
242
257
  page_data["table_structure"] = f"ERROR: {e}"
243
-
258
+
244
259
  # 5. Find regions by model and save separate + combined files
245
260
  print(f"\n📍 REGION ANALYSIS - By Model:")
246
261
  try:
247
- all_regions = page.find_all('region')
262
+ all_regions = page.find_all("region")
248
263
  if all_regions and len(all_regions) > 0:
249
264
  print(f"Total regions found: {len(all_regions)}")
250
-
265
+
251
266
  # Group regions by model/source
252
- yolo_regions = [r for r in all_regions if getattr(r, 'model', '') == '' or getattr(r, 'model', '') == 'yolo']
253
- tatr_regions = [r for r in all_regions if getattr(r, 'model', '') == 'tatr']
254
- other_regions = [r for r in all_regions if getattr(r, 'model', '') not in ['', 'yolo', 'tatr']]
255
-
267
+ yolo_regions = [
268
+ r
269
+ for r in all_regions
270
+ if getattr(r, "model", "") == "" or getattr(r, "model", "") == "yolo"
271
+ ]
272
+ tatr_regions = [r for r in all_regions if getattr(r, "model", "") == "tatr"]
273
+ other_regions = [
274
+ r
275
+ for r in all_regions
276
+ if getattr(r, "model", "") not in ["", "yolo", "tatr"]
277
+ ]
278
+
256
279
  print(f" YOLO regions: {len(yolo_regions)}")
257
280
  print(f" TATR regions: {len(tatr_regions)}")
258
281
  print(f" Other regions: {len(other_regions)}")
259
-
282
+
260
283
  # Save separate files for each model
261
284
  if yolo_regions:
262
285
  yolo_inspect = str(ElementCollection(yolo_regions).inspect(limit=1000))
263
- with open(pdf_output_dir / f"page_{page_num + 1}_yolo_regions.txt", "w") as f:
264
- f.write(f"YOLO Layout Regions ({len(yolo_regions)} found)\n{'='*50}\n\n{yolo_inspect}")
265
-
286
+ with open(
287
+ pdf_output_dir / f"page_{page_num + 1}_yolo_regions.txt", "w"
288
+ ) as f:
289
+ f.write(
290
+ f"YOLO Layout Regions ({len(yolo_regions)} found)\n{'='*50}\n\n{yolo_inspect}"
291
+ )
292
+
266
293
  if tatr_regions:
267
294
  tatr_inspect = str(ElementCollection(tatr_regions).inspect(limit=1000))
268
- with open(pdf_output_dir / f"page_{page_num + 1}_tatr_regions.txt", "w") as f:
269
- f.write(f"TATR Layout Regions ({len(tatr_regions)} found)\n{'='*50}\n\n{tatr_inspect}")
270
-
271
- # Combined regions inspect
295
+ with open(
296
+ pdf_output_dir / f"page_{page_num + 1}_tatr_regions.txt", "w"
297
+ ) as f:
298
+ f.write(
299
+ f"TATR Layout Regions ({len(tatr_regions)} found)\n{'='*50}\n\n{tatr_inspect}"
300
+ )
301
+
302
+ # Combined regions inspect
272
303
  all_inspect = str(all_regions.inspect(limit=1000))
273
304
  print(f"Combined regions preview (first 500 chars):\n{all_inspect[:500]}...")
274
-
305
+
275
306
  # Save combined regions file
276
307
  with open(pdf_output_dir / f"page_{page_num + 1}_all_regions.txt", "w") as f:
277
308
  f.write(f"All Layout Regions ({len(all_regions)} found)\n{'='*50}\n")
278
- f.write(f"YOLO: {len(yolo_regions)}, TATR: {len(tatr_regions)}, Other: {len(other_regions)}\n\n")
309
+ f.write(
310
+ f"YOLO: {len(yolo_regions)}, TATR: {len(tatr_regions)}, Other: {len(other_regions)}\n\n"
311
+ )
279
312
  f.write(all_inspect)
280
-
313
+
281
314
  page_data["regions"] = {
282
315
  "found": True,
283
316
  "total_count": len(all_regions),
284
317
  "yolo_count": len(yolo_regions),
285
318
  "tatr_count": len(tatr_regions),
286
319
  "other_count": len(other_regions),
287
- "inspect_preview": all_inspect[:500] + "..." if len(all_inspect) > 500 else all_inspect
320
+ "inspect_preview": (
321
+ all_inspect[:500] + "..." if len(all_inspect) > 500 else all_inspect
322
+ ),
288
323
  }
289
-
324
+
290
325
  else:
291
326
  print("No regions found")
292
327
  page_data["regions"] = {"found": False}
293
328
  except Exception as e:
294
329
  print(f"❌ region analysis failed: {e}")
295
330
  page_data["regions"] = f"ERROR: {e}"
296
-
331
+
297
332
  # 6. General element inspection
298
333
  print(f"\n🔍 GENERAL ELEMENT INSPECTION:")
299
334
  try:
300
335
  # Count different element types
301
- all_elements = page.find_all('*')
336
+ all_elements = page.find_all("*")
302
337
  if all_elements and len(all_elements) > 0:
303
338
  print(f"Total elements: {len(all_elements)}")
304
-
339
+
305
340
  # Full inspect output - shows complete breakdown
306
341
  print(f"\nFull element breakdown (.inspect()):")
307
342
  # Get string representation of inspect result (increased limit)
308
343
  inspect_result = all_elements.inspect(limit=1000)
309
344
  inspect_text = str(inspect_result)
310
345
  print(inspect_text)
311
-
346
+
312
347
  # Sample some elements for detailed inspection
313
348
  sample_elements = all_elements[:10] # First 10 elements
314
349
  print(f"Sample of first 10 elements:")
315
350
  elements_sample = []
316
351
  for i, elem in enumerate(sample_elements):
317
- elem_type = getattr(elem, 'object_type', 'unknown')
318
- text_preview = getattr(elem, 'text', '')[:30] if hasattr(elem, 'text') else ''
352
+ elem_type = getattr(elem, "object_type", "unknown")
353
+ text_preview = (
354
+ getattr(elem, "text", "")[:30] if hasattr(elem, "text") else ""
355
+ )
319
356
  elem_info = {
320
357
  "type": elem_type,
321
358
  "text": text_preview,
322
359
  "x0": elem.x0,
323
- "top": elem.top
360
+ "top": elem.top,
324
361
  }
325
362
  elements_sample.append(elem_info)
326
- print(f" {i+1}. {elem_type}: '{text_preview}' at ({elem.x0:.0f}, {elem.top:.0f})")
327
-
363
+ print(
364
+ f" {i+1}. {elem_type}: '{text_preview}' at ({elem.x0:.0f}, {elem.top:.0f})"
365
+ )
366
+
328
367
  page_data["elements_sample"] = {
329
368
  "total_count": len(all_elements),
330
369
  "full_inspect": inspect_text,
331
- "sample": elements_sample
370
+ "sample": elements_sample,
332
371
  }
333
-
372
+
334
373
  # Save full inspect to file
335
- with open(pdf_output_dir / f"page_{page_num + 1}_all_elements_inspect.txt", "w") as f:
374
+ with open(
375
+ pdf_output_dir / f"page_{page_num + 1}_all_elements_inspect.txt", "w"
376
+ ) as f:
336
377
  f.write(inspect_text)
337
-
378
+
338
379
  else:
339
380
  print("No elements found")
340
381
  page_data["elements_sample"] = {"total_count": 0, "sample": []}
341
382
  except Exception as e:
342
383
  print(f"❌ element inspection failed: {e}")
343
384
  page_data["elements_sample"] = f"ERROR: {e}"
344
-
385
+
345
386
  # 7. Render page as image
346
387
  print(f"\n🖼️ RENDERING PAGE AS IMAGE:")
347
388
  try:
348
389
  img = page.to_image(resolution=144)
349
390
  print(f"Image: {img.width}x{img.height} pixels")
350
-
391
+
351
392
  # Save image in output folder
352
393
  img_filename = f"page_{page_num + 1}.png"
353
394
  img_path = pdf_output_dir / img_filename
354
395
  img.save(str(img_path))
355
396
  print(f"Saved: {img_path}")
356
397
  page_data["image_path"] = str(img_path)
357
-
398
+
358
399
  except Exception as e:
359
400
  print(f"❌ image rendering failed: {e}")
360
401
  page_data["image_path"] = f"ERROR: {e}"
361
-
402
+
362
403
  analysis_data["pages"].append(page_data)
363
-
404
+
364
405
  if page_num < pages_to_analyze - 1:
365
406
  print("\n" + "=" * 80 + "\n")
366
-
407
+
367
408
  # Save complete analysis data as JSON
368
409
  import datetime
410
+
369
411
  analysis_data["analysis_timestamp"] = datetime.datetime.now().isoformat()
370
-
412
+
371
413
  summary_file = pdf_output_dir / "analysis_summary.json"
372
414
  with open(summary_file, "w") as f:
373
415
  json.dump(analysis_data, f, indent=2)
374
-
416
+
375
417
  print(f"\n✅ ANALYSIS COMPLETE")
376
418
  print(f"📊 Summary: Analyzed {pages_to_analyze} page(s) of {pdf_file.name}")
377
419
  print(f"📁 All results saved to: {pdf_output_dir}")
378
420
  print(f"📋 Summary JSON: {summary_file}")
379
-
421
+
380
422
  except Exception as e:
381
423
  print(f"❌ CRITICAL ERROR: {e}")
382
424
  import traceback
425
+
383
426
  traceback.print_exc()
384
427
 
385
428
 
386
429
  def main():
387
430
  """Main function"""
388
431
  if len(sys.argv) < 2:
389
- print("Usage: python pdf_analyzer.py <pdf_path> [num_pages] [output_folder] [--no-timestamp]")
432
+ print(
433
+ "Usage: python pdf_analyzer.py <pdf_path> [num_pages] [output_folder] [--no-timestamp]"
434
+ )
390
435
  print("Example: python pdf_analyzer.py bad-pdfs/submissions/Focus.pdf 2 analysis_results")
391
436
  print(" python pdf_analyzer.py Focus.pdf 1 my_analysis --no-timestamp")
392
437
  sys.exit(1)
393
-
438
+
394
439
  pdf_path = sys.argv[1]
395
440
  num_pages = int(sys.argv[2]) if len(sys.argv) > 2 else 1
396
441
  output_folder = "analysis_results"
397
442
  create_timestamp_folder = True
398
-
443
+
399
444
  # Parse remaining arguments
400
445
  for arg in sys.argv[3:]:
401
446
  if arg == "--no-timestamp":
402
447
  create_timestamp_folder = False
403
448
  elif not arg.startswith("--"):
404
449
  output_folder = arg
405
-
450
+
406
451
  analyze_pdf(pdf_path, num_pages, output_folder, create_timestamp_folder)
407
452
 
408
453
 
409
454
  if __name__ == "__main__":
410
- main()
455
+ main()