natural-pdf 0.1.28__py3-none-any.whl → 0.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. bad_pdf_analysis/analyze_10_more.py +300 -0
  2. bad_pdf_analysis/analyze_final_10.py +552 -0
  3. bad_pdf_analysis/analyze_specific_pages.py +394 -0
  4. bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
  5. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  6. natural_pdf/analyzers/layout/layout_manager.py +44 -0
  7. natural_pdf/analyzers/layout/surya.py +1 -1
  8. natural_pdf/analyzers/shape_detection_mixin.py +228 -0
  9. natural_pdf/classification/manager.py +67 -0
  10. natural_pdf/core/element_manager.py +578 -27
  11. natural_pdf/core/highlighting_service.py +98 -43
  12. natural_pdf/core/page.py +86 -20
  13. natural_pdf/core/pdf.py +0 -2
  14. natural_pdf/describe/base.py +40 -9
  15. natural_pdf/describe/elements.py +11 -6
  16. natural_pdf/elements/base.py +134 -20
  17. natural_pdf/elements/collections.py +43 -11
  18. natural_pdf/elements/image.py +43 -0
  19. natural_pdf/elements/region.py +64 -19
  20. natural_pdf/elements/text.py +118 -11
  21. natural_pdf/flows/collections.py +4 -4
  22. natural_pdf/flows/region.py +17 -2
  23. natural_pdf/ocr/ocr_manager.py +50 -0
  24. natural_pdf/selectors/parser.py +27 -7
  25. natural_pdf/tables/__init__.py +5 -0
  26. natural_pdf/tables/result.py +101 -0
  27. natural_pdf/utils/bidi_mirror.py +36 -0
  28. natural_pdf/utils/visualization.py +15 -1
  29. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/METADATA +2 -1
  30. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/RECORD +48 -26
  31. natural_pdf-0.1.31.dist-info/top_level.txt +6 -0
  32. optimization/memory_comparison.py +172 -0
  33. optimization/pdf_analyzer.py +410 -0
  34. optimization/performance_analysis.py +397 -0
  35. optimization/test_cleanup_methods.py +155 -0
  36. optimization/test_memory_fix.py +162 -0
  37. tools/bad_pdf_eval/__init__.py +1 -0
  38. tools/bad_pdf_eval/analyser.py +302 -0
  39. tools/bad_pdf_eval/collate_summaries.py +130 -0
  40. tools/bad_pdf_eval/eval_suite.py +116 -0
  41. tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
  42. tools/bad_pdf_eval/llm_enrich.py +273 -0
  43. tools/bad_pdf_eval/reporter.py +17 -0
  44. tools/bad_pdf_eval/utils.py +127 -0
  45. tools/rtl_smoke_test.py +80 -0
  46. natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
  47. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/WHEEL +0 -0
  48. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/entry_points.txt +0 -0
  49. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,410 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ PDF Analysis Tool
4
+
5
+ Analyzes a PDF using Natural PDF's capabilities to understand what it can actually extract.
6
+ This shows what Natural PDF sees vs. what users are struggling with.
7
+
8
+ Usage:
9
+ python pdf_analyzer.py path/to/document.pdf [num_pages] [output_folder]
10
+ """
11
+
12
+ import sys
13
+ import json
14
+ from pathlib import Path
15
+ import natural_pdf as npdf
16
+ from natural_pdf.elements.collections import ElementCollection
17
+
18
+
19
+ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_timestamp_folder=True):
20
+ """Analyze a PDF using Natural PDF's full capabilities"""
21
+
22
+ pdf_file = Path(pdf_path)
23
+ if not pdf_file.exists():
24
+ print(f"❌ File not found: {pdf_path}")
25
+ return
26
+
27
+ # Create output folder structure
28
+ base_output_dir = Path(output_folder)
29
+ base_output_dir.mkdir(exist_ok=True)
30
+
31
+ # If create_timestamp_folder=True, create a timestamped run folder for batch analysis
32
+ if create_timestamp_folder:
33
+ import datetime
34
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
35
+ run_output_dir = base_output_dir / f"run_{timestamp}"
36
+ run_output_dir.mkdir(exist_ok=True)
37
+ else:
38
+ run_output_dir = base_output_dir
39
+
40
+ # Create subfolder for this specific PDF within the run folder
41
+ pdf_output_dir = run_output_dir / pdf_file.stem
42
+ pdf_output_dir.mkdir(exist_ok=True)
43
+
44
+ print(f"🔍 ANALYZING: {pdf_file.name}")
45
+ print(f"📁 Output folder: {pdf_output_dir}")
46
+ print("=" * 80)
47
+
48
+ analysis_data = {
49
+ "pdf_name": pdf_file.name,
50
+ "pdf_path": str(pdf_file),
51
+ "analysis_timestamp": None,
52
+ "pages": []
53
+ }
54
+
55
+ try:
56
+ # Load PDF
57
+ pdf = npdf.PDF(str(pdf_file))
58
+ total_pages = len(pdf.pages)
59
+ pages_to_analyze = min(num_pages, total_pages)
60
+
61
+ analysis_data["total_pages"] = total_pages
62
+ analysis_data["pages_analyzed"] = pages_to_analyze
63
+
64
+ print(f"📄 Total pages: {total_pages}")
65
+ print(f"🔍 Analyzing first {pages_to_analyze} page(s)")
66
+ print()
67
+
68
+ for page_num in range(pages_to_analyze):
69
+ page = pdf.pages[page_num]
70
+
71
+ page_data = {
72
+ "page_number": page_num + 1,
73
+ "dimensions": {"width": page.width, "height": page.height},
74
+ "describe": None,
75
+ "extract_text": None,
76
+ "extract_table": None,
77
+ "analyze_layout": None,
78
+ "regions": None,
79
+ "elements_sample": None,
80
+ "image_path": None
81
+ }
82
+
83
+ print(f"📄 PAGE {page_num + 1}")
84
+ print("-" * 60)
85
+
86
+ # Basic page info
87
+ print(f"📐 Dimensions: {page.width:.1f} x {page.height:.1f}")
88
+
89
+ # 1. .describe() - Overview of elements
90
+ print(f"\n🤖 PAGE.DESCRIBE():")
91
+ try:
92
+ description = page.describe()
93
+ print(description)
94
+ page_data["describe"] = str(description)
95
+
96
+ # Save describe output to file
97
+ with open(pdf_output_dir / f"page_{page_num + 1}_describe.txt", "w") as f:
98
+ f.write(str(description))
99
+
100
+ except Exception as e:
101
+ print(f"❌ describe() failed: {e}")
102
+ page_data["describe"] = f"ERROR: {e}"
103
+
104
+ # 2. .extract_text() - Raw text extraction
105
+ print(f"\n📝 PAGE.EXTRACT_TEXT():")
106
+ try:
107
+ text = page.extract_text()
108
+ if text:
109
+ print(f"Length: {len(text)} characters")
110
+ # Show first 300 chars
111
+ preview = text[:300].replace('\n', '\\n')
112
+ print(f"Preview: {preview}...")
113
+ page_data["extract_text"] = {"length": len(text), "preview": preview, "full_text": text}
114
+
115
+ # Save full text to file
116
+ with open(pdf_output_dir / f"page_{page_num + 1}_text.txt", "w") as f:
117
+ f.write(text)
118
+
119
+ else:
120
+ print("No text extracted")
121
+ page_data["extract_text"] = {"length": 0, "preview": "", "full_text": ""}
122
+ except Exception as e:
123
+ print(f"❌ extract_text() failed: {e}")
124
+ page_data["extract_text"] = f"ERROR: {e}"
125
+
126
+ # 3. .extract_table() - Table extraction (returns List[List[str]])
127
+ print(f"\n📊 PAGE.EXTRACT_TABLE():")
128
+ try:
129
+ table_data = page.extract_table() # This returns List[List[Optional[str]]]
130
+ if table_data and len(table_data) > 0:
131
+ rows = len(table_data)
132
+ cols = len(table_data[0]) if table_data[0] else 0
133
+ print(f"Table found: {rows} rows x {cols} columns")
134
+ print("Sample data (first 3 rows):")
135
+ for i, row in enumerate(table_data[:3]):
136
+ print(f" Row {i+1}: {row}")
137
+
138
+ page_data["extract_table"] = {
139
+ "found": True,
140
+ "rows": rows,
141
+ "columns": cols,
142
+ "data": table_data
143
+ }
144
+
145
+ # Save table data as JSON
146
+ with open(pdf_output_dir / f"page_{page_num + 1}_table.json", "w") as f:
147
+ json.dump(table_data, f, indent=2)
148
+
149
+ else:
150
+ print("No table extracted")
151
+ page_data["extract_table"] = {"found": False}
152
+ except Exception as e:
153
+ print(f"❌ extract_table() failed: {e}")
154
+ page_data["extract_table"] = f"ERROR: {e}"
155
+
156
+ # 4. .analyze_layout() - Layout analysis
157
+ print(f"\n🏗️ PAGE.ANALYZE_LAYOUT():")
158
+ try:
159
+ layout = page.analyze_layout()
160
+ if layout and len(layout) > 0:
161
+ print(f"Layout regions found: {len(layout)}")
162
+ layout_info = []
163
+ for i, region in enumerate(layout[:5]): # Show first 5
164
+ region_info = {
165
+ "type": getattr(region, 'type', 'unknown'),
166
+ "bbox": [region.x0, region.top, region.x1, region.bottom],
167
+ "confidence": getattr(region, 'confidence', 0)
168
+ }
169
+ layout_info.append(region_info)
170
+ print(f" {i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})")
171
+
172
+ page_data["analyze_layout"] = {
173
+ "found": True,
174
+ "count": len(layout),
175
+ "regions": layout_info
176
+ }
177
+ else:
178
+ print("No layout regions found")
179
+ page_data["analyze_layout"] = {"found": False}
180
+ except Exception as e:
181
+ print(f"❌ analyze_layout() failed: {e}")
182
+ page_data["analyze_layout"] = f"ERROR: {e}"
183
+
184
+ # 4b. .analyze_layout('tatr') - Table structure analysis (append to preserve YOLO results)
185
+ print(f"\n🏗️ PAGE.ANALYZE_LAYOUT('TATR') - Table Structure:")
186
+ try:
187
+ tatr_layout = page.analyze_layout('tatr', existing="append")
188
+ if tatr_layout and len(tatr_layout) > 0:
189
+ print(f"TATR layout regions found: {len(tatr_layout)}")
190
+ tatr_info = []
191
+ for i, region in enumerate(tatr_layout[:5]): # Show first 5
192
+ region_info = {
193
+ "type": getattr(region, 'type', 'unknown'),
194
+ "bbox": [region.x0, region.top, region.x1, region.bottom],
195
+ "confidence": getattr(region, 'confidence', 0)
196
+ }
197
+ tatr_info.append(region_info)
198
+ print(f" {i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})")
199
+
200
+ page_data["analyze_layout_tatr"] = {
201
+ "found": True,
202
+ "count": len(tatr_layout),
203
+ "regions": tatr_info
204
+ }
205
+
206
+ # Save TATR layout analysis to file
207
+ tatr_summary = f"TATR Layout Analysis\n{'='*50}\n"
208
+ tatr_summary += f"Found {len(tatr_layout)} regions:\n\n"
209
+ for i, region_info in enumerate(tatr_info):
210
+ tatr_summary += f"{i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})\n"
211
+
212
+ with open(pdf_output_dir / f"page_{page_num + 1}_tatr_layout.txt", "w") as f:
213
+ f.write(tatr_summary)
214
+
215
+ # Try to get detailed table structure
216
+ try:
217
+ table_structure = page.find_table_structure()
218
+ if table_structure:
219
+ print(f"Table structure found with {len(table_structure)} elements")
220
+ table_details = str(table_structure)
221
+ page_data["table_structure"] = {
222
+ "found": True,
223
+ "count": len(table_structure),
224
+ "details": table_details[:1000] + ("..." if len(table_details) > 1000 else "")
225
+ }
226
+
227
+ # Save table structure to file
228
+ with open(pdf_output_dir / f"page_{page_num + 1}_table_structure.txt", "w") as f:
229
+ f.write(table_details)
230
+ else:
231
+ page_data["table_structure"] = {"found": False}
232
+ except Exception as te:
233
+ print(f"Table structure detection failed: {te}")
234
+ page_data["table_structure"] = f"ERROR: {te}"
235
+ else:
236
+ print("No TATR layout regions found")
237
+ page_data["analyze_layout_tatr"] = {"found": False}
238
+ page_data["table_structure"] = {"found": False}
239
+ except Exception as e:
240
+ print(f"❌ analyze_layout('tatr') failed: {e}")
241
+ page_data["analyze_layout_tatr"] = f"ERROR: {e}"
242
+ page_data["table_structure"] = f"ERROR: {e}"
243
+
244
+ # 5. Find regions by model and save separate + combined files
245
+ print(f"\n📍 REGION ANALYSIS - By Model:")
246
+ try:
247
+ all_regions = page.find_all('region')
248
+ if all_regions and len(all_regions) > 0:
249
+ print(f"Total regions found: {len(all_regions)}")
250
+
251
+ # Group regions by model/source
252
+ yolo_regions = [r for r in all_regions if getattr(r, 'model', '') == '' or getattr(r, 'model', '') == 'yolo']
253
+ tatr_regions = [r for r in all_regions if getattr(r, 'model', '') == 'tatr']
254
+ other_regions = [r for r in all_regions if getattr(r, 'model', '') not in ['', 'yolo', 'tatr']]
255
+
256
+ print(f" YOLO regions: {len(yolo_regions)}")
257
+ print(f" TATR regions: {len(tatr_regions)}")
258
+ print(f" Other regions: {len(other_regions)}")
259
+
260
+ # Save separate files for each model
261
+ if yolo_regions:
262
+ yolo_inspect = str(ElementCollection(yolo_regions).inspect(limit=1000))
263
+ with open(pdf_output_dir / f"page_{page_num + 1}_yolo_regions.txt", "w") as f:
264
+ f.write(f"YOLO Layout Regions ({len(yolo_regions)} found)\n{'='*50}\n\n{yolo_inspect}")
265
+
266
+ if tatr_regions:
267
+ tatr_inspect = str(ElementCollection(tatr_regions).inspect(limit=1000))
268
+ with open(pdf_output_dir / f"page_{page_num + 1}_tatr_regions.txt", "w") as f:
269
+ f.write(f"TATR Layout Regions ({len(tatr_regions)} found)\n{'='*50}\n\n{tatr_inspect}")
270
+
271
+ # Combined regions inspect
272
+ all_inspect = str(all_regions.inspect(limit=1000))
273
+ print(f"Combined regions preview (first 500 chars):\n{all_inspect[:500]}...")
274
+
275
+ # Save combined regions file
276
+ with open(pdf_output_dir / f"page_{page_num + 1}_all_regions.txt", "w") as f:
277
+ f.write(f"All Layout Regions ({len(all_regions)} found)\n{'='*50}\n")
278
+ f.write(f"YOLO: {len(yolo_regions)}, TATR: {len(tatr_regions)}, Other: {len(other_regions)}\n\n")
279
+ f.write(all_inspect)
280
+
281
+ page_data["regions"] = {
282
+ "found": True,
283
+ "total_count": len(all_regions),
284
+ "yolo_count": len(yolo_regions),
285
+ "tatr_count": len(tatr_regions),
286
+ "other_count": len(other_regions),
287
+ "inspect_preview": all_inspect[:500] + "..." if len(all_inspect) > 500 else all_inspect
288
+ }
289
+
290
+ else:
291
+ print("No regions found")
292
+ page_data["regions"] = {"found": False}
293
+ except Exception as e:
294
+ print(f"❌ region analysis failed: {e}")
295
+ page_data["regions"] = f"ERROR: {e}"
296
+
297
+ # 6. General element inspection
298
+ print(f"\n🔍 GENERAL ELEMENT INSPECTION:")
299
+ try:
300
+ # Count different element types
301
+ all_elements = page.find_all('*')
302
+ if all_elements and len(all_elements) > 0:
303
+ print(f"Total elements: {len(all_elements)}")
304
+
305
+ # Full inspect output - shows complete breakdown
306
+ print(f"\nFull element breakdown (.inspect()):")
307
+ # Get string representation of inspect result (increased limit)
308
+ inspect_result = all_elements.inspect(limit=1000)
309
+ inspect_text = str(inspect_result)
310
+ print(inspect_text)
311
+
312
+ # Sample some elements for detailed inspection
313
+ sample_elements = all_elements[:10] # First 10 elements
314
+ print(f"Sample of first 10 elements:")
315
+ elements_sample = []
316
+ for i, elem in enumerate(sample_elements):
317
+ elem_type = getattr(elem, 'object_type', 'unknown')
318
+ text_preview = getattr(elem, 'text', '')[:30] if hasattr(elem, 'text') else ''
319
+ elem_info = {
320
+ "type": elem_type,
321
+ "text": text_preview,
322
+ "x0": elem.x0,
323
+ "top": elem.top
324
+ }
325
+ elements_sample.append(elem_info)
326
+ print(f" {i+1}. {elem_type}: '{text_preview}' at ({elem.x0:.0f}, {elem.top:.0f})")
327
+
328
+ page_data["elements_sample"] = {
329
+ "total_count": len(all_elements),
330
+ "full_inspect": inspect_text,
331
+ "sample": elements_sample
332
+ }
333
+
334
+ # Save full inspect to file
335
+ with open(pdf_output_dir / f"page_{page_num + 1}_all_elements_inspect.txt", "w") as f:
336
+ f.write(inspect_text)
337
+
338
+ else:
339
+ print("No elements found")
340
+ page_data["elements_sample"] = {"total_count": 0, "sample": []}
341
+ except Exception as e:
342
+ print(f"❌ element inspection failed: {e}")
343
+ page_data["elements_sample"] = f"ERROR: {e}"
344
+
345
+ # 7. Render page as image
346
+ print(f"\n🖼️ RENDERING PAGE AS IMAGE:")
347
+ try:
348
+ img = page.to_image(resolution=144)
349
+ print(f"Image: {img.width}x{img.height} pixels")
350
+
351
+ # Save image in output folder
352
+ img_filename = f"page_{page_num + 1}.png"
353
+ img_path = pdf_output_dir / img_filename
354
+ img.save(str(img_path))
355
+ print(f"Saved: {img_path}")
356
+ page_data["image_path"] = str(img_path)
357
+
358
+ except Exception as e:
359
+ print(f"❌ image rendering failed: {e}")
360
+ page_data["image_path"] = f"ERROR: {e}"
361
+
362
+ analysis_data["pages"].append(page_data)
363
+
364
+ if page_num < pages_to_analyze - 1:
365
+ print("\n" + "=" * 80 + "\n")
366
+
367
+ # Save complete analysis data as JSON
368
+ import datetime
369
+ analysis_data["analysis_timestamp"] = datetime.datetime.now().isoformat()
370
+
371
+ summary_file = pdf_output_dir / "analysis_summary.json"
372
+ with open(summary_file, "w") as f:
373
+ json.dump(analysis_data, f, indent=2)
374
+
375
+ print(f"\n✅ ANALYSIS COMPLETE")
376
+ print(f"📊 Summary: Analyzed {pages_to_analyze} page(s) of {pdf_file.name}")
377
+ print(f"📁 All results saved to: {pdf_output_dir}")
378
+ print(f"📋 Summary JSON: {summary_file}")
379
+
380
+ except Exception as e:
381
+ print(f"❌ CRITICAL ERROR: {e}")
382
+ import traceback
383
+ traceback.print_exc()
384
+
385
+
386
+ def main():
387
+ """Main function"""
388
+ if len(sys.argv) < 2:
389
+ print("Usage: python pdf_analyzer.py <pdf_path> [num_pages] [output_folder] [--no-timestamp]")
390
+ print("Example: python pdf_analyzer.py bad-pdfs/submissions/Focus.pdf 2 analysis_results")
391
+ print(" python pdf_analyzer.py Focus.pdf 1 my_analysis --no-timestamp")
392
+ sys.exit(1)
393
+
394
+ pdf_path = sys.argv[1]
395
+ num_pages = int(sys.argv[2]) if len(sys.argv) > 2 else 1
396
+ output_folder = "analysis_results"
397
+ create_timestamp_folder = True
398
+
399
+ # Parse remaining arguments
400
+ for arg in sys.argv[3:]:
401
+ if arg == "--no-timestamp":
402
+ create_timestamp_folder = False
403
+ elif not arg.startswith("--"):
404
+ output_folder = arg
405
+
406
+ analyze_pdf(pdf_path, num_pages, output_folder, create_timestamp_folder)
407
+
408
+
409
+ if __name__ == "__main__":
410
+ main()