natural-pdf 0.1.28__py3-none-any.whl → 0.1.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. bad_pdf_analysis/analyze_10_more.py +300 -0
  2. bad_pdf_analysis/analyze_final_10.py +552 -0
  3. bad_pdf_analysis/analyze_specific_pages.py +394 -0
  4. bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
  5. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  6. natural_pdf/analyzers/layout/layout_manager.py +44 -0
  7. natural_pdf/analyzers/layout/surya.py +1 -1
  8. natural_pdf/analyzers/shape_detection_mixin.py +228 -0
  9. natural_pdf/classification/manager.py +67 -0
  10. natural_pdf/core/element_manager.py +556 -25
  11. natural_pdf/core/highlighting_service.py +98 -43
  12. natural_pdf/core/page.py +86 -20
  13. natural_pdf/core/pdf.py +0 -2
  14. natural_pdf/describe/base.py +40 -9
  15. natural_pdf/describe/elements.py +11 -6
  16. natural_pdf/elements/base.py +134 -20
  17. natural_pdf/elements/collections.py +43 -11
  18. natural_pdf/elements/image.py +43 -0
  19. natural_pdf/elements/region.py +64 -19
  20. natural_pdf/elements/text.py +89 -11
  21. natural_pdf/flows/collections.py +4 -4
  22. natural_pdf/flows/region.py +17 -2
  23. natural_pdf/ocr/ocr_manager.py +50 -0
  24. natural_pdf/selectors/parser.py +27 -7
  25. natural_pdf/tables/__init__.py +5 -0
  26. natural_pdf/tables/result.py +101 -0
  27. natural_pdf/utils/bidi_mirror.py +36 -0
  28. natural_pdf/utils/visualization.py +15 -1
  29. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
  30. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +48 -26
  31. natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
  32. optimization/memory_comparison.py +172 -0
  33. optimization/pdf_analyzer.py +410 -0
  34. optimization/performance_analysis.py +397 -0
  35. optimization/test_cleanup_methods.py +155 -0
  36. optimization/test_memory_fix.py +162 -0
  37. tools/bad_pdf_eval/__init__.py +1 -0
  38. tools/bad_pdf_eval/analyser.py +302 -0
  39. tools/bad_pdf_eval/collate_summaries.py +130 -0
  40. tools/bad_pdf_eval/eval_suite.py +116 -0
  41. tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
  42. tools/bad_pdf_eval/llm_enrich.py +273 -0
  43. tools/bad_pdf_eval/reporter.py +17 -0
  44. tools/bad_pdf_eval/utils.py +127 -0
  45. tools/rtl_smoke_test.py +80 -0
  46. natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
  47. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
  48. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
  49. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,382 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Direct Natural PDF analysis targeting specific pages.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import sys
9
+ from pathlib import Path
10
+ import natural_pdf as npdf
11
+ import re
12
+ from datetime import datetime
13
+
14
+ def analyze_specific_pages_direct(pdf_path, target_pages, output_folder):
15
+ """Directly analyze specific pages using Natural PDF"""
16
+
17
+ print(f"🔍 Analyzing {pdf_path}")
18
+ print(f"📍 Target pages: {target_pages}")
19
+
20
+ pdf = npdf.PDF(pdf_path)
21
+ results = {}
22
+
23
+ for page_num in target_pages:
24
+ if page_num > len(pdf.pages):
25
+ print(f"❌ Page {page_num} not found - document only has {len(pdf.pages)} pages")
26
+ continue
27
+
28
+ print(f"\n📄 Analyzing page {page_num}...")
29
+ page = pdf.pages[page_num - 1] # Convert to 0-based index
30
+
31
+ page_data = {
32
+ "page_number": page_num,
33
+ "dimensions": {
34
+ "width": page.width,
35
+ "height": page.height
36
+ }
37
+ }
38
+
39
+ # Get page description
40
+ try:
41
+ description = page.describe()
42
+ page_data["describe"] = description
43
+ print(f"✅ Page description: {len(description)} characters")
44
+ except Exception as e:
45
+ print(f"❌ Page description failed: {e}")
46
+ page_data["describe"] = f"ERROR: {e}"
47
+
48
+ # Extract text
49
+ try:
50
+ text = page.extract_text()
51
+ page_data["extract_text"] = {
52
+ "length": len(text),
53
+ "preview": text[:200] + "..." if len(text) > 200 else text,
54
+ "full_text": text
55
+ }
56
+ print(f"✅ Text extraction: {len(text)} characters")
57
+ except Exception as e:
58
+ print(f"❌ Text extraction failed: {e}")
59
+ page_data["extract_text"] = f"ERROR: {e}"
60
+
61
+ # Try table extraction
62
+ try:
63
+ table_data = page.extract_table()
64
+ if table_data and len(table_data) > 0:
65
+ page_data["extract_table"] = {
66
+ "found": True,
67
+ "rows": len(table_data),
68
+ "columns": len(table_data[0]) if table_data else 0,
69
+ "data": table_data[:5] # First 5 rows only
70
+ }
71
+ print(f"✅ Table found: {len(table_data)} rows × {len(table_data[0]) if table_data else 0} columns")
72
+ else:
73
+ page_data["extract_table"] = {"found": False}
74
+ print("ℹ️ No table found with standard extraction")
75
+ except Exception as e:
76
+ print(f"❌ Table extraction failed: {e}")
77
+ page_data["extract_table"] = f"ERROR: {e}"
78
+
79
+ # Try layout analysis
80
+ try:
81
+ page.analyze_layout('yolo', existing='replace')
82
+ layout_regions = page.find_all('region')
83
+ if layout_regions and len(layout_regions) > 0:
84
+ page_data["analyze_layout"] = {
85
+ "found": True,
86
+ "count": len(layout_regions),
87
+ "regions": []
88
+ }
89
+ for region in layout_regions[:10]: # First 10 regions
90
+ try:
91
+ page_data["analyze_layout"]["regions"].append({
92
+ "type": region.type if hasattr(region, 'type') else 'unknown',
93
+ "bbox": [region.x0, region.y0, region.x1, region.y1],
94
+ "confidence": region.confidence if hasattr(region, 'confidence') else 1.0
95
+ })
96
+ except:
97
+ pass
98
+ print(f"✅ Layout analysis: {len(layout_regions)} regions")
99
+ else:
100
+ page_data["analyze_layout"] = {"found": False}
101
+ print("ℹ️ No layout regions found")
102
+ except Exception as e:
103
+ print(f"❌ Layout analysis failed: {e}")
104
+ page_data["analyze_layout"] = f"ERROR: {e}"
105
+
106
+ # Try TATR analysis
107
+ try:
108
+ page.analyze_layout('tatr', existing='append')
109
+ tatr_regions = page.find_all('region')
110
+ tatr_count = len([r for r in tatr_regions if hasattr(r, 'type') and 'table' in str(r.type).lower()])
111
+ if tatr_count > 0:
112
+ page_data["analyze_layout_tatr"] = {
113
+ "found": True,
114
+ "count": tatr_count,
115
+ "regions": []
116
+ }
117
+ for region in tatr_regions[:25]: # First 25 regions
118
+ try:
119
+ if hasattr(region, 'type') and 'table' in str(region.type).lower():
120
+ page_data["analyze_layout_tatr"]["regions"].append({
121
+ "type": str(region.type),
122
+ "bbox": [region.x0, region.y0, region.x1, region.y1],
123
+ "confidence": region.confidence if hasattr(region, 'confidence') else 1.0
124
+ })
125
+ except:
126
+ pass
127
+ print(f"✅ TATR analysis: {tatr_count} table regions")
128
+ else:
129
+ page_data["analyze_layout_tatr"] = {"found": False}
130
+ print("ℹ️ No TATR table regions found")
131
+ except Exception as e:
132
+ print(f"❌ TATR analysis failed: {e}")
133
+ page_data["analyze_layout_tatr"] = f"ERROR: {e}"
134
+
135
+ # Save page image
136
+ try:
137
+ page_image_path = os.path.join(output_folder, f"page_{page_num}.png")
138
+ page.save_image(page_image_path, resolution=144)
139
+ page_data["image_path"] = page_image_path
140
+ print(f"✅ Page image saved: {page_image_path}")
141
+ except Exception as e:
142
+ print(f"❌ Page image save failed: {e}")
143
+ page_data["image_path"] = f"ERROR: {e}"
144
+
145
+ results[page_num] = page_data
146
+
147
+ return results
148
+
149
+ def create_enhanced_analysis_report(pdf_path, target_pages, analysis_results, output_folder):
150
+ """Create enhanced analysis report"""
151
+
152
+ pdf_name = Path(pdf_path).name
153
+
154
+ # Determine what the user was looking for
155
+ user_goal = f"Analysis of pages {target_pages}"
156
+ if len(target_pages) == 1:
157
+ user_goal = f"Analysis of page {target_pages[0]}"
158
+
159
+ report = f"""# Enhanced PDF Analysis Report - {pdf_name.replace('.pdf', '')}
160
+
161
+ ## Analysis Overview
162
+
163
+ **PDF File:** {pdf_name}
164
+ **Target Pages:** {target_pages}
165
+ **Pages Successfully Analyzed:** {list(analysis_results.keys())}
166
+ **Analysis Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
167
+
168
+ ---
169
+
170
+ ## Page-by-Page Analysis Results
171
+
172
+ """
173
+
174
+ for page_num in sorted(analysis_results.keys()):
175
+ page_data = analysis_results[page_num]
176
+
177
+ report += f"""### Page {page_num}
178
+
179
+ **Dimensions:** {page_data.get('dimensions', {}).get('width', 'Unknown')} × {page_data.get('dimensions', {}).get('height', 'Unknown')} points
180
+
181
+ **Content Analysis:**
182
+ """
183
+
184
+ # Text analysis
185
+ if isinstance(page_data.get('extract_text'), dict):
186
+ text_info = page_data['extract_text']
187
+ report += f"- **Text Content:** {text_info.get('length', 0)} characters extracted\n"
188
+ if text_info.get('preview'):
189
+ report += f"- **Content Preview:** {text_info['preview']}\n"
190
+
191
+ # Table analysis
192
+ if isinstance(page_data.get('extract_table'), dict):
193
+ table_info = page_data['extract_table']
194
+ if table_info.get('found'):
195
+ report += f"- **Table Found:** {table_info.get('rows', 0)} rows × {table_info.get('columns', 0)} columns\n"
196
+ else:
197
+ report += "- **Table Status:** No standard table structure detected\n"
198
+
199
+ # Layout analysis
200
+ if isinstance(page_data.get('analyze_layout'), dict):
201
+ layout_info = page_data['analyze_layout']
202
+ if layout_info.get('found'):
203
+ report += f"- **Layout Regions:** {layout_info.get('count', 0)} regions detected\n"
204
+
205
+ # Show region types
206
+ region_types = {}
207
+ for region in layout_info.get('regions', []):
208
+ region_type = region.get('type', 'unknown')
209
+ region_types[region_type] = region_types.get(region_type, 0) + 1
210
+
211
+ if region_types:
212
+ report += f"- **Region Types:** {dict(region_types)}\n"
213
+
214
+ # TATR analysis
215
+ if isinstance(page_data.get('analyze_layout_tatr'), dict):
216
+ tatr_info = page_data['analyze_layout_tatr']
217
+ if tatr_info.get('found'):
218
+ report += f"- **TATR Table Analysis:** {tatr_info.get('count', 0)} table regions detected\n"
219
+
220
+ # Image
221
+ if page_data.get('image_path') and not page_data['image_path'].startswith('ERROR'):
222
+ report += f"- **Visual:** Page image saved as `page_{page_num}.png`\n"
223
+
224
+ report += "\n"
225
+
226
+ # Analysis summary
227
+ report += """---
228
+
229
+ ## Analysis Summary
230
+
231
+ ### What We Found
232
+ """
233
+
234
+ # Summarize findings across all pages
235
+ total_text_chars = 0
236
+ pages_with_tables = 0
237
+ total_layout_regions = 0
238
+ total_tatr_regions = 0
239
+
240
+ for page_data in analysis_results.values():
241
+ if isinstance(page_data.get('extract_text'), dict):
242
+ total_text_chars += page_data['extract_text'].get('length', 0)
243
+
244
+ if isinstance(page_data.get('extract_table'), dict) and page_data['extract_table'].get('found'):
245
+ pages_with_tables += 1
246
+
247
+ if isinstance(page_data.get('analyze_layout'), dict) and page_data['analyze_layout'].get('found'):
248
+ total_layout_regions += page_data['analyze_layout'].get('count', 0)
249
+
250
+ if isinstance(page_data.get('analyze_layout_tatr'), dict) and page_data['analyze_layout_tatr'].get('found'):
251
+ total_tatr_regions += page_data['analyze_layout_tatr'].get('count', 0)
252
+
253
+ report += f"""
254
+ - **Total Text Content:** {total_text_chars:,} characters across {len(analysis_results)} pages
255
+ - **Table Detection:** {pages_with_tables} out of {len(analysis_results)} pages have detectable tables
256
+ - **Layout Analysis:** {total_layout_regions} total layout regions detected
257
+ - **TATR Analysis:** {total_tatr_regions} table-specific regions detected
258
+ """
259
+
260
+ # Add recommendations
261
+ report += """
262
+ ### Natural PDF Extraction Approach
263
+
264
+ Based on the actual content found on these pages:
265
+
266
+ ```python
267
+ import natural_pdf as npdf
268
+
269
+ def extract_from_specific_pages(pdf_path, target_pages):
270
+ \"\"\"Extract data from specific pages with targeted approach\"\"\"
271
+ pdf = npdf.PDF(pdf_path)
272
+ results = []
273
+
274
+ for page_num in target_pages:
275
+ if page_num <= len(pdf.pages):
276
+ page = pdf.pages[page_num - 1]
277
+
278
+ # Use layout analysis for better structure detection
279
+ page.analyze_layout('tatr', existing='append')
280
+
281
+ # Try table extraction first
282
+ table_data = page.extract_table()
283
+ if table_data:
284
+ results.append({
285
+ 'page': page_num,
286
+ 'type': 'table',
287
+ 'data': table_data
288
+ })
289
+ else:
290
+ # Use spatial navigation for complex layouts
291
+ all_text = page.find_all('text')
292
+ results.append({
293
+ 'page': page_num,
294
+ 'type': 'text_elements',
295
+ 'elements': all_text
296
+ })
297
+
298
+ return results
299
+
300
+ # Extract from your specific pages
301
+ """
302
+
303
+ if len(target_pages) == 1:
304
+ report += f"results = extract_from_specific_pages('{pdf_name}', [{target_pages[0]}])\n"
305
+ else:
306
+ report += f"results = extract_from_specific_pages('{pdf_name}', {target_pages})\n"
307
+
308
+ report += "```\n"
309
+
310
+ # Save the report
311
+ report_path = os.path.join(output_folder, f"{pdf_name.replace('.pdf', '')}_enhanced_analysis.md")
312
+ with open(report_path, 'w', encoding='utf-8') as f:
313
+ f.write(report)
314
+
315
+ print(f"✅ Enhanced analysis report saved: {report_path}")
316
+ return report_path
317
+
318
+ def main():
319
+ """Re-analyze specific documents with page targeting"""
320
+
321
+ # Documents that need re-analysis with specific pages
322
+ documents_to_reanalyze = [
323
+ {
324
+ 'folder': 'ODX1DW8_The large table on page 179',
325
+ 'file': 'ODX1DW8.pdf',
326
+ 'pages': [178, 179, 180], # Page 179 ± 1 for safety
327
+ 'reason': 'User requested page 179, original analysis used page 1'
328
+ },
329
+ {
330
+ 'folder': 'eqrZ5yq_The long table _Annex 6_ spanning across pages fro',
331
+ 'file': 'eqrZ5yq.pdf',
332
+ 'pages': [89, 90, 91, 92], # Multi-page table range
333
+ 'reason': 'User requested pages 89-92, original analysis used page 1'
334
+ }
335
+ ]
336
+
337
+ base_path = "/Users/soma/Development/natural-pdf/bad_pdf_analysis"
338
+
339
+ for doc in documents_to_reanalyze:
340
+ print(f"\n{'='*80}")
341
+ print(f"🔄 Re-analyzing {doc['file']}")
342
+ print(f"📋 Reason: {doc['reason']}")
343
+ print(f"{'='*80}")
344
+
345
+ folder_path = os.path.join(base_path, doc['folder'])
346
+ pdf_path = os.path.join(folder_path, doc['file'])
347
+ output_folder = os.path.join(folder_path, 'enhanced_analysis')
348
+
349
+ if not os.path.exists(pdf_path):
350
+ print(f"❌ PDF not found: {pdf_path}")
351
+ continue
352
+
353
+ # Create output folder
354
+ os.makedirs(output_folder, exist_ok=True)
355
+
356
+ # Run direct analysis on specific pages
357
+ try:
358
+ analysis_results = analyze_specific_pages_direct(pdf_path, doc['pages'], output_folder)
359
+
360
+ if analysis_results:
361
+ # Save analysis results as JSON
362
+ results_file = os.path.join(output_folder, "enhanced_analysis_results.json")
363
+ with open(results_file, 'w') as f:
364
+ json.dump({
365
+ "pdf_path": pdf_path,
366
+ "target_pages": doc['pages'],
367
+ "analysis_timestamp": datetime.now().isoformat(),
368
+ "results": analysis_results
369
+ }, f, indent=2)
370
+
371
+ # Create enhanced report
372
+ create_enhanced_analysis_report(pdf_path, doc['pages'], analysis_results, output_folder)
373
+
374
+ print(f"\n✅ Successfully analyzed {len(analysis_results)} pages from {doc['file']}")
375
+ else:
376
+ print(f"❌ No results obtained for {doc['file']}")
377
+
378
+ except Exception as e:
379
+ print(f"❌ Analysis failed for {doc['file']}: {e}")
380
+
381
+ if __name__ == "__main__":
382
+ main()
@@ -83,10 +83,9 @@ class LayoutAnalyzer:
83
83
  f" Rendering page {self._page.number} to image for initial layout detection..."
84
84
  )
85
85
  try:
86
- layout_scale = getattr(self._page._parent, "_config", {}).get("layout_image_scale", 1.0)
87
- layout_resolution = layout_scale * 72
86
+ layout_resolution = getattr(self._page._parent, "_config", {}).get("layout_image_resolution", 72)
88
87
  std_res_page_image = self._page.to_image(
89
- resolution=layout_resolution, include_highlights=False, scale=1.0
88
+ resolution=layout_resolution, include_highlights=False
90
89
  )
91
90
  if not std_res_page_image:
92
91
  raise ValueError("Initial page rendering returned None")
@@ -220,3 +220,47 @@ class LayoutManager:
220
220
  logger.debug(f"Layout engine '{name}' check failed: {e}")
221
221
  pass
222
222
  return available
223
+
224
+ def cleanup_detector(self, detector_name: Optional[str] = None) -> int:
225
+ """
226
+ Cleanup layout detector instances to free memory.
227
+
228
+ Args:
229
+ detector_name: Specific detector to cleanup, or None to cleanup all detectors
230
+
231
+ Returns:
232
+ Number of detectors cleaned up
233
+ """
234
+ cleaned_count = 0
235
+
236
+ if detector_name:
237
+ # Cleanup specific detector
238
+ detector_name = detector_name.lower()
239
+ if detector_name in self._detector_instances:
240
+ detector = self._detector_instances.pop(detector_name)
241
+ if hasattr(detector, 'cleanup'):
242
+ try:
243
+ detector.cleanup()
244
+ except Exception as e:
245
+ logger.debug(f"Detector {detector_name} cleanup method failed: {e}")
246
+
247
+ logger.info(f"Cleaned up layout detector: {detector_name}")
248
+ cleaned_count = 1
249
+ else:
250
+ # Cleanup all detectors
251
+ for name, detector in list(self._detector_instances.items()):
252
+ if hasattr(detector, 'cleanup'):
253
+ try:
254
+ detector.cleanup()
255
+ except Exception as e:
256
+ logger.debug(f"Detector {name} cleanup method failed: {e}")
257
+
258
+ # Clear all caches
259
+ detector_count = len(self._detector_instances)
260
+ self._detector_instances.clear()
261
+
262
+ if detector_count > 0:
263
+ logger.info(f"Cleaned up {detector_count} layout detectors")
264
+ cleaned_count = detector_count
265
+
266
+ return cleaned_count
@@ -189,7 +189,7 @@ class SuryaLayoutDetector(LayoutDetector):
189
189
  "surya_table_rec_dpi", 192
190
190
  )
191
191
  high_res_page_image = self._page_ref.to_image(
192
- resolution=high_res_dpi, include_highlights=False, scale=1.0
192
+ resolution=high_res_dpi, include_highlights=False
193
193
  )
194
194
 
195
195
  # Render high-res page ONCE