natural-pdf 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. bad_pdf_analysis/analyze_10_more.py +300 -0
  2. bad_pdf_analysis/analyze_final_10.py +552 -0
  3. bad_pdf_analysis/analyze_specific_pages.py +394 -0
  4. bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
  5. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  6. natural_pdf/analyzers/layout/layout_manager.py +45 -1
  7. natural_pdf/analyzers/layout/surya.py +1 -1
  8. natural_pdf/analyzers/layout/yolo.py +2 -2
  9. natural_pdf/analyzers/shape_detection_mixin.py +228 -0
  10. natural_pdf/classification/manager.py +67 -0
  11. natural_pdf/core/element_manager.py +556 -25
  12. natural_pdf/core/highlighting_service.py +98 -43
  13. natural_pdf/core/page.py +86 -20
  14. natural_pdf/core/pdf.py +0 -2
  15. natural_pdf/describe/base.py +40 -9
  16. natural_pdf/describe/elements.py +11 -6
  17. natural_pdf/elements/base.py +134 -20
  18. natural_pdf/elements/collections.py +43 -11
  19. natural_pdf/elements/image.py +43 -0
  20. natural_pdf/elements/region.py +64 -19
  21. natural_pdf/elements/text.py +89 -11
  22. natural_pdf/flows/collections.py +4 -4
  23. natural_pdf/flows/region.py +17 -2
  24. natural_pdf/ocr/engine_paddle.py +1 -1
  25. natural_pdf/ocr/ocr_factory.py +8 -8
  26. natural_pdf/ocr/ocr_manager.py +51 -1
  27. natural_pdf/selectors/parser.py +27 -7
  28. natural_pdf/tables/__init__.py +5 -0
  29. natural_pdf/tables/result.py +101 -0
  30. natural_pdf/utils/bidi_mirror.py +36 -0
  31. natural_pdf/utils/visualization.py +15 -1
  32. {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
  33. {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +51 -29
  34. natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
  35. optimization/memory_comparison.py +172 -0
  36. optimization/pdf_analyzer.py +410 -0
  37. optimization/performance_analysis.py +397 -0
  38. optimization/test_cleanup_methods.py +155 -0
  39. optimization/test_memory_fix.py +162 -0
  40. tools/bad_pdf_eval/__init__.py +1 -0
  41. tools/bad_pdf_eval/analyser.py +302 -0
  42. tools/bad_pdf_eval/collate_summaries.py +130 -0
  43. tools/bad_pdf_eval/eval_suite.py +116 -0
  44. tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
  45. tools/bad_pdf_eval/llm_enrich.py +273 -0
  46. tools/bad_pdf_eval/reporter.py +17 -0
  47. tools/bad_pdf_eval/utils.py +127 -0
  48. tools/rtl_smoke_test.py +80 -0
  49. natural_pdf-0.1.27.dist-info/top_level.txt +0 -2
  50. {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
  51. {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
  52. {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,552 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Analyze final 10 PDF documents with enhanced Natural PDF capability awareness
4
+ Focus on testing existing capabilities and identifying real gaps
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import json
10
+ import time
11
+ from datetime import datetime
12
+ import natural_pdf as npdf
13
+
14
+ # Add the project root to the path
15
+ sys.path.append('/Users/soma/Development/natural-pdf')
16
+
17
+ def detailed_pdf_analysis(pdf_path, document_name, target_pages=None):
18
+ """Enhanced analysis leveraging discovered Natural PDF capabilities"""
19
+ print(f"\n{'='*80}")
20
+ print(f"🔍 DETAILED ANALYSIS: {document_name}")
21
+ print(f"📁 Path: {pdf_path}")
22
+ if target_pages:
23
+ print(f"📍 Target pages: {target_pages}")
24
+ print(f"{'='*80}")
25
+
26
+ try:
27
+ pdf = npdf.PDF(pdf_path)
28
+ total_pages = len(pdf.pages)
29
+ print(f"📄 Total pages in document: {total_pages}")
30
+
31
+ # Determine which pages to analyze
32
+ if target_pages:
33
+ pages_to_analyze = [p for p in target_pages if p <= total_pages]
34
+ if len(pages_to_analyze) != len(target_pages):
35
+ print(f"⚠️ Some target pages exceed document length, analyzing: {pages_to_analyze}")
36
+ else:
37
+ # Analyze first page but also sample a middle page for diversity
38
+ pages_to_analyze = [1]
39
+ if total_pages > 10:
40
+ pages_to_analyze.append(total_pages // 2)
41
+
42
+ results = {
43
+ 'document': document_name,
44
+ 'total_pages': total_pages,
45
+ 'analyzed_pages': pages_to_analyze,
46
+ 'analysis_date': datetime.now().isoformat(),
47
+ 'pages': {},
48
+ 'capabilities_tested': {},
49
+ 'challenges_identified': [],
50
+ 'natural_pdf_gaps': []
51
+ }
52
+
53
+ for page_num in pages_to_analyze:
54
+ print(f"\n📄 DEEP ANALYSIS: Page {page_num}")
55
+ page = pdf.pages[page_num - 1] # Convert to 0-based index
56
+
57
+ page_results = {
58
+ 'page_number': page_num,
59
+ 'dimensions': f"{page.width} × {page.height} points",
60
+ 'tests_performed': {}
61
+ }
62
+
63
+ # === TEXT EXTRACTION ANALYSIS ===
64
+ print("🔤 Text Extraction Analysis...")
65
+ try:
66
+ text_content = page.extract_text()
67
+ page_results['text_length'] = len(text_content)
68
+ page_results['text_preview'] = text_content[:200] + "..." if len(text_content) > 200 else text_content
69
+
70
+ # Character-level analysis for dense text detection
71
+ chars = page.chars
72
+ char_count = len(chars)
73
+ page_results['character_count'] = char_count
74
+
75
+ # Detect potential dense text issues (character overlap)
76
+ if char_count > 100:
77
+ overlap_count = 0
78
+ for i, char in enumerate(chars[:100]): # Sample first 100 chars
79
+ for j, other_char in enumerate(chars[i+1:i+21]): # Check next 20
80
+ if abs(char.x0 - other_char.x0) < 2: # Very close x positions
81
+ overlap_count += 1
82
+
83
+ overlap_ratio = overlap_count / min(100, char_count)
84
+ page_results['dense_text_detected'] = overlap_ratio > 0.3
85
+ page_results['character_overlap_ratio'] = overlap_ratio
86
+
87
+ if overlap_ratio > 0.3:
88
+ results['challenges_identified'].append({
89
+ 'type': 'dense_text',
90
+ 'page': page_num,
91
+ 'severity': 'high' if overlap_ratio > 0.5 else 'medium',
92
+ 'details': f'Character overlap ratio: {overlap_ratio:.2f}'
93
+ })
94
+
95
+ print(f"✅ Text: {len(text_content)} chars, {char_count} character elements")
96
+ if page_results.get('dense_text_detected'):
97
+ print(f"⚠️ Dense text detected (overlap ratio: {page_results['character_overlap_ratio']:.2f})")
98
+
99
+ except Exception as e:
100
+ page_results['text_error'] = str(e)
101
+ print(f"❌ Text extraction failed: {e}")
102
+
103
+ # === ADVANCED TABLE DETECTION ===
104
+ print("📊 Advanced Table Detection...")
105
+ try:
106
+ # Standard table extraction
107
+ table_data = page.extract_table()
108
+ if table_data and len(table_data) > 0:
109
+ rows = len(table_data)
110
+ cols = max(len(row) for row in table_data) if table_data else 0
111
+ page_results['standard_table'] = f"{rows} rows × {cols} columns"
112
+ print(f"✅ Standard table: {rows} rows × {cols} columns")
113
+
114
+ # Test unruled table detection using discovered line detection capability
115
+ print("🔍 Testing line detection for unruled tables...")
116
+ try:
117
+ # Use projection profiling (no OpenCV required)
118
+ page.detect_lines(
119
+ resolution=144,
120
+ source_label="analysis_test",
121
+ method="projection",
122
+ horizontal=True,
123
+ vertical=True,
124
+ peak_threshold_h=0.3, # Lower threshold for subtle lines
125
+ peak_threshold_v=0.3,
126
+ replace=True
127
+ )
128
+
129
+ # Check detected lines
130
+ detected_lines = [line for line in page._element_mgr.lines
131
+ if getattr(line, 'source', None) == 'analysis_test']
132
+
133
+ h_lines = [l for l in detected_lines if l.is_horizontal]
134
+ v_lines = [l for l in detected_lines if l.is_vertical]
135
+
136
+ page_results['line_detection'] = {
137
+ 'horizontal_lines': len(h_lines),
138
+ 'vertical_lines': len(v_lines),
139
+ 'total_lines': len(detected_lines)
140
+ }
141
+
142
+ print(f"✅ Line detection: {len(h_lines)} horizontal, {len(v_lines)} vertical")
143
+
144
+ # Test table structure from lines
145
+ if len(detected_lines) > 0:
146
+ page.detect_table_structure_from_lines(
147
+ source_label="analysis_test",
148
+ ignore_outer_regions=True,
149
+ cell_padding=0.5
150
+ )
151
+
152
+ # Check created table regions
153
+ table_regions = [r for r in page._element_mgr.regions
154
+ if getattr(r, 'region_type', None) == 'table']
155
+ cell_regions = [r for r in page._element_mgr.regions
156
+ if getattr(r, 'region_type', None) == 'table_cell']
157
+
158
+ page_results['table_from_lines'] = {
159
+ 'table_regions': len(table_regions),
160
+ 'cell_regions': len(cell_regions)
161
+ }
162
+
163
+ print(f"✅ Table from lines: {len(table_regions)} tables, {len(cell_regions)} cells")
164
+
165
+ results['capabilities_tested']['line_detection'] = True
166
+ results['capabilities_tested']['table_from_lines'] = True
167
+
168
+ except Exception as e:
169
+ page_results['line_detection_error'] = str(e)
170
+ print(f"❌ Line detection failed: {e}")
171
+ results['natural_pdf_gaps'].append({
172
+ 'capability': 'line_detection',
173
+ 'error': str(e),
174
+ 'page': page_num
175
+ })
176
+
177
+ else:
178
+ page_results['standard_table'] = "No table detected"
179
+ print("ℹ️ No standard table detected")
180
+
181
+ except Exception as e:
182
+ page_results['table_error'] = str(e)
183
+ print(f"❌ Table extraction failed: {e}")
184
+
185
+ # === LAYOUT ANALYSIS COMPARISON ===
186
+ print("🏗️ Layout Analysis Comparison...")
187
+ try:
188
+ # YOLO analysis
189
+ yolo_start = time.time()
190
+ page.analyze_layout('yolo', existing='replace')
191
+ yolo_time = time.time() - yolo_start
192
+
193
+ yolo_regions = page.find_all('region')
194
+ page_results['yolo_analysis'] = {
195
+ 'regions': len(yolo_regions),
196
+ 'processing_time': yolo_time
197
+ }
198
+
199
+ # Categorize YOLO regions
200
+ yolo_types = {}
201
+ for region in yolo_regions:
202
+ region_type = getattr(region, 'type', 'unknown')
203
+ yolo_types[region_type] = yolo_types.get(region_type, 0) + 1
204
+
205
+ page_results['yolo_types'] = yolo_types
206
+ print(f"✅ YOLO: {len(yolo_regions)} regions in {yolo_time:.2f}s - {yolo_types}")
207
+
208
+ # TATR analysis
209
+ tatr_start = time.time()
210
+ page.analyze_layout('tatr', existing='append')
211
+ tatr_time = time.time() - tatr_start
212
+
213
+ tatr_regions = page.find_all('region[type="table"]')
214
+ page_results['tatr_analysis'] = {
215
+ 'table_regions': len(tatr_regions),
216
+ 'processing_time': tatr_time
217
+ }
218
+ print(f"✅ TATR: {len(tatr_regions)} table regions in {tatr_time:.2f}s")
219
+
220
+ results['capabilities_tested']['yolo_analysis'] = True
221
+ results['capabilities_tested']['tatr_analysis'] = True
222
+
223
+ except Exception as e:
224
+ page_results['layout_error'] = str(e)
225
+ print(f"❌ Layout analysis failed: {e}")
226
+
227
+ # === ADVANCED SELECTOR TESTING ===
228
+ print("🎯 Advanced Selector Testing...")
229
+ try:
230
+ # Test complex selectors
231
+ selector_tests = {
232
+ 'large_text': 'text[size>12]',
233
+ 'small_text': 'text[size<8]',
234
+ 'bold_text': 'text:bold',
235
+ 'colored_rects': 'rect[fill]',
236
+ 'thin_lines': 'rect[height<3]', # Potential underlines
237
+ 'wide_elements': f'*[width>{page.width * 0.7}]', # Page-spanning elements
238
+ }
239
+
240
+ for test_name, selector in selector_tests.items():
241
+ try:
242
+ elements = page.find_all(selector)
243
+ page_results[f'selector_{test_name}'] = len(elements)
244
+ print(f"✅ {test_name}: {len(elements)} elements")
245
+
246
+ # Special analysis for thin lines (potential formatting)
247
+ if test_name == 'thin_lines' and len(elements) > 0:
248
+ # Check if these might be text formatting
249
+ text_elements = page.find_all('text')
250
+ formatting_candidates = 0
251
+
252
+ for thin_rect in elements[:10]: # Sample first 10
253
+ # Check if there's text above this thin rect
254
+ for text_elem in text_elements[:20]: # Sample text elements
255
+ if (abs(text_elem.bottom - thin_rect.top) < 5 and # Below text
256
+ thin_rect.x0 <= text_elem.x1 and thin_rect.x1 >= text_elem.x0): # Overlaps horizontally
257
+ formatting_candidates += 1
258
+ break
259
+
260
+ if formatting_candidates > 0:
261
+ page_results['potential_text_formatting'] = formatting_candidates
262
+ print(f"🎯 Potential text formatting: {formatting_candidates} underline candidates")
263
+
264
+ results['challenges_identified'].append({
265
+ 'type': 'text_formatting',
266
+ 'page': page_num,
267
+ 'severity': 'medium',
268
+ 'details': f'{formatting_candidates} potential underlines detected'
269
+ })
270
+
271
+ except Exception as e:
272
+ page_results[f'selector_{test_name}_error'] = str(e)
273
+ print(f"❌ Selector {test_name} failed: {e}")
274
+
275
+ results['capabilities_tested']['advanced_selectors'] = True
276
+
277
+ except Exception as e:
278
+ print(f"❌ Selector testing failed: {e}")
279
+
280
+ # === SAVE PAGE IMAGE ===
281
+ try:
282
+ folder_name = document_name.replace('/', '_').replace('\\', '_')
283
+ analysis_dir = f"/Users/soma/Development/natural-pdf/bad_pdf_analysis/{folder_name}/detailed_analysis_final"
284
+ os.makedirs(analysis_dir, exist_ok=True)
285
+
286
+ image_path = f"{analysis_dir}/page_{page_num}.png"
287
+ page_image = page.to_image(resolution=144)
288
+ page_image.save(image_path)
289
+ page_results['image_saved'] = image_path
290
+ print(f"✅ Page image saved: page_{page_num}.png")
291
+ except Exception as e:
292
+ page_results['image_error'] = str(e)
293
+ print(f"❌ Image save failed: {e}")
294
+
295
+ results['pages'][page_num] = page_results
296
+
297
+ # === GENERATE COMPREHENSIVE INSIGHTS ===
298
+ insights = generate_comprehensive_insights(results)
299
+ results['comprehensive_insights'] = insights
300
+
301
+ # === SAVE RESULTS ===
302
+ try:
303
+ folder_name = document_name.replace('/', '_').replace('\\', '_')
304
+ analysis_dir = f"/Users/soma/Development/natural-pdf/bad_pdf_analysis/{folder_name}/detailed_analysis_final"
305
+ os.makedirs(analysis_dir, exist_ok=True)
306
+
307
+ results_path = f"{analysis_dir}/detailed_analysis_results.json"
308
+ with open(results_path, 'w', encoding='utf-8') as f:
309
+ json.dump(results, f, indent=2, ensure_ascii=False)
310
+ print(f"✅ Detailed analysis saved: {results_path}")
311
+
312
+ # Generate detailed markdown report
313
+ markdown_path = f"{analysis_dir}/{document_name}_detailed_analysis.md"
314
+ generate_detailed_markdown(results, markdown_path)
315
+ print(f"✅ Detailed markdown report saved: {markdown_path}")
316
+
317
+ except Exception as e:
318
+ print(f"❌ Failed to save results: {e}")
319
+
320
+ return results
321
+
322
+ except Exception as e:
323
+ print(f"❌ Failed to analyze {document_name}: {e}")
324
+ return None
325
+
326
+ def generate_comprehensive_insights(results):
327
+ """Generate comprehensive insights from detailed analysis"""
328
+ insights = {
329
+ 'document_complexity': 'low',
330
+ 'processing_recommendations': [],
331
+ 'natural_pdf_effectiveness': {},
332
+ 'priority_issues': []
333
+ }
334
+
335
+ # Analyze document complexity
336
+ total_chars = sum(page.get('character_count', 0) for page in results['pages'].values())
337
+ max_regions = max(page.get('yolo_analysis', {}).get('regions', 0) for page in results['pages'].values())
338
+
339
+ if total_chars > 5000 or max_regions > 15:
340
+ insights['document_complexity'] = 'high'
341
+ elif total_chars > 2000 or max_regions > 8:
342
+ insights['document_complexity'] = 'medium'
343
+
344
+ # Analyze Natural PDF effectiveness
345
+ capabilities_tested = results.get('capabilities_tested', {})
346
+ working_capabilities = [k for k, v in capabilities_tested.items() if v]
347
+ insights['natural_pdf_effectiveness']['working_capabilities'] = working_capabilities
348
+
349
+ # Priority issues
350
+ for challenge in results.get('challenges_identified', []):
351
+ if challenge['severity'] == 'high':
352
+ insights['priority_issues'].append(challenge)
353
+
354
+ # Processing recommendations
355
+ if any(page.get('dense_text_detected') for page in results['pages'].values()):
356
+ insights['processing_recommendations'].append('Use pdfplumber parameters for dense text handling')
357
+
358
+ if any(page.get('line_detection', {}).get('total_lines', 0) > 0 for page in results['pages'].values()):
359
+ insights['processing_recommendations'].append('Leverage existing line detection for table structure')
360
+
361
+ return insights
362
+
363
+ def generate_detailed_markdown(results, output_path):
364
+ """Generate detailed markdown report"""
365
+
366
+ content = f"""# Detailed PDF Analysis Report - {results['document']}
367
+
368
+ ## Executive Summary
369
+
370
+ **Document:** {results['document']}
371
+ **Complexity:** {results.get('comprehensive_insights', {}).get('document_complexity', 'unknown').upper()}
372
+ **Pages Analyzed:** {len(results['pages'])}
373
+ **Analysis Date:** {results['analysis_date']}
374
+
375
+ ### Key Findings
376
+
377
+ """
378
+
379
+ # Add priority issues
380
+ priority_issues = results.get('comprehensive_insights', {}).get('priority_issues', [])
381
+ if priority_issues:
382
+ content += "#### 🚨 Priority Issues\n\n"
383
+ for issue in priority_issues:
384
+ content += f"- **{issue['type'].title()}** (Page {issue['page']}): {issue['details']}\n"
385
+ content += "\n"
386
+
387
+ # Add working capabilities
388
+ working_caps = results.get('comprehensive_insights', {}).get('natural_pdf_effectiveness', {}).get('working_capabilities', [])
389
+ if working_caps:
390
+ content += "#### ✅ Natural PDF Capabilities Confirmed\n\n"
391
+ for cap in working_caps:
392
+ content += f"- {cap.replace('_', ' ').title()}\n"
393
+ content += "\n"
394
+
395
+ content += "---\n\n## Detailed Page Analysis\n\n"
396
+
397
+ for page_num, page_data in results['pages'].items():
398
+ content += f"### Page {page_num}\n\n"
399
+ content += f"**Dimensions:** {page_data.get('dimensions', 'Unknown')}\n\n"
400
+
401
+ # Text analysis
402
+ if 'text_length' in page_data:
403
+ content += f"**Text Analysis:**\n"
404
+ content += f"- Content: {page_data['text_length']} characters, {page_data.get('character_count', 0)} elements\n"
405
+ if page_data.get('dense_text_detected'):
406
+ content += f"- ⚠️ Dense text detected (overlap ratio: {page_data.get('character_overlap_ratio', 0):.2f})\n"
407
+ content += "\n"
408
+
409
+ # Table analysis
410
+ if 'standard_table' in page_data:
411
+ content += f"**Table Analysis:**\n"
412
+ content += f"- Standard extraction: {page_data['standard_table']}\n"
413
+ if 'line_detection' in page_data:
414
+ ld = page_data['line_detection']
415
+ content += f"- Line detection: {ld['horizontal_lines']} horizontal, {ld['vertical_lines']} vertical\n"
416
+ if 'table_from_lines' in page_data:
417
+ tfl = page_data['table_from_lines']
418
+ content += f"- Table from lines: {tfl['table_regions']} tables, {tfl['cell_regions']} cells\n"
419
+ content += "\n"
420
+
421
+ # Layout analysis
422
+ if 'yolo_analysis' in page_data:
423
+ ya = page_data['yolo_analysis']
424
+ content += f"**Layout Analysis:**\n"
425
+ content += f"- YOLO: {ya['regions']} regions in {ya['processing_time']:.2f}s\n"
426
+ if 'yolo_types' in page_data:
427
+ types_str = ", ".join([f"{k}: {v}" for k, v in page_data['yolo_types'].items()])
428
+ content += f" - Types: {types_str}\n"
429
+ if 'tatr_analysis' in page_data:
430
+ ta = page_data['tatr_analysis']
431
+ content += f"- TATR: {ta['table_regions']} table regions in {ta['processing_time']:.2f}s\n"
432
+ content += "\n"
433
+
434
+ # Selector testing
435
+ selector_keys = [k for k in page_data.keys() if k.startswith('selector_')]
436
+ if selector_keys:
437
+ content += f"**Advanced Selector Testing:**\n"
438
+ for key in selector_keys:
439
+ if not key.endswith('_error'):
440
+ clean_name = key.replace('selector_', '').replace('_', ' ').title()
441
+ content += f"- {clean_name}: {page_data[key]} elements\n"
442
+
443
+ if page_data.get('potential_text_formatting'):
444
+ content += f"- 🎯 Text formatting candidates: {page_data['potential_text_formatting']}\n"
445
+ content += "\n"
446
+
447
+ content += "\n"
448
+
449
+ # Add comprehensive recommendations
450
+ content += """---
451
+
452
+ ## Natural PDF Integration Recommendations
453
+
454
+ Based on this detailed analysis:
455
+
456
+ ```python
457
+ import natural_pdf as npdf
458
+
459
+ def process_document_optimally(pdf_path):
460
+ \"\"\"Optimized processing based on analysis findings\"\"\"
461
+ pdf = npdf.PDF(pdf_path)
462
+ results = []
463
+
464
+ for page_num, page in enumerate(pdf.pages, 1):
465
+ # Use discovered line detection capability
466
+ page.detect_lines(
467
+ resolution=144,
468
+ method="projection", # No OpenCV required
469
+ horizontal=True,
470
+ vertical=True,
471
+ peak_threshold_h=0.3,
472
+ peak_threshold_v=0.3
473
+ )
474
+
475
+ # Create table structure from detected lines
476
+ page.detect_table_structure_from_lines(
477
+ source_label="detected",
478
+ ignore_outer_regions=True,
479
+ cell_padding=0.5
480
+ )
481
+
482
+ # Extract using multiple methods
483
+ standard_table = page.extract_table()
484
+ line_based_tables = page.find_all('region[type="table"]')
485
+
486
+ results.append({
487
+ 'page': page_num,
488
+ 'standard_table': standard_table,
489
+ 'line_based_tables': len(line_based_tables)
490
+ })
491
+
492
+ return results
493
+ ```
494
+
495
+ """
496
+
497
+ with open(output_path, 'w', encoding='utf-8') as f:
498
+ f.write(content)
499
+
500
+ def main():
501
+ """Analyze final 10 PDF documents with detailed capability testing"""
502
+
503
+ # Select diverse documents focusing on different challenge types
504
+ documents_to_analyze = [
505
+ # Text formatting challenges
506
+ ("Y5G72LB_We are trying to get specific information such as ", "Y5G72LB.pdf", None),
507
+ ("Pd1KBb1_the data table _of election results_", "Pd1KBb1.pdf", None),
508
+ ("Pd9WVDb_We want a spreadsheet showing all the columns sepa", "Pd9WVDb.pdf", None),
509
+
510
+ # Complex table structures
511
+ ("eqQ4N7q_election results data table", "eqQ4N7q.pdf", None),
512
+ ("eqQ4NoQ_data table", "eqQ4NoQ.pdf", None),
513
+ ("ODXl8aR_0. ISO code of the business_ business name_ contac", "ODXl8aR.pdf", None),
514
+
515
+ # Multi-language and script challenges
516
+ ("1A4PPW1_The arabic text", "1A4PPW1.pdf", None),
517
+ ("lbODDK6_The text in Ethiopian.", "lbODDK6.pdf", None),
518
+
519
+ # Dense content and specialized formats
520
+ ("2EAOEvb_The text_ without beeing divided in 2 columns and ", "2EAOEvb.pdf", None),
521
+ ("OD49rjM_Just being able to make sense of any of it. It_s b", "OD49rjM.pdf", None),
522
+ ]
523
+
524
+ analysis_results = []
525
+
526
+ print(f"🚀 Starting detailed analysis of {len(documents_to_analyze)} documents...")
527
+ print(f"🔬 Testing discovered Natural PDF capabilities:")
528
+ print(f" - Line detection (projection profiling)")
529
+ print(f" - Table structure from lines")
530
+ print(f" - Advanced selectors")
531
+ print(f" - Character-level dense text detection")
532
+
533
+ for folder_name, pdf_filename, target_pages in documents_to_analyze:
534
+ pdf_path = f"/Users/soma/Development/natural-pdf/bad_pdf_analysis/{folder_name}/{pdf_filename}"
535
+
536
+ if os.path.exists(pdf_path):
537
+ result = detailed_pdf_analysis(pdf_path, folder_name, target_pages)
538
+ if result:
539
+ analysis_results.append(result)
540
+ else:
541
+ print(f"❌ PDF not found: {pdf_path}")
542
+
543
+ print(f"\n{'='*80}")
544
+ print(f"✅ DETAILED ANALYSIS COMPLETE!")
545
+ print(f"📊 Processed {len(analysis_results)} documents")
546
+ print(f"🔬 Tested Natural PDF capabilities extensively")
547
+ print(f"{'='*80}")
548
+
549
+ return analysis_results
550
+
551
+ if __name__ == "__main__":
552
+ main()