natural-pdf 0.1.31__py3-none-any.whl → 0.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +18 -4
- natural_pdf/analyzers/guides.py +2176 -0
- natural_pdf/analyzers/shape_detection_mixin.py +0 -650
- natural_pdf/core/element_manager.py +42 -3
- natural_pdf/core/page.py +49 -1
- natural_pdf/core/pdf.py +22 -0
- natural_pdf/elements/collections.py +61 -0
- natural_pdf/elements/region.py +257 -14
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/RECORD +14 -18
- bad_pdf_analysis/analyze_10_more.py +0 -300
- bad_pdf_analysis/analyze_final_10.py +0 -552
- bad_pdf_analysis/analyze_specific_pages.py +0 -394
- bad_pdf_analysis/analyze_specific_pages_direct.py +0 -382
- tools/rtl_smoke_test.py +0 -80
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/top_level.txt +0 -0
@@ -1,552 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
"""
|
3
|
-
Analyze final 10 PDF documents with enhanced Natural PDF capability awareness
|
4
|
-
Focus on testing existing capabilities and identifying real gaps
|
5
|
-
"""
|
6
|
-
|
7
|
-
import os
|
8
|
-
import sys
|
9
|
-
import json
|
10
|
-
import time
|
11
|
-
from datetime import datetime
|
12
|
-
import natural_pdf as npdf
|
13
|
-
|
14
|
-
# Add the project root to the path
|
15
|
-
sys.path.append('/Users/soma/Development/natural-pdf')
|
16
|
-
|
17
|
-
def detailed_pdf_analysis(pdf_path, document_name, target_pages=None):
|
18
|
-
"""Enhanced analysis leveraging discovered Natural PDF capabilities"""
|
19
|
-
print(f"\n{'='*80}")
|
20
|
-
print(f"🔍 DETAILED ANALYSIS: {document_name}")
|
21
|
-
print(f"📁 Path: {pdf_path}")
|
22
|
-
if target_pages:
|
23
|
-
print(f"📍 Target pages: {target_pages}")
|
24
|
-
print(f"{'='*80}")
|
25
|
-
|
26
|
-
try:
|
27
|
-
pdf = npdf.PDF(pdf_path)
|
28
|
-
total_pages = len(pdf.pages)
|
29
|
-
print(f"📄 Total pages in document: {total_pages}")
|
30
|
-
|
31
|
-
# Determine which pages to analyze
|
32
|
-
if target_pages:
|
33
|
-
pages_to_analyze = [p for p in target_pages if p <= total_pages]
|
34
|
-
if len(pages_to_analyze) != len(target_pages):
|
35
|
-
print(f"⚠️ Some target pages exceed document length, analyzing: {pages_to_analyze}")
|
36
|
-
else:
|
37
|
-
# Analyze first page but also sample a middle page for diversity
|
38
|
-
pages_to_analyze = [1]
|
39
|
-
if total_pages > 10:
|
40
|
-
pages_to_analyze.append(total_pages // 2)
|
41
|
-
|
42
|
-
results = {
|
43
|
-
'document': document_name,
|
44
|
-
'total_pages': total_pages,
|
45
|
-
'analyzed_pages': pages_to_analyze,
|
46
|
-
'analysis_date': datetime.now().isoformat(),
|
47
|
-
'pages': {},
|
48
|
-
'capabilities_tested': {},
|
49
|
-
'challenges_identified': [],
|
50
|
-
'natural_pdf_gaps': []
|
51
|
-
}
|
52
|
-
|
53
|
-
for page_num in pages_to_analyze:
|
54
|
-
print(f"\n📄 DEEP ANALYSIS: Page {page_num}")
|
55
|
-
page = pdf.pages[page_num - 1] # Convert to 0-based index
|
56
|
-
|
57
|
-
page_results = {
|
58
|
-
'page_number': page_num,
|
59
|
-
'dimensions': f"{page.width} × {page.height} points",
|
60
|
-
'tests_performed': {}
|
61
|
-
}
|
62
|
-
|
63
|
-
# === TEXT EXTRACTION ANALYSIS ===
|
64
|
-
print("🔤 Text Extraction Analysis...")
|
65
|
-
try:
|
66
|
-
text_content = page.extract_text()
|
67
|
-
page_results['text_length'] = len(text_content)
|
68
|
-
page_results['text_preview'] = text_content[:200] + "..." if len(text_content) > 200 else text_content
|
69
|
-
|
70
|
-
# Character-level analysis for dense text detection
|
71
|
-
chars = page.chars
|
72
|
-
char_count = len(chars)
|
73
|
-
page_results['character_count'] = char_count
|
74
|
-
|
75
|
-
# Detect potential dense text issues (character overlap)
|
76
|
-
if char_count > 100:
|
77
|
-
overlap_count = 0
|
78
|
-
for i, char in enumerate(chars[:100]): # Sample first 100 chars
|
79
|
-
for j, other_char in enumerate(chars[i+1:i+21]): # Check next 20
|
80
|
-
if abs(char.x0 - other_char.x0) < 2: # Very close x positions
|
81
|
-
overlap_count += 1
|
82
|
-
|
83
|
-
overlap_ratio = overlap_count / min(100, char_count)
|
84
|
-
page_results['dense_text_detected'] = overlap_ratio > 0.3
|
85
|
-
page_results['character_overlap_ratio'] = overlap_ratio
|
86
|
-
|
87
|
-
if overlap_ratio > 0.3:
|
88
|
-
results['challenges_identified'].append({
|
89
|
-
'type': 'dense_text',
|
90
|
-
'page': page_num,
|
91
|
-
'severity': 'high' if overlap_ratio > 0.5 else 'medium',
|
92
|
-
'details': f'Character overlap ratio: {overlap_ratio:.2f}'
|
93
|
-
})
|
94
|
-
|
95
|
-
print(f"✅ Text: {len(text_content)} chars, {char_count} character elements")
|
96
|
-
if page_results.get('dense_text_detected'):
|
97
|
-
print(f"⚠️ Dense text detected (overlap ratio: {page_results['character_overlap_ratio']:.2f})")
|
98
|
-
|
99
|
-
except Exception as e:
|
100
|
-
page_results['text_error'] = str(e)
|
101
|
-
print(f"❌ Text extraction failed: {e}")
|
102
|
-
|
103
|
-
# === ADVANCED TABLE DETECTION ===
|
104
|
-
print("📊 Advanced Table Detection...")
|
105
|
-
try:
|
106
|
-
# Standard table extraction
|
107
|
-
table_data = page.extract_table()
|
108
|
-
if table_data and len(table_data) > 0:
|
109
|
-
rows = len(table_data)
|
110
|
-
cols = max(len(row) for row in table_data) if table_data else 0
|
111
|
-
page_results['standard_table'] = f"{rows} rows × {cols} columns"
|
112
|
-
print(f"✅ Standard table: {rows} rows × {cols} columns")
|
113
|
-
|
114
|
-
# Test unruled table detection using discovered line detection capability
|
115
|
-
print("🔍 Testing line detection for unruled tables...")
|
116
|
-
try:
|
117
|
-
# Use projection profiling (no OpenCV required)
|
118
|
-
page.detect_lines(
|
119
|
-
resolution=144,
|
120
|
-
source_label="analysis_test",
|
121
|
-
method="projection",
|
122
|
-
horizontal=True,
|
123
|
-
vertical=True,
|
124
|
-
peak_threshold_h=0.3, # Lower threshold for subtle lines
|
125
|
-
peak_threshold_v=0.3,
|
126
|
-
replace=True
|
127
|
-
)
|
128
|
-
|
129
|
-
# Check detected lines
|
130
|
-
detected_lines = [line for line in page._element_mgr.lines
|
131
|
-
if getattr(line, 'source', None) == 'analysis_test']
|
132
|
-
|
133
|
-
h_lines = [l for l in detected_lines if l.is_horizontal]
|
134
|
-
v_lines = [l for l in detected_lines if l.is_vertical]
|
135
|
-
|
136
|
-
page_results['line_detection'] = {
|
137
|
-
'horizontal_lines': len(h_lines),
|
138
|
-
'vertical_lines': len(v_lines),
|
139
|
-
'total_lines': len(detected_lines)
|
140
|
-
}
|
141
|
-
|
142
|
-
print(f"✅ Line detection: {len(h_lines)} horizontal, {len(v_lines)} vertical")
|
143
|
-
|
144
|
-
# Test table structure from lines
|
145
|
-
if len(detected_lines) > 0:
|
146
|
-
page.detect_table_structure_from_lines(
|
147
|
-
source_label="analysis_test",
|
148
|
-
ignore_outer_regions=True,
|
149
|
-
cell_padding=0.5
|
150
|
-
)
|
151
|
-
|
152
|
-
# Check created table regions
|
153
|
-
table_regions = [r for r in page._element_mgr.regions
|
154
|
-
if getattr(r, 'region_type', None) == 'table']
|
155
|
-
cell_regions = [r for r in page._element_mgr.regions
|
156
|
-
if getattr(r, 'region_type', None) == 'table_cell']
|
157
|
-
|
158
|
-
page_results['table_from_lines'] = {
|
159
|
-
'table_regions': len(table_regions),
|
160
|
-
'cell_regions': len(cell_regions)
|
161
|
-
}
|
162
|
-
|
163
|
-
print(f"✅ Table from lines: {len(table_regions)} tables, {len(cell_regions)} cells")
|
164
|
-
|
165
|
-
results['capabilities_tested']['line_detection'] = True
|
166
|
-
results['capabilities_tested']['table_from_lines'] = True
|
167
|
-
|
168
|
-
except Exception as e:
|
169
|
-
page_results['line_detection_error'] = str(e)
|
170
|
-
print(f"❌ Line detection failed: {e}")
|
171
|
-
results['natural_pdf_gaps'].append({
|
172
|
-
'capability': 'line_detection',
|
173
|
-
'error': str(e),
|
174
|
-
'page': page_num
|
175
|
-
})
|
176
|
-
|
177
|
-
else:
|
178
|
-
page_results['standard_table'] = "No table detected"
|
179
|
-
print("ℹ️ No standard table detected")
|
180
|
-
|
181
|
-
except Exception as e:
|
182
|
-
page_results['table_error'] = str(e)
|
183
|
-
print(f"❌ Table extraction failed: {e}")
|
184
|
-
|
185
|
-
# === LAYOUT ANALYSIS COMPARISON ===
|
186
|
-
print("🏗️ Layout Analysis Comparison...")
|
187
|
-
try:
|
188
|
-
# YOLO analysis
|
189
|
-
yolo_start = time.time()
|
190
|
-
page.analyze_layout('yolo', existing='replace')
|
191
|
-
yolo_time = time.time() - yolo_start
|
192
|
-
|
193
|
-
yolo_regions = page.find_all('region')
|
194
|
-
page_results['yolo_analysis'] = {
|
195
|
-
'regions': len(yolo_regions),
|
196
|
-
'processing_time': yolo_time
|
197
|
-
}
|
198
|
-
|
199
|
-
# Categorize YOLO regions
|
200
|
-
yolo_types = {}
|
201
|
-
for region in yolo_regions:
|
202
|
-
region_type = getattr(region, 'type', 'unknown')
|
203
|
-
yolo_types[region_type] = yolo_types.get(region_type, 0) + 1
|
204
|
-
|
205
|
-
page_results['yolo_types'] = yolo_types
|
206
|
-
print(f"✅ YOLO: {len(yolo_regions)} regions in {yolo_time:.2f}s - {yolo_types}")
|
207
|
-
|
208
|
-
# TATR analysis
|
209
|
-
tatr_start = time.time()
|
210
|
-
page.analyze_layout('tatr', existing='append')
|
211
|
-
tatr_time = time.time() - tatr_start
|
212
|
-
|
213
|
-
tatr_regions = page.find_all('region[type="table"]')
|
214
|
-
page_results['tatr_analysis'] = {
|
215
|
-
'table_regions': len(tatr_regions),
|
216
|
-
'processing_time': tatr_time
|
217
|
-
}
|
218
|
-
print(f"✅ TATR: {len(tatr_regions)} table regions in {tatr_time:.2f}s")
|
219
|
-
|
220
|
-
results['capabilities_tested']['yolo_analysis'] = True
|
221
|
-
results['capabilities_tested']['tatr_analysis'] = True
|
222
|
-
|
223
|
-
except Exception as e:
|
224
|
-
page_results['layout_error'] = str(e)
|
225
|
-
print(f"❌ Layout analysis failed: {e}")
|
226
|
-
|
227
|
-
# === ADVANCED SELECTOR TESTING ===
|
228
|
-
print("🎯 Advanced Selector Testing...")
|
229
|
-
try:
|
230
|
-
# Test complex selectors
|
231
|
-
selector_tests = {
|
232
|
-
'large_text': 'text[size>12]',
|
233
|
-
'small_text': 'text[size<8]',
|
234
|
-
'bold_text': 'text:bold',
|
235
|
-
'colored_rects': 'rect[fill]',
|
236
|
-
'thin_lines': 'rect[height<3]', # Potential underlines
|
237
|
-
'wide_elements': f'*[width>{page.width * 0.7}]', # Page-spanning elements
|
238
|
-
}
|
239
|
-
|
240
|
-
for test_name, selector in selector_tests.items():
|
241
|
-
try:
|
242
|
-
elements = page.find_all(selector)
|
243
|
-
page_results[f'selector_{test_name}'] = len(elements)
|
244
|
-
print(f"✅ {test_name}: {len(elements)} elements")
|
245
|
-
|
246
|
-
# Special analysis for thin lines (potential formatting)
|
247
|
-
if test_name == 'thin_lines' and len(elements) > 0:
|
248
|
-
# Check if these might be text formatting
|
249
|
-
text_elements = page.find_all('text')
|
250
|
-
formatting_candidates = 0
|
251
|
-
|
252
|
-
for thin_rect in elements[:10]: # Sample first 10
|
253
|
-
# Check if there's text above this thin rect
|
254
|
-
for text_elem in text_elements[:20]: # Sample text elements
|
255
|
-
if (abs(text_elem.bottom - thin_rect.top) < 5 and # Below text
|
256
|
-
thin_rect.x0 <= text_elem.x1 and thin_rect.x1 >= text_elem.x0): # Overlaps horizontally
|
257
|
-
formatting_candidates += 1
|
258
|
-
break
|
259
|
-
|
260
|
-
if formatting_candidates > 0:
|
261
|
-
page_results['potential_text_formatting'] = formatting_candidates
|
262
|
-
print(f"🎯 Potential text formatting: {formatting_candidates} underline candidates")
|
263
|
-
|
264
|
-
results['challenges_identified'].append({
|
265
|
-
'type': 'text_formatting',
|
266
|
-
'page': page_num,
|
267
|
-
'severity': 'medium',
|
268
|
-
'details': f'{formatting_candidates} potential underlines detected'
|
269
|
-
})
|
270
|
-
|
271
|
-
except Exception as e:
|
272
|
-
page_results[f'selector_{test_name}_error'] = str(e)
|
273
|
-
print(f"❌ Selector {test_name} failed: {e}")
|
274
|
-
|
275
|
-
results['capabilities_tested']['advanced_selectors'] = True
|
276
|
-
|
277
|
-
except Exception as e:
|
278
|
-
print(f"❌ Selector testing failed: {e}")
|
279
|
-
|
280
|
-
# === SAVE PAGE IMAGE ===
|
281
|
-
try:
|
282
|
-
folder_name = document_name.replace('/', '_').replace('\\', '_')
|
283
|
-
analysis_dir = f"/Users/soma/Development/natural-pdf/bad_pdf_analysis/{folder_name}/detailed_analysis_final"
|
284
|
-
os.makedirs(analysis_dir, exist_ok=True)
|
285
|
-
|
286
|
-
image_path = f"{analysis_dir}/page_{page_num}.png"
|
287
|
-
page_image = page.to_image(resolution=144)
|
288
|
-
page_image.save(image_path)
|
289
|
-
page_results['image_saved'] = image_path
|
290
|
-
print(f"✅ Page image saved: page_{page_num}.png")
|
291
|
-
except Exception as e:
|
292
|
-
page_results['image_error'] = str(e)
|
293
|
-
print(f"❌ Image save failed: {e}")
|
294
|
-
|
295
|
-
results['pages'][page_num] = page_results
|
296
|
-
|
297
|
-
# === GENERATE COMPREHENSIVE INSIGHTS ===
|
298
|
-
insights = generate_comprehensive_insights(results)
|
299
|
-
results['comprehensive_insights'] = insights
|
300
|
-
|
301
|
-
# === SAVE RESULTS ===
|
302
|
-
try:
|
303
|
-
folder_name = document_name.replace('/', '_').replace('\\', '_')
|
304
|
-
analysis_dir = f"/Users/soma/Development/natural-pdf/bad_pdf_analysis/{folder_name}/detailed_analysis_final"
|
305
|
-
os.makedirs(analysis_dir, exist_ok=True)
|
306
|
-
|
307
|
-
results_path = f"{analysis_dir}/detailed_analysis_results.json"
|
308
|
-
with open(results_path, 'w', encoding='utf-8') as f:
|
309
|
-
json.dump(results, f, indent=2, ensure_ascii=False)
|
310
|
-
print(f"✅ Detailed analysis saved: {results_path}")
|
311
|
-
|
312
|
-
# Generate detailed markdown report
|
313
|
-
markdown_path = f"{analysis_dir}/{document_name}_detailed_analysis.md"
|
314
|
-
generate_detailed_markdown(results, markdown_path)
|
315
|
-
print(f"✅ Detailed markdown report saved: {markdown_path}")
|
316
|
-
|
317
|
-
except Exception as e:
|
318
|
-
print(f"❌ Failed to save results: {e}")
|
319
|
-
|
320
|
-
return results
|
321
|
-
|
322
|
-
except Exception as e:
|
323
|
-
print(f"❌ Failed to analyze {document_name}: {e}")
|
324
|
-
return None
|
325
|
-
|
326
|
-
def generate_comprehensive_insights(results):
|
327
|
-
"""Generate comprehensive insights from detailed analysis"""
|
328
|
-
insights = {
|
329
|
-
'document_complexity': 'low',
|
330
|
-
'processing_recommendations': [],
|
331
|
-
'natural_pdf_effectiveness': {},
|
332
|
-
'priority_issues': []
|
333
|
-
}
|
334
|
-
|
335
|
-
# Analyze document complexity
|
336
|
-
total_chars = sum(page.get('character_count', 0) for page in results['pages'].values())
|
337
|
-
max_regions = max(page.get('yolo_analysis', {}).get('regions', 0) for page in results['pages'].values())
|
338
|
-
|
339
|
-
if total_chars > 5000 or max_regions > 15:
|
340
|
-
insights['document_complexity'] = 'high'
|
341
|
-
elif total_chars > 2000 or max_regions > 8:
|
342
|
-
insights['document_complexity'] = 'medium'
|
343
|
-
|
344
|
-
# Analyze Natural PDF effectiveness
|
345
|
-
capabilities_tested = results.get('capabilities_tested', {})
|
346
|
-
working_capabilities = [k for k, v in capabilities_tested.items() if v]
|
347
|
-
insights['natural_pdf_effectiveness']['working_capabilities'] = working_capabilities
|
348
|
-
|
349
|
-
# Priority issues
|
350
|
-
for challenge in results.get('challenges_identified', []):
|
351
|
-
if challenge['severity'] == 'high':
|
352
|
-
insights['priority_issues'].append(challenge)
|
353
|
-
|
354
|
-
# Processing recommendations
|
355
|
-
if any(page.get('dense_text_detected') for page in results['pages'].values()):
|
356
|
-
insights['processing_recommendations'].append('Use pdfplumber parameters for dense text handling')
|
357
|
-
|
358
|
-
if any(page.get('line_detection', {}).get('total_lines', 0) > 0 for page in results['pages'].values()):
|
359
|
-
insights['processing_recommendations'].append('Leverage existing line detection for table structure')
|
360
|
-
|
361
|
-
return insights
|
362
|
-
|
363
|
-
def generate_detailed_markdown(results, output_path):
|
364
|
-
"""Generate detailed markdown report"""
|
365
|
-
|
366
|
-
content = f"""# Detailed PDF Analysis Report - {results['document']}
|
367
|
-
|
368
|
-
## Executive Summary
|
369
|
-
|
370
|
-
**Document:** {results['document']}
|
371
|
-
**Complexity:** {results.get('comprehensive_insights', {}).get('document_complexity', 'unknown').upper()}
|
372
|
-
**Pages Analyzed:** {len(results['pages'])}
|
373
|
-
**Analysis Date:** {results['analysis_date']}
|
374
|
-
|
375
|
-
### Key Findings
|
376
|
-
|
377
|
-
"""
|
378
|
-
|
379
|
-
# Add priority issues
|
380
|
-
priority_issues = results.get('comprehensive_insights', {}).get('priority_issues', [])
|
381
|
-
if priority_issues:
|
382
|
-
content += "#### 🚨 Priority Issues\n\n"
|
383
|
-
for issue in priority_issues:
|
384
|
-
content += f"- **{issue['type'].title()}** (Page {issue['page']}): {issue['details']}\n"
|
385
|
-
content += "\n"
|
386
|
-
|
387
|
-
# Add working capabilities
|
388
|
-
working_caps = results.get('comprehensive_insights', {}).get('natural_pdf_effectiveness', {}).get('working_capabilities', [])
|
389
|
-
if working_caps:
|
390
|
-
content += "#### ✅ Natural PDF Capabilities Confirmed\n\n"
|
391
|
-
for cap in working_caps:
|
392
|
-
content += f"- {cap.replace('_', ' ').title()}\n"
|
393
|
-
content += "\n"
|
394
|
-
|
395
|
-
content += "---\n\n## Detailed Page Analysis\n\n"
|
396
|
-
|
397
|
-
for page_num, page_data in results['pages'].items():
|
398
|
-
content += f"### Page {page_num}\n\n"
|
399
|
-
content += f"**Dimensions:** {page_data.get('dimensions', 'Unknown')}\n\n"
|
400
|
-
|
401
|
-
# Text analysis
|
402
|
-
if 'text_length' in page_data:
|
403
|
-
content += f"**Text Analysis:**\n"
|
404
|
-
content += f"- Content: {page_data['text_length']} characters, {page_data.get('character_count', 0)} elements\n"
|
405
|
-
if page_data.get('dense_text_detected'):
|
406
|
-
content += f"- ⚠️ Dense text detected (overlap ratio: {page_data.get('character_overlap_ratio', 0):.2f})\n"
|
407
|
-
content += "\n"
|
408
|
-
|
409
|
-
# Table analysis
|
410
|
-
if 'standard_table' in page_data:
|
411
|
-
content += f"**Table Analysis:**\n"
|
412
|
-
content += f"- Standard extraction: {page_data['standard_table']}\n"
|
413
|
-
if 'line_detection' in page_data:
|
414
|
-
ld = page_data['line_detection']
|
415
|
-
content += f"- Line detection: {ld['horizontal_lines']} horizontal, {ld['vertical_lines']} vertical\n"
|
416
|
-
if 'table_from_lines' in page_data:
|
417
|
-
tfl = page_data['table_from_lines']
|
418
|
-
content += f"- Table from lines: {tfl['table_regions']} tables, {tfl['cell_regions']} cells\n"
|
419
|
-
content += "\n"
|
420
|
-
|
421
|
-
# Layout analysis
|
422
|
-
if 'yolo_analysis' in page_data:
|
423
|
-
ya = page_data['yolo_analysis']
|
424
|
-
content += f"**Layout Analysis:**\n"
|
425
|
-
content += f"- YOLO: {ya['regions']} regions in {ya['processing_time']:.2f}s\n"
|
426
|
-
if 'yolo_types' in page_data:
|
427
|
-
types_str = ", ".join([f"{k}: {v}" for k, v in page_data['yolo_types'].items()])
|
428
|
-
content += f" - Types: {types_str}\n"
|
429
|
-
if 'tatr_analysis' in page_data:
|
430
|
-
ta = page_data['tatr_analysis']
|
431
|
-
content += f"- TATR: {ta['table_regions']} table regions in {ta['processing_time']:.2f}s\n"
|
432
|
-
content += "\n"
|
433
|
-
|
434
|
-
# Selector testing
|
435
|
-
selector_keys = [k for k in page_data.keys() if k.startswith('selector_')]
|
436
|
-
if selector_keys:
|
437
|
-
content += f"**Advanced Selector Testing:**\n"
|
438
|
-
for key in selector_keys:
|
439
|
-
if not key.endswith('_error'):
|
440
|
-
clean_name = key.replace('selector_', '').replace('_', ' ').title()
|
441
|
-
content += f"- {clean_name}: {page_data[key]} elements\n"
|
442
|
-
|
443
|
-
if page_data.get('potential_text_formatting'):
|
444
|
-
content += f"- 🎯 Text formatting candidates: {page_data['potential_text_formatting']}\n"
|
445
|
-
content += "\n"
|
446
|
-
|
447
|
-
content += "\n"
|
448
|
-
|
449
|
-
# Add comprehensive recommendations
|
450
|
-
content += """---
|
451
|
-
|
452
|
-
## Natural PDF Integration Recommendations
|
453
|
-
|
454
|
-
Based on this detailed analysis:
|
455
|
-
|
456
|
-
```python
|
457
|
-
import natural_pdf as npdf
|
458
|
-
|
459
|
-
def process_document_optimally(pdf_path):
|
460
|
-
\"\"\"Optimized processing based on analysis findings\"\"\"
|
461
|
-
pdf = npdf.PDF(pdf_path)
|
462
|
-
results = []
|
463
|
-
|
464
|
-
for page_num, page in enumerate(pdf.pages, 1):
|
465
|
-
# Use discovered line detection capability
|
466
|
-
page.detect_lines(
|
467
|
-
resolution=144,
|
468
|
-
method="projection", # No OpenCV required
|
469
|
-
horizontal=True,
|
470
|
-
vertical=True,
|
471
|
-
peak_threshold_h=0.3,
|
472
|
-
peak_threshold_v=0.3
|
473
|
-
)
|
474
|
-
|
475
|
-
# Create table structure from detected lines
|
476
|
-
page.detect_table_structure_from_lines(
|
477
|
-
source_label="detected",
|
478
|
-
ignore_outer_regions=True,
|
479
|
-
cell_padding=0.5
|
480
|
-
)
|
481
|
-
|
482
|
-
# Extract using multiple methods
|
483
|
-
standard_table = page.extract_table()
|
484
|
-
line_based_tables = page.find_all('region[type="table"]')
|
485
|
-
|
486
|
-
results.append({
|
487
|
-
'page': page_num,
|
488
|
-
'standard_table': standard_table,
|
489
|
-
'line_based_tables': len(line_based_tables)
|
490
|
-
})
|
491
|
-
|
492
|
-
return results
|
493
|
-
```
|
494
|
-
|
495
|
-
"""
|
496
|
-
|
497
|
-
with open(output_path, 'w', encoding='utf-8') as f:
|
498
|
-
f.write(content)
|
499
|
-
|
500
|
-
def main():
|
501
|
-
"""Analyze final 10 PDF documents with detailed capability testing"""
|
502
|
-
|
503
|
-
# Select diverse documents focusing on different challenge types
|
504
|
-
documents_to_analyze = [
|
505
|
-
# Text formatting challenges
|
506
|
-
("Y5G72LB_We are trying to get specific information such as ", "Y5G72LB.pdf", None),
|
507
|
-
("Pd1KBb1_the data table _of election results_", "Pd1KBb1.pdf", None),
|
508
|
-
("Pd9WVDb_We want a spreadsheet showing all the columns sepa", "Pd9WVDb.pdf", None),
|
509
|
-
|
510
|
-
# Complex table structures
|
511
|
-
("eqQ4N7q_election results data table", "eqQ4N7q.pdf", None),
|
512
|
-
("eqQ4NoQ_data table", "eqQ4NoQ.pdf", None),
|
513
|
-
("ODXl8aR_0. ISO code of the business_ business name_ contac", "ODXl8aR.pdf", None),
|
514
|
-
|
515
|
-
# Multi-language and script challenges
|
516
|
-
("1A4PPW1_The arabic text", "1A4PPW1.pdf", None),
|
517
|
-
("lbODDK6_The text in Ethiopian.", "lbODDK6.pdf", None),
|
518
|
-
|
519
|
-
# Dense content and specialized formats
|
520
|
-
("2EAOEvb_The text_ without beeing divided in 2 columns and ", "2EAOEvb.pdf", None),
|
521
|
-
("OD49rjM_Just being able to make sense of any of it. It_s b", "OD49rjM.pdf", None),
|
522
|
-
]
|
523
|
-
|
524
|
-
analysis_results = []
|
525
|
-
|
526
|
-
print(f"🚀 Starting detailed analysis of {len(documents_to_analyze)} documents...")
|
527
|
-
print(f"🔬 Testing discovered Natural PDF capabilities:")
|
528
|
-
print(f" - Line detection (projection profiling)")
|
529
|
-
print(f" - Table structure from lines")
|
530
|
-
print(f" - Advanced selectors")
|
531
|
-
print(f" - Character-level dense text detection")
|
532
|
-
|
533
|
-
for folder_name, pdf_filename, target_pages in documents_to_analyze:
|
534
|
-
pdf_path = f"/Users/soma/Development/natural-pdf/bad_pdf_analysis/{folder_name}/{pdf_filename}"
|
535
|
-
|
536
|
-
if os.path.exists(pdf_path):
|
537
|
-
result = detailed_pdf_analysis(pdf_path, folder_name, target_pages)
|
538
|
-
if result:
|
539
|
-
analysis_results.append(result)
|
540
|
-
else:
|
541
|
-
print(f"❌ PDF not found: {pdf_path}")
|
542
|
-
|
543
|
-
print(f"\n{'='*80}")
|
544
|
-
print(f"✅ DETAILED ANALYSIS COMPLETE!")
|
545
|
-
print(f"📊 Processed {len(analysis_results)} documents")
|
546
|
-
print(f"🔬 Tested Natural PDF capabilities extensively")
|
547
|
-
print(f"{'='*80}")
|
548
|
-
|
549
|
-
return analysis_results
|
550
|
-
|
551
|
-
if __name__ == "__main__":
|
552
|
-
main()
|