natural-pdf 0.1.31__py3-none-any.whl → 0.1.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +18 -4
- natural_pdf/analyzers/guides.py +2176 -0
- natural_pdf/analyzers/shape_detection_mixin.py +0 -650
- natural_pdf/core/element_manager.py +99 -40
- natural_pdf/core/page.py +76 -3
- natural_pdf/core/pdf.py +38 -3
- natural_pdf/elements/collections.py +61 -0
- natural_pdf/elements/region.py +270 -14
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/RECORD +14 -18
- bad_pdf_analysis/analyze_10_more.py +0 -300
- bad_pdf_analysis/analyze_final_10.py +0 -552
- bad_pdf_analysis/analyze_specific_pages.py +0 -394
- bad_pdf_analysis/analyze_specific_pages_direct.py +0 -382
- tools/rtl_smoke_test.py +0 -80
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/top_level.txt +0 -0
@@ -1,300 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
"""
|
3
|
-
Analyze 10 more PDF documents from the bad PDF collection
|
4
|
-
"""
|
5
|
-
|
6
|
-
import os
|
7
|
-
import sys
|
8
|
-
import json
|
9
|
-
from datetime import datetime
|
10
|
-
import natural_pdf as npdf
|
11
|
-
|
12
|
-
# Add the project root to the path
|
13
|
-
sys.path.append('/Users/soma/Development/natural-pdf')
|
14
|
-
|
15
|
-
def analyze_pdf_document(pdf_path, document_name, target_pages=None):
|
16
|
-
"""Analyze a specific PDF document with enhanced reporting"""
|
17
|
-
print(f"\n{'='*80}")
|
18
|
-
print(f"🔍 Analyzing {document_name}")
|
19
|
-
print(f"📁 Path: {pdf_path}")
|
20
|
-
if target_pages:
|
21
|
-
print(f"📍 Target pages: {target_pages}")
|
22
|
-
print(f"{'='*80}")
|
23
|
-
|
24
|
-
try:
|
25
|
-
pdf = npdf.PDF(pdf_path)
|
26
|
-
total_pages = len(pdf.pages)
|
27
|
-
print(f"📄 Total pages in document: {total_pages}")
|
28
|
-
|
29
|
-
# Determine which pages to analyze
|
30
|
-
if target_pages:
|
31
|
-
pages_to_analyze = [p for p in target_pages if p <= total_pages]
|
32
|
-
if len(pages_to_analyze) != len(target_pages):
|
33
|
-
print(f"⚠️ Some target pages exceed document length, analyzing: {pages_to_analyze}")
|
34
|
-
else:
|
35
|
-
# Default to first page if no specific pages requested
|
36
|
-
pages_to_analyze = [1] if total_pages > 0 else []
|
37
|
-
|
38
|
-
results = {
|
39
|
-
'document': document_name,
|
40
|
-
'total_pages': total_pages,
|
41
|
-
'analyzed_pages': pages_to_analyze,
|
42
|
-
'analysis_date': datetime.now().isoformat(),
|
43
|
-
'pages': {}
|
44
|
-
}
|
45
|
-
|
46
|
-
for page_num in pages_to_analyze:
|
47
|
-
print(f"\n📄 Analyzing page {page_num}...")
|
48
|
-
page = pdf.pages[page_num - 1] # Convert to 0-based index
|
49
|
-
|
50
|
-
page_results = {
|
51
|
-
'page_number': page_num,
|
52
|
-
'dimensions': f"{page.width} × {page.height} points"
|
53
|
-
}
|
54
|
-
|
55
|
-
# Extract text
|
56
|
-
try:
|
57
|
-
text_content = page.extract_text()
|
58
|
-
page_results['text_length'] = len(text_content)
|
59
|
-
page_results['text_preview'] = text_content[:200] + "..." if len(text_content) > 200 else text_content
|
60
|
-
print(f"✅ Text extraction: {len(text_content)} characters")
|
61
|
-
except Exception as e:
|
62
|
-
page_results['text_error'] = str(e)
|
63
|
-
print(f"❌ Text extraction failed: {e}")
|
64
|
-
|
65
|
-
# Try table extraction
|
66
|
-
try:
|
67
|
-
table_data = page.extract_table()
|
68
|
-
if table_data and len(table_data) > 0:
|
69
|
-
rows = len(table_data)
|
70
|
-
cols = max(len(row) for row in table_data) if table_data else 0
|
71
|
-
page_results['table'] = f"{rows} rows × {cols} columns"
|
72
|
-
page_results['table_sample'] = table_data[:3] if len(table_data) >= 3 else table_data
|
73
|
-
print(f"✅ Table found: {rows} rows × {cols} columns")
|
74
|
-
else:
|
75
|
-
page_results['table'] = "No table detected"
|
76
|
-
print("ℹ️ No table detected")
|
77
|
-
except Exception as e:
|
78
|
-
page_results['table_error'] = str(e)
|
79
|
-
print(f"❌ Table extraction failed: {e}")
|
80
|
-
|
81
|
-
# Layout analysis with YOLO
|
82
|
-
try:
|
83
|
-
page.analyze_layout('yolo')
|
84
|
-
yolo_regions = page.find_all('region')
|
85
|
-
page_results['yolo_regions'] = len(yolo_regions)
|
86
|
-
print(f"✅ YOLO layout analysis: {len(yolo_regions)} regions")
|
87
|
-
except Exception as e:
|
88
|
-
page_results['yolo_error'] = str(e)
|
89
|
-
print(f"❌ YOLO analysis failed: {e}")
|
90
|
-
|
91
|
-
# Layout analysis with TATR (table-specific)
|
92
|
-
try:
|
93
|
-
page.analyze_layout('tatr', existing='append')
|
94
|
-
tatr_regions = page.find_all('region[type="table"]')
|
95
|
-
page_results['tatr_regions'] = len(tatr_regions)
|
96
|
-
print(f"✅ TATR analysis: {len(tatr_regions)} table regions")
|
97
|
-
except Exception as e:
|
98
|
-
page_results['tatr_error'] = str(e)
|
99
|
-
print(f"❌ TATR analysis failed: {e}")
|
100
|
-
|
101
|
-
# Save page image
|
102
|
-
try:
|
103
|
-
folder_name = document_name.replace('/', '_').replace('\\', '_')
|
104
|
-
analysis_dir = f"/Users/soma/Development/natural-pdf/bad_pdf_analysis/{folder_name}/enhanced_analysis_10"
|
105
|
-
os.makedirs(analysis_dir, exist_ok=True)
|
106
|
-
|
107
|
-
image_path = f"{analysis_dir}/page_{page_num}.png"
|
108
|
-
page_image = page.to_image(resolution=144)
|
109
|
-
page_image.save(image_path)
|
110
|
-
page_results['image_saved'] = image_path
|
111
|
-
print(f"✅ Page image saved: page_{page_num}.png")
|
112
|
-
except Exception as e:
|
113
|
-
page_results['image_error'] = str(e)
|
114
|
-
print(f"❌ Image save failed: {e}")
|
115
|
-
|
116
|
-
results['pages'][page_num] = page_results
|
117
|
-
|
118
|
-
# Generate analysis summary
|
119
|
-
analysis_insights = generate_analysis_insights(results)
|
120
|
-
results['insights'] = analysis_insights
|
121
|
-
|
122
|
-
# Save results to JSON
|
123
|
-
try:
|
124
|
-
folder_name = document_name.replace('/', '_').replace('\\', '_')
|
125
|
-
analysis_dir = f"/Users/soma/Development/natural-pdf/bad_pdf_analysis/{folder_name}/enhanced_analysis_10"
|
126
|
-
os.makedirs(analysis_dir, exist_ok=True)
|
127
|
-
|
128
|
-
results_path = f"{analysis_dir}/analysis_results.json"
|
129
|
-
with open(results_path, 'w', encoding='utf-8') as f:
|
130
|
-
json.dump(results, f, indent=2, ensure_ascii=False)
|
131
|
-
print(f"✅ Analysis results saved: {results_path}")
|
132
|
-
|
133
|
-
# Generate markdown report
|
134
|
-
markdown_path = f"{analysis_dir}/{document_name}_enhanced_analysis.md"
|
135
|
-
generate_markdown_report(results, markdown_path)
|
136
|
-
print(f"✅ Markdown report saved: {markdown_path}")
|
137
|
-
|
138
|
-
except Exception as e:
|
139
|
-
print(f"❌ Failed to save results: {e}")
|
140
|
-
|
141
|
-
return results
|
142
|
-
|
143
|
-
except Exception as e:
|
144
|
-
print(f"❌ Failed to analyze {document_name}: {e}")
|
145
|
-
return None
|
146
|
-
|
147
|
-
def generate_analysis_insights(results):
|
148
|
-
"""Generate insights based on analysis results"""
|
149
|
-
insights = []
|
150
|
-
|
151
|
-
total_chars = sum(page.get('text_length', 0) for page in results['pages'].values())
|
152
|
-
table_pages = sum(1 for page in results['pages'].values() if 'table' in page and 'rows' in page['table'])
|
153
|
-
|
154
|
-
if total_chars > 0:
|
155
|
-
insights.append(f"Document contains {total_chars} total characters across {len(results['pages'])} analyzed pages")
|
156
|
-
|
157
|
-
if table_pages > 0:
|
158
|
-
insights.append(f"{table_pages} out of {len(results['pages'])} pages contain detectable tables")
|
159
|
-
|
160
|
-
# Check for layout complexity
|
161
|
-
avg_regions = sum(page.get('yolo_regions', 0) for page in results['pages'].values()) / len(results['pages'])
|
162
|
-
if avg_regions > 5:
|
163
|
-
insights.append(f"Complex layout detected - average {avg_regions:.1f} regions per page")
|
164
|
-
|
165
|
-
# Check for table structure complexity
|
166
|
-
tatr_regions = sum(page.get('tatr_regions', 0) for page in results['pages'].values())
|
167
|
-
if tatr_regions > 50:
|
168
|
-
insights.append(f"High table complexity - {tatr_regions} TATR table regions detected")
|
169
|
-
|
170
|
-
return insights
|
171
|
-
|
172
|
-
def generate_markdown_report(results, output_path):
|
173
|
-
"""Generate a detailed markdown report"""
|
174
|
-
|
175
|
-
content = f"""# Enhanced PDF Analysis Report - {results['document']}
|
176
|
-
|
177
|
-
## Analysis Overview
|
178
|
-
|
179
|
-
**Document:** {results['document']}
|
180
|
-
**Total Pages:** {results['total_pages']}
|
181
|
-
**Analyzed Pages:** {results['analyzed_pages']}
|
182
|
-
**Analysis Date:** {results['analysis_date']}
|
183
|
-
|
184
|
-
---
|
185
|
-
|
186
|
-
## Key Insights
|
187
|
-
|
188
|
-
"""
|
189
|
-
|
190
|
-
for insight in results.get('insights', []):
|
191
|
-
content += f"- {insight}\n"
|
192
|
-
|
193
|
-
content += "\n---\n\n## Page-by-Page Analysis\n\n"
|
194
|
-
|
195
|
-
for page_num, page_data in results['pages'].items():
|
196
|
-
content += f"### Page {page_num}\n\n"
|
197
|
-
content += f"**Dimensions:** {page_data.get('dimensions', 'Unknown')}\n\n"
|
198
|
-
|
199
|
-
if 'text_length' in page_data:
|
200
|
-
content += f"**Text Content:** {page_data['text_length']} characters\n"
|
201
|
-
if 'text_preview' in page_data:
|
202
|
-
content += f"**Preview:** {page_data['text_preview'][:100]}...\n\n"
|
203
|
-
|
204
|
-
if 'table' in page_data:
|
205
|
-
content += f"**Table Detection:** {page_data['table']}\n"
|
206
|
-
if 'table_sample' in page_data and page_data['table_sample']:
|
207
|
-
content += f"**Sample Data:** First few rows: {page_data['table_sample'][:2]}\n\n"
|
208
|
-
|
209
|
-
if 'yolo_regions' in page_data:
|
210
|
-
content += f"**Layout Regions (YOLO):** {page_data['yolo_regions']}\n"
|
211
|
-
|
212
|
-
if 'tatr_regions' in page_data:
|
213
|
-
content += f"**Table Regions (TATR):** {page_data['tatr_regions']}\n"
|
214
|
-
|
215
|
-
content += "\n"
|
216
|
-
|
217
|
-
content += """
|
218
|
-
---
|
219
|
-
|
220
|
-
## Natural PDF Extraction Recommendations
|
221
|
-
|
222
|
-
Based on this analysis, here are the recommended approaches:
|
223
|
-
|
224
|
-
```python
|
225
|
-
import natural_pdf as npdf
|
226
|
-
|
227
|
-
def extract_document_data(pdf_path):
|
228
|
-
pdf = npdf.PDF(pdf_path)
|
229
|
-
results = []
|
230
|
-
|
231
|
-
for page_num, page in enumerate(pdf.pages, 1):
|
232
|
-
# Use layout analysis for structure detection
|
233
|
-
page.analyze_layout('tatr', existing='append')
|
234
|
-
|
235
|
-
# Extract tables if present
|
236
|
-
table_data = page.extract_table()
|
237
|
-
if table_data:
|
238
|
-
results.append({
|
239
|
-
'page': page_num,
|
240
|
-
'type': 'table',
|
241
|
-
'data': table_data
|
242
|
-
})
|
243
|
-
|
244
|
-
# Extract text content
|
245
|
-
text_content = page.extract_text()
|
246
|
-
if text_content:
|
247
|
-
results.append({
|
248
|
-
'page': page_num,
|
249
|
-
'type': 'text',
|
250
|
-
'content': text_content
|
251
|
-
})
|
252
|
-
|
253
|
-
return results
|
254
|
-
```
|
255
|
-
|
256
|
-
"""
|
257
|
-
|
258
|
-
with open(output_path, 'w', encoding='utf-8') as f:
|
259
|
-
f.write(content)
|
260
|
-
|
261
|
-
def main():
|
262
|
-
"""Analyze 10 more PDF documents"""
|
263
|
-
|
264
|
-
# List of documents to analyze with specific pages if needed
|
265
|
-
documents_to_analyze = [
|
266
|
-
# Documents with specific page requests
|
267
|
-
("GxpvezO_The table in Nepali on page 30 _in between the tex", "GxpvezO.pdf", [30]),
|
268
|
-
("J9lKd7Y_Table in Slovenian _e.g. on page 80_.", "J9lKd7Y.pdf", [80]),
|
269
|
-
("b5eVqGg_Math formulas in Russian _e.g. on page 181__", "b5eVqGg.pdf", [181]),
|
270
|
-
("lbODqev_Large wide tables in Serbian _from page 63 and on_", "lbODqev.pdf", [63, 64, 65]),
|
271
|
-
("obR6Dxb_Large table that spans across pages in Serbian _e.", "obR6Dxb.pdf", [1, 2, 3]),
|
272
|
-
("ober4db_The graph and table on page 180 and 181", "ober4db.pdf", [180, 181]),
|
273
|
-
("oberryX_The survery question table_ such as the one on pag", "oberryX.pdf", [1]), # Need to find specific page
|
274
|
-
("eqrZZbq_The categorize chart _E1_ on page 4_ The chart_tab", "eqrZZbq.pdf", [4]),
|
275
|
-
|
276
|
-
# Documents with general analysis needs
|
277
|
-
("NplKG2O_Try to see if natural-pdf can process non-standard", "NplKG2O.pdf", None),
|
278
|
-
("obe1Vq5_MARKED UP text -- underline and strikethu__for bon", "obe1Vq5.pdf", None),
|
279
|
-
]
|
280
|
-
|
281
|
-
analysis_results = []
|
282
|
-
|
283
|
-
for folder_name, pdf_filename, target_pages in documents_to_analyze:
|
284
|
-
pdf_path = f"/Users/soma/Development/natural-pdf/bad_pdf_analysis/{folder_name}/{pdf_filename}"
|
285
|
-
|
286
|
-
if os.path.exists(pdf_path):
|
287
|
-
result = analyze_pdf_document(pdf_path, folder_name, target_pages)
|
288
|
-
if result:
|
289
|
-
analysis_results.append(result)
|
290
|
-
else:
|
291
|
-
print(f"❌ PDF not found: {pdf_path}")
|
292
|
-
|
293
|
-
print(f"\n{'='*80}")
|
294
|
-
print(f"✅ Analysis complete! Processed {len(analysis_results)} documents")
|
295
|
-
print(f"{'='*80}")
|
296
|
-
|
297
|
-
return analysis_results
|
298
|
-
|
299
|
-
if __name__ == "__main__":
|
300
|
-
main()
|