natural-pdf 0.1.28__py3-none-any.whl → 0.1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bad_pdf_analysis/analyze_10_more.py +300 -0
- bad_pdf_analysis/analyze_final_10.py +552 -0
- bad_pdf_analysis/analyze_specific_pages.py +394 -0
- bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +44 -0
- natural_pdf/analyzers/layout/surya.py +1 -1
- natural_pdf/analyzers/shape_detection_mixin.py +228 -0
- natural_pdf/classification/manager.py +67 -0
- natural_pdf/core/element_manager.py +578 -27
- natural_pdf/core/highlighting_service.py +98 -43
- natural_pdf/core/page.py +86 -20
- natural_pdf/core/pdf.py +0 -2
- natural_pdf/describe/base.py +40 -9
- natural_pdf/describe/elements.py +11 -6
- natural_pdf/elements/base.py +134 -20
- natural_pdf/elements/collections.py +43 -11
- natural_pdf/elements/image.py +43 -0
- natural_pdf/elements/region.py +64 -19
- natural_pdf/elements/text.py +118 -11
- natural_pdf/flows/collections.py +4 -4
- natural_pdf/flows/region.py +17 -2
- natural_pdf/ocr/ocr_manager.py +50 -0
- natural_pdf/selectors/parser.py +27 -7
- natural_pdf/tables/__init__.py +5 -0
- natural_pdf/tables/result.py +101 -0
- natural_pdf/utils/bidi_mirror.py +36 -0
- natural_pdf/utils/visualization.py +15 -1
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/RECORD +48 -26
- natural_pdf-0.1.31.dist-info/top_level.txt +6 -0
- optimization/memory_comparison.py +172 -0
- optimization/pdf_analyzer.py +410 -0
- optimization/performance_analysis.py +397 -0
- optimization/test_cleanup_methods.py +155 -0
- optimization/test_memory_fix.py +162 -0
- tools/bad_pdf_eval/__init__.py +1 -0
- tools/bad_pdf_eval/analyser.py +302 -0
- tools/bad_pdf_eval/collate_summaries.py +130 -0
- tools/bad_pdf_eval/eval_suite.py +116 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
- tools/bad_pdf_eval/llm_enrich.py +273 -0
- tools/bad_pdf_eval/reporter.py +17 -0
- tools/bad_pdf_eval/utils.py +127 -0
- tools/rtl_smoke_test.py +80 -0
- natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,394 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Enhanced PDF analysis script that can target specific pages.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import json
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
from pathlib import Path
|
10
|
+
import subprocess
|
11
|
+
import re
|
12
|
+
|
13
|
+
def parse_page_request(user_goal):
|
14
|
+
"""Parse user requests for specific pages or page ranges"""
|
15
|
+
page_patterns = [
|
16
|
+
r'page (\d+)',
|
17
|
+
r'pages (\d+) to (\d+)',
|
18
|
+
r'pages (\d+)-(\d+)',
|
19
|
+
r'from page (\d+) to (\d+)',
|
20
|
+
r'spanning.*pages.*from page (\d+) to (\d+)',
|
21
|
+
]
|
22
|
+
|
23
|
+
user_goal_lower = user_goal.lower()
|
24
|
+
|
25
|
+
for pattern in page_patterns:
|
26
|
+
match = re.search(pattern, user_goal_lower)
|
27
|
+
if match:
|
28
|
+
groups = match.groups()
|
29
|
+
if len(groups) == 1:
|
30
|
+
# Single page
|
31
|
+
return [int(groups[0])]
|
32
|
+
elif len(groups) == 2:
|
33
|
+
# Page range
|
34
|
+
start, end = int(groups[0]), int(groups[1])
|
35
|
+
return list(range(start, end + 1))
|
36
|
+
|
37
|
+
return None # No specific pages found
|
38
|
+
|
39
|
+
def run_pdf_analyzer_on_pages(pdf_path, pages_to_analyze, output_folder):
|
40
|
+
"""Run PDF analyzer on specific pages"""
|
41
|
+
results = {}
|
42
|
+
|
43
|
+
for page_num in pages_to_analyze:
|
44
|
+
print(f"Analyzing page {page_num}...")
|
45
|
+
|
46
|
+
# Create page-specific output folder
|
47
|
+
page_output = os.path.join(output_folder, f"page_{page_num}")
|
48
|
+
os.makedirs(page_output, exist_ok=True)
|
49
|
+
|
50
|
+
# Run analyzer for specific page
|
51
|
+
cmd = [
|
52
|
+
"python", "-m", "natural_pdf.cli.pdf_analyzer",
|
53
|
+
pdf_path,
|
54
|
+
"1", # Analyze 1 page starting from page_num
|
55
|
+
page_output,
|
56
|
+
"--no-timestamp",
|
57
|
+
f"--start-page={page_num}"
|
58
|
+
]
|
59
|
+
|
60
|
+
try:
|
61
|
+
result = subprocess.run(cmd, capture_output=True, text=True, cwd="/Users/soma/Development/natural-pdf")
|
62
|
+
if result.returncode == 0:
|
63
|
+
print(f" ✅ Page {page_num} analysis completed")
|
64
|
+
|
65
|
+
# Read the analysis results
|
66
|
+
analysis_file = os.path.join(page_output, "analysis_summary.json")
|
67
|
+
if os.path.exists(analysis_file):
|
68
|
+
with open(analysis_file, 'r') as f:
|
69
|
+
page_analysis = json.load(f)
|
70
|
+
results[page_num] = page_analysis
|
71
|
+
else:
|
72
|
+
print(f" ⚠️ No analysis file found for page {page_num}")
|
73
|
+
else:
|
74
|
+
print(f" ❌ Page {page_num} analysis failed: {result.stderr}")
|
75
|
+
|
76
|
+
except Exception as e:
|
77
|
+
print(f" ❌ Error analyzing page {page_num}: {e}")
|
78
|
+
|
79
|
+
return results
|
80
|
+
|
81
|
+
def create_enhanced_analysis_report(submission_data, page_results, pdf_filename, folder_path):
|
82
|
+
"""Create analysis report using results from specific pages"""
|
83
|
+
|
84
|
+
# Extract basic submission info
|
85
|
+
user_goal = submission_data.get('goal', 'Unknown goal')
|
86
|
+
pdf_description = submission_data.get('description', 'No description provided')
|
87
|
+
reported_issues = submission_data.get('issues', 'No issues reported')
|
88
|
+
|
89
|
+
# Parse requested pages
|
90
|
+
requested_pages = parse_page_request(user_goal)
|
91
|
+
pages_analyzed = list(page_results.keys()) if page_results else []
|
92
|
+
|
93
|
+
# Get document properties from first successful page analysis
|
94
|
+
doc_properties = {}
|
95
|
+
sample_page_data = {}
|
96
|
+
if page_results:
|
97
|
+
first_page_result = next(iter(page_results.values()))
|
98
|
+
if first_page_result.get('pages'):
|
99
|
+
sample_page_data = first_page_result['pages'][0]
|
100
|
+
doc_properties = {
|
101
|
+
'dimensions': sample_page_data.get('dimensions', {}),
|
102
|
+
'total_pages': first_page_result.get('total_pages', 'Unknown')
|
103
|
+
}
|
104
|
+
|
105
|
+
# Create the analysis report
|
106
|
+
report_content = f"""# PDF Analysis Report - {pdf_filename.replace('.pdf', '')}
|
107
|
+
|
108
|
+
## Submission Details
|
109
|
+
|
110
|
+
**PDF File:** {pdf_filename}
|
111
|
+
**Language:** {submission_data.get('language', 'Unknown')}
|
112
|
+
**Contains Handwriting:** {submission_data.get('handwriting', 'Unknown')}
|
113
|
+
**Requires OCR:** {submission_data.get('ocr_required', 'Unknown')}
|
114
|
+
|
115
|
+
### User's Goal
|
116
|
+
{user_goal}
|
117
|
+
|
118
|
+
### PDF Description
|
119
|
+
{pdf_description}
|
120
|
+
|
121
|
+
### Reported Issues
|
122
|
+
{reported_issues}
|
123
|
+
|
124
|
+
---
|
125
|
+
|
126
|
+
## Technical Analysis
|
127
|
+
|
128
|
+
### PDF Properties
|
129
|
+
**Document Size:** {doc_properties.get('total_pages', 'Unknown')} pages
|
130
|
+
**Page Dimensions:** {doc_properties.get('dimensions', {}).get('width', 'Unknown')} × {doc_properties.get('dimensions', {}).get('height', 'Unknown')} points
|
131
|
+
**Pages Requested:** {requested_pages if requested_pages else 'Not specified'}
|
132
|
+
**Pages Analyzed:** {pages_analyzed}
|
133
|
+
|
134
|
+
### Analysis Results by Page
|
135
|
+
"""
|
136
|
+
|
137
|
+
# Add results for each analyzed page
|
138
|
+
for page_num, page_data in page_results.items():
|
139
|
+
if page_data.get('pages'):
|
140
|
+
page_info = page_data['pages'][0]
|
141
|
+
|
142
|
+
report_content += f"""
|
143
|
+
#### Page {page_num} Analysis
|
144
|
+
|
145
|
+
**Elements Found:**
|
146
|
+
- **Text elements:** {page_info.get('describe', '').count('text')}
|
147
|
+
- **Table regions:** {page_info.get('analyze_layout', {}).get('count', 0)} layout regions detected
|
148
|
+
- **Extract table:** {'✅ Success' if page_info.get('extract_table', {}).get('found') else '❌ No tables found'}
|
149
|
+
|
150
|
+
**Content Preview:**
|
151
|
+
```
|
152
|
+
{page_info.get('extract_text', {}).get('preview', 'No text preview available')[:200]}...
|
153
|
+
```
|
154
|
+
|
155
|
+
**Visual Analysis:** Page image saved as `page_{page_num}.png`
|
156
|
+
"""
|
157
|
+
|
158
|
+
# Add difficulty assessment based on actual page content
|
159
|
+
report_content += f"""
|
160
|
+
---
|
161
|
+
|
162
|
+
## Difficulty Assessment
|
163
|
+
|
164
|
+
### Extraction Type
|
165
|
+
**Primary Goal:** {determine_extraction_type(user_goal)}
|
166
|
+
|
167
|
+
### Real Challenges Identified
|
168
|
+
"""
|
169
|
+
|
170
|
+
# Analyze challenges based on actual page content
|
171
|
+
challenges = analyze_page_challenges(page_results, requested_pages, pages_analyzed)
|
172
|
+
for challenge in challenges:
|
173
|
+
report_content += f"\n{challenge}\n"
|
174
|
+
|
175
|
+
# Add recommendations based on actual content
|
176
|
+
report_content += """
|
177
|
+
### What Natural PDF Can Do
|
178
|
+
|
179
|
+
**✅ Recommended Approaches:**
|
180
|
+
|
181
|
+
Based on the actual page content analyzed, here are specific Natural PDF approaches:
|
182
|
+
|
183
|
+
"""
|
184
|
+
|
185
|
+
recommendations = generate_specific_recommendations(page_results, user_goal)
|
186
|
+
report_content += recommendations
|
187
|
+
|
188
|
+
# Add footer
|
189
|
+
report_content += f"""
|
190
|
+
---
|
191
|
+
|
192
|
+
## Feedback Section
|
193
|
+
|
194
|
+
*Analysis based on actual page content from requested pages*
|
195
|
+
|
196
|
+
### Assessment Accuracy
|
197
|
+
- [x] Analysis examined user-requested pages
|
198
|
+
- [ ] Difficulty assessment needs revision
|
199
|
+
|
200
|
+
### Proposed Methods
|
201
|
+
- [ ] Recommended approaches look good
|
202
|
+
- [ ] Alternative approaches needed
|
203
|
+
- [ ] Methods need refinement
|
204
|
+
|
205
|
+
---
|
206
|
+
|
207
|
+
**Analysis Generated:** Enhanced analysis targeting user-specified pages
|
208
|
+
**Pages Analyzed:** {pages_analyzed}
|
209
|
+
**Analysis Date:** {page_results[pages_analyzed[0]]['analysis_timestamp'] if pages_analyzed and page_results else 'Unknown'}
|
210
|
+
"""
|
211
|
+
|
212
|
+
# Write the report
|
213
|
+
report_path = os.path.join(folder_path, f"{pdf_filename.replace('.pdf', '')}_analysis.md")
|
214
|
+
with open(report_path, 'w', encoding='utf-8') as f:
|
215
|
+
f.write(report_content)
|
216
|
+
|
217
|
+
print(f"✅ Enhanced analysis report created: {report_path}")
|
218
|
+
return report_path
|
219
|
+
|
220
|
+
def determine_extraction_type(user_goal):
|
221
|
+
"""Determine extraction type from user goal"""
|
222
|
+
goal_lower = user_goal.lower()
|
223
|
+
if 'table' in goal_lower:
|
224
|
+
return 'Table Extraction'
|
225
|
+
elif 'text' in goal_lower:
|
226
|
+
return 'Text Extraction'
|
227
|
+
elif 'form' in goal_lower:
|
228
|
+
return 'Form Data Extraction'
|
229
|
+
else:
|
230
|
+
return 'Data Extraction'
|
231
|
+
|
232
|
+
def analyze_page_challenges(page_results, requested_pages, pages_analyzed):
|
233
|
+
"""Analyze real challenges based on page content"""
|
234
|
+
challenges = []
|
235
|
+
|
236
|
+
# Check if we got the right pages
|
237
|
+
if requested_pages and set(requested_pages) != set(pages_analyzed):
|
238
|
+
missing_pages = set(requested_pages) - set(pages_analyzed)
|
239
|
+
challenges.append(f"""
|
240
|
+
#### **Page Access Issues**
|
241
|
+
**Missing pages:** {missing_pages} - Could not analyze all requested pages
|
242
|
+
**Analyzed instead:** {pages_analyzed}
|
243
|
+
**Impact:** Analysis may be incomplete without examining all target pages
|
244
|
+
""")
|
245
|
+
|
246
|
+
# Analyze content complexity from actual results
|
247
|
+
for page_num, page_data in page_results.items():
|
248
|
+
if page_data.get('pages'):
|
249
|
+
page_info = page_data['pages'][0]
|
250
|
+
|
251
|
+
# Check for table extraction issues
|
252
|
+
if not page_info.get('extract_table', {}).get('found'):
|
253
|
+
challenges.append(f"""
|
254
|
+
#### **Table Detection Issues (Page {page_num})**
|
255
|
+
**Problem:** No tables detected on page {page_num}
|
256
|
+
**Possible causes:** Complex layout, unruled tables, or non-standard table structure
|
257
|
+
**Content type:** Based on text preview, this appears to be {analyze_content_type(page_info)}
|
258
|
+
""")
|
259
|
+
|
260
|
+
# Check for text complexity
|
261
|
+
text_length = page_info.get('extract_text', {}).get('length', 0)
|
262
|
+
if text_length > 5000:
|
263
|
+
challenges.append(f"""
|
264
|
+
#### **Dense Content (Page {page_num})**
|
265
|
+
**Issue:** Large amount of text ({text_length} characters) may indicate complex layout
|
266
|
+
**Challenge:** Dense content can complicate spatial navigation and element detection
|
267
|
+
""")
|
268
|
+
|
269
|
+
return challenges
|
270
|
+
|
271
|
+
def analyze_content_type(page_info):
|
272
|
+
"""Analyze what type of content is on the page"""
|
273
|
+
text_preview = page_info.get('extract_text', {}).get('preview', '').lower()
|
274
|
+
|
275
|
+
if 'table' in text_preview or 'column' in text_preview:
|
276
|
+
return 'tabular data'
|
277
|
+
elif any(word in text_preview for word in ['report', 'study', 'analysis']):
|
278
|
+
return 'report content'
|
279
|
+
elif any(word in text_preview for word in ['form', 'application', 'field']):
|
280
|
+
return 'form data'
|
281
|
+
else:
|
282
|
+
return 'mixed content'
|
283
|
+
|
284
|
+
def generate_specific_recommendations(page_results, user_goal):
|
285
|
+
"""Generate specific recommendations based on actual page analysis"""
|
286
|
+
recommendations = """
|
287
|
+
```python
|
288
|
+
import natural_pdf as npdf
|
289
|
+
|
290
|
+
def extract_from_target_pages(pdf_path, target_pages):
|
291
|
+
\"\"\"Extract data from user-specified pages\"\"\"
|
292
|
+
pdf = npdf.PDF(pdf_path)
|
293
|
+
results = []
|
294
|
+
|
295
|
+
for page_num in target_pages:
|
296
|
+
if page_num <= len(pdf.pages):
|
297
|
+
page = pdf.pages[page_num - 1] # Convert to 0-based index
|
298
|
+
|
299
|
+
# Analyze layout for better structure detection
|
300
|
+
page.analyze_layout('tatr', existing='append')
|
301
|
+
|
302
|
+
# Try multiple extraction approaches
|
303
|
+
table_data = page.extract_table()
|
304
|
+
if table_data:
|
305
|
+
results.append({'page': page_num, 'type': 'table', 'data': table_data})
|
306
|
+
else:
|
307
|
+
# Fall back to text extraction with spatial awareness
|
308
|
+
text_elements = page.find_all('text')
|
309
|
+
results.append({'page': page_num, 'type': 'text', 'elements': text_elements})
|
310
|
+
|
311
|
+
return results
|
312
|
+
|
313
|
+
# Usage for your specific case
|
314
|
+
"""
|
315
|
+
|
316
|
+
# Add specific usage based on the document
|
317
|
+
if 'page' in user_goal.lower():
|
318
|
+
page_match = re.search(r'page (\d+)', user_goal.lower())
|
319
|
+
if page_match:
|
320
|
+
page_num = page_match.group(1)
|
321
|
+
recommendations += f"""
|
322
|
+
# Target the specific page mentioned
|
323
|
+
results = extract_from_target_pages('document.pdf', [{page_num}])
|
324
|
+
```
|
325
|
+
"""
|
326
|
+
elif 'pages' in user_goal.lower():
|
327
|
+
pages_match = re.search(r'pages (\d+) to (\d+)', user_goal.lower())
|
328
|
+
if pages_match:
|
329
|
+
start, end = pages_match.groups()
|
330
|
+
recommendations += f"""
|
331
|
+
# Target the page range mentioned
|
332
|
+
results = extract_from_target_pages('document.pdf', list(range({start}, {end} + 1)))
|
333
|
+
```
|
334
|
+
"""
|
335
|
+
|
336
|
+
return recommendations
|
337
|
+
|
338
|
+
def main():
|
339
|
+
"""Re-analyze specific documents with page targeting"""
|
340
|
+
|
341
|
+
# Documents that need re-analysis with specific pages
|
342
|
+
documents_to_reanalyze = [
|
343
|
+
{
|
344
|
+
'folder': 'ODX1DW8_The large table on page 179',
|
345
|
+
'file': 'ODX1DW8.pdf',
|
346
|
+
'pages': [178, 179, 180], # Page 179 ± 1 for safety
|
347
|
+
'reason': 'User requested page 179, original analysis used page 1'
|
348
|
+
},
|
349
|
+
{
|
350
|
+
'folder': 'eqrZ5yq_The long table _Annex 6_ spanning across pages fro',
|
351
|
+
'file': 'eqrZ5yq.pdf',
|
352
|
+
'pages': [89, 90, 91, 92], # Multi-page table range
|
353
|
+
'reason': 'User requested pages 89-92, original analysis used page 1'
|
354
|
+
}
|
355
|
+
]
|
356
|
+
|
357
|
+
base_path = "/Users/soma/Development/natural-pdf/bad_pdf_analysis"
|
358
|
+
|
359
|
+
for doc in documents_to_reanalyze:
|
360
|
+
print(f"\n🔄 Re-analyzing {doc['file']} - {doc['reason']}")
|
361
|
+
|
362
|
+
folder_path = os.path.join(base_path, doc['folder'])
|
363
|
+
pdf_path = os.path.join(folder_path, doc['file'])
|
364
|
+
output_folder = os.path.join(folder_path, 'analysis', 'specific_pages')
|
365
|
+
|
366
|
+
if not os.path.exists(pdf_path):
|
367
|
+
print(f"❌ PDF not found: {pdf_path}")
|
368
|
+
continue
|
369
|
+
|
370
|
+
# Create output folder
|
371
|
+
os.makedirs(output_folder, exist_ok=True)
|
372
|
+
|
373
|
+
# Run analysis on specific pages
|
374
|
+
page_results = run_pdf_analyzer_on_pages(pdf_path, doc['pages'], output_folder)
|
375
|
+
|
376
|
+
if page_results:
|
377
|
+
# Create enhanced analysis report
|
378
|
+
submission_data = {
|
379
|
+
'goal': f"Analysis targeting pages {doc['pages']}",
|
380
|
+
'description': f"Re-analysis of {doc['file']} focusing on user-requested pages",
|
381
|
+
'issues': doc['reason']
|
382
|
+
}
|
383
|
+
|
384
|
+
create_enhanced_analysis_report(
|
385
|
+
submission_data,
|
386
|
+
page_results,
|
387
|
+
doc['file'],
|
388
|
+
folder_path
|
389
|
+
)
|
390
|
+
else:
|
391
|
+
print(f"❌ No results obtained for {doc['file']}")
|
392
|
+
|
393
|
+
if __name__ == "__main__":
|
394
|
+
main()
|