natural-pdf 0.1.28__py3-none-any.whl → 0.1.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. bad_pdf_analysis/analyze_10_more.py +300 -0
  2. bad_pdf_analysis/analyze_final_10.py +552 -0
  3. bad_pdf_analysis/analyze_specific_pages.py +394 -0
  4. bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
  5. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  6. natural_pdf/analyzers/layout/layout_manager.py +44 -0
  7. natural_pdf/analyzers/layout/surya.py +1 -1
  8. natural_pdf/analyzers/shape_detection_mixin.py +228 -0
  9. natural_pdf/classification/manager.py +67 -0
  10. natural_pdf/core/element_manager.py +556 -25
  11. natural_pdf/core/highlighting_service.py +98 -43
  12. natural_pdf/core/page.py +86 -20
  13. natural_pdf/core/pdf.py +0 -2
  14. natural_pdf/describe/base.py +40 -9
  15. natural_pdf/describe/elements.py +11 -6
  16. natural_pdf/elements/base.py +134 -20
  17. natural_pdf/elements/collections.py +43 -11
  18. natural_pdf/elements/image.py +43 -0
  19. natural_pdf/elements/region.py +64 -19
  20. natural_pdf/elements/text.py +89 -11
  21. natural_pdf/flows/collections.py +4 -4
  22. natural_pdf/flows/region.py +17 -2
  23. natural_pdf/ocr/ocr_manager.py +50 -0
  24. natural_pdf/selectors/parser.py +27 -7
  25. natural_pdf/tables/__init__.py +5 -0
  26. natural_pdf/tables/result.py +101 -0
  27. natural_pdf/utils/bidi_mirror.py +36 -0
  28. natural_pdf/utils/visualization.py +15 -1
  29. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
  30. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +48 -26
  31. natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
  32. optimization/memory_comparison.py +172 -0
  33. optimization/pdf_analyzer.py +410 -0
  34. optimization/performance_analysis.py +397 -0
  35. optimization/test_cleanup_methods.py +155 -0
  36. optimization/test_memory_fix.py +162 -0
  37. tools/bad_pdf_eval/__init__.py +1 -0
  38. tools/bad_pdf_eval/analyser.py +302 -0
  39. tools/bad_pdf_eval/collate_summaries.py +130 -0
  40. tools/bad_pdf_eval/eval_suite.py +116 -0
  41. tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
  42. tools/bad_pdf_eval/llm_enrich.py +273 -0
  43. tools/bad_pdf_eval/reporter.py +17 -0
  44. tools/bad_pdf_eval/utils.py +127 -0
  45. tools/rtl_smoke_test.py +80 -0
  46. natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
  47. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
  48. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
  49. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,394 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Enhanced PDF analysis script that can target specific pages.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import sys
9
+ from pathlib import Path
10
+ import subprocess
11
+ import re
12
+
13
+ def parse_page_request(user_goal):
14
+ """Parse user requests for specific pages or page ranges"""
15
+ page_patterns = [
16
+ r'page (\d+)',
17
+ r'pages (\d+) to (\d+)',
18
+ r'pages (\d+)-(\d+)',
19
+ r'from page (\d+) to (\d+)',
20
+ r'spanning.*pages.*from page (\d+) to (\d+)',
21
+ ]
22
+
23
+ user_goal_lower = user_goal.lower()
24
+
25
+ for pattern in page_patterns:
26
+ match = re.search(pattern, user_goal_lower)
27
+ if match:
28
+ groups = match.groups()
29
+ if len(groups) == 1:
30
+ # Single page
31
+ return [int(groups[0])]
32
+ elif len(groups) == 2:
33
+ # Page range
34
+ start, end = int(groups[0]), int(groups[1])
35
+ return list(range(start, end + 1))
36
+
37
+ return None # No specific pages found
38
+
39
+ def run_pdf_analyzer_on_pages(pdf_path, pages_to_analyze, output_folder):
40
+ """Run PDF analyzer on specific pages"""
41
+ results = {}
42
+
43
+ for page_num in pages_to_analyze:
44
+ print(f"Analyzing page {page_num}...")
45
+
46
+ # Create page-specific output folder
47
+ page_output = os.path.join(output_folder, f"page_{page_num}")
48
+ os.makedirs(page_output, exist_ok=True)
49
+
50
+ # Run analyzer for specific page
51
+ cmd = [
52
+ "python", "-m", "natural_pdf.cli.pdf_analyzer",
53
+ pdf_path,
54
+ "1", # Analyze 1 page starting from page_num
55
+ page_output,
56
+ "--no-timestamp",
57
+ f"--start-page={page_num}"
58
+ ]
59
+
60
+ try:
61
+ result = subprocess.run(cmd, capture_output=True, text=True, cwd="/Users/soma/Development/natural-pdf")
62
+ if result.returncode == 0:
63
+ print(f" ✅ Page {page_num} analysis completed")
64
+
65
+ # Read the analysis results
66
+ analysis_file = os.path.join(page_output, "analysis_summary.json")
67
+ if os.path.exists(analysis_file):
68
+ with open(analysis_file, 'r') as f:
69
+ page_analysis = json.load(f)
70
+ results[page_num] = page_analysis
71
+ else:
72
+ print(f" ⚠️ No analysis file found for page {page_num}")
73
+ else:
74
+ print(f" ❌ Page {page_num} analysis failed: {result.stderr}")
75
+
76
+ except Exception as e:
77
+ print(f" ❌ Error analyzing page {page_num}: {e}")
78
+
79
+ return results
80
+
81
+ def create_enhanced_analysis_report(submission_data, page_results, pdf_filename, folder_path):
82
+ """Create analysis report using results from specific pages"""
83
+
84
+ # Extract basic submission info
85
+ user_goal = submission_data.get('goal', 'Unknown goal')
86
+ pdf_description = submission_data.get('description', 'No description provided')
87
+ reported_issues = submission_data.get('issues', 'No issues reported')
88
+
89
+ # Parse requested pages
90
+ requested_pages = parse_page_request(user_goal)
91
+ pages_analyzed = list(page_results.keys()) if page_results else []
92
+
93
+ # Get document properties from first successful page analysis
94
+ doc_properties = {}
95
+ sample_page_data = {}
96
+ if page_results:
97
+ first_page_result = next(iter(page_results.values()))
98
+ if first_page_result.get('pages'):
99
+ sample_page_data = first_page_result['pages'][0]
100
+ doc_properties = {
101
+ 'dimensions': sample_page_data.get('dimensions', {}),
102
+ 'total_pages': first_page_result.get('total_pages', 'Unknown')
103
+ }
104
+
105
+ # Create the analysis report
106
+ report_content = f"""# PDF Analysis Report - {pdf_filename.replace('.pdf', '')}
107
+
108
+ ## Submission Details
109
+
110
+ **PDF File:** {pdf_filename}
111
+ **Language:** {submission_data.get('language', 'Unknown')}
112
+ **Contains Handwriting:** {submission_data.get('handwriting', 'Unknown')}
113
+ **Requires OCR:** {submission_data.get('ocr_required', 'Unknown')}
114
+
115
+ ### User's Goal
116
+ {user_goal}
117
+
118
+ ### PDF Description
119
+ {pdf_description}
120
+
121
+ ### Reported Issues
122
+ {reported_issues}
123
+
124
+ ---
125
+
126
+ ## Technical Analysis
127
+
128
+ ### PDF Properties
129
+ **Document Size:** {doc_properties.get('total_pages', 'Unknown')} pages
130
+ **Page Dimensions:** {doc_properties.get('dimensions', {}).get('width', 'Unknown')} × {doc_properties.get('dimensions', {}).get('height', 'Unknown')} points
131
+ **Pages Requested:** {requested_pages if requested_pages else 'Not specified'}
132
+ **Pages Analyzed:** {pages_analyzed}
133
+
134
+ ### Analysis Results by Page
135
+ """
136
+
137
+ # Add results for each analyzed page
138
+ for page_num, page_data in page_results.items():
139
+ if page_data.get('pages'):
140
+ page_info = page_data['pages'][0]
141
+
142
+ report_content += f"""
143
+ #### Page {page_num} Analysis
144
+
145
+ **Elements Found:**
146
+ - **Text elements:** {page_info.get('describe', '').count('text')}
147
+ - **Table regions:** {page_info.get('analyze_layout', {}).get('count', 0)} layout regions detected
148
+ - **Extract table:** {'✅ Success' if page_info.get('extract_table', {}).get('found') else '❌ No tables found'}
149
+
150
+ **Content Preview:**
151
+ ```
152
+ {page_info.get('extract_text', {}).get('preview', 'No text preview available')[:200]}...
153
+ ```
154
+
155
+ **Visual Analysis:** Page image saved as `page_{page_num}.png`
156
+ """
157
+
158
+ # Add difficulty assessment based on actual page content
159
+ report_content += f"""
160
+ ---
161
+
162
+ ## Difficulty Assessment
163
+
164
+ ### Extraction Type
165
+ **Primary Goal:** {determine_extraction_type(user_goal)}
166
+
167
+ ### Real Challenges Identified
168
+ """
169
+
170
+ # Analyze challenges based on actual page content
171
+ challenges = analyze_page_challenges(page_results, requested_pages, pages_analyzed)
172
+ for challenge in challenges:
173
+ report_content += f"\n{challenge}\n"
174
+
175
+ # Add recommendations based on actual content
176
+ report_content += """
177
+ ### What Natural PDF Can Do
178
+
179
+ **✅ Recommended Approaches:**
180
+
181
+ Based on the actual page content analyzed, here are specific Natural PDF approaches:
182
+
183
+ """
184
+
185
+ recommendations = generate_specific_recommendations(page_results, user_goal)
186
+ report_content += recommendations
187
+
188
+ # Add footer
189
+ report_content += f"""
190
+ ---
191
+
192
+ ## Feedback Section
193
+
194
+ *Analysis based on actual page content from requested pages*
195
+
196
+ ### Assessment Accuracy
197
+ - [x] Analysis examined user-requested pages
198
+ - [ ] Difficulty assessment needs revision
199
+
200
+ ### Proposed Methods
201
+ - [ ] Recommended approaches look good
202
+ - [ ] Alternative approaches needed
203
+ - [ ] Methods need refinement
204
+
205
+ ---
206
+
207
+ **Analysis Generated:** Enhanced analysis targeting user-specified pages
208
+ **Pages Analyzed:** {pages_analyzed}
209
+ **Analysis Date:** {page_results[pages_analyzed[0]]['analysis_timestamp'] if pages_analyzed and page_results else 'Unknown'}
210
+ """
211
+
212
+ # Write the report
213
+ report_path = os.path.join(folder_path, f"{pdf_filename.replace('.pdf', '')}_analysis.md")
214
+ with open(report_path, 'w', encoding='utf-8') as f:
215
+ f.write(report_content)
216
+
217
+ print(f"✅ Enhanced analysis report created: {report_path}")
218
+ return report_path
219
+
220
+ def determine_extraction_type(user_goal):
221
+ """Determine extraction type from user goal"""
222
+ goal_lower = user_goal.lower()
223
+ if 'table' in goal_lower:
224
+ return 'Table Extraction'
225
+ elif 'text' in goal_lower:
226
+ return 'Text Extraction'
227
+ elif 'form' in goal_lower:
228
+ return 'Form Data Extraction'
229
+ else:
230
+ return 'Data Extraction'
231
+
232
+ def analyze_page_challenges(page_results, requested_pages, pages_analyzed):
233
+ """Analyze real challenges based on page content"""
234
+ challenges = []
235
+
236
+ # Check if we got the right pages
237
+ if requested_pages and set(requested_pages) != set(pages_analyzed):
238
+ missing_pages = set(requested_pages) - set(pages_analyzed)
239
+ challenges.append(f"""
240
+ #### **Page Access Issues**
241
+ **Missing pages:** {missing_pages} - Could not analyze all requested pages
242
+ **Analyzed instead:** {pages_analyzed}
243
+ **Impact:** Analysis may be incomplete without examining all target pages
244
+ """)
245
+
246
+ # Analyze content complexity from actual results
247
+ for page_num, page_data in page_results.items():
248
+ if page_data.get('pages'):
249
+ page_info = page_data['pages'][0]
250
+
251
+ # Check for table extraction issues
252
+ if not page_info.get('extract_table', {}).get('found'):
253
+ challenges.append(f"""
254
+ #### **Table Detection Issues (Page {page_num})**
255
+ **Problem:** No tables detected on page {page_num}
256
+ **Possible causes:** Complex layout, unruled tables, or non-standard table structure
257
+ **Content type:** Based on text preview, this appears to be {analyze_content_type(page_info)}
258
+ """)
259
+
260
+ # Check for text complexity
261
+ text_length = page_info.get('extract_text', {}).get('length', 0)
262
+ if text_length > 5000:
263
+ challenges.append(f"""
264
+ #### **Dense Content (Page {page_num})**
265
+ **Issue:** Large amount of text ({text_length} characters) may indicate complex layout
266
+ **Challenge:** Dense content can complicate spatial navigation and element detection
267
+ """)
268
+
269
+ return challenges
270
+
271
+ def analyze_content_type(page_info):
272
+ """Analyze what type of content is on the page"""
273
+ text_preview = page_info.get('extract_text', {}).get('preview', '').lower()
274
+
275
+ if 'table' in text_preview or 'column' in text_preview:
276
+ return 'tabular data'
277
+ elif any(word in text_preview for word in ['report', 'study', 'analysis']):
278
+ return 'report content'
279
+ elif any(word in text_preview for word in ['form', 'application', 'field']):
280
+ return 'form data'
281
+ else:
282
+ return 'mixed content'
283
+
284
+ def generate_specific_recommendations(page_results, user_goal):
285
+ """Generate specific recommendations based on actual page analysis"""
286
+ recommendations = """
287
+ ```python
288
+ import natural_pdf as npdf
289
+
290
+ def extract_from_target_pages(pdf_path, target_pages):
291
+ \"\"\"Extract data from user-specified pages\"\"\"
292
+ pdf = npdf.PDF(pdf_path)
293
+ results = []
294
+
295
+ for page_num in target_pages:
296
+ if page_num <= len(pdf.pages):
297
+ page = pdf.pages[page_num - 1] # Convert to 0-based index
298
+
299
+ # Analyze layout for better structure detection
300
+ page.analyze_layout('tatr', existing='append')
301
+
302
+ # Try multiple extraction approaches
303
+ table_data = page.extract_table()
304
+ if table_data:
305
+ results.append({'page': page_num, 'type': 'table', 'data': table_data})
306
+ else:
307
+ # Fall back to text extraction with spatial awareness
308
+ text_elements = page.find_all('text')
309
+ results.append({'page': page_num, 'type': 'text', 'elements': text_elements})
310
+
311
+ return results
312
+
313
+ # Usage for your specific case
314
+ """
315
+
316
+ # Add specific usage based on the document
317
+ if 'page' in user_goal.lower():
318
+ page_match = re.search(r'page (\d+)', user_goal.lower())
319
+ if page_match:
320
+ page_num = page_match.group(1)
321
+ recommendations += f"""
322
+ # Target the specific page mentioned
323
+ results = extract_from_target_pages('document.pdf', [{page_num}])
324
+ ```
325
+ """
326
+ elif 'pages' in user_goal.lower():
327
+ pages_match = re.search(r'pages (\d+) to (\d+)', user_goal.lower())
328
+ if pages_match:
329
+ start, end = pages_match.groups()
330
+ recommendations += f"""
331
+ # Target the page range mentioned
332
+ results = extract_from_target_pages('document.pdf', list(range({start}, {end} + 1)))
333
+ ```
334
+ """
335
+
336
+ return recommendations
337
+
338
+ def main():
339
+ """Re-analyze specific documents with page targeting"""
340
+
341
+ # Documents that need re-analysis with specific pages
342
+ documents_to_reanalyze = [
343
+ {
344
+ 'folder': 'ODX1DW8_The large table on page 179',
345
+ 'file': 'ODX1DW8.pdf',
346
+ 'pages': [178, 179, 180], # Page 179 ± 1 for safety
347
+ 'reason': 'User requested page 179, original analysis used page 1'
348
+ },
349
+ {
350
+ 'folder': 'eqrZ5yq_The long table _Annex 6_ spanning across pages fro',
351
+ 'file': 'eqrZ5yq.pdf',
352
+ 'pages': [89, 90, 91, 92], # Multi-page table range
353
+ 'reason': 'User requested pages 89-92, original analysis used page 1'
354
+ }
355
+ ]
356
+
357
+ base_path = "/Users/soma/Development/natural-pdf/bad_pdf_analysis"
358
+
359
+ for doc in documents_to_reanalyze:
360
+ print(f"\n🔄 Re-analyzing {doc['file']} - {doc['reason']}")
361
+
362
+ folder_path = os.path.join(base_path, doc['folder'])
363
+ pdf_path = os.path.join(folder_path, doc['file'])
364
+ output_folder = os.path.join(folder_path, 'analysis', 'specific_pages')
365
+
366
+ if not os.path.exists(pdf_path):
367
+ print(f"❌ PDF not found: {pdf_path}")
368
+ continue
369
+
370
+ # Create output folder
371
+ os.makedirs(output_folder, exist_ok=True)
372
+
373
+ # Run analysis on specific pages
374
+ page_results = run_pdf_analyzer_on_pages(pdf_path, doc['pages'], output_folder)
375
+
376
+ if page_results:
377
+ # Create enhanced analysis report
378
+ submission_data = {
379
+ 'goal': f"Analysis targeting pages {doc['pages']}",
380
+ 'description': f"Re-analysis of {doc['file']} focusing on user-requested pages",
381
+ 'issues': doc['reason']
382
+ }
383
+
384
+ create_enhanced_analysis_report(
385
+ submission_data,
386
+ page_results,
387
+ doc['file'],
388
+ folder_path
389
+ )
390
+ else:
391
+ print(f"❌ No results obtained for {doc['file']}")
392
+
393
+ if __name__ == "__main__":
394
+ main()