natural-pdf 0.1.28__py3-none-any.whl → 0.1.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. bad_pdf_analysis/analyze_10_more.py +300 -0
  2. bad_pdf_analysis/analyze_final_10.py +552 -0
  3. bad_pdf_analysis/analyze_specific_pages.py +394 -0
  4. bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
  5. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  6. natural_pdf/analyzers/layout/layout_manager.py +44 -0
  7. natural_pdf/analyzers/layout/surya.py +1 -1
  8. natural_pdf/analyzers/shape_detection_mixin.py +228 -0
  9. natural_pdf/classification/manager.py +67 -0
  10. natural_pdf/core/element_manager.py +556 -25
  11. natural_pdf/core/highlighting_service.py +98 -43
  12. natural_pdf/core/page.py +86 -20
  13. natural_pdf/core/pdf.py +0 -2
  14. natural_pdf/describe/base.py +40 -9
  15. natural_pdf/describe/elements.py +11 -6
  16. natural_pdf/elements/base.py +134 -20
  17. natural_pdf/elements/collections.py +43 -11
  18. natural_pdf/elements/image.py +43 -0
  19. natural_pdf/elements/region.py +64 -19
  20. natural_pdf/elements/text.py +89 -11
  21. natural_pdf/flows/collections.py +4 -4
  22. natural_pdf/flows/region.py +17 -2
  23. natural_pdf/ocr/ocr_manager.py +50 -0
  24. natural_pdf/selectors/parser.py +27 -7
  25. natural_pdf/tables/__init__.py +5 -0
  26. natural_pdf/tables/result.py +101 -0
  27. natural_pdf/utils/bidi_mirror.py +36 -0
  28. natural_pdf/utils/visualization.py +15 -1
  29. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
  30. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +48 -26
  31. natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
  32. optimization/memory_comparison.py +172 -0
  33. optimization/pdf_analyzer.py +410 -0
  34. optimization/performance_analysis.py +397 -0
  35. optimization/test_cleanup_methods.py +155 -0
  36. optimization/test_memory_fix.py +162 -0
  37. tools/bad_pdf_eval/__init__.py +1 -0
  38. tools/bad_pdf_eval/analyser.py +302 -0
  39. tools/bad_pdf_eval/collate_summaries.py +130 -0
  40. tools/bad_pdf_eval/eval_suite.py +116 -0
  41. tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
  42. tools/bad_pdf_eval/llm_enrich.py +273 -0
  43. tools/bad_pdf_eval/reporter.py +17 -0
  44. tools/bad_pdf_eval/utils.py +127 -0
  45. tools/rtl_smoke_test.py +80 -0
  46. natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
  47. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
  48. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
  49. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,300 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Analyze 10 more PDF documents from the bad PDF collection
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import json
9
+ from datetime import datetime
10
+ import natural_pdf as npdf
11
+
12
+ # Add the project root to the path
13
+ sys.path.append('/Users/soma/Development/natural-pdf')
14
+
15
+ def analyze_pdf_document(pdf_path, document_name, target_pages=None):
16
+ """Analyze a specific PDF document with enhanced reporting"""
17
+ print(f"\n{'='*80}")
18
+ print(f"🔍 Analyzing {document_name}")
19
+ print(f"📁 Path: {pdf_path}")
20
+ if target_pages:
21
+ print(f"📍 Target pages: {target_pages}")
22
+ print(f"{'='*80}")
23
+
24
+ try:
25
+ pdf = npdf.PDF(pdf_path)
26
+ total_pages = len(pdf.pages)
27
+ print(f"📄 Total pages in document: {total_pages}")
28
+
29
+ # Determine which pages to analyze
30
+ if target_pages:
31
+ pages_to_analyze = [p for p in target_pages if p <= total_pages]
32
+ if len(pages_to_analyze) != len(target_pages):
33
+ print(f"⚠️ Some target pages exceed document length, analyzing: {pages_to_analyze}")
34
+ else:
35
+ # Default to first page if no specific pages requested
36
+ pages_to_analyze = [1] if total_pages > 0 else []
37
+
38
+ results = {
39
+ 'document': document_name,
40
+ 'total_pages': total_pages,
41
+ 'analyzed_pages': pages_to_analyze,
42
+ 'analysis_date': datetime.now().isoformat(),
43
+ 'pages': {}
44
+ }
45
+
46
+ for page_num in pages_to_analyze:
47
+ print(f"\n📄 Analyzing page {page_num}...")
48
+ page = pdf.pages[page_num - 1] # Convert to 0-based index
49
+
50
+ page_results = {
51
+ 'page_number': page_num,
52
+ 'dimensions': f"{page.width} × {page.height} points"
53
+ }
54
+
55
+ # Extract text
56
+ try:
57
+ text_content = page.extract_text()
58
+ page_results['text_length'] = len(text_content)
59
+ page_results['text_preview'] = text_content[:200] + "..." if len(text_content) > 200 else text_content
60
+ print(f"✅ Text extraction: {len(text_content)} characters")
61
+ except Exception as e:
62
+ page_results['text_error'] = str(e)
63
+ print(f"❌ Text extraction failed: {e}")
64
+
65
+ # Try table extraction
66
+ try:
67
+ table_data = page.extract_table()
68
+ if table_data and len(table_data) > 0:
69
+ rows = len(table_data)
70
+ cols = max(len(row) for row in table_data) if table_data else 0
71
+ page_results['table'] = f"{rows} rows × {cols} columns"
72
+ page_results['table_sample'] = table_data[:3] if len(table_data) >= 3 else table_data
73
+ print(f"✅ Table found: {rows} rows × {cols} columns")
74
+ else:
75
+ page_results['table'] = "No table detected"
76
+ print("ℹ️ No table detected")
77
+ except Exception as e:
78
+ page_results['table_error'] = str(e)
79
+ print(f"❌ Table extraction failed: {e}")
80
+
81
+ # Layout analysis with YOLO
82
+ try:
83
+ page.analyze_layout('yolo')
84
+ yolo_regions = page.find_all('region')
85
+ page_results['yolo_regions'] = len(yolo_regions)
86
+ print(f"✅ YOLO layout analysis: {len(yolo_regions)} regions")
87
+ except Exception as e:
88
+ page_results['yolo_error'] = str(e)
89
+ print(f"❌ YOLO analysis failed: {e}")
90
+
91
+ # Layout analysis with TATR (table-specific)
92
+ try:
93
+ page.analyze_layout('tatr', existing='append')
94
+ tatr_regions = page.find_all('region[type="table"]')
95
+ page_results['tatr_regions'] = len(tatr_regions)
96
+ print(f"✅ TATR analysis: {len(tatr_regions)} table regions")
97
+ except Exception as e:
98
+ page_results['tatr_error'] = str(e)
99
+ print(f"❌ TATR analysis failed: {e}")
100
+
101
+ # Save page image
102
+ try:
103
+ folder_name = document_name.replace('/', '_').replace('\\', '_')
104
+ analysis_dir = f"/Users/soma/Development/natural-pdf/bad_pdf_analysis/{folder_name}/enhanced_analysis_10"
105
+ os.makedirs(analysis_dir, exist_ok=True)
106
+
107
+ image_path = f"{analysis_dir}/page_{page_num}.png"
108
+ page_image = page.to_image(resolution=144)
109
+ page_image.save(image_path)
110
+ page_results['image_saved'] = image_path
111
+ print(f"✅ Page image saved: page_{page_num}.png")
112
+ except Exception as e:
113
+ page_results['image_error'] = str(e)
114
+ print(f"❌ Image save failed: {e}")
115
+
116
+ results['pages'][page_num] = page_results
117
+
118
+ # Generate analysis summary
119
+ analysis_insights = generate_analysis_insights(results)
120
+ results['insights'] = analysis_insights
121
+
122
+ # Save results to JSON
123
+ try:
124
+ folder_name = document_name.replace('/', '_').replace('\\', '_')
125
+ analysis_dir = f"/Users/soma/Development/natural-pdf/bad_pdf_analysis/{folder_name}/enhanced_analysis_10"
126
+ os.makedirs(analysis_dir, exist_ok=True)
127
+
128
+ results_path = f"{analysis_dir}/analysis_results.json"
129
+ with open(results_path, 'w', encoding='utf-8') as f:
130
+ json.dump(results, f, indent=2, ensure_ascii=False)
131
+ print(f"✅ Analysis results saved: {results_path}")
132
+
133
+ # Generate markdown report
134
+ markdown_path = f"{analysis_dir}/{document_name}_enhanced_analysis.md"
135
+ generate_markdown_report(results, markdown_path)
136
+ print(f"✅ Markdown report saved: {markdown_path}")
137
+
138
+ except Exception as e:
139
+ print(f"❌ Failed to save results: {e}")
140
+
141
+ return results
142
+
143
+ except Exception as e:
144
+ print(f"❌ Failed to analyze {document_name}: {e}")
145
+ return None
146
+
147
+ def generate_analysis_insights(results):
148
+ """Generate insights based on analysis results"""
149
+ insights = []
150
+
151
+ total_chars = sum(page.get('text_length', 0) for page in results['pages'].values())
152
+ table_pages = sum(1 for page in results['pages'].values() if 'table' in page and 'rows' in page['table'])
153
+
154
+ if total_chars > 0:
155
+ insights.append(f"Document contains {total_chars} total characters across {len(results['pages'])} analyzed pages")
156
+
157
+ if table_pages > 0:
158
+ insights.append(f"{table_pages} out of {len(results['pages'])} pages contain detectable tables")
159
+
160
+ # Check for layout complexity
161
+ avg_regions = sum(page.get('yolo_regions', 0) for page in results['pages'].values()) / len(results['pages'])
162
+ if avg_regions > 5:
163
+ insights.append(f"Complex layout detected - average {avg_regions:.1f} regions per page")
164
+
165
+ # Check for table structure complexity
166
+ tatr_regions = sum(page.get('tatr_regions', 0) for page in results['pages'].values())
167
+ if tatr_regions > 50:
168
+ insights.append(f"High table complexity - {tatr_regions} TATR table regions detected")
169
+
170
+ return insights
171
+
172
+ def generate_markdown_report(results, output_path):
173
+ """Generate a detailed markdown report"""
174
+
175
+ content = f"""# Enhanced PDF Analysis Report - {results['document']}
176
+
177
+ ## Analysis Overview
178
+
179
+ **Document:** {results['document']}
180
+ **Total Pages:** {results['total_pages']}
181
+ **Analyzed Pages:** {results['analyzed_pages']}
182
+ **Analysis Date:** {results['analysis_date']}
183
+
184
+ ---
185
+
186
+ ## Key Insights
187
+
188
+ """
189
+
190
+ for insight in results.get('insights', []):
191
+ content += f"- {insight}\n"
192
+
193
+ content += "\n---\n\n## Page-by-Page Analysis\n\n"
194
+
195
+ for page_num, page_data in results['pages'].items():
196
+ content += f"### Page {page_num}\n\n"
197
+ content += f"**Dimensions:** {page_data.get('dimensions', 'Unknown')}\n\n"
198
+
199
+ if 'text_length' in page_data:
200
+ content += f"**Text Content:** {page_data['text_length']} characters\n"
201
+ if 'text_preview' in page_data:
202
+ content += f"**Preview:** {page_data['text_preview'][:100]}...\n\n"
203
+
204
+ if 'table' in page_data:
205
+ content += f"**Table Detection:** {page_data['table']}\n"
206
+ if 'table_sample' in page_data and page_data['table_sample']:
207
+ content += f"**Sample Data:** First few rows: {page_data['table_sample'][:2]}\n\n"
208
+
209
+ if 'yolo_regions' in page_data:
210
+ content += f"**Layout Regions (YOLO):** {page_data['yolo_regions']}\n"
211
+
212
+ if 'tatr_regions' in page_data:
213
+ content += f"**Table Regions (TATR):** {page_data['tatr_regions']}\n"
214
+
215
+ content += "\n"
216
+
217
+ content += """
218
+ ---
219
+
220
+ ## Natural PDF Extraction Recommendations
221
+
222
+ Based on this analysis, here are the recommended approaches:
223
+
224
+ ```python
225
+ import natural_pdf as npdf
226
+
227
+ def extract_document_data(pdf_path):
228
+ pdf = npdf.PDF(pdf_path)
229
+ results = []
230
+
231
+ for page_num, page in enumerate(pdf.pages, 1):
232
+ # Use layout analysis for structure detection
233
+ page.analyze_layout('tatr', existing='append')
234
+
235
+ # Extract tables if present
236
+ table_data = page.extract_table()
237
+ if table_data:
238
+ results.append({
239
+ 'page': page_num,
240
+ 'type': 'table',
241
+ 'data': table_data
242
+ })
243
+
244
+ # Extract text content
245
+ text_content = page.extract_text()
246
+ if text_content:
247
+ results.append({
248
+ 'page': page_num,
249
+ 'type': 'text',
250
+ 'content': text_content
251
+ })
252
+
253
+ return results
254
+ ```
255
+
256
+ """
257
+
258
+ with open(output_path, 'w', encoding='utf-8') as f:
259
+ f.write(content)
260
+
261
+ def main():
262
+ """Analyze 10 more PDF documents"""
263
+
264
+ # List of documents to analyze with specific pages if needed
265
+ documents_to_analyze = [
266
+ # Documents with specific page requests
267
+ ("GxpvezO_The table in Nepali on page 30 _in between the tex", "GxpvezO.pdf", [30]),
268
+ ("J9lKd7Y_Table in Slovenian _e.g. on page 80_.", "J9lKd7Y.pdf", [80]),
269
+ ("b5eVqGg_Math formulas in Russian _e.g. on page 181__", "b5eVqGg.pdf", [181]),
270
+ ("lbODqev_Large wide tables in Serbian _from page 63 and on_", "lbODqev.pdf", [63, 64, 65]),
271
+ ("obR6Dxb_Large table that spans across pages in Serbian _e.", "obR6Dxb.pdf", [1, 2, 3]),
272
+ ("ober4db_The graph and table on page 180 and 181", "ober4db.pdf", [180, 181]),
273
+ ("oberryX_The survery question table_ such as the one on pag", "oberryX.pdf", [1]), # Need to find specific page
274
+ ("eqrZZbq_The categorize chart _E1_ on page 4_ The chart_tab", "eqrZZbq.pdf", [4]),
275
+
276
+ # Documents with general analysis needs
277
+ ("NplKG2O_Try to see if natural-pdf can process non-standard", "NplKG2O.pdf", None),
278
+ ("obe1Vq5_MARKED UP text -- underline and strikethu__for bon", "obe1Vq5.pdf", None),
279
+ ]
280
+
281
+ analysis_results = []
282
+
283
+ for folder_name, pdf_filename, target_pages in documents_to_analyze:
284
+ pdf_path = f"/Users/soma/Development/natural-pdf/bad_pdf_analysis/{folder_name}/{pdf_filename}"
285
+
286
+ if os.path.exists(pdf_path):
287
+ result = analyze_pdf_document(pdf_path, folder_name, target_pages)
288
+ if result:
289
+ analysis_results.append(result)
290
+ else:
291
+ print(f"❌ PDF not found: {pdf_path}")
292
+
293
+ print(f"\n{'='*80}")
294
+ print(f"✅ Analysis complete! Processed {len(analysis_results)} documents")
295
+ print(f"{'='*80}")
296
+
297
+ return analysis_results
298
+
299
+ if __name__ == "__main__":
300
+ main()