natural-pdf 0.1.31__py3-none-any.whl → 0.1.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,382 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Direct Natural PDF analysis targeting specific pages.
4
- """
5
-
6
- import json
7
- import os
8
- import sys
9
- from pathlib import Path
10
- import natural_pdf as npdf
11
- import re
12
- from datetime import datetime
13
-
14
- def analyze_specific_pages_direct(pdf_path, target_pages, output_folder):
15
- """Directly analyze specific pages using Natural PDF"""
16
-
17
- print(f"🔍 Analyzing {pdf_path}")
18
- print(f"📍 Target pages: {target_pages}")
19
-
20
- pdf = npdf.PDF(pdf_path)
21
- results = {}
22
-
23
- for page_num in target_pages:
24
- if page_num > len(pdf.pages):
25
- print(f"❌ Page {page_num} not found - document only has {len(pdf.pages)} pages")
26
- continue
27
-
28
- print(f"\n📄 Analyzing page {page_num}...")
29
- page = pdf.pages[page_num - 1] # Convert to 0-based index
30
-
31
- page_data = {
32
- "page_number": page_num,
33
- "dimensions": {
34
- "width": page.width,
35
- "height": page.height
36
- }
37
- }
38
-
39
- # Get page description
40
- try:
41
- description = page.describe()
42
- page_data["describe"] = description
43
- print(f"✅ Page description: {len(description)} characters")
44
- except Exception as e:
45
- print(f"❌ Page description failed: {e}")
46
- page_data["describe"] = f"ERROR: {e}"
47
-
48
- # Extract text
49
- try:
50
- text = page.extract_text()
51
- page_data["extract_text"] = {
52
- "length": len(text),
53
- "preview": text[:200] + "..." if len(text) > 200 else text,
54
- "full_text": text
55
- }
56
- print(f"✅ Text extraction: {len(text)} characters")
57
- except Exception as e:
58
- print(f"❌ Text extraction failed: {e}")
59
- page_data["extract_text"] = f"ERROR: {e}"
60
-
61
- # Try table extraction
62
- try:
63
- table_data = page.extract_table()
64
- if table_data and len(table_data) > 0:
65
- page_data["extract_table"] = {
66
- "found": True,
67
- "rows": len(table_data),
68
- "columns": len(table_data[0]) if table_data else 0,
69
- "data": table_data[:5] # First 5 rows only
70
- }
71
- print(f"✅ Table found: {len(table_data)} rows × {len(table_data[0]) if table_data else 0} columns")
72
- else:
73
- page_data["extract_table"] = {"found": False}
74
- print("ℹ️ No table found with standard extraction")
75
- except Exception as e:
76
- print(f"❌ Table extraction failed: {e}")
77
- page_data["extract_table"] = f"ERROR: {e}"
78
-
79
- # Try layout analysis
80
- try:
81
- page.analyze_layout('yolo', existing='replace')
82
- layout_regions = page.find_all('region')
83
- if layout_regions and len(layout_regions) > 0:
84
- page_data["analyze_layout"] = {
85
- "found": True,
86
- "count": len(layout_regions),
87
- "regions": []
88
- }
89
- for region in layout_regions[:10]: # First 10 regions
90
- try:
91
- page_data["analyze_layout"]["regions"].append({
92
- "type": region.type if hasattr(region, 'type') else 'unknown',
93
- "bbox": [region.x0, region.y0, region.x1, region.y1],
94
- "confidence": region.confidence if hasattr(region, 'confidence') else 1.0
95
- })
96
- except:
97
- pass
98
- print(f"✅ Layout analysis: {len(layout_regions)} regions")
99
- else:
100
- page_data["analyze_layout"] = {"found": False}
101
- print("ℹ️ No layout regions found")
102
- except Exception as e:
103
- print(f"❌ Layout analysis failed: {e}")
104
- page_data["analyze_layout"] = f"ERROR: {e}"
105
-
106
- # Try TATR analysis
107
- try:
108
- page.analyze_layout('tatr', existing='append')
109
- tatr_regions = page.find_all('region')
110
- tatr_count = len([r for r in tatr_regions if hasattr(r, 'type') and 'table' in str(r.type).lower()])
111
- if tatr_count > 0:
112
- page_data["analyze_layout_tatr"] = {
113
- "found": True,
114
- "count": tatr_count,
115
- "regions": []
116
- }
117
- for region in tatr_regions[:25]: # First 25 regions
118
- try:
119
- if hasattr(region, 'type') and 'table' in str(region.type).lower():
120
- page_data["analyze_layout_tatr"]["regions"].append({
121
- "type": str(region.type),
122
- "bbox": [region.x0, region.y0, region.x1, region.y1],
123
- "confidence": region.confidence if hasattr(region, 'confidence') else 1.0
124
- })
125
- except:
126
- pass
127
- print(f"✅ TATR analysis: {tatr_count} table regions")
128
- else:
129
- page_data["analyze_layout_tatr"] = {"found": False}
130
- print("ℹ️ No TATR table regions found")
131
- except Exception as e:
132
- print(f"❌ TATR analysis failed: {e}")
133
- page_data["analyze_layout_tatr"] = f"ERROR: {e}"
134
-
135
- # Save page image
136
- try:
137
- page_image_path = os.path.join(output_folder, f"page_{page_num}.png")
138
- page.save_image(page_image_path, resolution=144)
139
- page_data["image_path"] = page_image_path
140
- print(f"✅ Page image saved: {page_image_path}")
141
- except Exception as e:
142
- print(f"❌ Page image save failed: {e}")
143
- page_data["image_path"] = f"ERROR: {e}"
144
-
145
- results[page_num] = page_data
146
-
147
- return results
148
-
149
- def create_enhanced_analysis_report(pdf_path, target_pages, analysis_results, output_folder):
150
- """Create enhanced analysis report"""
151
-
152
- pdf_name = Path(pdf_path).name
153
-
154
- # Determine what the user was looking for
155
- user_goal = f"Analysis of pages {target_pages}"
156
- if len(target_pages) == 1:
157
- user_goal = f"Analysis of page {target_pages[0]}"
158
-
159
- report = f"""# Enhanced PDF Analysis Report - {pdf_name.replace('.pdf', '')}
160
-
161
- ## Analysis Overview
162
-
163
- **PDF File:** {pdf_name}
164
- **Target Pages:** {target_pages}
165
- **Pages Successfully Analyzed:** {list(analysis_results.keys())}
166
- **Analysis Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
167
-
168
- ---
169
-
170
- ## Page-by-Page Analysis Results
171
-
172
- """
173
-
174
- for page_num in sorted(analysis_results.keys()):
175
- page_data = analysis_results[page_num]
176
-
177
- report += f"""### Page {page_num}
178
-
179
- **Dimensions:** {page_data.get('dimensions', {}).get('width', 'Unknown')} × {page_data.get('dimensions', {}).get('height', 'Unknown')} points
180
-
181
- **Content Analysis:**
182
- """
183
-
184
- # Text analysis
185
- if isinstance(page_data.get('extract_text'), dict):
186
- text_info = page_data['extract_text']
187
- report += f"- **Text Content:** {text_info.get('length', 0)} characters extracted\n"
188
- if text_info.get('preview'):
189
- report += f"- **Content Preview:** {text_info['preview']}\n"
190
-
191
- # Table analysis
192
- if isinstance(page_data.get('extract_table'), dict):
193
- table_info = page_data['extract_table']
194
- if table_info.get('found'):
195
- report += f"- **Table Found:** {table_info.get('rows', 0)} rows × {table_info.get('columns', 0)} columns\n"
196
- else:
197
- report += "- **Table Status:** No standard table structure detected\n"
198
-
199
- # Layout analysis
200
- if isinstance(page_data.get('analyze_layout'), dict):
201
- layout_info = page_data['analyze_layout']
202
- if layout_info.get('found'):
203
- report += f"- **Layout Regions:** {layout_info.get('count', 0)} regions detected\n"
204
-
205
- # Show region types
206
- region_types = {}
207
- for region in layout_info.get('regions', []):
208
- region_type = region.get('type', 'unknown')
209
- region_types[region_type] = region_types.get(region_type, 0) + 1
210
-
211
- if region_types:
212
- report += f"- **Region Types:** {dict(region_types)}\n"
213
-
214
- # TATR analysis
215
- if isinstance(page_data.get('analyze_layout_tatr'), dict):
216
- tatr_info = page_data['analyze_layout_tatr']
217
- if tatr_info.get('found'):
218
- report += f"- **TATR Table Analysis:** {tatr_info.get('count', 0)} table regions detected\n"
219
-
220
- # Image
221
- if page_data.get('image_path') and not page_data['image_path'].startswith('ERROR'):
222
- report += f"- **Visual:** Page image saved as `page_{page_num}.png`\n"
223
-
224
- report += "\n"
225
-
226
- # Analysis summary
227
- report += """---
228
-
229
- ## Analysis Summary
230
-
231
- ### What We Found
232
- """
233
-
234
- # Summarize findings across all pages
235
- total_text_chars = 0
236
- pages_with_tables = 0
237
- total_layout_regions = 0
238
- total_tatr_regions = 0
239
-
240
- for page_data in analysis_results.values():
241
- if isinstance(page_data.get('extract_text'), dict):
242
- total_text_chars += page_data['extract_text'].get('length', 0)
243
-
244
- if isinstance(page_data.get('extract_table'), dict) and page_data['extract_table'].get('found'):
245
- pages_with_tables += 1
246
-
247
- if isinstance(page_data.get('analyze_layout'), dict) and page_data['analyze_layout'].get('found'):
248
- total_layout_regions += page_data['analyze_layout'].get('count', 0)
249
-
250
- if isinstance(page_data.get('analyze_layout_tatr'), dict) and page_data['analyze_layout_tatr'].get('found'):
251
- total_tatr_regions += page_data['analyze_layout_tatr'].get('count', 0)
252
-
253
- report += f"""
254
- - **Total Text Content:** {total_text_chars:,} characters across {len(analysis_results)} pages
255
- - **Table Detection:** {pages_with_tables} out of {len(analysis_results)} pages have detectable tables
256
- - **Layout Analysis:** {total_layout_regions} total layout regions detected
257
- - **TATR Analysis:** {total_tatr_regions} table-specific regions detected
258
- """
259
-
260
- # Add recommendations
261
- report += """
262
- ### Natural PDF Extraction Approach
263
-
264
- Based on the actual content found on these pages:
265
-
266
- ```python
267
- import natural_pdf as npdf
268
-
269
- def extract_from_specific_pages(pdf_path, target_pages):
270
- \"\"\"Extract data from specific pages with targeted approach\"\"\"
271
- pdf = npdf.PDF(pdf_path)
272
- results = []
273
-
274
- for page_num in target_pages:
275
- if page_num <= len(pdf.pages):
276
- page = pdf.pages[page_num - 1]
277
-
278
- # Use layout analysis for better structure detection
279
- page.analyze_layout('tatr', existing='append')
280
-
281
- # Try table extraction first
282
- table_data = page.extract_table()
283
- if table_data:
284
- results.append({
285
- 'page': page_num,
286
- 'type': 'table',
287
- 'data': table_data
288
- })
289
- else:
290
- # Use spatial navigation for complex layouts
291
- all_text = page.find_all('text')
292
- results.append({
293
- 'page': page_num,
294
- 'type': 'text_elements',
295
- 'elements': all_text
296
- })
297
-
298
- return results
299
-
300
- # Extract from your specific pages
301
- """
302
-
303
- if len(target_pages) == 1:
304
- report += f"results = extract_from_specific_pages('{pdf_name}', [{target_pages[0]}])\n"
305
- else:
306
- report += f"results = extract_from_specific_pages('{pdf_name}', {target_pages})\n"
307
-
308
- report += "```\n"
309
-
310
- # Save the report
311
- report_path = os.path.join(output_folder, f"{pdf_name.replace('.pdf', '')}_enhanced_analysis.md")
312
- with open(report_path, 'w', encoding='utf-8') as f:
313
- f.write(report)
314
-
315
- print(f"✅ Enhanced analysis report saved: {report_path}")
316
- return report_path
317
-
318
- def main():
319
- """Re-analyze specific documents with page targeting"""
320
-
321
- # Documents that need re-analysis with specific pages
322
- documents_to_reanalyze = [
323
- {
324
- 'folder': 'ODX1DW8_The large table on page 179',
325
- 'file': 'ODX1DW8.pdf',
326
- 'pages': [178, 179, 180], # Page 179 ± 1 for safety
327
- 'reason': 'User requested page 179, original analysis used page 1'
328
- },
329
- {
330
- 'folder': 'eqrZ5yq_The long table _Annex 6_ spanning across pages fro',
331
- 'file': 'eqrZ5yq.pdf',
332
- 'pages': [89, 90, 91, 92], # Multi-page table range
333
- 'reason': 'User requested pages 89-92, original analysis used page 1'
334
- }
335
- ]
336
-
337
- base_path = "/Users/soma/Development/natural-pdf/bad_pdf_analysis"
338
-
339
- for doc in documents_to_reanalyze:
340
- print(f"\n{'='*80}")
341
- print(f"🔄 Re-analyzing {doc['file']}")
342
- print(f"📋 Reason: {doc['reason']}")
343
- print(f"{'='*80}")
344
-
345
- folder_path = os.path.join(base_path, doc['folder'])
346
- pdf_path = os.path.join(folder_path, doc['file'])
347
- output_folder = os.path.join(folder_path, 'enhanced_analysis')
348
-
349
- if not os.path.exists(pdf_path):
350
- print(f"❌ PDF not found: {pdf_path}")
351
- continue
352
-
353
- # Create output folder
354
- os.makedirs(output_folder, exist_ok=True)
355
-
356
- # Run direct analysis on specific pages
357
- try:
358
- analysis_results = analyze_specific_pages_direct(pdf_path, doc['pages'], output_folder)
359
-
360
- if analysis_results:
361
- # Save analysis results as JSON
362
- results_file = os.path.join(output_folder, "enhanced_analysis_results.json")
363
- with open(results_file, 'w') as f:
364
- json.dump({
365
- "pdf_path": pdf_path,
366
- "target_pages": doc['pages'],
367
- "analysis_timestamp": datetime.now().isoformat(),
368
- "results": analysis_results
369
- }, f, indent=2)
370
-
371
- # Create enhanced report
372
- create_enhanced_analysis_report(pdf_path, doc['pages'], analysis_results, output_folder)
373
-
374
- print(f"\n✅ Successfully analyzed {len(analysis_results)} pages from {doc['file']}")
375
- else:
376
- print(f"❌ No results obtained for {doc['file']}")
377
-
378
- except Exception as e:
379
- print(f"❌ Analysis failed for {doc['file']}: {e}")
380
-
381
- if __name__ == "__main__":
382
- main()
tools/rtl_smoke_test.py DELETED
@@ -1,80 +0,0 @@
1
- #!/usr/bin/env python3
2
- """RTL pipeline smoke-test for natural-pdf.
3
-
4
- Run it from the repository root:
5
-
6
- python tools/rtl_smoke_test.py
7
-
8
- It loads *pdfs/arabic.pdf* and performs a handful of checks that cover the
9
- most common break-points we identified for RTL handling:
10
- 1. char ingestion / word grouping
11
- 2. selector finds on logical Arabic tokens
12
- 3. bracket mirroring
13
- 4. number directionality inside RTL lines
14
-
15
- Exit code is **0** when all checks pass, **1** otherwise.
16
- """
17
- from __future__ import annotations
18
-
19
- import sys
20
- from pathlib import Path
21
-
22
- from bidi.algorithm import get_display # type: ignore
23
-
24
- from natural_pdf import PDF
25
- from natural_pdf.utils.bidi_mirror import mirror_brackets
26
-
27
-
28
- PDF_PATH = Path("pdfs/arabic.pdf")
29
-
30
- if not PDF_PATH.exists():
31
- print(f"❗ PDF not found: {PDF_PATH.resolve()}")
32
- sys.exit(1)
33
-
34
- # ────────────────────────────────────────────────────────────────
35
- # Helpers
36
- # ────────────────────────────────────────────────────────────────
37
-
38
- failures: list[str] = []
39
-
40
- def check(cond: bool, msg: str):
41
- """Collect failures but keep running to show full report."""
42
- if cond:
43
- print(f"✓ {msg}")
44
- else:
45
- print(f"✗ {msg}")
46
- failures.append(msg)
47
-
48
-
49
- # ────────────────────────────────────────────────────────────────
50
- # Load page
51
- # ────────────────────────────────────────────────────────────────
52
-
53
- pdf = PDF(str(PDF_PATH))
54
- page = pdf.pages[0]
55
-
56
- # Basic char/word counts (should be non-zero)
57
- check(len(page.chars) > 0, "chars were ingested")
58
- check(len(page.words) > 0, "words were grouped")
59
-
60
- # First line logical text
61
- logical_first_line = page.extract_text().split("\n")[0]
62
- print("First logical line:")
63
- print(" ", logical_first_line)
64
-
65
- # 1. Arabic keyword should be findable
66
- check(page.find(text="مكرر") is not None, "page.find works for Arabic token 'مكرر'")
67
-
68
- # 2. Reversed token should NOT match
69
- check(page.find(text="مكرر"[::-1]) is None, "reverse token does not match (logical order stored)")
70
-
71
- # 3. Extracted line should already show the bracket pair in correct orientation
72
- check("(مكرر)" in logical_first_line, "parentheses orientation is correct in extract_text")
73
-
74
- # 4. Western numbers must stay LTR inside RTL
75
- # After visual re-order, the line should end with 2022 (year on the left visually → last in logical string)
76
- check(logical_first_line.rstrip().endswith("2022"), "Western number '2022' kept logical placement")
77
-
78
- print("\nSummary: {} passed, {} failed".format(4 - len(failures), len(failures)))
79
-
80
- sys.exit(0 if not failures else 1)