natural-pdf 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. docs/finetuning/index.md +176 -0
  2. docs/ocr/index.md +34 -47
  3. docs/tutorials/01-loading-and-extraction.ipynb +34 -1536
  4. docs/tutorials/02-finding-elements.ipynb +42 -42
  5. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  6. docs/tutorials/04-table-extraction.ipynb +12 -12
  7. docs/tutorials/05-excluding-content.ipynb +30 -30
  8. docs/tutorials/06-document-qa.ipynb +28 -28
  9. docs/tutorials/07-layout-analysis.ipynb +63 -35
  10. docs/tutorials/07-working-with-regions.ipynb +55 -51
  11. docs/tutorials/07-working-with-regions.md +2 -2
  12. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  13. docs/tutorials/09-section-extraction.ipynb +113 -113
  14. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  15. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  16. docs/tutorials/12-ocr-integration.ipynb +149 -131
  17. docs/tutorials/12-ocr-integration.md +0 -13
  18. docs/tutorials/13-semantic-search.ipynb +313 -873
  19. natural_pdf/__init__.py +21 -22
  20. natural_pdf/analyzers/layout/gemini.py +280 -0
  21. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  22. natural_pdf/analyzers/layout/layout_options.py +11 -0
  23. natural_pdf/analyzers/layout/yolo.py +6 -2
  24. natural_pdf/collections/pdf_collection.py +24 -0
  25. natural_pdf/core/element_manager.py +18 -13
  26. natural_pdf/core/page.py +174 -36
  27. natural_pdf/core/pdf.py +156 -42
  28. natural_pdf/elements/base.py +9 -17
  29. natural_pdf/elements/collections.py +99 -38
  30. natural_pdf/elements/region.py +77 -37
  31. natural_pdf/elements/text.py +5 -0
  32. natural_pdf/exporters/__init__.py +4 -0
  33. natural_pdf/exporters/base.py +61 -0
  34. natural_pdf/exporters/paddleocr.py +345 -0
  35. natural_pdf/ocr/__init__.py +57 -36
  36. natural_pdf/ocr/engine.py +160 -49
  37. natural_pdf/ocr/engine_easyocr.py +178 -157
  38. natural_pdf/ocr/engine_paddle.py +114 -189
  39. natural_pdf/ocr/engine_surya.py +87 -144
  40. natural_pdf/ocr/ocr_factory.py +125 -0
  41. natural_pdf/ocr/ocr_manager.py +65 -89
  42. natural_pdf/ocr/ocr_options.py +8 -13
  43. natural_pdf/ocr/utils.py +113 -0
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
  45. natural_pdf/templates/spa/css/style.css +334 -0
  46. natural_pdf/templates/spa/index.html +31 -0
  47. natural_pdf/templates/spa/js/app.js +472 -0
  48. natural_pdf/templates/spa/words.txt +235976 -0
  49. natural_pdf/utils/debug.py +34 -0
  50. natural_pdf/utils/identifiers.py +33 -0
  51. natural_pdf/utils/packaging.py +485 -0
  52. natural_pdf/utils/text_extraction.py +44 -64
  53. natural_pdf/utils/visualization.py +1 -1
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +44 -20
  55. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +58 -47
  56. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +1 -1
  57. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -1
  58. natural_pdf/templates/ocr_debug.html +0 -517
  59. tests/test_loading.py +0 -50
  60. tests/test_optional_deps.py +0 -298
  61. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
@@ -1,517 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <meta charset="UTF-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <title>OCR Debug Report</title>
7
- <style>
8
- body {{
9
- font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
10
- line-height: 1.6;
11
- color: #333;
12
- max-width: 1200px;
13
- margin: 0 auto;
14
- padding: 20px;
15
- }}
16
- h1, h2 {{
17
- color: #2c3e50;
18
- }}
19
- .page-section {{
20
- margin-bottom: 30px;
21
- border: 1px solid #ddd;
22
- border-radius: 4px;
23
- padding: 15px;
24
- background: #f8f9fa;
25
- }}
26
- .page-title {{
27
- display: flex;
28
- justify-content: space-between;
29
- align-items: center;
30
- margin-bottom: 15px;
31
- }}
32
- .page-controls {{
33
- display: flex;
34
- gap: 10px;
35
- align-items: center;
36
- }}
37
- .controls {{
38
- margin-bottom: 20px;
39
- display: flex;
40
- justify-content: space-between;
41
- flex-wrap: wrap;
42
- gap: 10px;
43
- padding: 10px;
44
- background: #f0f0f0;
45
- border-radius: 4px;
46
- }}
47
- .filter-control {{
48
- display: flex;
49
- align-items: center;
50
- gap: 8px;
51
- }}
52
- input, select, button {{
53
- padding: 6px 12px;
54
- border: 1px solid #ccc;
55
- border-radius: 4px;
56
- font-size: 14px;
57
- }}
58
- button {{
59
- background: #4b6bfb;
60
- color: white;
61
- cursor: pointer;
62
- }}
63
- button:hover {{
64
- background: #3b5de7;
65
- }}
66
- .region-table {{
67
- width: 100%;
68
- border-collapse: collapse;
69
- }}
70
- .region-table th,
71
- .region-table td {{
72
- padding: 8px;
73
- border: 1px solid #ddd;
74
- text-align: left;
75
- vertical-align: top;
76
- }}
77
- .region-table th {{
78
- background: #f2f2f2;
79
- position: sticky;
80
- top: 0;
81
- z-index: 10;
82
- }}
83
- .region-image {{
84
- width: 35%; /* Limit image cell width */
85
- }}
86
- .image-clip {{
87
- position: relative;
88
- overflow: hidden;
89
- background-repeat: no-repeat;
90
- border-radius: 3px;
91
- box-shadow: 0 1px 3px rgba(0,0,0,0.2);
92
- margin: 0 auto;
93
- max-width: 350px; /* Maximum width */
94
- max-height: 250px; /* Maximum height */
95
- transform-origin: top left; /* For scaling */
96
- }}
97
- .confidence {{
98
- width: 90px;
99
- text-align: center;
100
- white-space: nowrap;
101
- }}
102
- .confidence[data-level="high"] {{
103
- background-color: rgba(0, 200, 0, 0.2);
104
- }}
105
- .confidence[data-level="medium"] {{
106
- background-color: rgba(255, 200, 0, 0.2);
107
- }}
108
- .confidence[data-level="low"] {{
109
- background-color: rgba(255, 0, 0, 0.2);
110
- }}
111
- .text-content {{
112
- width: 60%;
113
- }}
114
- .text-content-input {{
115
- width: 100%;
116
- font-family: monospace;
117
- padding: 8px;
118
- line-height: 1.4;
119
- white-space: pre-wrap;
120
- word-break: break-all;
121
- border: 1px solid #ddd;
122
- border-radius: 4px;
123
- resize: vertical;
124
- }}
125
- .text-content-input:focus {{
126
- border-color: #4b6bfb;
127
- outline: none;
128
- box-shadow: 0 0 0 2px rgba(75, 107, 251, 0.25);
129
- }}
130
- .modified-status {{
131
- text-align: center;
132
- width: 80px;
133
- }}
134
- .modified-checkbox {{
135
- width: 18px;
136
- height: 18px;
137
- cursor: not-allowed;
138
- }}
139
- .hidden {{
140
- display: none;
141
- }}
142
- .toggle-btn {{
143
- background: #eee;
144
- color: #333;
145
- border: 1px solid #ccc;
146
- padding: 3px 8px;
147
- border-radius: 3px;
148
- cursor: pointer;
149
- font-size: 12px;
150
- }}
151
- .toggle-btn:hover {{
152
- background: #ddd;
153
- }}
154
- .export-btn {{
155
- margin-left: auto;
156
- }}
157
- .page-image {{
158
- max-width: 100%;
159
- height: auto;
160
- margin-bottom: 15px;
161
- border: 1px solid #ddd;
162
- display: none;
163
- }}
164
- .show {{
165
- display: block;
166
- }}
167
-
168
- @media (max-width: 800px) {{
169
- .region-table, .region-table tbody, .region-table tr, .region-table td, .region-table th {{
170
- display: block;
171
- }}
172
- .region-table td {{
173
- margin-bottom: 8px;
174
- }}
175
- .region-table th {{
176
- position: static;
177
- }}
178
- }}
179
- </style>
180
- </head>
181
- <body>
182
- <h1>OCR Debug Report</h1>
183
-
184
- <div class="controls">
185
- <div class="filter-control">
186
- <label for="confidence-filter">Min Confidence:</label>
187
- <input type="range" id="confidence-filter" min="0" max="1" step="0.05" value="0">
188
- <span id="confidence-value">0</span>
189
- </div>
190
-
191
- <div class="filter-control">
192
- <label for="text-filter">Text Filter:</label>
193
- <input type="text" id="text-filter" placeholder="Filter text...">
194
- </div>
195
-
196
- <div class="filter-control">
197
- <label for="sort-by">Sort By:</label>
198
- <select id="sort-by">
199
- <option value="position">Position (default)</option>
200
- <option value="confidence-asc">Confidence (Low to High)</option>
201
- <option value="confidence-desc">Confidence (High to Low)</option>
202
- <option value="text-length">Text Length</option>
203
- </select>
204
- </div>
205
-
206
- <button id="export-json" class="export-btn">Export JSON</button>
207
- </div>
208
-
209
- <div id="pages-container">
210
- <!-- Pages will be inserted here -->
211
- </div>
212
-
213
- <script>
214
- // Main OCR data structure with pages and regions
215
- const ocrData = {pages_data};
216
-
217
- // ===== DOM Elements =====
218
- const pagesContainer = document.getElementById('pages-container');
219
- const confidenceFilter = document.getElementById('confidence-filter');
220
- const confidenceValue = document.getElementById('confidence-value');
221
- const textFilter = document.getElementById('text-filter');
222
- const sortBySelect = document.getElementById('sort-by');
223
- const exportButton = document.getElementById('export-json');
224
-
225
- // ===== Rendering Functions =====
226
-
227
- // Render a single page section with its regions
228
- function renderPage(page, pageIndex) {{
229
- const pageDiv = document.createElement('div');
230
- pageDiv.className = 'page-section';
231
- pageDiv.id = `page-${{pageIndex}}`;
232
-
233
- // Page header with controls
234
- const pageTitle = document.createElement('div');
235
- pageTitle.className = 'page-title';
236
-
237
- const pageHeading = document.createElement('h2');
238
- pageHeading.textContent = `Page ${{page.page_number}}`;
239
- pageTitle.appendChild(pageHeading);
240
-
241
- const pageControls = document.createElement('div');
242
- pageControls.className = 'page-controls';
243
-
244
- const toggleImageBtn = document.createElement('button');
245
- toggleImageBtn.className = 'toggle-btn';
246
- toggleImageBtn.textContent = 'Show Full Image';
247
- toggleImageBtn.onclick = () => toggleFullImage(pageIndex);
248
- pageControls.appendChild(toggleImageBtn);
249
-
250
- pageTitle.appendChild(pageControls);
251
- pageDiv.appendChild(pageTitle);
252
-
253
- // Full page image (hidden by default)
254
- const pageImage = document.createElement('img');
255
- pageImage.src = page.image;
256
- pageImage.className = 'page-image';
257
- pageImage.id = `page-image-${{pageIndex}}`;
258
- pageImage.alt = `Page ${{page.page_number}}`;
259
- pageDiv.appendChild(pageImage);
260
-
261
- // Table for regions
262
- const table = document.createElement('table');
263
- table.className = 'region-table';
264
-
265
- // Table header
266
- const thead = document.createElement('thead');
267
- const headerRow = document.createElement('tr');
268
-
269
- const headers = ['Confidence', 'Text Region', 'Text Content'];
270
- headers.forEach(header => {{
271
- const th = document.createElement('th');
272
- th.textContent = header;
273
- headerRow.appendChild(th);
274
- }});
275
-
276
- thead.appendChild(headerRow);
277
- table.appendChild(thead);
278
-
279
- // Table body
280
- const tbody = document.createElement('tbody');
281
- tbody.id = `regions-${{pageIndex}}`;
282
-
283
- // Render each region row
284
- page.regions.forEach((region, regionIndex) => {{
285
- const row = renderRegionRow(region, pageIndex, regionIndex, page.image);
286
- tbody.appendChild(row);
287
- }});
288
-
289
- table.appendChild(tbody);
290
- pageDiv.appendChild(table);
291
-
292
- return pageDiv;
293
- }}
294
-
295
- // Render a single region row
296
- function renderRegionRow(region, pageIndex, regionIndex, pageImage) {{
297
- const row = document.createElement('tr');
298
- row.className = 'region-row';
299
- row.dataset.confidence = region.confidence;
300
- row.dataset.text = region.ocr_text;
301
- row.dataset.modified = (region.modified || false).toString();
302
- row.dataset.regionId = `${{pageIndex}}-${{regionIndex}}`;
303
-
304
- // Confidence cell
305
- const confidenceCell = document.createElement('td');
306
- confidenceCell.className = 'confidence';
307
- confidenceCell.textContent = region.confidence.toFixed(2);
308
-
309
- // Set color level based on confidence
310
- if (region.confidence >= 0.8) {{
311
- confidenceCell.dataset.level = 'high';
312
- }} else if (region.confidence >= 0.5) {{
313
- confidenceCell.dataset.level = 'medium';
314
- }} else {{
315
- confidenceCell.dataset.level = 'low';
316
- }}
317
-
318
- row.appendChild(confidenceCell);
319
-
320
- // Image region cell
321
- const imageCell = document.createElement('td');
322
- imageCell.className = 'region-image';
323
-
324
- const imageClip = document.createElement('div');
325
- imageClip.className = 'image-clip';
326
- imageClip.style.backgroundImage = `url('${{pageImage}}')`;
327
-
328
- // Calculate dimensions (scaled by 2.0 to match the image scale)
329
- const width = (region.bbox[2] - region.bbox[0]) * 2.0;
330
- const height = (region.bbox[3] - region.bbox[1]) * 2.0;
331
-
332
- // Calculate background position (negative of the top-left corner)
333
- imageClip.style.backgroundPosition = `-${{region.bbox[0] * 2.0}}px -${{region.bbox[1] * 2.0}}px`;
334
-
335
- // If the image is very large, we'll apply CSS transform scaling instead of
336
- // changing the dimensions directly to maintain proper background position
337
- const maxWidth = 350;
338
- const maxHeight = 250;
339
- let scale = 1;
340
-
341
- if (width > maxWidth || height > maxHeight) {{
342
- const scaleX = maxWidth / width;
343
- const scaleY = maxHeight / height;
344
- scale = Math.min(scaleX, scaleY);
345
- imageClip.style.transform = `scale(${{scale}})`;
346
- }}
347
-
348
- // Set the final dimensions
349
- imageClip.style.width = `${{width}}px`;
350
- imageClip.style.height = `${{height}}px`;
351
-
352
- imageCell.appendChild(imageClip);
353
- row.appendChild(imageCell);
354
-
355
- // Combined text content cell with textarea
356
- const textCell = document.createElement('td');
357
- textCell.className = 'text-content';
358
-
359
- const textArea = document.createElement('textarea');
360
- textArea.className = 'text-content-input';
361
- textArea.value = region.ocr_text;
362
- textArea.rows = Math.max(1, Math.ceil(region.ocr_text.length / 40)); // Approximate rows based on text length
363
- textArea.dataset.pageIndex = pageIndex;
364
- textArea.dataset.regionIndex = regionIndex;
365
- textArea.dataset.originalText = region.ocr_text;
366
-
367
- // Save changes to data structure
368
- textArea.addEventListener('change', (e) => {{
369
- const pIdx = parseInt(e.target.dataset.pageIndex);
370
- const rIdx = parseInt(e.target.dataset.regionIndex);
371
- ocrData.pages[pIdx].regions[rIdx].corrected_text = e.target.value;
372
-
373
- // Update the modified status in the dataset
374
- const isModified = e.target.value !== e.target.dataset.originalText;
375
- ocrData.pages[pIdx].regions[rIdx].modified = isModified;
376
-
377
- // Visual indication of modification through textarea style
378
- if (isModified) {{
379
- e.target.style.borderColor = '#4b6bfb';
380
- e.target.style.backgroundColor = 'rgba(75, 107, 251, 0.05)';
381
- }} else {{
382
- e.target.style.borderColor = '#ddd';
383
- e.target.style.backgroundColor = '';
384
- }}
385
- }});
386
-
387
- textCell.appendChild(textArea);
388
- row.appendChild(textCell);
389
-
390
- // No Modified column needed
391
-
392
- return row;
393
- }}
394
-
395
- // ===== Interactive Functions =====
396
-
397
- // Toggle display of full page image
398
- function toggleFullImage(pageIndex) {{
399
- const image = document.getElementById(`page-image-${{pageIndex}}`);
400
- const button = image.previousElementSibling.querySelector('.toggle-btn');
401
-
402
- if (image.classList.contains('show')) {{
403
- image.classList.remove('show');
404
- button.textContent = 'Show Full Image';
405
- }} else {{
406
- image.classList.add('show');
407
- button.textContent = 'Hide Full Image';
408
- }}
409
- }}
410
-
411
- // Filter regions by confidence
412
- function filterByConfidence(minConfidence) {{
413
- document.querySelectorAll('.region-row').forEach(row => {{
414
- const confidence = parseFloat(row.dataset.confidence);
415
- if (confidence < minConfidence) {{
416
- row.classList.add('hidden');
417
- }} else {{
418
- row.classList.remove('hidden');
419
- }}
420
- }});
421
- }}
422
-
423
- // Filter regions by text content
424
- function filterByText(text) {{
425
- const searchText = text.toLowerCase();
426
- // If no search text, we don't need to do anything
427
- if (!searchText) {{
428
- document.querySelectorAll('.region-row').forEach(row => {{
429
- row.classList.remove('hidden');
430
- }});
431
- return;
432
- }}
433
-
434
- // Filter based on current textarea content (not just original text)
435
- document.querySelectorAll('.region-row').forEach(row => {{
436
- const textarea = row.querySelector('.text-content-input');
437
- const currentText = textarea ? textarea.value.toLowerCase() : row.dataset.text.toLowerCase();
438
-
439
- if (!currentText.includes(searchText)) {{
440
- row.classList.add('hidden');
441
- }} else {{
442
- row.classList.remove('hidden');
443
- }}
444
- }});
445
- }}
446
-
447
- // Sort regions by different criteria
448
- function sortRegions(sortBy) {{
449
- ocrData.pages.forEach((page, pageIndex) => {{
450
- const tbody = document.getElementById(`regions-${{pageIndex}}`);
451
- const rows = Array.from(tbody.querySelectorAll('.region-row'));
452
-
453
- // Sort based on selected criterion
454
- rows.sort((a, b) => {{
455
- switch (sortBy) {{
456
- case 'confidence-asc':
457
- return parseFloat(a.dataset.confidence) - parseFloat(b.dataset.confidence);
458
- case 'confidence-desc':
459
- return parseFloat(b.dataset.confidence) - parseFloat(a.dataset.confidence);
460
- case 'text-length':
461
- return b.dataset.text.length - a.dataset.text.length;
462
- case 'position':
463
- default:
464
- // Default sort by region ID (original position)
465
- return a.dataset.regionId.localeCompare(b.dataset.regionId);
466
- }}
467
- }});
468
-
469
- // Reinsert in sorted order
470
- rows.forEach(row => tbody.appendChild(row));
471
- }});
472
- }}
473
-
474
- // Export data as JSON
475
- function exportJSON() {{
476
- // Create a downloadable JSON with corrected text
477
- const exportData = JSON.stringify(ocrData, null, 2);
478
- const blob = new Blob([exportData], {{type: 'application/json'}});
479
- const url = URL.createObjectURL(blob);
480
-
481
- const a = document.createElement('a');
482
- a.href = url;
483
- a.download = 'ocr_debug_export.json';
484
- document.body.appendChild(a);
485
- a.click();
486
- document.body.removeChild(a);
487
- URL.revokeObjectURL(url);
488
- }}
489
-
490
- // ===== Event Listeners =====
491
-
492
- confidenceFilter.addEventListener('input', (e) => {{
493
- const value = parseFloat(e.target.value);
494
- confidenceValue.textContent = value.toFixed(2);
495
- filterByConfidence(value);
496
- }});
497
-
498
- textFilter.addEventListener('input', (e) => {{
499
- filterByText(e.target.value);
500
- }});
501
-
502
- sortBySelect.addEventListener('change', (e) => {{
503
- sortRegions(e.target.value);
504
- }});
505
-
506
- exportButton.addEventListener('click', exportJSON);
507
-
508
- // ===== Initialize =====
509
-
510
- // Render all pages
511
- ocrData.pages.forEach((page, i) => {{
512
- const pageElement = renderPage(page, i);
513
- pagesContainer.appendChild(pageElement);
514
- }});
515
- </script>
516
- </body>
517
- </html>
tests/test_loading.py DELETED
@@ -1,50 +0,0 @@
1
- import os
2
-
3
- import pytest
4
-
5
- from natural_pdf import PDF
6
-
7
- # URL for the test PDF used in the tutorial
8
- TEST_PDF_URL = "https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf"
9
-
10
-
11
- def test_pdf_loading_from_url():
12
- """Tests if a PDF can be loaded successfully from a URL."""
13
- try:
14
- pdf = PDF(TEST_PDF_URL)
15
- # Basic assertions after loading
16
- assert pdf is not None
17
- assert len(pdf.pages) > 0, "PDF should have at least one page"
18
- assert os.path.exists(pdf.path), "PDF file should be downloaded locally"
19
- # Check if metadata (like Title) is accessible, even if None
20
- assert "Title" in pdf.metadata or pdf.metadata.get("Title") is None
21
-
22
- except Exception as e:
23
- pytest.fail(f"PDF loading from URL failed: {e}")
24
-
25
-
26
- def test_page_text_extraction():
27
- """Tests if text can be extracted from the first page."""
28
- try:
29
- pdf = PDF(TEST_PDF_URL)
30
- assert len(pdf.pages) > 0, "PDF has no pages"
31
- page = pdf.pages[0]
32
- text = page.extract_text()
33
- assert isinstance(text, str), "Extracted text should be a string"
34
- assert len(text) > 50, "Extracted text seems too short or empty"
35
- # Add a more specific assertion if you know some expected text
36
- # assert "Expected sample text" in text
37
-
38
- except Exception as e:
39
- pytest.fail(f"Text extraction failed: {e}")
40
-
41
-
42
- # Clean up downloaded file if necessary (optional, depends on PDF class behavior)
43
- # You might want a fixture to handle setup/teardown of the downloaded file
44
- # @pytest.fixture(scope="module")
45
- # def downloaded_pdf():
46
- # pdf = PDF(TEST_PDF_URL)
47
- # yield pdf
48
- # # Cleanup code here if PDF() doesn't handle it
49
- # if os.path.exists(pdf.path):
50
- # os.remove(pdf.path)