natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +209 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +288 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +413 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +512 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +604 -0
  56. docs/tutorials/12-ocr-integration.md +175 -0
  57. docs/tutorials/13-semantic-search.ipynb +1328 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +50 -33
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/gemini.py +264 -0
  67. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  68. natural_pdf/analyzers/layout/layout_manager.py +125 -58
  69. natural_pdf/analyzers/layout/layout_options.py +43 -17
  70. natural_pdf/analyzers/layout/paddle.py +152 -95
  71. natural_pdf/analyzers/layout/surya.py +164 -92
  72. natural_pdf/analyzers/layout/tatr.py +149 -84
  73. natural_pdf/analyzers/layout/yolo.py +89 -45
  74. natural_pdf/analyzers/text_options.py +22 -15
  75. natural_pdf/analyzers/text_structure.py +131 -85
  76. natural_pdf/analyzers/utils.py +30 -23
  77. natural_pdf/collections/pdf_collection.py +146 -97
  78. natural_pdf/core/__init__.py +1 -1
  79. natural_pdf/core/element_manager.py +419 -337
  80. natural_pdf/core/highlighting_service.py +268 -196
  81. natural_pdf/core/page.py +1044 -521
  82. natural_pdf/core/pdf.py +516 -313
  83. natural_pdf/elements/__init__.py +1 -1
  84. natural_pdf/elements/base.py +307 -225
  85. natural_pdf/elements/collections.py +805 -543
  86. natural_pdf/elements/line.py +39 -36
  87. natural_pdf/elements/rect.py +32 -30
  88. natural_pdf/elements/region.py +889 -879
  89. natural_pdf/elements/text.py +127 -99
  90. natural_pdf/exporters/__init__.py +0 -1
  91. natural_pdf/exporters/searchable_pdf.py +261 -102
  92. natural_pdf/ocr/__init__.py +57 -35
  93. natural_pdf/ocr/engine.py +150 -46
  94. natural_pdf/ocr/engine_easyocr.py +146 -150
  95. natural_pdf/ocr/engine_paddle.py +118 -175
  96. natural_pdf/ocr/engine_surya.py +78 -141
  97. natural_pdf/ocr/ocr_factory.py +114 -0
  98. natural_pdf/ocr/ocr_manager.py +122 -124
  99. natural_pdf/ocr/ocr_options.py +16 -20
  100. natural_pdf/ocr/utils.py +98 -0
  101. natural_pdf/qa/__init__.py +1 -1
  102. natural_pdf/qa/document_qa.py +119 -111
  103. natural_pdf/search/__init__.py +37 -31
  104. natural_pdf/search/haystack_search_service.py +312 -189
  105. natural_pdf/search/haystack_utils.py +186 -122
  106. natural_pdf/search/search_options.py +25 -14
  107. natural_pdf/search/search_service_protocol.py +12 -6
  108. natural_pdf/search/searchable_mixin.py +261 -176
  109. natural_pdf/selectors/__init__.py +2 -1
  110. natural_pdf/selectors/parser.py +159 -316
  111. natural_pdf/templates/__init__.py +1 -1
  112. natural_pdf/templates/spa/css/style.css +334 -0
  113. natural_pdf/templates/spa/index.html +31 -0
  114. natural_pdf/templates/spa/js/app.js +472 -0
  115. natural_pdf/templates/spa/words.txt +235976 -0
  116. natural_pdf/utils/debug.py +32 -0
  117. natural_pdf/utils/highlighting.py +8 -2
  118. natural_pdf/utils/identifiers.py +29 -0
  119. natural_pdf/utils/packaging.py +418 -0
  120. natural_pdf/utils/reading_order.py +65 -63
  121. natural_pdf/utils/text_extraction.py +195 -0
  122. natural_pdf/utils/visualization.py +70 -61
  123. natural_pdf/widgets/__init__.py +2 -3
  124. natural_pdf/widgets/viewer.py +749 -718
  125. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
  126. natural_pdf-0.1.6.dist-info/RECORD +141 -0
  127. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  128. natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
  129. notebooks/Examples.ipynb +1293 -0
  130. pdfs/.gitkeep +0 -0
  131. pdfs/01-practice.pdf +543 -0
  132. pdfs/0500000US42001.pdf +0 -0
  133. pdfs/0500000US42007.pdf +0 -0
  134. pdfs/2014 Statistics.pdf +0 -0
  135. pdfs/2019 Statistics.pdf +0 -0
  136. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  137. pdfs/needs-ocr.pdf +0 -0
  138. natural_pdf/templates/ocr_debug.html +0 -517
  139. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  140. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  141. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,517 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <meta charset="UTF-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <title>OCR Debug Report</title>
7
- <style>
8
- body {{
9
- font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
10
- line-height: 1.6;
11
- color: #333;
12
- max-width: 1200px;
13
- margin: 0 auto;
14
- padding: 20px;
15
- }}
16
- h1, h2 {{
17
- color: #2c3e50;
18
- }}
19
- .page-section {{
20
- margin-bottom: 30px;
21
- border: 1px solid #ddd;
22
- border-radius: 4px;
23
- padding: 15px;
24
- background: #f8f9fa;
25
- }}
26
- .page-title {{
27
- display: flex;
28
- justify-content: space-between;
29
- align-items: center;
30
- margin-bottom: 15px;
31
- }}
32
- .page-controls {{
33
- display: flex;
34
- gap: 10px;
35
- align-items: center;
36
- }}
37
- .controls {{
38
- margin-bottom: 20px;
39
- display: flex;
40
- justify-content: space-between;
41
- flex-wrap: wrap;
42
- gap: 10px;
43
- padding: 10px;
44
- background: #f0f0f0;
45
- border-radius: 4px;
46
- }}
47
- .filter-control {{
48
- display: flex;
49
- align-items: center;
50
- gap: 8px;
51
- }}
52
- input, select, button {{
53
- padding: 6px 12px;
54
- border: 1px solid #ccc;
55
- border-radius: 4px;
56
- font-size: 14px;
57
- }}
58
- button {{
59
- background: #4b6bfb;
60
- color: white;
61
- cursor: pointer;
62
- }}
63
- button:hover {{
64
- background: #3b5de7;
65
- }}
66
- .region-table {{
67
- width: 100%;
68
- border-collapse: collapse;
69
- }}
70
- .region-table th,
71
- .region-table td {{
72
- padding: 8px;
73
- border: 1px solid #ddd;
74
- text-align: left;
75
- vertical-align: top;
76
- }}
77
- .region-table th {{
78
- background: #f2f2f2;
79
- position: sticky;
80
- top: 0;
81
- z-index: 10;
82
- }}
83
- .region-image {{
84
- width: 35%; /* Limit image cell width */
85
- }}
86
- .image-clip {{
87
- position: relative;
88
- overflow: hidden;
89
- background-repeat: no-repeat;
90
- border-radius: 3px;
91
- box-shadow: 0 1px 3px rgba(0,0,0,0.2);
92
- margin: 0 auto;
93
- max-width: 350px; /* Maximum width */
94
- max-height: 250px; /* Maximum height */
95
- transform-origin: top left; /* For scaling */
96
- }}
97
- .confidence {{
98
- width: 90px;
99
- text-align: center;
100
- white-space: nowrap;
101
- }}
102
- .confidence[data-level="high"] {{
103
- background-color: rgba(0, 200, 0, 0.2);
104
- }}
105
- .confidence[data-level="medium"] {{
106
- background-color: rgba(255, 200, 0, 0.2);
107
- }}
108
- .confidence[data-level="low"] {{
109
- background-color: rgba(255, 0, 0, 0.2);
110
- }}
111
- .text-content {{
112
- width: 60%;
113
- }}
114
- .text-content-input {{
115
- width: 100%;
116
- font-family: monospace;
117
- padding: 8px;
118
- line-height: 1.4;
119
- white-space: pre-wrap;
120
- word-break: break-all;
121
- border: 1px solid #ddd;
122
- border-radius: 4px;
123
- resize: vertical;
124
- }}
125
- .text-content-input:focus {{
126
- border-color: #4b6bfb;
127
- outline: none;
128
- box-shadow: 0 0 0 2px rgba(75, 107, 251, 0.25);
129
- }}
130
- .modified-status {{
131
- text-align: center;
132
- width: 80px;
133
- }}
134
- .modified-checkbox {{
135
- width: 18px;
136
- height: 18px;
137
- cursor: not-allowed;
138
- }}
139
- .hidden {{
140
- display: none;
141
- }}
142
- .toggle-btn {{
143
- background: #eee;
144
- color: #333;
145
- border: 1px solid #ccc;
146
- padding: 3px 8px;
147
- border-radius: 3px;
148
- cursor: pointer;
149
- font-size: 12px;
150
- }}
151
- .toggle-btn:hover {{
152
- background: #ddd;
153
- }}
154
- .export-btn {{
155
- margin-left: auto;
156
- }}
157
- .page-image {{
158
- max-width: 100%;
159
- height: auto;
160
- margin-bottom: 15px;
161
- border: 1px solid #ddd;
162
- display: none;
163
- }}
164
- .show {{
165
- display: block;
166
- }}
167
-
168
- @media (max-width: 800px) {{
169
- .region-table, .region-table tbody, .region-table tr, .region-table td, .region-table th {{
170
- display: block;
171
- }}
172
- .region-table td {{
173
- margin-bottom: 8px;
174
- }}
175
- .region-table th {{
176
- position: static;
177
- }}
178
- }}
179
- </style>
180
- </head>
181
- <body>
182
- <h1>OCR Debug Report</h1>
183
-
184
- <div class="controls">
185
- <div class="filter-control">
186
- <label for="confidence-filter">Min Confidence:</label>
187
- <input type="range" id="confidence-filter" min="0" max="1" step="0.05" value="0">
188
- <span id="confidence-value">0</span>
189
- </div>
190
-
191
- <div class="filter-control">
192
- <label for="text-filter">Text Filter:</label>
193
- <input type="text" id="text-filter" placeholder="Filter text...">
194
- </div>
195
-
196
- <div class="filter-control">
197
- <label for="sort-by">Sort By:</label>
198
- <select id="sort-by">
199
- <option value="position">Position (default)</option>
200
- <option value="confidence-asc">Confidence (Low to High)</option>
201
- <option value="confidence-desc">Confidence (High to Low)</option>
202
- <option value="text-length">Text Length</option>
203
- </select>
204
- </div>
205
-
206
- <button id="export-json" class="export-btn">Export JSON</button>
207
- </div>
208
-
209
- <div id="pages-container">
210
- <!-- Pages will be inserted here -->
211
- </div>
212
-
213
- <script>
214
- // Main OCR data structure with pages and regions
215
- const ocrData = {pages_data};
216
-
217
- // ===== DOM Elements =====
218
- const pagesContainer = document.getElementById('pages-container');
219
- const confidenceFilter = document.getElementById('confidence-filter');
220
- const confidenceValue = document.getElementById('confidence-value');
221
- const textFilter = document.getElementById('text-filter');
222
- const sortBySelect = document.getElementById('sort-by');
223
- const exportButton = document.getElementById('export-json');
224
-
225
- // ===== Rendering Functions =====
226
-
227
- // Render a single page section with its regions
228
- function renderPage(page, pageIndex) {{
229
- const pageDiv = document.createElement('div');
230
- pageDiv.className = 'page-section';
231
- pageDiv.id = `page-${{pageIndex}}`;
232
-
233
- // Page header with controls
234
- const pageTitle = document.createElement('div');
235
- pageTitle.className = 'page-title';
236
-
237
- const pageHeading = document.createElement('h2');
238
- pageHeading.textContent = `Page ${{page.page_number}}`;
239
- pageTitle.appendChild(pageHeading);
240
-
241
- const pageControls = document.createElement('div');
242
- pageControls.className = 'page-controls';
243
-
244
- const toggleImageBtn = document.createElement('button');
245
- toggleImageBtn.className = 'toggle-btn';
246
- toggleImageBtn.textContent = 'Show Full Image';
247
- toggleImageBtn.onclick = () => toggleFullImage(pageIndex);
248
- pageControls.appendChild(toggleImageBtn);
249
-
250
- pageTitle.appendChild(pageControls);
251
- pageDiv.appendChild(pageTitle);
252
-
253
- // Full page image (hidden by default)
254
- const pageImage = document.createElement('img');
255
- pageImage.src = page.image;
256
- pageImage.className = 'page-image';
257
- pageImage.id = `page-image-${{pageIndex}}`;
258
- pageImage.alt = `Page ${{page.page_number}}`;
259
- pageDiv.appendChild(pageImage);
260
-
261
- // Table for regions
262
- const table = document.createElement('table');
263
- table.className = 'region-table';
264
-
265
- // Table header
266
- const thead = document.createElement('thead');
267
- const headerRow = document.createElement('tr');
268
-
269
- const headers = ['Confidence', 'Text Region', 'Text Content'];
270
- headers.forEach(header => {{
271
- const th = document.createElement('th');
272
- th.textContent = header;
273
- headerRow.appendChild(th);
274
- }});
275
-
276
- thead.appendChild(headerRow);
277
- table.appendChild(thead);
278
-
279
- // Table body
280
- const tbody = document.createElement('tbody');
281
- tbody.id = `regions-${{pageIndex}}`;
282
-
283
- // Render each region row
284
- page.regions.forEach((region, regionIndex) => {{
285
- const row = renderRegionRow(region, pageIndex, regionIndex, page.image);
286
- tbody.appendChild(row);
287
- }});
288
-
289
- table.appendChild(tbody);
290
- pageDiv.appendChild(table);
291
-
292
- return pageDiv;
293
- }}
294
-
295
- // Render a single region row
296
- function renderRegionRow(region, pageIndex, regionIndex, pageImage) {{
297
- const row = document.createElement('tr');
298
- row.className = 'region-row';
299
- row.dataset.confidence = region.confidence;
300
- row.dataset.text = region.ocr_text;
301
- row.dataset.modified = (region.modified || false).toString();
302
- row.dataset.regionId = `${{pageIndex}}-${{regionIndex}}`;
303
-
304
- // Confidence cell
305
- const confidenceCell = document.createElement('td');
306
- confidenceCell.className = 'confidence';
307
- confidenceCell.textContent = region.confidence.toFixed(2);
308
-
309
- // Set color level based on confidence
310
- if (region.confidence >= 0.8) {{
311
- confidenceCell.dataset.level = 'high';
312
- }} else if (region.confidence >= 0.5) {{
313
- confidenceCell.dataset.level = 'medium';
314
- }} else {{
315
- confidenceCell.dataset.level = 'low';
316
- }}
317
-
318
- row.appendChild(confidenceCell);
319
-
320
- // Image region cell
321
- const imageCell = document.createElement('td');
322
- imageCell.className = 'region-image';
323
-
324
- const imageClip = document.createElement('div');
325
- imageClip.className = 'image-clip';
326
- imageClip.style.backgroundImage = `url('${{pageImage}}')`;
327
-
328
- // Calculate dimensions (scaled by 2.0 to match the image scale)
329
- const width = (region.bbox[2] - region.bbox[0]) * 2.0;
330
- const height = (region.bbox[3] - region.bbox[1]) * 2.0;
331
-
332
- // Calculate background position (negative of the top-left corner)
333
- imageClip.style.backgroundPosition = `-${{region.bbox[0] * 2.0}}px -${{region.bbox[1] * 2.0}}px`;
334
-
335
- // If the image is very large, we'll apply CSS transform scaling instead of
336
- // changing the dimensions directly to maintain proper background position
337
- const maxWidth = 350;
338
- const maxHeight = 250;
339
- let scale = 1;
340
-
341
- if (width > maxWidth || height > maxHeight) {{
342
- const scaleX = maxWidth / width;
343
- const scaleY = maxHeight / height;
344
- scale = Math.min(scaleX, scaleY);
345
- imageClip.style.transform = `scale(${{scale}})`;
346
- }}
347
-
348
- // Set the final dimensions
349
- imageClip.style.width = `${{width}}px`;
350
- imageClip.style.height = `${{height}}px`;
351
-
352
- imageCell.appendChild(imageClip);
353
- row.appendChild(imageCell);
354
-
355
- // Combined text content cell with textarea
356
- const textCell = document.createElement('td');
357
- textCell.className = 'text-content';
358
-
359
- const textArea = document.createElement('textarea');
360
- textArea.className = 'text-content-input';
361
- textArea.value = region.ocr_text;
362
- textArea.rows = Math.max(1, Math.ceil(region.ocr_text.length / 40)); // Approximate rows based on text length
363
- textArea.dataset.pageIndex = pageIndex;
364
- textArea.dataset.regionIndex = regionIndex;
365
- textArea.dataset.originalText = region.ocr_text;
366
-
367
- // Save changes to data structure
368
- textArea.addEventListener('change', (e) => {{
369
- const pIdx = parseInt(e.target.dataset.pageIndex);
370
- const rIdx = parseInt(e.target.dataset.regionIndex);
371
- ocrData.pages[pIdx].regions[rIdx].corrected_text = e.target.value;
372
-
373
- // Update the modified status in the dataset
374
- const isModified = e.target.value !== e.target.dataset.originalText;
375
- ocrData.pages[pIdx].regions[rIdx].modified = isModified;
376
-
377
- // Visual indication of modification through textarea style
378
- if (isModified) {{
379
- e.target.style.borderColor = '#4b6bfb';
380
- e.target.style.backgroundColor = 'rgba(75, 107, 251, 0.05)';
381
- }} else {{
382
- e.target.style.borderColor = '#ddd';
383
- e.target.style.backgroundColor = '';
384
- }}
385
- }});
386
-
387
- textCell.appendChild(textArea);
388
- row.appendChild(textCell);
389
-
390
- // No Modified column needed
391
-
392
- return row;
393
- }}
394
-
395
- // ===== Interactive Functions =====
396
-
397
- // Toggle display of full page image
398
- function toggleFullImage(pageIndex) {{
399
- const image = document.getElementById(`page-image-${{pageIndex}}`);
400
- const button = image.previousElementSibling.querySelector('.toggle-btn');
401
-
402
- if (image.classList.contains('show')) {{
403
- image.classList.remove('show');
404
- button.textContent = 'Show Full Image';
405
- }} else {{
406
- image.classList.add('show');
407
- button.textContent = 'Hide Full Image';
408
- }}
409
- }}
410
-
411
- // Filter regions by confidence
412
- function filterByConfidence(minConfidence) {{
413
- document.querySelectorAll('.region-row').forEach(row => {{
414
- const confidence = parseFloat(row.dataset.confidence);
415
- if (confidence < minConfidence) {{
416
- row.classList.add('hidden');
417
- }} else {{
418
- row.classList.remove('hidden');
419
- }}
420
- }});
421
- }}
422
-
423
- // Filter regions by text content
424
- function filterByText(text) {{
425
- const searchText = text.toLowerCase();
426
- // If no search text, we don't need to do anything
427
- if (!searchText) {{
428
- document.querySelectorAll('.region-row').forEach(row => {{
429
- row.classList.remove('hidden');
430
- }});
431
- return;
432
- }}
433
-
434
- // Filter based on current textarea content (not just original text)
435
- document.querySelectorAll('.region-row').forEach(row => {{
436
- const textarea = row.querySelector('.text-content-input');
437
- const currentText = textarea ? textarea.value.toLowerCase() : row.dataset.text.toLowerCase();
438
-
439
- if (!currentText.includes(searchText)) {{
440
- row.classList.add('hidden');
441
- }} else {{
442
- row.classList.remove('hidden');
443
- }}
444
- }});
445
- }}
446
-
447
- // Sort regions by different criteria
448
- function sortRegions(sortBy) {{
449
- ocrData.pages.forEach((page, pageIndex) => {{
450
- const tbody = document.getElementById(`regions-${{pageIndex}}`);
451
- const rows = Array.from(tbody.querySelectorAll('.region-row'));
452
-
453
- // Sort based on selected criterion
454
- rows.sort((a, b) => {{
455
- switch (sortBy) {{
456
- case 'confidence-asc':
457
- return parseFloat(a.dataset.confidence) - parseFloat(b.dataset.confidence);
458
- case 'confidence-desc':
459
- return parseFloat(b.dataset.confidence) - parseFloat(a.dataset.confidence);
460
- case 'text-length':
461
- return b.dataset.text.length - a.dataset.text.length;
462
- case 'position':
463
- default:
464
- // Default sort by region ID (original position)
465
- return a.dataset.regionId.localeCompare(b.dataset.regionId);
466
- }}
467
- }});
468
-
469
- // Reinsert in sorted order
470
- rows.forEach(row => tbody.appendChild(row));
471
- }});
472
- }}
473
-
474
- // Export data as JSON
475
- function exportJSON() {{
476
- // Create a downloadable JSON with corrected text
477
- const exportData = JSON.stringify(ocrData, null, 2);
478
- const blob = new Blob([exportData], {{type: 'application/json'}});
479
- const url = URL.createObjectURL(blob);
480
-
481
- const a = document.createElement('a');
482
- a.href = url;
483
- a.download = 'ocr_debug_export.json';
484
- document.body.appendChild(a);
485
- a.click();
486
- document.body.removeChild(a);
487
- URL.revokeObjectURL(url);
488
- }}
489
-
490
- // ===== Event Listeners =====
491
-
492
- confidenceFilter.addEventListener('input', (e) => {{
493
- const value = parseFloat(e.target.value);
494
- confidenceValue.textContent = value.toFixed(2);
495
- filterByConfidence(value);
496
- }});
497
-
498
- textFilter.addEventListener('input', (e) => {{
499
- filterByText(e.target.value);
500
- }});
501
-
502
- sortBySelect.addEventListener('change', (e) => {{
503
- sortRegions(e.target.value);
504
- }});
505
-
506
- exportButton.addEventListener('click', exportJSON);
507
-
508
- // ===== Initialize =====
509
-
510
- // Render all pages
511
- ocrData.pages.forEach((page, i) => {{
512
- const pageElement = renderPage(page, i);
513
- pagesContainer.appendChild(pageElement);
514
- }});
515
- </script>
516
- </body>
517
- </html>
@@ -1,61 +0,0 @@
1
- natural_pdf/__init__.py,sha256=hdqbTG3SHtu8jPIL7su6TpEhEbNsL89pgktCXPMKWCI,2825
2
- natural_pdf/analyzers/__init__.py,sha256=BkSmEqw5J76C2fvYHF86EXQJQWWFNIvjSwRMwfW-Ht0,140
3
- natural_pdf/analyzers/text_options.py,sha256=9IGRoem1O2mc1ZNGiM5-VPRZ3c8LLwEk1B3is9UxMoE,2777
4
- natural_pdf/analyzers/text_structure.py,sha256=e4G6v0bD7ZJCdo6DcuDD3iZt8KAwBfALMduwZHGh0wI,12415
5
- natural_pdf/analyzers/utils.py,sha256=u5_FAUPmEG1ydPVuxpu7bVw507NB3WzisMNSUhsnukY,2146
6
- natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
7
- natural_pdf/analyzers/layout/base.py,sha256=D6KHDsbVKzZWCfW4vt0khPC3TA9JzQD3cF4VtTSyf28,6752
8
- natural_pdf/analyzers/layout/docling.py,sha256=iNeD10ZfolDVJNqayAUd0-Bs2tVr5INE7WK9c_Mll_8,11930
9
- natural_pdf/analyzers/layout/layout_analyzer.py,sha256=JJasXl7QEiP4DgAvf-zu1w7Uakdf8ypvITkpQ-OQDgA,13340
10
- natural_pdf/analyzers/layout/layout_manager.py,sha256=6Zi9SBonpa0urWyeQBJnmxIL1hOn4xAx09ugkMrEhro,9555
11
- natural_pdf/analyzers/layout/layout_options.py,sha256=EmvPEnDsVGMJkDNfn6ORLnX545gbmlo3kVcz4anVm5Q,3325
12
- natural_pdf/analyzers/layout/paddle.py,sha256=QCasH_Z9UITX6wRGlE_HjmwkBuANz9Yyw5Yk7QvRVcw,12519
13
- natural_pdf/analyzers/layout/surya.py,sha256=Ibwo42TioJ-BZP3-2T13KCtH3kLSWQh7C9ZYuk1kUQo,12657
14
- natural_pdf/analyzers/layout/tatr.py,sha256=H0Xygk9jA46-vlPleoal94cuDyz-LHTSxVb3e6gpmV8,11956
15
- natural_pdf/analyzers/layout/yolo.py,sha256=NSQK3TcS1qN8D2MDxCvcwTpS_kvzGy3I2LepJDUceoQ,7699
16
- natural_pdf/collections/pdf_collection.py,sha256=LLtixKaKRzPRfZNdDQQ7HY3wyWbBcefPYvf_4Ke-FLw,12123
17
- natural_pdf/core/__init__.py,sha256=GUuFtj2Apc9biAdUOlnL8leL3BQncEzubvpiAUaU3ss,37
18
- natural_pdf/core/element_manager.py,sha256=H1896JSt48ASLSmG22xEXMY-xSKcpYsUlYmYMD48i6Q,17117
19
- natural_pdf/core/highlighting_service.py,sha256=a-40UMohOglYrw4klW1GuQ_p3jZOxnAfPOXPORThr4U,31476
20
- natural_pdf/core/page.py,sha256=7LSqJbGHhpKQliAdcy7aRQzkr8sO9jUP68bzy7uH54U,69305
21
- natural_pdf/core/pdf.py,sha256=ALCO7YB_oaMtGZpS6JHJglrIIDbUd63sSso0oNAAP9k,41140
22
- natural_pdf/elements/__init__.py,sha256=6FGHZm2oONd8zErahMEawuB4AvJR5jOZPt4KtEwbj80,40
23
- natural_pdf/elements/base.py,sha256=9SQ-O2qbQe9Avbf9JI-p6vWlyThZVch-p1yqXWSrBHw,35750
24
- natural_pdf/elements/collections.py,sha256=G6H-6VtCWq_KW-A0y9XhyHLOIWxz-1vHByfC6dq8lmU,62387
25
- natural_pdf/elements/line.py,sha256=QvVdhf_K6rwJkq3q67JmgdZpDhrBgWuSMF-Q25malP4,4783
26
- natural_pdf/elements/rect.py,sha256=dls9g-R213O78HvfAJMak3_eV14Zh654Zw7hqTTXxDQ,3949
27
- natural_pdf/elements/region.py,sha256=5dXHYbbdO1QNgkD6b6I34ezHt-SHKx_aH1ubzbfMHQs,74370
28
- natural_pdf/elements/text.py,sha256=OAuy0ozaemj6yjMwhXPsJ76VZtRPeJbmrFTzpDJA2_U,11017
29
- natural_pdf/exporters/__init__.py,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
30
- natural_pdf/exporters/searchable_pdf.py,sha256=PPkF64hFNNhPlZPuyJRvC_scAg3WCOiIvwgIP8nlZ9E,10225
31
- natural_pdf/ocr/__init__.py,sha256=mbUUsCfeU6yRsEqNn3I4Len-XY6FfjfKhTAoWDLA1f4,1943
32
- natural_pdf/ocr/engine.py,sha256=xDnvhnm4Lr7d83ezglDqOtl9xfx74zOOTyYW-fZHQEQ,4183
33
- natural_pdf/ocr/engine_easyocr.py,sha256=6srZhXqlH3UpNWw5iFq7u4TS5HQsMSTWYuuWo3oYZp8,8273
34
- natural_pdf/ocr/engine_paddle.py,sha256=9nnG-0zyEH9fj_QRmxjX6OaxzfyKKmyW_SCs40edTWs,9065
35
- natural_pdf/ocr/engine_surya.py,sha256=gWV_BEuLMqmJcKVlag9i45SsO2uLAtI-dayBm1xbDQ0,7719
36
- natural_pdf/ocr/ocr_manager.py,sha256=mAyCntdAnrNv8TIvGYlGs40G2tDAdMQ_Jqb3owiPWW8,9934
37
- natural_pdf/ocr/ocr_options.py,sha256=A2CQV172id-90zMpPZWb8CD09ZP0BuQnnCZGEFP4SaQ,3787
38
- natural_pdf/qa/__init__.py,sha256=kagdfqNMpTnyzjC2EFy_PBX5us38NnJL548ESSQVzfI,107
39
- natural_pdf/qa/document_qa.py,sha256=QYKKor0RqUQcEdFEBEUdq7L0ktq1WSMfQ-ynTc64cPU,15926
40
- natural_pdf/search/__init__.py,sha256=sYv7-XrSohUgE2UH8sFpGfl66SG092jZoNokZaDdxsY,4125
41
- natural_pdf/search/haystack_search_service.py,sha256=qhvqVJMxz4-KTnQF0MPO7YLQxTlYe27PCgKJgYeAels,27580
42
- natural_pdf/search/haystack_utils.py,sha256=BXU5yIEcFIWliSX44slMYLlUMfwCXEfve-ZYmVcEt3k,18773
43
- natural_pdf/search/search_options.py,sha256=PrIGkvM9A9wpqaz6tDB-9hWiSp9fqhi8mf7FQl1qoGI,3510
44
- natural_pdf/search/search_service_protocol.py,sha256=5EYzHFUoFvaYw3khnQNz1dsOHqTvBChekvk_qf2mu5w,6811
45
- natural_pdf/search/searchable_mixin.py,sha256=QPtPSJHCP5n0Twp4uHKSns8J6HuvGjyipTNbB66JFLg,24896
46
- natural_pdf/selectors/__init__.py,sha256=Jfk-JBZEpQ7V5FWVGuLJQLH-qOfqNLC2AdicncMhrmY,121
47
- natural_pdf/selectors/parser.py,sha256=JK1zDVISACkUhzmzWfQMMW8hvsV422lRBFKgDBWOWC4,24108
48
- natural_pdf/templates/__init__.py,sha256=i7N8epDxZoDDsK4p2iUiMwzKVs97i_KtNk8ATArqlC4,19
49
- natural_pdf/templates/ocr_debug.html,sha256=Zy9StzBeHFQU8ity6cjFSZLe3TY0QOabUux4c5WQUzs,19171
50
- natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
51
- natural_pdf/utils/highlighting.py,sha256=9H8vbWhwgxzjrL7MhAePXUWZZctLPboNocJzy-1TE_g,675
52
- natural_pdf/utils/reading_order.py,sha256=1oihH9ZTqQvIVDYc2oVEYqIXyPzi94ERtelp6TyzmWU,7594
53
- natural_pdf/utils/visualization.py,sha256=14BM-K4ovDqHniNbxbP_y9KaEYNlkbpELGAv9_8aOZ4,8876
54
- natural_pdf/widgets/__init__.py,sha256=qckw3DjdVTsASPLJ8uUrGKg3MFhvzHndUpeNGlqwg6A,215
55
- natural_pdf/widgets/viewer.py,sha256=h_amj_uvf-vRqEsFg4P00fgKxawLAd9jjC1ohUza4BY,37479
56
- natural_pdf/widgets/frontend/viewer.js,sha256=w8ywfz_IOAAv2nP_qaf2VBUkF1KhjT3zorhJxM1-CfU,4371
57
- natural_pdf-0.1.4.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
58
- natural_pdf-0.1.4.dist-info/METADATA,sha256=Qbj7uNu_w5OfHexqGGFEi1VQCELaidq670nHDArAtqE,4967
59
- natural_pdf-0.1.4.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
60
- natural_pdf-0.1.4.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
61
- natural_pdf-0.1.4.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- natural_pdf