natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,517 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html lang="en">
|
3
|
+
<head>
|
4
|
+
<meta charset="UTF-8">
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6
|
+
<title>OCR Debug Report</title>
|
7
|
+
<style>
|
8
|
+
body {{
|
9
|
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
|
10
|
+
line-height: 1.6;
|
11
|
+
color: #333;
|
12
|
+
max-width: 1200px;
|
13
|
+
margin: 0 auto;
|
14
|
+
padding: 20px;
|
15
|
+
}}
|
16
|
+
h1, h2 {{
|
17
|
+
color: #2c3e50;
|
18
|
+
}}
|
19
|
+
.page-section {{
|
20
|
+
margin-bottom: 30px;
|
21
|
+
border: 1px solid #ddd;
|
22
|
+
border-radius: 4px;
|
23
|
+
padding: 15px;
|
24
|
+
background: #f8f9fa;
|
25
|
+
}}
|
26
|
+
.page-title {{
|
27
|
+
display: flex;
|
28
|
+
justify-content: space-between;
|
29
|
+
align-items: center;
|
30
|
+
margin-bottom: 15px;
|
31
|
+
}}
|
32
|
+
.page-controls {{
|
33
|
+
display: flex;
|
34
|
+
gap: 10px;
|
35
|
+
align-items: center;
|
36
|
+
}}
|
37
|
+
.controls {{
|
38
|
+
margin-bottom: 20px;
|
39
|
+
display: flex;
|
40
|
+
justify-content: space-between;
|
41
|
+
flex-wrap: wrap;
|
42
|
+
gap: 10px;
|
43
|
+
padding: 10px;
|
44
|
+
background: #f0f0f0;
|
45
|
+
border-radius: 4px;
|
46
|
+
}}
|
47
|
+
.filter-control {{
|
48
|
+
display: flex;
|
49
|
+
align-items: center;
|
50
|
+
gap: 8px;
|
51
|
+
}}
|
52
|
+
input, select, button {{
|
53
|
+
padding: 6px 12px;
|
54
|
+
border: 1px solid #ccc;
|
55
|
+
border-radius: 4px;
|
56
|
+
font-size: 14px;
|
57
|
+
}}
|
58
|
+
button {{
|
59
|
+
background: #4b6bfb;
|
60
|
+
color: white;
|
61
|
+
cursor: pointer;
|
62
|
+
}}
|
63
|
+
button:hover {{
|
64
|
+
background: #3b5de7;
|
65
|
+
}}
|
66
|
+
.region-table {{
|
67
|
+
width: 100%;
|
68
|
+
border-collapse: collapse;
|
69
|
+
}}
|
70
|
+
.region-table th,
|
71
|
+
.region-table td {{
|
72
|
+
padding: 8px;
|
73
|
+
border: 1px solid #ddd;
|
74
|
+
text-align: left;
|
75
|
+
vertical-align: top;
|
76
|
+
}}
|
77
|
+
.region-table th {{
|
78
|
+
background: #f2f2f2;
|
79
|
+
position: sticky;
|
80
|
+
top: 0;
|
81
|
+
z-index: 10;
|
82
|
+
}}
|
83
|
+
.region-image {{
|
84
|
+
width: 35%; /* Limit image cell width */
|
85
|
+
}}
|
86
|
+
.image-clip {{
|
87
|
+
position: relative;
|
88
|
+
overflow: hidden;
|
89
|
+
background-repeat: no-repeat;
|
90
|
+
border-radius: 3px;
|
91
|
+
box-shadow: 0 1px 3px rgba(0,0,0,0.2);
|
92
|
+
margin: 0 auto;
|
93
|
+
max-width: 350px; /* Maximum width */
|
94
|
+
max-height: 250px; /* Maximum height */
|
95
|
+
transform-origin: top left; /* For scaling */
|
96
|
+
}}
|
97
|
+
.confidence {{
|
98
|
+
width: 90px;
|
99
|
+
text-align: center;
|
100
|
+
white-space: nowrap;
|
101
|
+
}}
|
102
|
+
.confidence[data-level="high"] {{
|
103
|
+
background-color: rgba(0, 200, 0, 0.2);
|
104
|
+
}}
|
105
|
+
.confidence[data-level="medium"] {{
|
106
|
+
background-color: rgba(255, 200, 0, 0.2);
|
107
|
+
}}
|
108
|
+
.confidence[data-level="low"] {{
|
109
|
+
background-color: rgba(255, 0, 0, 0.2);
|
110
|
+
}}
|
111
|
+
.text-content {{
|
112
|
+
width: 60%;
|
113
|
+
}}
|
114
|
+
.text-content-input {{
|
115
|
+
width: 100%;
|
116
|
+
font-family: monospace;
|
117
|
+
padding: 8px;
|
118
|
+
line-height: 1.4;
|
119
|
+
white-space: pre-wrap;
|
120
|
+
word-break: break-all;
|
121
|
+
border: 1px solid #ddd;
|
122
|
+
border-radius: 4px;
|
123
|
+
resize: vertical;
|
124
|
+
}}
|
125
|
+
.text-content-input:focus {{
|
126
|
+
border-color: #4b6bfb;
|
127
|
+
outline: none;
|
128
|
+
box-shadow: 0 0 0 2px rgba(75, 107, 251, 0.25);
|
129
|
+
}}
|
130
|
+
.modified-status {{
|
131
|
+
text-align: center;
|
132
|
+
width: 80px;
|
133
|
+
}}
|
134
|
+
.modified-checkbox {{
|
135
|
+
width: 18px;
|
136
|
+
height: 18px;
|
137
|
+
cursor: not-allowed;
|
138
|
+
}}
|
139
|
+
.hidden {{
|
140
|
+
display: none;
|
141
|
+
}}
|
142
|
+
.toggle-btn {{
|
143
|
+
background: #eee;
|
144
|
+
color: #333;
|
145
|
+
border: 1px solid #ccc;
|
146
|
+
padding: 3px 8px;
|
147
|
+
border-radius: 3px;
|
148
|
+
cursor: pointer;
|
149
|
+
font-size: 12px;
|
150
|
+
}}
|
151
|
+
.toggle-btn:hover {{
|
152
|
+
background: #ddd;
|
153
|
+
}}
|
154
|
+
.export-btn {{
|
155
|
+
margin-left: auto;
|
156
|
+
}}
|
157
|
+
.page-image {{
|
158
|
+
max-width: 100%;
|
159
|
+
height: auto;
|
160
|
+
margin-bottom: 15px;
|
161
|
+
border: 1px solid #ddd;
|
162
|
+
display: none;
|
163
|
+
}}
|
164
|
+
.show {{
|
165
|
+
display: block;
|
166
|
+
}}
|
167
|
+
|
168
|
+
@media (max-width: 800px) {{
|
169
|
+
.region-table, .region-table tbody, .region-table tr, .region-table td, .region-table th {{
|
170
|
+
display: block;
|
171
|
+
}}
|
172
|
+
.region-table td {{
|
173
|
+
margin-bottom: 8px;
|
174
|
+
}}
|
175
|
+
.region-table th {{
|
176
|
+
position: static;
|
177
|
+
}}
|
178
|
+
}}
|
179
|
+
</style>
|
180
|
+
</head>
|
181
|
+
<body>
|
182
|
+
<h1>OCR Debug Report</h1>
|
183
|
+
|
184
|
+
<div class="controls">
|
185
|
+
<div class="filter-control">
|
186
|
+
<label for="confidence-filter">Min Confidence:</label>
|
187
|
+
<input type="range" id="confidence-filter" min="0" max="1" step="0.05" value="0">
|
188
|
+
<span id="confidence-value">0</span>
|
189
|
+
</div>
|
190
|
+
|
191
|
+
<div class="filter-control">
|
192
|
+
<label for="text-filter">Text Filter:</label>
|
193
|
+
<input type="text" id="text-filter" placeholder="Filter text...">
|
194
|
+
</div>
|
195
|
+
|
196
|
+
<div class="filter-control">
|
197
|
+
<label for="sort-by">Sort By:</label>
|
198
|
+
<select id="sort-by">
|
199
|
+
<option value="position">Position (default)</option>
|
200
|
+
<option value="confidence-asc">Confidence (Low to High)</option>
|
201
|
+
<option value="confidence-desc">Confidence (High to Low)</option>
|
202
|
+
<option value="text-length">Text Length</option>
|
203
|
+
</select>
|
204
|
+
</div>
|
205
|
+
|
206
|
+
<button id="export-json" class="export-btn">Export JSON</button>
|
207
|
+
</div>
|
208
|
+
|
209
|
+
<div id="pages-container">
|
210
|
+
<!-- Pages will be inserted here -->
|
211
|
+
</div>
|
212
|
+
|
213
|
+
<script>
|
214
|
+
// Main OCR data structure with pages and regions
|
215
|
+
const ocrData = {pages_data};
|
216
|
+
|
217
|
+
// ===== DOM Elements =====
|
218
|
+
const pagesContainer = document.getElementById('pages-container');
|
219
|
+
const confidenceFilter = document.getElementById('confidence-filter');
|
220
|
+
const confidenceValue = document.getElementById('confidence-value');
|
221
|
+
const textFilter = document.getElementById('text-filter');
|
222
|
+
const sortBySelect = document.getElementById('sort-by');
|
223
|
+
const exportButton = document.getElementById('export-json');
|
224
|
+
|
225
|
+
// ===== Rendering Functions =====
|
226
|
+
|
227
|
+
// Render a single page section with its regions
|
228
|
+
function renderPage(page, pageIndex) {{
|
229
|
+
const pageDiv = document.createElement('div');
|
230
|
+
pageDiv.className = 'page-section';
|
231
|
+
pageDiv.id = `page-${{pageIndex}}`;
|
232
|
+
|
233
|
+
// Page header with controls
|
234
|
+
const pageTitle = document.createElement('div');
|
235
|
+
pageTitle.className = 'page-title';
|
236
|
+
|
237
|
+
const pageHeading = document.createElement('h2');
|
238
|
+
pageHeading.textContent = `Page ${{page.page_number}}`;
|
239
|
+
pageTitle.appendChild(pageHeading);
|
240
|
+
|
241
|
+
const pageControls = document.createElement('div');
|
242
|
+
pageControls.className = 'page-controls';
|
243
|
+
|
244
|
+
const toggleImageBtn = document.createElement('button');
|
245
|
+
toggleImageBtn.className = 'toggle-btn';
|
246
|
+
toggleImageBtn.textContent = 'Show Full Image';
|
247
|
+
toggleImageBtn.onclick = () => toggleFullImage(pageIndex);
|
248
|
+
pageControls.appendChild(toggleImageBtn);
|
249
|
+
|
250
|
+
pageTitle.appendChild(pageControls);
|
251
|
+
pageDiv.appendChild(pageTitle);
|
252
|
+
|
253
|
+
// Full page image (hidden by default)
|
254
|
+
const pageImage = document.createElement('img');
|
255
|
+
pageImage.src = page.image;
|
256
|
+
pageImage.className = 'page-image';
|
257
|
+
pageImage.id = `page-image-${{pageIndex}}`;
|
258
|
+
pageImage.alt = `Page ${{page.page_number}}`;
|
259
|
+
pageDiv.appendChild(pageImage);
|
260
|
+
|
261
|
+
// Table for regions
|
262
|
+
const table = document.createElement('table');
|
263
|
+
table.className = 'region-table';
|
264
|
+
|
265
|
+
// Table header
|
266
|
+
const thead = document.createElement('thead');
|
267
|
+
const headerRow = document.createElement('tr');
|
268
|
+
|
269
|
+
const headers = ['Confidence', 'Text Region', 'Text Content'];
|
270
|
+
headers.forEach(header => {{
|
271
|
+
const th = document.createElement('th');
|
272
|
+
th.textContent = header;
|
273
|
+
headerRow.appendChild(th);
|
274
|
+
}});
|
275
|
+
|
276
|
+
thead.appendChild(headerRow);
|
277
|
+
table.appendChild(thead);
|
278
|
+
|
279
|
+
// Table body
|
280
|
+
const tbody = document.createElement('tbody');
|
281
|
+
tbody.id = `regions-${{pageIndex}}`;
|
282
|
+
|
283
|
+
// Render each region row
|
284
|
+
page.regions.forEach((region, regionIndex) => {{
|
285
|
+
const row = renderRegionRow(region, pageIndex, regionIndex, page.image);
|
286
|
+
tbody.appendChild(row);
|
287
|
+
}});
|
288
|
+
|
289
|
+
table.appendChild(tbody);
|
290
|
+
pageDiv.appendChild(table);
|
291
|
+
|
292
|
+
return pageDiv;
|
293
|
+
}}
|
294
|
+
|
295
|
+
// Render a single region row
|
296
|
+
function renderRegionRow(region, pageIndex, regionIndex, pageImage) {{
|
297
|
+
const row = document.createElement('tr');
|
298
|
+
row.className = 'region-row';
|
299
|
+
row.dataset.confidence = region.confidence;
|
300
|
+
row.dataset.text = region.ocr_text;
|
301
|
+
row.dataset.modified = (region.modified || false).toString();
|
302
|
+
row.dataset.regionId = `${{pageIndex}}-${{regionIndex}}`;
|
303
|
+
|
304
|
+
// Confidence cell
|
305
|
+
const confidenceCell = document.createElement('td');
|
306
|
+
confidenceCell.className = 'confidence';
|
307
|
+
confidenceCell.textContent = region.confidence.toFixed(2);
|
308
|
+
|
309
|
+
// Set color level based on confidence
|
310
|
+
if (region.confidence >= 0.8) {{
|
311
|
+
confidenceCell.dataset.level = 'high';
|
312
|
+
}} else if (region.confidence >= 0.5) {{
|
313
|
+
confidenceCell.dataset.level = 'medium';
|
314
|
+
}} else {{
|
315
|
+
confidenceCell.dataset.level = 'low';
|
316
|
+
}}
|
317
|
+
|
318
|
+
row.appendChild(confidenceCell);
|
319
|
+
|
320
|
+
// Image region cell
|
321
|
+
const imageCell = document.createElement('td');
|
322
|
+
imageCell.className = 'region-image';
|
323
|
+
|
324
|
+
const imageClip = document.createElement('div');
|
325
|
+
imageClip.className = 'image-clip';
|
326
|
+
imageClip.style.backgroundImage = `url('${{pageImage}}')`;
|
327
|
+
|
328
|
+
// Calculate dimensions (scaled by 2.0 to match the image scale)
|
329
|
+
const width = (region.bbox[2] - region.bbox[0]) * 2.0;
|
330
|
+
const height = (region.bbox[3] - region.bbox[1]) * 2.0;
|
331
|
+
|
332
|
+
// Calculate background position (negative of the top-left corner)
|
333
|
+
imageClip.style.backgroundPosition = `-${{region.bbox[0] * 2.0}}px -${{region.bbox[1] * 2.0}}px`;
|
334
|
+
|
335
|
+
// If the image is very large, we'll apply CSS transform scaling instead of
|
336
|
+
// changing the dimensions directly to maintain proper background position
|
337
|
+
const maxWidth = 350;
|
338
|
+
const maxHeight = 250;
|
339
|
+
let scale = 1;
|
340
|
+
|
341
|
+
if (width > maxWidth || height > maxHeight) {{
|
342
|
+
const scaleX = maxWidth / width;
|
343
|
+
const scaleY = maxHeight / height;
|
344
|
+
scale = Math.min(scaleX, scaleY);
|
345
|
+
imageClip.style.transform = `scale(${{scale}})`;
|
346
|
+
}}
|
347
|
+
|
348
|
+
// Set the final dimensions
|
349
|
+
imageClip.style.width = `${{width}}px`;
|
350
|
+
imageClip.style.height = `${{height}}px`;
|
351
|
+
|
352
|
+
imageCell.appendChild(imageClip);
|
353
|
+
row.appendChild(imageCell);
|
354
|
+
|
355
|
+
// Combined text content cell with textarea
|
356
|
+
const textCell = document.createElement('td');
|
357
|
+
textCell.className = 'text-content';
|
358
|
+
|
359
|
+
const textArea = document.createElement('textarea');
|
360
|
+
textArea.className = 'text-content-input';
|
361
|
+
textArea.value = region.ocr_text;
|
362
|
+
textArea.rows = Math.max(1, Math.ceil(region.ocr_text.length / 40)); // Approximate rows based on text length
|
363
|
+
textArea.dataset.pageIndex = pageIndex;
|
364
|
+
textArea.dataset.regionIndex = regionIndex;
|
365
|
+
textArea.dataset.originalText = region.ocr_text;
|
366
|
+
|
367
|
+
// Save changes to data structure
|
368
|
+
textArea.addEventListener('change', (e) => {{
|
369
|
+
const pIdx = parseInt(e.target.dataset.pageIndex);
|
370
|
+
const rIdx = parseInt(e.target.dataset.regionIndex);
|
371
|
+
ocrData.pages[pIdx].regions[rIdx].corrected_text = e.target.value;
|
372
|
+
|
373
|
+
// Update the modified status in the dataset
|
374
|
+
const isModified = e.target.value !== e.target.dataset.originalText;
|
375
|
+
ocrData.pages[pIdx].regions[rIdx].modified = isModified;
|
376
|
+
|
377
|
+
// Visual indication of modification through textarea style
|
378
|
+
if (isModified) {{
|
379
|
+
e.target.style.borderColor = '#4b6bfb';
|
380
|
+
e.target.style.backgroundColor = 'rgba(75, 107, 251, 0.05)';
|
381
|
+
}} else {{
|
382
|
+
e.target.style.borderColor = '#ddd';
|
383
|
+
e.target.style.backgroundColor = '';
|
384
|
+
}}
|
385
|
+
}});
|
386
|
+
|
387
|
+
textCell.appendChild(textArea);
|
388
|
+
row.appendChild(textCell);
|
389
|
+
|
390
|
+
// No Modified column needed
|
391
|
+
|
392
|
+
return row;
|
393
|
+
}}
|
394
|
+
|
395
|
+
// ===== Interactive Functions =====
|
396
|
+
|
397
|
+
// Toggle display of full page image
|
398
|
+
function toggleFullImage(pageIndex) {{
|
399
|
+
const image = document.getElementById(`page-image-${{pageIndex}}`);
|
400
|
+
const button = image.previousElementSibling.querySelector('.toggle-btn');
|
401
|
+
|
402
|
+
if (image.classList.contains('show')) {{
|
403
|
+
image.classList.remove('show');
|
404
|
+
button.textContent = 'Show Full Image';
|
405
|
+
}} else {{
|
406
|
+
image.classList.add('show');
|
407
|
+
button.textContent = 'Hide Full Image';
|
408
|
+
}}
|
409
|
+
}}
|
410
|
+
|
411
|
+
// Filter regions by confidence
|
412
|
+
function filterByConfidence(minConfidence) {{
|
413
|
+
document.querySelectorAll('.region-row').forEach(row => {{
|
414
|
+
const confidence = parseFloat(row.dataset.confidence);
|
415
|
+
if (confidence < minConfidence) {{
|
416
|
+
row.classList.add('hidden');
|
417
|
+
}} else {{
|
418
|
+
row.classList.remove('hidden');
|
419
|
+
}}
|
420
|
+
}});
|
421
|
+
}}
|
422
|
+
|
423
|
+
// Filter regions by text content
|
424
|
+
function filterByText(text) {{
|
425
|
+
const searchText = text.toLowerCase();
|
426
|
+
// If no search text, we don't need to do anything
|
427
|
+
if (!searchText) {{
|
428
|
+
document.querySelectorAll('.region-row').forEach(row => {{
|
429
|
+
row.classList.remove('hidden');
|
430
|
+
}});
|
431
|
+
return;
|
432
|
+
}}
|
433
|
+
|
434
|
+
// Filter based on current textarea content (not just original text)
|
435
|
+
document.querySelectorAll('.region-row').forEach(row => {{
|
436
|
+
const textarea = row.querySelector('.text-content-input');
|
437
|
+
const currentText = textarea ? textarea.value.toLowerCase() : row.dataset.text.toLowerCase();
|
438
|
+
|
439
|
+
if (!currentText.includes(searchText)) {{
|
440
|
+
row.classList.add('hidden');
|
441
|
+
}} else {{
|
442
|
+
row.classList.remove('hidden');
|
443
|
+
}}
|
444
|
+
}});
|
445
|
+
}}
|
446
|
+
|
447
|
+
// Sort regions by different criteria
|
448
|
+
function sortRegions(sortBy) {{
|
449
|
+
ocrData.pages.forEach((page, pageIndex) => {{
|
450
|
+
const tbody = document.getElementById(`regions-${{pageIndex}}`);
|
451
|
+
const rows = Array.from(tbody.querySelectorAll('.region-row'));
|
452
|
+
|
453
|
+
// Sort based on selected criterion
|
454
|
+
rows.sort((a, b) => {{
|
455
|
+
switch (sortBy) {{
|
456
|
+
case 'confidence-asc':
|
457
|
+
return parseFloat(a.dataset.confidence) - parseFloat(b.dataset.confidence);
|
458
|
+
case 'confidence-desc':
|
459
|
+
return parseFloat(b.dataset.confidence) - parseFloat(a.dataset.confidence);
|
460
|
+
case 'text-length':
|
461
|
+
return b.dataset.text.length - a.dataset.text.length;
|
462
|
+
case 'position':
|
463
|
+
default:
|
464
|
+
// Default sort by region ID (original position)
|
465
|
+
return a.dataset.regionId.localeCompare(b.dataset.regionId);
|
466
|
+
}}
|
467
|
+
}});
|
468
|
+
|
469
|
+
// Reinsert in sorted order
|
470
|
+
rows.forEach(row => tbody.appendChild(row));
|
471
|
+
}});
|
472
|
+
}}
|
473
|
+
|
474
|
+
// Export data as JSON
|
475
|
+
function exportJSON() {{
|
476
|
+
// Create a downloadable JSON with corrected text
|
477
|
+
const exportData = JSON.stringify(ocrData, null, 2);
|
478
|
+
const blob = new Blob([exportData], {{type: 'application/json'}});
|
479
|
+
const url = URL.createObjectURL(blob);
|
480
|
+
|
481
|
+
const a = document.createElement('a');
|
482
|
+
a.href = url;
|
483
|
+
a.download = 'ocr_debug_export.json';
|
484
|
+
document.body.appendChild(a);
|
485
|
+
a.click();
|
486
|
+
document.body.removeChild(a);
|
487
|
+
URL.revokeObjectURL(url);
|
488
|
+
}}
|
489
|
+
|
490
|
+
// ===== Event Listeners =====
|
491
|
+
|
492
|
+
confidenceFilter.addEventListener('input', (e) => {{
|
493
|
+
const value = parseFloat(e.target.value);
|
494
|
+
confidenceValue.textContent = value.toFixed(2);
|
495
|
+
filterByConfidence(value);
|
496
|
+
}});
|
497
|
+
|
498
|
+
textFilter.addEventListener('input', (e) => {{
|
499
|
+
filterByText(e.target.value);
|
500
|
+
}});
|
501
|
+
|
502
|
+
sortBySelect.addEventListener('change', (e) => {{
|
503
|
+
sortRegions(e.target.value);
|
504
|
+
}});
|
505
|
+
|
506
|
+
exportButton.addEventListener('click', exportJSON);
|
507
|
+
|
508
|
+
// ===== Initialize =====
|
509
|
+
|
510
|
+
// Render all pages
|
511
|
+
ocrData.pages.forEach((page, i) => {{
|
512
|
+
const pageElement = renderPage(page, i);
|
513
|
+
pagesContainer.appendChild(pageElement);
|
514
|
+
}});
|
515
|
+
</script>
|
516
|
+
</body>
|
517
|
+
</html>
|