natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,334 +0,0 @@
|
|
1
|
-
/* Styles adapted from ocr_debug.css */
|
2
|
-
body {
|
3
|
-
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
|
4
|
-
line-height: 1.6;
|
5
|
-
color: #333;
|
6
|
-
margin: 0;
|
7
|
-
display: flex;
|
8
|
-
flex-direction: column;
|
9
|
-
min-height: 100vh;
|
10
|
-
}
|
11
|
-
|
12
|
-
header {
|
13
|
-
background-color: #f8f9fa;
|
14
|
-
padding: 10px 20px;
|
15
|
-
border-bottom: 1px solid #ddd;
|
16
|
-
}
|
17
|
-
|
18
|
-
main {
|
19
|
-
flex-grow: 1;
|
20
|
-
max-width: 1400px; /* Wider for SPA layout */
|
21
|
-
width: 100%;
|
22
|
-
margin: 20px auto;
|
23
|
-
padding: 0 20px; /* Add horizontal padding */
|
24
|
-
box-sizing: border-box;
|
25
|
-
}
|
26
|
-
|
27
|
-
footer {
|
28
|
-
background-color: #f8f9fa;
|
29
|
-
padding: 10px 20px;
|
30
|
-
border-top: 1px solid #ddd;
|
31
|
-
text-align: center;
|
32
|
-
font-size: 0.9em;
|
33
|
-
color: #666;
|
34
|
-
margin-top: auto;
|
35
|
-
}
|
36
|
-
|
37
|
-
h1, h2 {
|
38
|
-
color: #2c3e50;
|
39
|
-
margin-top: 0;
|
40
|
-
}
|
41
|
-
|
42
|
-
.loading-message,
|
43
|
-
.initial-message {
|
44
|
-
text-align: center;
|
45
|
-
padding: 40px;
|
46
|
-
font-size: 1.2em;
|
47
|
-
color: #666;
|
48
|
-
}
|
49
|
-
|
50
|
-
.task-loader {
|
51
|
-
margin-bottom: 20px;
|
52
|
-
padding: 15px;
|
53
|
-
border: 1px dashed #ccc;
|
54
|
-
background-color: #f0f0f0;
|
55
|
-
text-align: center;
|
56
|
-
}
|
57
|
-
|
58
|
-
.page-section {
|
59
|
-
margin-bottom: 30px;
|
60
|
-
border: 1px solid #ddd;
|
61
|
-
border-radius: 4px;
|
62
|
-
padding: 15px;
|
63
|
-
background: #f8f9fa;
|
64
|
-
}
|
65
|
-
|
66
|
-
.page-title {
|
67
|
-
display: flex;
|
68
|
-
justify-content: space-between;
|
69
|
-
align-items: center;
|
70
|
-
margin-bottom: 15px;
|
71
|
-
}
|
72
|
-
|
73
|
-
.page-controls {
|
74
|
-
display: flex;
|
75
|
-
gap: 10px;
|
76
|
-
align-items: center;
|
77
|
-
}
|
78
|
-
|
79
|
-
.controls-container {
|
80
|
-
margin-bottom: 20px;
|
81
|
-
display: flex;
|
82
|
-
justify-content: space-between;
|
83
|
-
flex-wrap: wrap;
|
84
|
-
gap: 15px; /* Increased gap */
|
85
|
-
padding: 15px;
|
86
|
-
background: #eee;
|
87
|
-
border-radius: 4px;
|
88
|
-
position: sticky; /* Make controls sticky */
|
89
|
-
top: 0;
|
90
|
-
z-index: 100;
|
91
|
-
border-bottom: 1px solid #ddd;
|
92
|
-
}
|
93
|
-
|
94
|
-
.filter-control {
|
95
|
-
display: flex;
|
96
|
-
align-items: center;
|
97
|
-
gap: 8px;
|
98
|
-
}
|
99
|
-
|
100
|
-
input, select, button {
|
101
|
-
padding: 8px 12px; /* Slightly larger padding */
|
102
|
-
border: 1px solid #ccc;
|
103
|
-
border-radius: 4px;
|
104
|
-
font-size: 14px;
|
105
|
-
vertical-align: middle; /* Align form elements */
|
106
|
-
}
|
107
|
-
|
108
|
-
input[type="range"] {
|
109
|
-
padding: 0; /* Remove padding for range */
|
110
|
-
}
|
111
|
-
|
112
|
-
button {
|
113
|
-
background: #4b6bfb;
|
114
|
-
color: white;
|
115
|
-
cursor: pointer;
|
116
|
-
border: none; /* Remove border */
|
117
|
-
}
|
118
|
-
|
119
|
-
button:hover {
|
120
|
-
background: #3b5de7;
|
121
|
-
}
|
122
|
-
|
123
|
-
button.secondary {
|
124
|
-
background: #6c757d;
|
125
|
-
}
|
126
|
-
button.secondary:hover {
|
127
|
-
background: #5a6268;
|
128
|
-
}
|
129
|
-
|
130
|
-
/* --- NEW FLEXBOX BASED LAYOUT --- */
|
131
|
-
.region-list {
|
132
|
-
/* Container for all region items */
|
133
|
-
margin-top: 15px;
|
134
|
-
border: 1px solid #ddd; /* Add border around the list */
|
135
|
-
border-radius: 4px;
|
136
|
-
overflow: hidden; /* Ensures border radius clips children */
|
137
|
-
}
|
138
|
-
|
139
|
-
.region-item {
|
140
|
-
display: flex;
|
141
|
-
align-items: stretch; /* Make cells same height */
|
142
|
-
border-bottom: 1px solid #ddd; /* Separator line */
|
143
|
-
background-color: #fff;
|
144
|
-
}
|
145
|
-
.region-item:last-child {
|
146
|
-
border-bottom: none; /* Remove border for the last item */
|
147
|
-
}
|
148
|
-
|
149
|
-
.confidence-cell {
|
150
|
-
width: 80px; /* Fixed width for confidence */
|
151
|
-
padding: 10px;
|
152
|
-
text-align: center;
|
153
|
-
white-space: nowrap;
|
154
|
-
border-right: 1px solid #ddd; /* Separator line */
|
155
|
-
display: flex; /* Center content vertically */
|
156
|
-
align-items: center;
|
157
|
-
justify-content: center;
|
158
|
-
flex-shrink: 0; /* Prevent shrinking */
|
159
|
-
}
|
160
|
-
|
161
|
-
/* Confidence background colors (moved from .confidence) */
|
162
|
-
.confidence-cell[data-level="high"] {
|
163
|
-
background-color: rgba(40, 167, 69, 0.1);
|
164
|
-
}
|
165
|
-
.confidence-cell[data-level="medium"] {
|
166
|
-
background-color: rgba(255, 193, 7, 0.1);
|
167
|
-
}
|
168
|
-
.confidence-cell[data-level="low"] {
|
169
|
-
background-color: rgba(220, 53, 69, 0.1);
|
170
|
-
}
|
171
|
-
|
172
|
-
.region-content-cell {
|
173
|
-
flex-grow: 1; /* Take remaining width */
|
174
|
-
padding: 10px;
|
175
|
-
display: flex; /* Use flex to control children */
|
176
|
-
flex-direction: column; /* Stack image and text */
|
177
|
-
/* background-color: #fafafa; /* Optional: Slight background difference */
|
178
|
-
}
|
179
|
-
|
180
|
-
.image-clip {
|
181
|
-
/* Styles for the image snippet div */
|
182
|
-
position: relative;
|
183
|
-
overflow: hidden;
|
184
|
-
background-repeat: no-repeat;
|
185
|
-
border-radius: 3px;
|
186
|
-
box-shadow: 0 1px 3px rgba(0,0,0,0.2);
|
187
|
-
margin-bottom: 10px; /* Space below image */
|
188
|
-
/* Use inline style for width/height/background */
|
189
|
-
max-width: 350px; /* Max width within the cell */
|
190
|
-
max-height: 250px; /* Max height */
|
191
|
-
transform-origin: top left;
|
192
|
-
border: 1px solid #ccc;
|
193
|
-
background-color: #f8f8f8; /* Placeholder background */
|
194
|
-
align-self: flex-start; /* Align to start if cell is wider */
|
195
|
-
/* display: inline-block; /* Remove this if using flex parent */
|
196
|
-
}
|
197
|
-
|
198
|
-
.editing-content {
|
199
|
-
font-size: 18px;
|
200
|
-
text-align: center;
|
201
|
-
}
|
202
|
-
|
203
|
-
.text-content-input {
|
204
|
-
/* Styles for the textarea */
|
205
|
-
text-align: center;
|
206
|
-
width: 100%; /* Full width of the cell */
|
207
|
-
font-family: monospace;
|
208
|
-
padding: 8px;
|
209
|
-
font-size: 16px;
|
210
|
-
line-height: 1.4;
|
211
|
-
white-space: pre-wrap;
|
212
|
-
word-break: break-all;
|
213
|
-
border: 1px solid #ddd;
|
214
|
-
border-radius: 4px;
|
215
|
-
resize: vertical;
|
216
|
-
min-height: 50px; /* Slightly taller */
|
217
|
-
box-sizing: border-box;
|
218
|
-
}
|
219
|
-
|
220
|
-
.text-content-input.modified {
|
221
|
-
border-color: #4b6bfb;
|
222
|
-
background-color: rgba(75, 107, 251, 0.05);
|
223
|
-
}
|
224
|
-
|
225
|
-
.text-content-input:focus {
|
226
|
-
border-color: #4b6bfb;
|
227
|
-
outline: none;
|
228
|
-
box-shadow: 0 0 0 2px rgba(75, 107, 251, 0.25);
|
229
|
-
}
|
230
|
-
/* --- END NEW FLEXBOX BASED LAYOUT --- */
|
231
|
-
|
232
|
-
|
233
|
-
/* --- REMOVED OLD TABLE STYLES --- */
|
234
|
-
/*
|
235
|
-
.region-table { ... }
|
236
|
-
.region-table th, .region-table td { ... }
|
237
|
-
.region-table th { ... }
|
238
|
-
.region-image { ... }
|
239
|
-
.confidence { ... }
|
240
|
-
.text-content { ... }
|
241
|
-
*/
|
242
|
-
/* --- END REMOVED OLD TABLE STYLES --- */
|
243
|
-
|
244
|
-
|
245
|
-
.hidden {
|
246
|
-
display: none !important;
|
247
|
-
}
|
248
|
-
|
249
|
-
.toggle-btn {
|
250
|
-
background: #eee;
|
251
|
-
color: #333;
|
252
|
-
border: 1px solid #ccc;
|
253
|
-
padding: 3px 8px;
|
254
|
-
border-radius: 3px;
|
255
|
-
cursor: pointer;
|
256
|
-
font-size: 12px;
|
257
|
-
}
|
258
|
-
|
259
|
-
.toggle-btn:hover {
|
260
|
-
background: #ddd;
|
261
|
-
}
|
262
|
-
|
263
|
-
.export-btn {
|
264
|
-
margin-left: auto;
|
265
|
-
}
|
266
|
-
|
267
|
-
.page-image {
|
268
|
-
max-width: 100%;
|
269
|
-
height: auto;
|
270
|
-
margin-bottom: 15px;
|
271
|
-
border: 1px solid #ddd;
|
272
|
-
display: none; /* Hidden by default */
|
273
|
-
}
|
274
|
-
|
275
|
-
.page-image.show {
|
276
|
-
display: block;
|
277
|
-
}
|
278
|
-
|
279
|
-
/* Responsive adjustments */
|
280
|
-
@media (max-width: 800px) {
|
281
|
-
/* REMOVED table-specific responsive styles */
|
282
|
-
/*
|
283
|
-
.region-table thead { ... }
|
284
|
-
.region-table, .region-table tbody, .region-table tr, .region-table td { ... }
|
285
|
-
.region-table tr { ... }
|
286
|
-
.region-table td { ... }
|
287
|
-
.region-table td::before { ... }
|
288
|
-
.region-table td:last-child { ... }
|
289
|
-
.region-image, .text-content { ... }
|
290
|
-
.confidence { ... }
|
291
|
-
*/
|
292
|
-
|
293
|
-
/* NEW Flexbox Responsive Styles */
|
294
|
-
.region-item {
|
295
|
-
flex-direction: column; /* Stack columns vertically on mobile */
|
296
|
-
align-items: stretch; /* Make items full width */
|
297
|
-
}
|
298
|
-
|
299
|
-
.confidence-cell {
|
300
|
-
width: auto; /* Allow confidence to take full width */
|
301
|
-
border-right: none; /* Remove right border */
|
302
|
-
border-bottom: 1px solid #ddd; /* Add bottom border */
|
303
|
-
justify-content: flex-start; /* Align text left */
|
304
|
-
padding: 5px 10px; /* Adjust padding */
|
305
|
-
}
|
306
|
-
/* Optional: Add label back using ::before */
|
307
|
-
.confidence-cell::before {
|
308
|
-
content: "Confidence: ";
|
309
|
-
font-weight: bold;
|
310
|
-
margin-right: 5px;
|
311
|
-
}
|
312
|
-
|
313
|
-
.region-content-cell {
|
314
|
-
padding: 10px; /* Reset padding */
|
315
|
-
}
|
316
|
-
|
317
|
-
.image-clip {
|
318
|
-
max-width: 100%; /* Allow image full width */
|
319
|
-
align-self: center; /* Center image */
|
320
|
-
/* Remove fixed max-height? */
|
321
|
-
/* max-height: none; */
|
322
|
-
}
|
323
|
-
|
324
|
-
.controls-container {
|
325
|
-
flex-direction: column;
|
326
|
-
align-items: stretch;
|
327
|
-
position: static; /* Unstick controls */
|
328
|
-
}
|
329
|
-
|
330
|
-
.export-btn {
|
331
|
-
margin-left: 0;
|
332
|
-
margin-top: 10px;
|
333
|
-
}
|
334
|
-
}
|
@@ -1,31 +0,0 @@
|
|
1
|
-
<!DOCTYPE html>
|
2
|
-
<html lang="en">
|
3
|
-
<head>
|
4
|
-
<meta charset="UTF-8">
|
5
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6
|
-
<title>OCR Correction Tool</title>
|
7
|
-
<link rel="stylesheet" href="css/style.css">
|
8
|
-
</head>
|
9
|
-
<body>
|
10
|
-
<header>
|
11
|
-
<h1>OCR Correction Tool</h1>
|
12
|
-
</header>
|
13
|
-
|
14
|
-
<main id="app">
|
15
|
-
<p>Loading application...</p>
|
16
|
-
</main>
|
17
|
-
|
18
|
-
<footer>
|
19
|
-
<p>Generated by natural-pdf</p>
|
20
|
-
</footer>
|
21
|
-
|
22
|
-
<script src="js/lib/react.development.js"></script>
|
23
|
-
<script src="js/lib/react-dom.development.js"></script>
|
24
|
-
<script src="js/lib/babel.min.js"></script>
|
25
|
-
<script src="js/lib/jszip.min.js"></script>
|
26
|
-
<script src="js/lib/FileSaver.min.js"></script>
|
27
|
-
|
28
|
-
<script type="text/babel" src="js/app.js"></script>
|
29
|
-
|
30
|
-
</body>
|
31
|
-
</html>
|