natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/categorizing-documents/index.md +168 -0
- docs/data-extraction/index.md +87 -0
- docs/element-selection/index.ipynb +218 -164
- docs/element-selection/index.md +20 -0
- docs/index.md +19 -0
- docs/ocr/index.md +63 -16
- docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
- docs/tutorials/02-finding-elements.ipynb +123 -46
- docs/tutorials/03-extracting-blocks.ipynb +24 -19
- docs/tutorials/04-table-extraction.ipynb +17 -12
- docs/tutorials/05-excluding-content.ipynb +37 -32
- docs/tutorials/06-document-qa.ipynb +36 -31
- docs/tutorials/07-layout-analysis.ipynb +45 -40
- docs/tutorials/07-working-with-regions.ipynb +61 -60
- docs/tutorials/08-spatial-navigation.ipynb +76 -71
- docs/tutorials/09-section-extraction.ipynb +160 -155
- docs/tutorials/10-form-field-extraction.ipynb +71 -66
- docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- docs/tutorials/12-ocr-integration.ipynb +3420 -312
- docs/tutorials/12-ocr-integration.md +68 -106
- docs/tutorials/13-semantic-search.ipynb +641 -251
- natural_pdf/__init__.py +2 -0
- natural_pdf/classification/manager.py +343 -0
- natural_pdf/classification/mixin.py +149 -0
- natural_pdf/classification/results.py +62 -0
- natural_pdf/collections/mixins.py +63 -0
- natural_pdf/collections/pdf_collection.py +321 -15
- natural_pdf/core/element_manager.py +67 -0
- natural_pdf/core/page.py +227 -64
- natural_pdf/core/pdf.py +387 -378
- natural_pdf/elements/collections.py +272 -41
- natural_pdf/elements/region.py +99 -15
- natural_pdf/elements/text.py +5 -2
- natural_pdf/exporters/paddleocr.py +1 -1
- natural_pdf/extraction/manager.py +134 -0
- natural_pdf/extraction/mixin.py +246 -0
- natural_pdf/extraction/result.py +37 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_manager.py +85 -25
- natural_pdf/ocr/ocr_options.py +33 -10
- natural_pdf/ocr/utils.py +14 -3
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/selectors/parser.py +363 -238
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/text_extraction.py +52 -1
- natural_pdf/utils/tqdm_utils.py +43 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@
|
|
2
2
|
"cells": [
|
3
3
|
{
|
4
4
|
"cell_type": "markdown",
|
5
|
-
"id": "
|
5
|
+
"id": "2fed8e7f",
|
6
6
|
"metadata": {},
|
7
7
|
"source": [
|
8
8
|
"# Working with Regions\n",
|
@@ -13,13 +13,13 @@
|
|
13
13
|
{
|
14
14
|
"cell_type": "code",
|
15
15
|
"execution_count": 1,
|
16
|
-
"id": "
|
16
|
+
"id": "f8632f87",
|
17
17
|
"metadata": {
|
18
18
|
"execution": {
|
19
|
-
"iopub.execute_input": "2025-04-
|
20
|
-
"iopub.status.busy": "2025-04-
|
21
|
-
"iopub.status.idle": "2025-04-
|
22
|
-
"shell.execute_reply": "2025-04-
|
19
|
+
"iopub.execute_input": "2025-04-27T16:32:58.581956Z",
|
20
|
+
"iopub.status.busy": "2025-04-27T16:32:58.581525Z",
|
21
|
+
"iopub.status.idle": "2025-04-27T16:32:58.587927Z",
|
22
|
+
"shell.execute_reply": "2025-04-27T16:32:58.587228Z"
|
23
23
|
}
|
24
24
|
},
|
25
25
|
"outputs": [],
|
@@ -30,20 +30,20 @@
|
|
30
30
|
{
|
31
31
|
"cell_type": "code",
|
32
32
|
"execution_count": 2,
|
33
|
-
"id": "
|
33
|
+
"id": "6e31d1dd",
|
34
34
|
"metadata": {
|
35
35
|
"execution": {
|
36
|
-
"iopub.execute_input": "2025-04-
|
37
|
-
"iopub.status.busy": "2025-04-
|
38
|
-
"iopub.status.idle": "2025-04-
|
39
|
-
"shell.execute_reply": "2025-04-
|
36
|
+
"iopub.execute_input": "2025-04-27T16:32:58.590761Z",
|
37
|
+
"iopub.status.busy": "2025-04-27T16:32:58.590387Z",
|
38
|
+
"iopub.status.idle": "2025-04-27T16:33:04.820259Z",
|
39
|
+
"shell.execute_reply": "2025-04-27T16:33:04.819919Z"
|
40
40
|
}
|
41
41
|
},
|
42
42
|
"outputs": [
|
43
43
|
{
|
44
44
|
"data": {
|
45
45
|
"text/plain": [
|
46
|
-
"'
|
46
|
+
"'INS-UP70N51NCL41R\\nSite: Durham’s Meatpacking Chicago, Ill.\\nDate: February 3, 1905\\nViolation Count: 7\\nSummary: Worst of any, however, were the fertilizer men, and those who served in the cooking rooms.\\nThese people could not be shown to the visitor - for the odor of a fertilizer man would scare any ordinary\\nvisitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor, their peculiar trouble was that they fell'"
|
47
47
|
]
|
48
48
|
},
|
49
49
|
"execution_count": 2,
|
@@ -75,7 +75,7 @@
|
|
75
75
|
},
|
76
76
|
{
|
77
77
|
"cell_type": "markdown",
|
78
|
-
"id": "
|
78
|
+
"id": "afb3d0f0",
|
79
79
|
"metadata": {},
|
80
80
|
"source": [
|
81
81
|
"## Creating Regions from Elements"
|
@@ -84,20 +84,20 @@
|
|
84
84
|
{
|
85
85
|
"cell_type": "code",
|
86
86
|
"execution_count": 3,
|
87
|
-
"id": "
|
87
|
+
"id": "00b25eb6",
|
88
88
|
"metadata": {
|
89
89
|
"execution": {
|
90
|
-
"iopub.execute_input": "2025-04-
|
91
|
-
"iopub.status.busy": "2025-04-
|
92
|
-
"iopub.status.idle": "2025-04-
|
93
|
-
"shell.execute_reply": "2025-04-
|
90
|
+
"iopub.execute_input": "2025-04-27T16:33:04.821760Z",
|
91
|
+
"iopub.status.busy": "2025-04-27T16:33:04.821475Z",
|
92
|
+
"iopub.status.idle": "2025-04-27T16:33:04.875614Z",
|
93
|
+
"shell.execute_reply": "2025-04-27T16:33:04.875304Z"
|
94
94
|
}
|
95
95
|
},
|
96
96
|
"outputs": [
|
97
97
|
{
|
98
98
|
"data": {
|
99
99
|
"text/plain": [
|
100
|
-
"'
|
100
|
+
"'INS-UP70N51NCL41R\\nSite: Durham’s Meatpacking Chicago, Ill.\\nDate: February 3, 1905\\nViolation Count: 7'"
|
101
101
|
]
|
102
102
|
},
|
103
103
|
"execution_count": 3,
|
@@ -125,7 +125,7 @@
|
|
125
125
|
},
|
126
126
|
{
|
127
127
|
"cell_type": "markdown",
|
128
|
-
"id": "
|
128
|
+
"id": "a87ac43e",
|
129
129
|
"metadata": {},
|
130
130
|
"source": [
|
131
131
|
"## Finding Elements Within Regions"
|
@@ -134,13 +134,13 @@
|
|
134
134
|
{
|
135
135
|
"cell_type": "code",
|
136
136
|
"execution_count": 4,
|
137
|
-
"id": "
|
137
|
+
"id": "b497d980",
|
138
138
|
"metadata": {
|
139
139
|
"execution": {
|
140
|
-
"iopub.execute_input": "2025-04-
|
141
|
-
"iopub.status.busy": "2025-04-
|
142
|
-
"iopub.status.idle": "2025-04-
|
143
|
-
"shell.execute_reply": "2025-04-
|
140
|
+
"iopub.execute_input": "2025-04-27T16:33:04.876933Z",
|
141
|
+
"iopub.status.busy": "2025-04-27T16:33:04.876835Z",
|
142
|
+
"iopub.status.idle": "2025-04-27T16:33:04.915535Z",
|
143
|
+
"shell.execute_reply": "2025-04-27T16:33:04.915248Z"
|
144
144
|
}
|
145
145
|
},
|
146
146
|
"outputs": [
|
@@ -172,7 +172,7 @@
|
|
172
172
|
},
|
173
173
|
{
|
174
174
|
"cell_type": "markdown",
|
175
|
-
"id": "
|
175
|
+
"id": "8c11a00a",
|
176
176
|
"metadata": {},
|
177
177
|
"source": [
|
178
178
|
"## Expanding and Adjusting Regions"
|
@@ -181,20 +181,20 @@
|
|
181
181
|
{
|
182
182
|
"cell_type": "code",
|
183
183
|
"execution_count": 5,
|
184
|
-
"id": "
|
184
|
+
"id": "0e16db82",
|
185
185
|
"metadata": {
|
186
186
|
"execution": {
|
187
|
-
"iopub.execute_input": "2025-04-
|
188
|
-
"iopub.status.busy": "2025-04-
|
189
|
-
"iopub.status.idle": "2025-04-
|
190
|
-
"shell.execute_reply": "2025-04-
|
187
|
+
"iopub.execute_input": "2025-04-27T16:33:04.916815Z",
|
188
|
+
"iopub.status.busy": "2025-04-27T16:33:04.916720Z",
|
189
|
+
"iopub.status.idle": "2025-04-27T16:33:04.954778Z",
|
190
|
+
"shell.execute_reply": "2025-04-27T16:33:04.954482Z"
|
191
191
|
}
|
192
192
|
},
|
193
193
|
"outputs": [
|
194
194
|
{
|
195
195
|
"data": {
|
196
196
|
"text/plain": [
|
197
|
-
"'Summary: Worst of any, however, were the fertilizer men\\nThese people could not be shown to the visitor - for the o\\nvisitor at a hundred yards, and as for the other men, who\\nsome of which there were open vats near the level of the\\ninto the vats; and when they were fished out, there was n\\nexhibiting - sometimes they would be overlooked for days\\nto the world as Durham’s Pure Leaf Lard
|
197
|
+
"'Summary: Worst of any, however, were the fertilizer men\\nThese people could not be shown to the visitor - for the o\\nvisitor at a hundred yards, and as for the other men, who\\nsome of which there were open vats near the level of the\\ninto the vats; and when they were fished out, there was n\\nexhibiting - sometimes they would be overlooked for days\\nto the world as Durham’s Pure Leaf Lard!'"
|
198
198
|
]
|
199
199
|
},
|
200
200
|
"execution_count": 5,
|
@@ -230,7 +230,7 @@
|
|
230
230
|
},
|
231
231
|
{
|
232
232
|
"cell_type": "markdown",
|
233
|
-
"id": "
|
233
|
+
"id": "7c86b3ce",
|
234
234
|
"metadata": {},
|
235
235
|
"source": [
|
236
236
|
"## Creating Bounded Regions"
|
@@ -239,20 +239,20 @@
|
|
239
239
|
{
|
240
240
|
"cell_type": "code",
|
241
241
|
"execution_count": 6,
|
242
|
-
"id": "
|
242
|
+
"id": "7bc305b5",
|
243
243
|
"metadata": {
|
244
244
|
"execution": {
|
245
|
-
"iopub.execute_input": "2025-04-
|
246
|
-
"iopub.status.busy": "2025-04-
|
247
|
-
"iopub.status.idle": "2025-04-
|
248
|
-
"shell.execute_reply": "2025-04-
|
245
|
+
"iopub.execute_input": "2025-04-27T16:33:04.955989Z",
|
246
|
+
"iopub.status.busy": "2025-04-27T16:33:04.955888Z",
|
247
|
+
"iopub.status.idle": "2025-04-27T16:33:04.979625Z",
|
248
|
+
"shell.execute_reply": "2025-04-27T16:33:04.979327Z"
|
249
249
|
}
|
250
250
|
},
|
251
251
|
"outputs": [
|
252
252
|
{
|
253
253
|
"data": {
|
254
254
|
"text/plain": [
|
255
|
-
"'
|
255
|
+
"'Jungle Health and Safety Inspection Service\\nINS-UP70N51NCL41R\\nSite: Durham’s Meatpacking Chicago, Ill.\\nDate: February 3, 1905\\nViolation Count: 7\\nSummary: Worst of any, however, were the fertilizer men...'"
|
256
256
|
]
|
257
257
|
},
|
258
258
|
"execution_count": 6,
|
@@ -277,7 +277,7 @@
|
|
277
277
|
},
|
278
278
|
{
|
279
279
|
"cell_type": "markdown",
|
280
|
-
"id": "
|
280
|
+
"id": "2fd79731",
|
281
281
|
"metadata": {},
|
282
282
|
"source": [
|
283
283
|
"## Working with Multiple Regions"
|
@@ -286,22 +286,22 @@
|
|
286
286
|
{
|
287
287
|
"cell_type": "code",
|
288
288
|
"execution_count": 7,
|
289
|
-
"id": "
|
289
|
+
"id": "65bdc1f2",
|
290
290
|
"metadata": {
|
291
291
|
"execution": {
|
292
|
-
"iopub.execute_input": "2025-04-
|
293
|
-
"iopub.status.busy": "2025-04-
|
294
|
-
"iopub.status.idle": "2025-04-
|
295
|
-
"shell.execute_reply": "2025-04-
|
292
|
+
"iopub.execute_input": "2025-04-27T16:33:04.980921Z",
|
293
|
+
"iopub.status.busy": "2025-04-27T16:33:04.980818Z",
|
294
|
+
"iopub.status.idle": "2025-04-27T16:33:05.040137Z",
|
295
|
+
"shell.execute_reply": "2025-04-27T16:33:05.039848Z"
|
296
296
|
}
|
297
297
|
},
|
298
298
|
"outputs": [
|
299
299
|
{
|
300
300
|
"data": {
|
301
301
|
"text/plain": [
|
302
|
-
"{'header': '
|
303
|
-
" 'main': 'ruary 3, 1905
|
304
|
-
" 'footer': '
|
302
|
+
"{'header': 'Jungle Health and Safety Inspection Service\\nINS-UP70N51NCL41R\\nSite: Durham’s Meatpacking Chicago, Ill.',\n",
|
303
|
+
" 'main': 'ruary 3, 1905\\nCount: 7\\nWorst of any, however, were the fertilizer men, and those who served in the c...',\n",
|
304
|
+
" 'footer': 'Jungle Health and Safety Inspection Service'}"
|
305
305
|
]
|
306
306
|
},
|
307
307
|
"execution_count": 7,
|
@@ -333,7 +333,7 @@
|
|
333
333
|
},
|
334
334
|
{
|
335
335
|
"cell_type": "markdown",
|
336
|
-
"id": "
|
336
|
+
"id": "f0b33ace",
|
337
337
|
"metadata": {},
|
338
338
|
"source": [
|
339
339
|
"## Creating an Image of a Region"
|
@@ -342,13 +342,13 @@
|
|
342
342
|
{
|
343
343
|
"cell_type": "code",
|
344
344
|
"execution_count": 8,
|
345
|
-
"id": "
|
345
|
+
"id": "92e5cfa0",
|
346
346
|
"metadata": {
|
347
347
|
"execution": {
|
348
|
-
"iopub.execute_input": "2025-04-
|
349
|
-
"iopub.status.busy": "2025-04-
|
350
|
-
"iopub.status.idle": "2025-04-
|
351
|
-
"shell.execute_reply": "2025-04-
|
348
|
+
"iopub.execute_input": "2025-04-27T16:33:05.041446Z",
|
349
|
+
"iopub.status.busy": "2025-04-27T16:33:05.041342Z",
|
350
|
+
"iopub.status.idle": "2025-04-27T16:33:05.085610Z",
|
351
|
+
"shell.execute_reply": "2025-04-27T16:33:05.085305Z"
|
352
352
|
}
|
353
353
|
},
|
354
354
|
"outputs": [
|
@@ -378,7 +378,7 @@
|
|
378
378
|
},
|
379
379
|
{
|
380
380
|
"cell_type": "markdown",
|
381
|
-
"id": "
|
381
|
+
"id": "898822db",
|
382
382
|
"metadata": {},
|
383
383
|
"source": [
|
384
384
|
"Regions allow you to precisely target specific parts of a document for extraction and analysis. They're essential for handling complex document layouts and isolating the exact content you need. "
|
@@ -389,11 +389,12 @@
|
|
389
389
|
"jupytext": {
|
390
390
|
"cell_metadata_filter": "-all",
|
391
391
|
"main_language": "python",
|
392
|
-
"notebook_metadata_filter": "-all"
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
392
|
+
"notebook_metadata_filter": "-all"
|
393
|
+
},
|
394
|
+
"kernelspec": {
|
395
|
+
"display_name": "Python (natural-pdf)",
|
396
|
+
"language": "python",
|
397
|
+
"name": "natural-pdf"
|
397
398
|
},
|
398
399
|
"language_info": {
|
399
400
|
"codemirror_mode": {
|