natural-pdf 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/ocr/index.md +34 -47
- docs/tutorials/01-loading-and-extraction.ipynb +60 -46
- docs/tutorials/02-finding-elements.ipynb +42 -42
- docs/tutorials/03-extracting-blocks.ipynb +17 -17
- docs/tutorials/04-table-extraction.ipynb +12 -12
- docs/tutorials/05-excluding-content.ipynb +30 -30
- docs/tutorials/06-document-qa.ipynb +28 -28
- docs/tutorials/07-layout-analysis.ipynb +63 -35
- docs/tutorials/07-working-with-regions.ipynb +55 -51
- docs/tutorials/07-working-with-regions.md +2 -2
- docs/tutorials/08-spatial-navigation.ipynb +60 -60
- docs/tutorials/09-section-extraction.ipynb +113 -113
- docs/tutorials/10-form-field-extraction.ipynb +78 -50
- docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
- docs/tutorials/12-ocr-integration.ipynb +149 -131
- docs/tutorials/12-ocr-integration.md +0 -13
- docs/tutorials/13-semantic-search.ipynb +313 -873
- natural_pdf/__init__.py +21 -23
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_manager.py +28 -1
- natural_pdf/analyzers/layout/layout_options.py +11 -0
- natural_pdf/analyzers/layout/yolo.py +6 -2
- natural_pdf/collections/pdf_collection.py +21 -0
- natural_pdf/core/element_manager.py +16 -13
- natural_pdf/core/page.py +165 -36
- natural_pdf/core/pdf.py +146 -41
- natural_pdf/elements/base.py +11 -17
- natural_pdf/elements/collections.py +100 -38
- natural_pdf/elements/region.py +77 -38
- natural_pdf/elements/text.py +5 -0
- natural_pdf/ocr/__init__.py +49 -36
- natural_pdf/ocr/engine.py +146 -51
- natural_pdf/ocr/engine_easyocr.py +141 -161
- natural_pdf/ocr/engine_paddle.py +107 -193
- natural_pdf/ocr/engine_surya.py +75 -148
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +65 -93
- natural_pdf/ocr/ocr_options.py +7 -17
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +41 -19
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/RECORD +51 -44
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/top_level.txt +0 -1
- natural_pdf/templates/ocr_debug.html +0 -517
- tests/test_loading.py +0 -50
- tests/test_optional_deps.py +0 -298
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -2,7 +2,7 @@
|
|
2
2
|
"cells": [
|
3
3
|
{
|
4
4
|
"cell_type": "markdown",
|
5
|
-
"id": "
|
5
|
+
"id": "9e434442",
|
6
6
|
"metadata": {},
|
7
7
|
"source": [
|
8
8
|
"# Finding Specific Elements\n",
|
@@ -15,13 +15,13 @@
|
|
15
15
|
{
|
16
16
|
"cell_type": "code",
|
17
17
|
"execution_count": 1,
|
18
|
-
"id": "
|
18
|
+
"id": "04dee5a2",
|
19
19
|
"metadata": {
|
20
20
|
"execution": {
|
21
|
-
"iopub.execute_input": "2025-04-
|
22
|
-
"iopub.status.busy": "2025-04-
|
23
|
-
"iopub.status.idle": "2025-04-
|
24
|
-
"shell.execute_reply": "2025-04-
|
21
|
+
"iopub.execute_input": "2025-04-21T21:23:38.994146Z",
|
22
|
+
"iopub.status.busy": "2025-04-21T21:23:38.993783Z",
|
23
|
+
"iopub.status.idle": "2025-04-21T21:23:38.999468Z",
|
24
|
+
"shell.execute_reply": "2025-04-21T21:23:38.998717Z"
|
25
25
|
},
|
26
26
|
"lines_to_next_cell": 2
|
27
27
|
},
|
@@ -33,13 +33,13 @@
|
|
33
33
|
{
|
34
34
|
"cell_type": "code",
|
35
35
|
"execution_count": 2,
|
36
|
-
"id": "
|
36
|
+
"id": "bf31643c",
|
37
37
|
"metadata": {
|
38
38
|
"execution": {
|
39
|
-
"iopub.execute_input": "2025-04-
|
40
|
-
"iopub.status.busy": "2025-04-
|
41
|
-
"iopub.status.idle": "2025-04-
|
42
|
-
"shell.execute_reply": "2025-04-
|
39
|
+
"iopub.execute_input": "2025-04-21T21:23:39.002568Z",
|
40
|
+
"iopub.status.busy": "2025-04-21T21:23:39.002232Z",
|
41
|
+
"iopub.status.idle": "2025-04-21T21:23:46.097884Z",
|
42
|
+
"shell.execute_reply": "2025-04-21T21:23:46.097468Z"
|
43
43
|
}
|
44
44
|
},
|
45
45
|
"outputs": [
|
@@ -87,7 +87,7 @@
|
|
87
87
|
},
|
88
88
|
{
|
89
89
|
"cell_type": "markdown",
|
90
|
-
"id": "
|
90
|
+
"id": "b4e9d1a6",
|
91
91
|
"metadata": {},
|
92
92
|
"source": [
|
93
93
|
"## Finding Elements by Color\n",
|
@@ -98,13 +98,13 @@
|
|
98
98
|
{
|
99
99
|
"cell_type": "code",
|
100
100
|
"execution_count": 3,
|
101
|
-
"id": "
|
101
|
+
"id": "799cbbf4",
|
102
102
|
"metadata": {
|
103
103
|
"execution": {
|
104
|
-
"iopub.execute_input": "2025-04-
|
105
|
-
"iopub.status.busy": "2025-04-
|
106
|
-
"iopub.status.idle": "2025-04-
|
107
|
-
"shell.execute_reply": "2025-04-
|
104
|
+
"iopub.execute_input": "2025-04-21T21:23:46.100225Z",
|
105
|
+
"iopub.status.busy": "2025-04-21T21:23:46.099846Z",
|
106
|
+
"iopub.status.idle": "2025-04-21T21:23:46.103274Z",
|
107
|
+
"shell.execute_reply": "2025-04-21T21:23:46.102903Z"
|
108
108
|
}
|
109
109
|
},
|
110
110
|
"outputs": [
|
@@ -128,7 +128,7 @@
|
|
128
128
|
},
|
129
129
|
{
|
130
130
|
"cell_type": "markdown",
|
131
|
-
"id": "
|
131
|
+
"id": "0c914d1c",
|
132
132
|
"metadata": {},
|
133
133
|
"source": [
|
134
134
|
"## Finding Lines and Shapes\n",
|
@@ -139,13 +139,13 @@
|
|
139
139
|
{
|
140
140
|
"cell_type": "code",
|
141
141
|
"execution_count": 4,
|
142
|
-
"id": "
|
142
|
+
"id": "0ef67b74",
|
143
143
|
"metadata": {
|
144
144
|
"execution": {
|
145
|
-
"iopub.execute_input": "2025-04-
|
146
|
-
"iopub.status.busy": "2025-04-
|
147
|
-
"iopub.status.idle": "2025-04-
|
148
|
-
"shell.execute_reply": "2025-04-
|
145
|
+
"iopub.execute_input": "2025-04-21T21:23:46.104775Z",
|
146
|
+
"iopub.status.busy": "2025-04-21T21:23:46.104656Z",
|
147
|
+
"iopub.status.idle": "2025-04-21T21:23:46.285442Z",
|
148
|
+
"shell.execute_reply": "2025-04-21T21:23:46.285066Z"
|
149
149
|
}
|
150
150
|
},
|
151
151
|
"outputs": [
|
@@ -181,7 +181,7 @@
|
|
181
181
|
},
|
182
182
|
{
|
183
183
|
"cell_type": "markdown",
|
184
|
-
"id": "
|
184
|
+
"id": "1702ca48",
|
185
185
|
"metadata": {},
|
186
186
|
"source": [
|
187
187
|
"## Finding Elements by Font Properties"
|
@@ -190,13 +190,13 @@
|
|
190
190
|
{
|
191
191
|
"cell_type": "code",
|
192
192
|
"execution_count": 5,
|
193
|
-
"id": "
|
193
|
+
"id": "9a2b6947",
|
194
194
|
"metadata": {
|
195
195
|
"execution": {
|
196
|
-
"iopub.execute_input": "2025-04-
|
197
|
-
"iopub.status.busy": "2025-04-
|
198
|
-
"iopub.status.idle": "2025-04-
|
199
|
-
"shell.execute_reply": "2025-04-
|
196
|
+
"iopub.execute_input": "2025-04-21T21:23:46.287144Z",
|
197
|
+
"iopub.status.busy": "2025-04-21T21:23:46.287028Z",
|
198
|
+
"iopub.status.idle": "2025-04-21T21:23:46.289701Z",
|
199
|
+
"shell.execute_reply": "2025-04-21T21:23:46.289377Z"
|
200
200
|
}
|
201
201
|
},
|
202
202
|
"outputs": [],
|
@@ -211,7 +211,7 @@
|
|
211
211
|
},
|
212
212
|
{
|
213
213
|
"cell_type": "markdown",
|
214
|
-
"id": "
|
214
|
+
"id": "b1701bf0",
|
215
215
|
"metadata": {},
|
216
216
|
"source": [
|
217
217
|
"## Spatial Navigation\n",
|
@@ -222,13 +222,13 @@
|
|
222
222
|
{
|
223
223
|
"cell_type": "code",
|
224
224
|
"execution_count": 6,
|
225
|
-
"id": "
|
225
|
+
"id": "52992c4d",
|
226
226
|
"metadata": {
|
227
227
|
"execution": {
|
228
|
-
"iopub.execute_input": "2025-04-
|
229
|
-
"iopub.status.busy": "2025-04-
|
230
|
-
"iopub.status.idle": "2025-04-
|
231
|
-
"shell.execute_reply": "2025-04-
|
228
|
+
"iopub.execute_input": "2025-04-21T21:23:46.291344Z",
|
229
|
+
"iopub.status.busy": "2025-04-21T21:23:46.291182Z",
|
230
|
+
"iopub.status.idle": "2025-04-21T21:23:46.304254Z",
|
231
|
+
"shell.execute_reply": "2025-04-21T21:23:46.303900Z"
|
232
232
|
}
|
233
233
|
},
|
234
234
|
"outputs": [],
|
@@ -245,7 +245,7 @@
|
|
245
245
|
},
|
246
246
|
{
|
247
247
|
"cell_type": "markdown",
|
248
|
-
"id": "
|
248
|
+
"id": "d8be7257",
|
249
249
|
"metadata": {},
|
250
250
|
"source": [
|
251
251
|
"## Combining Selectors\n",
|
@@ -256,13 +256,13 @@
|
|
256
256
|
{
|
257
257
|
"cell_type": "code",
|
258
258
|
"execution_count": 7,
|
259
|
-
"id": "
|
259
|
+
"id": "d765ee51",
|
260
260
|
"metadata": {
|
261
261
|
"execution": {
|
262
|
-
"iopub.execute_input": "2025-04-
|
263
|
-
"iopub.status.busy": "2025-04-
|
264
|
-
"iopub.status.idle": "2025-04-
|
265
|
-
"shell.execute_reply": "2025-04-
|
262
|
+
"iopub.execute_input": "2025-04-21T21:23:46.305731Z",
|
263
|
+
"iopub.status.busy": "2025-04-21T21:23:46.305613Z",
|
264
|
+
"iopub.status.idle": "2025-04-21T21:23:46.308089Z",
|
265
|
+
"shell.execute_reply": "2025-04-21T21:23:46.307763Z"
|
266
266
|
}
|
267
267
|
},
|
268
268
|
"outputs": [],
|
@@ -276,7 +276,7 @@
|
|
276
276
|
},
|
277
277
|
{
|
278
278
|
"cell_type": "markdown",
|
279
|
-
"id": "
|
279
|
+
"id": "265e503c",
|
280
280
|
"metadata": {},
|
281
281
|
"source": [
|
282
282
|
"<div class=\"admonition note\">\n",
|
@@ -2,7 +2,7 @@
|
|
2
2
|
"cells": [
|
3
3
|
{
|
4
4
|
"cell_type": "markdown",
|
5
|
-
"id": "
|
5
|
+
"id": "9c506b6d",
|
6
6
|
"metadata": {},
|
7
7
|
"source": [
|
8
8
|
"# Extracting Text Blocks\n",
|
@@ -15,13 +15,13 @@
|
|
15
15
|
{
|
16
16
|
"cell_type": "code",
|
17
17
|
"execution_count": 1,
|
18
|
-
"id": "
|
18
|
+
"id": "3b4c59a8",
|
19
19
|
"metadata": {
|
20
20
|
"execution": {
|
21
|
-
"iopub.execute_input": "2025-04-
|
22
|
-
"iopub.status.busy": "2025-04-
|
23
|
-
"iopub.status.idle": "2025-04-
|
24
|
-
"shell.execute_reply": "2025-04-
|
21
|
+
"iopub.execute_input": "2025-04-21T21:23:49.516795Z",
|
22
|
+
"iopub.status.busy": "2025-04-21T21:23:49.516596Z",
|
23
|
+
"iopub.status.idle": "2025-04-21T21:23:49.520836Z",
|
24
|
+
"shell.execute_reply": "2025-04-21T21:23:49.520398Z"
|
25
25
|
},
|
26
26
|
"lines_to_next_cell": 2
|
27
27
|
},
|
@@ -33,13 +33,13 @@
|
|
33
33
|
{
|
34
34
|
"cell_type": "code",
|
35
35
|
"execution_count": 2,
|
36
|
-
"id": "
|
36
|
+
"id": "2f1392d0",
|
37
37
|
"metadata": {
|
38
38
|
"execution": {
|
39
|
-
"iopub.execute_input": "2025-04-
|
40
|
-
"iopub.status.busy": "2025-04-
|
41
|
-
"iopub.status.idle": "2025-04-
|
42
|
-
"shell.execute_reply": "2025-04-
|
39
|
+
"iopub.execute_input": "2025-04-21T21:23:49.522593Z",
|
40
|
+
"iopub.status.busy": "2025-04-21T21:23:49.522429Z",
|
41
|
+
"iopub.status.idle": "2025-04-21T21:23:56.685334Z",
|
42
|
+
"shell.execute_reply": "2025-04-21T21:23:56.684952Z"
|
43
43
|
}
|
44
44
|
},
|
45
45
|
"outputs": [
|
@@ -81,13 +81,13 @@
|
|
81
81
|
{
|
82
82
|
"cell_type": "code",
|
83
83
|
"execution_count": 3,
|
84
|
-
"id": "
|
84
|
+
"id": "47d1bdd0",
|
85
85
|
"metadata": {
|
86
86
|
"execution": {
|
87
|
-
"iopub.execute_input": "2025-04-
|
88
|
-
"iopub.status.busy": "2025-04-
|
89
|
-
"iopub.status.idle": "2025-04-
|
90
|
-
"shell.execute_reply": "2025-04-
|
87
|
+
"iopub.execute_input": "2025-04-21T21:23:56.687240Z",
|
88
|
+
"iopub.status.busy": "2025-04-21T21:23:56.686879Z",
|
89
|
+
"iopub.status.idle": "2025-04-21T21:23:56.833242Z",
|
90
|
+
"shell.execute_reply": "2025-04-21T21:23:56.832870Z"
|
91
91
|
}
|
92
92
|
},
|
93
93
|
"outputs": [
|
@@ -110,7 +110,7 @@
|
|
110
110
|
},
|
111
111
|
{
|
112
112
|
"cell_type": "markdown",
|
113
|
-
"id": "
|
113
|
+
"id": "f2a17cb4",
|
114
114
|
"metadata": {},
|
115
115
|
"source": [
|
116
116
|
"This selects the elements using `.below(until=...)` and extracts their text. The second code block displays the page image with the visualized section.\n",
|
@@ -2,7 +2,7 @@
|
|
2
2
|
"cells": [
|
3
3
|
{
|
4
4
|
"cell_type": "markdown",
|
5
|
-
"id": "
|
5
|
+
"id": "24111eee",
|
6
6
|
"metadata": {},
|
7
7
|
"source": [
|
8
8
|
"# Basic Table Extraction\n",
|
@@ -15,13 +15,13 @@
|
|
15
15
|
{
|
16
16
|
"cell_type": "code",
|
17
17
|
"execution_count": 1,
|
18
|
-
"id": "
|
18
|
+
"id": "75f17900",
|
19
19
|
"metadata": {
|
20
20
|
"execution": {
|
21
|
-
"iopub.execute_input": "2025-04-
|
22
|
-
"iopub.status.busy": "2025-04-
|
23
|
-
"iopub.status.idle": "2025-04-
|
24
|
-
"shell.execute_reply": "2025-04-
|
21
|
+
"iopub.execute_input": "2025-04-21T21:23:59.967091Z",
|
22
|
+
"iopub.status.busy": "2025-04-21T21:23:59.966933Z",
|
23
|
+
"iopub.status.idle": "2025-04-21T21:23:59.971753Z",
|
24
|
+
"shell.execute_reply": "2025-04-21T21:23:59.970980Z"
|
25
25
|
},
|
26
26
|
"lines_to_next_cell": 2
|
27
27
|
},
|
@@ -33,13 +33,13 @@
|
|
33
33
|
{
|
34
34
|
"cell_type": "code",
|
35
35
|
"execution_count": 2,
|
36
|
-
"id": "
|
36
|
+
"id": "f1b71280",
|
37
37
|
"metadata": {
|
38
38
|
"execution": {
|
39
|
-
"iopub.execute_input": "2025-04-
|
40
|
-
"iopub.status.busy": "2025-04-
|
41
|
-
"iopub.status.idle": "2025-04-
|
42
|
-
"shell.execute_reply": "2025-04-
|
39
|
+
"iopub.execute_input": "2025-04-21T21:23:59.974183Z",
|
40
|
+
"iopub.status.busy": "2025-04-21T21:23:59.973996Z",
|
41
|
+
"iopub.status.idle": "2025-04-21T21:24:06.847197Z",
|
42
|
+
"shell.execute_reply": "2025-04-21T21:24:06.846712Z"
|
43
43
|
}
|
44
44
|
},
|
45
45
|
"outputs": [],
|
@@ -70,7 +70,7 @@
|
|
70
70
|
},
|
71
71
|
{
|
72
72
|
"cell_type": "markdown",
|
73
|
-
"id": "
|
73
|
+
"id": "5c80e397",
|
74
74
|
"metadata": {},
|
75
75
|
"source": [
|
76
76
|
"This code uses `page.extract_tables()` which attempts to automatically detect tables based on visual cues like lines and whitespace. The result is a list of lists, representing the rows and cells of the table.\n",
|