natural-pdf 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. docs/ocr/index.md +34 -47
  2. docs/tutorials/01-loading-and-extraction.ipynb +60 -46
  3. docs/tutorials/02-finding-elements.ipynb +42 -42
  4. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  5. docs/tutorials/04-table-extraction.ipynb +12 -12
  6. docs/tutorials/05-excluding-content.ipynb +30 -30
  7. docs/tutorials/06-document-qa.ipynb +28 -28
  8. docs/tutorials/07-layout-analysis.ipynb +63 -35
  9. docs/tutorials/07-working-with-regions.ipynb +55 -51
  10. docs/tutorials/07-working-with-regions.md +2 -2
  11. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  12. docs/tutorials/09-section-extraction.ipynb +113 -113
  13. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  14. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  15. docs/tutorials/12-ocr-integration.ipynb +149 -131
  16. docs/tutorials/12-ocr-integration.md +0 -13
  17. docs/tutorials/13-semantic-search.ipynb +313 -873
  18. natural_pdf/__init__.py +21 -23
  19. natural_pdf/analyzers/layout/gemini.py +264 -0
  20. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  21. natural_pdf/analyzers/layout/layout_options.py +11 -0
  22. natural_pdf/analyzers/layout/yolo.py +6 -2
  23. natural_pdf/collections/pdf_collection.py +21 -0
  24. natural_pdf/core/element_manager.py +16 -13
  25. natural_pdf/core/page.py +165 -36
  26. natural_pdf/core/pdf.py +146 -41
  27. natural_pdf/elements/base.py +11 -17
  28. natural_pdf/elements/collections.py +100 -38
  29. natural_pdf/elements/region.py +77 -38
  30. natural_pdf/elements/text.py +5 -0
  31. natural_pdf/ocr/__init__.py +49 -36
  32. natural_pdf/ocr/engine.py +146 -51
  33. natural_pdf/ocr/engine_easyocr.py +141 -161
  34. natural_pdf/ocr/engine_paddle.py +107 -193
  35. natural_pdf/ocr/engine_surya.py +75 -148
  36. natural_pdf/ocr/ocr_factory.py +114 -0
  37. natural_pdf/ocr/ocr_manager.py +65 -93
  38. natural_pdf/ocr/ocr_options.py +7 -17
  39. natural_pdf/ocr/utils.py +98 -0
  40. natural_pdf/templates/spa/css/style.css +334 -0
  41. natural_pdf/templates/spa/index.html +31 -0
  42. natural_pdf/templates/spa/js/app.js +472 -0
  43. natural_pdf/templates/spa/words.txt +235976 -0
  44. natural_pdf/utils/debug.py +32 -0
  45. natural_pdf/utils/identifiers.py +29 -0
  46. natural_pdf/utils/packaging.py +418 -0
  47. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +41 -19
  48. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/RECORD +51 -44
  49. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  50. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/top_level.txt +0 -1
  51. natural_pdf/templates/ocr_debug.html +0 -517
  52. tests/test_loading.py +0 -50
  53. tests/test_optional_deps.py +0 -298
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -2,7 +2,7 @@
2
2
  "cells": [
3
3
  {
4
4
  "cell_type": "markdown",
5
- "id": "5bfcaf73",
5
+ "id": "289614f6",
6
6
  "metadata": {},
7
7
  "source": [
8
8
  "# Working with Regions\n",
@@ -13,13 +13,13 @@
13
13
  {
14
14
  "cell_type": "code",
15
15
  "execution_count": 1,
16
- "id": "a2e06c0a",
16
+ "id": "ee7b29db",
17
17
  "metadata": {
18
18
  "execution": {
19
- "iopub.execute_input": "2025-04-16T14:58:04.098688Z",
20
- "iopub.status.busy": "2025-04-16T14:58:04.098426Z",
21
- "iopub.status.idle": "2025-04-16T14:58:04.101109Z",
22
- "shell.execute_reply": "2025-04-16T14:58:04.100804Z"
19
+ "iopub.execute_input": "2025-04-21T21:32:06.104242Z",
20
+ "iopub.status.busy": "2025-04-21T21:32:06.104024Z",
21
+ "iopub.status.idle": "2025-04-21T21:32:06.108388Z",
22
+ "shell.execute_reply": "2025-04-21T21:32:06.107767Z"
23
23
  }
24
24
  },
25
25
  "outputs": [],
@@ -30,13 +30,13 @@
30
30
  {
31
31
  "cell_type": "code",
32
32
  "execution_count": 2,
33
- "id": "a2b68c02",
33
+ "id": "392a4d3b",
34
34
  "metadata": {
35
35
  "execution": {
36
- "iopub.execute_input": "2025-04-16T14:58:04.102469Z",
37
- "iopub.status.busy": "2025-04-16T14:58:04.102344Z",
38
- "iopub.status.idle": "2025-04-16T14:58:10.407739Z",
39
- "shell.execute_reply": "2025-04-16T14:58:10.407418Z"
36
+ "iopub.execute_input": "2025-04-21T21:32:06.110143Z",
37
+ "iopub.status.busy": "2025-04-21T21:32:06.109959Z",
38
+ "iopub.status.idle": "2025-04-21T21:32:14.111600Z",
39
+ "shell.execute_reply": "2025-04-21T21:32:14.111045Z"
40
40
  }
41
41
  },
42
42
  "outputs": [
@@ -75,7 +75,7 @@
75
75
  },
76
76
  {
77
77
  "cell_type": "markdown",
78
- "id": "75aede6e",
78
+ "id": "326305f6",
79
79
  "metadata": {},
80
80
  "source": [
81
81
  "## Creating Regions from Elements"
@@ -84,13 +84,13 @@
84
84
  {
85
85
  "cell_type": "code",
86
86
  "execution_count": 3,
87
- "id": "8c9314c2",
87
+ "id": "76a034f2",
88
88
  "metadata": {
89
89
  "execution": {
90
- "iopub.execute_input": "2025-04-16T14:58:10.409356Z",
91
- "iopub.status.busy": "2025-04-16T14:58:10.409042Z",
92
- "iopub.status.idle": "2025-04-16T14:58:10.466670Z",
93
- "shell.execute_reply": "2025-04-16T14:58:10.466358Z"
90
+ "iopub.execute_input": "2025-04-21T21:32:14.113859Z",
91
+ "iopub.status.busy": "2025-04-21T21:32:14.113455Z",
92
+ "iopub.status.idle": "2025-04-21T21:32:14.187851Z",
93
+ "shell.execute_reply": "2025-04-21T21:32:14.187217Z"
94
94
  }
95
95
  },
96
96
  "outputs": [
@@ -125,7 +125,7 @@
125
125
  },
126
126
  {
127
127
  "cell_type": "markdown",
128
- "id": "757d90eb",
128
+ "id": "ad3b2337",
129
129
  "metadata": {},
130
130
  "source": [
131
131
  "## Finding Elements Within Regions"
@@ -134,13 +134,13 @@
134
134
  {
135
135
  "cell_type": "code",
136
136
  "execution_count": 4,
137
- "id": "d17c7f5b",
137
+ "id": "633bdc8b",
138
138
  "metadata": {
139
139
  "execution": {
140
- "iopub.execute_input": "2025-04-16T14:58:10.468032Z",
141
- "iopub.status.busy": "2025-04-16T14:58:10.467921Z",
142
- "iopub.status.idle": "2025-04-16T14:58:10.509930Z",
143
- "shell.execute_reply": "2025-04-16T14:58:10.509642Z"
140
+ "iopub.execute_input": "2025-04-21T21:32:14.189588Z",
141
+ "iopub.status.busy": "2025-04-21T21:32:14.189354Z",
142
+ "iopub.status.idle": "2025-04-21T21:32:14.245744Z",
143
+ "shell.execute_reply": "2025-04-21T21:32:14.245287Z"
144
144
  }
145
145
  },
146
146
  "outputs": [
@@ -172,7 +172,7 @@
172
172
  },
173
173
  {
174
174
  "cell_type": "markdown",
175
- "id": "2a5e740e",
175
+ "id": "1016d6de",
176
176
  "metadata": {},
177
177
  "source": [
178
178
  "## Expanding and Adjusting Regions"
@@ -181,13 +181,13 @@
181
181
  {
182
182
  "cell_type": "code",
183
183
  "execution_count": 5,
184
- "id": "4235a902",
184
+ "id": "ef6db1c1",
185
185
  "metadata": {
186
186
  "execution": {
187
- "iopub.execute_input": "2025-04-16T14:58:10.511511Z",
188
- "iopub.status.busy": "2025-04-16T14:58:10.511386Z",
189
- "iopub.status.idle": "2025-04-16T14:58:10.550827Z",
190
- "shell.execute_reply": "2025-04-16T14:58:10.550537Z"
187
+ "iopub.execute_input": "2025-04-21T21:32:14.247620Z",
188
+ "iopub.status.busy": "2025-04-21T21:32:14.247431Z",
189
+ "iopub.status.idle": "2025-04-21T21:32:14.297797Z",
190
+ "shell.execute_reply": "2025-04-21T21:32:14.297383Z"
191
191
  }
192
192
  },
193
193
  "outputs": [
@@ -216,8 +216,8 @@
216
216
  "expanded_region = tight_region.expand(\n",
217
217
  " left=10, # Expand 10 points to the left\n",
218
218
  " right=200, # Expand 200 points to the right\n",
219
- " top_expand=5, # Expand 5 points above\n",
220
- " bottom_expand=100 # Expand 100 points below\n",
219
+ " top=5, # Expand 5 points above\n",
220
+ " bottom=100 # Expand 100 points below\n",
221
221
  ")\n",
222
222
  "\n",
223
223
  "# Visualize both regions\n",
@@ -230,7 +230,7 @@
230
230
  },
231
231
  {
232
232
  "cell_type": "markdown",
233
- "id": "d6babf51",
233
+ "id": "3a0a59e5",
234
234
  "metadata": {},
235
235
  "source": [
236
236
  "## Creating Bounded Regions"
@@ -239,13 +239,13 @@
239
239
  {
240
240
  "cell_type": "code",
241
241
  "execution_count": 6,
242
- "id": "f86d4c72",
242
+ "id": "083c200e",
243
243
  "metadata": {
244
244
  "execution": {
245
- "iopub.execute_input": "2025-04-16T14:58:10.552499Z",
246
- "iopub.status.busy": "2025-04-16T14:58:10.552371Z",
247
- "iopub.status.idle": "2025-04-16T14:58:10.576994Z",
248
- "shell.execute_reply": "2025-04-16T14:58:10.576702Z"
245
+ "iopub.execute_input": "2025-04-21T21:32:14.299580Z",
246
+ "iopub.status.busy": "2025-04-21T21:32:14.299178Z",
247
+ "iopub.status.idle": "2025-04-21T21:32:14.330220Z",
248
+ "shell.execute_reply": "2025-04-21T21:32:14.329836Z"
249
249
  }
250
250
  },
251
251
  "outputs": [
@@ -277,7 +277,7 @@
277
277
  },
278
278
  {
279
279
  "cell_type": "markdown",
280
- "id": "7d46742d",
280
+ "id": "231224fa",
281
281
  "metadata": {},
282
282
  "source": [
283
283
  "## Working with Multiple Regions"
@@ -286,13 +286,13 @@
286
286
  {
287
287
  "cell_type": "code",
288
288
  "execution_count": 7,
289
- "id": "41afc629",
289
+ "id": "d520009e",
290
290
  "metadata": {
291
291
  "execution": {
292
- "iopub.execute_input": "2025-04-16T14:58:10.578381Z",
293
- "iopub.status.busy": "2025-04-16T14:58:10.578276Z",
294
- "iopub.status.idle": "2025-04-16T14:58:10.642158Z",
295
- "shell.execute_reply": "2025-04-16T14:58:10.641880Z"
292
+ "iopub.execute_input": "2025-04-21T21:32:14.331874Z",
293
+ "iopub.status.busy": "2025-04-21T21:32:14.331693Z",
294
+ "iopub.status.idle": "2025-04-21T21:32:14.410562Z",
295
+ "shell.execute_reply": "2025-04-21T21:32:14.410194Z"
296
296
  }
297
297
  },
298
298
  "outputs": [
@@ -333,7 +333,7 @@
333
333
  },
334
334
  {
335
335
  "cell_type": "markdown",
336
- "id": "2b43c050",
336
+ "id": "3168edfa",
337
337
  "metadata": {},
338
338
  "source": [
339
339
  "## Creating an Image of a Region"
@@ -342,13 +342,13 @@
342
342
  {
343
343
  "cell_type": "code",
344
344
  "execution_count": 8,
345
- "id": "2ff14e96",
345
+ "id": "eb460e68",
346
346
  "metadata": {
347
347
  "execution": {
348
- "iopub.execute_input": "2025-04-16T14:58:10.643619Z",
349
- "iopub.status.busy": "2025-04-16T14:58:10.643522Z",
350
- "iopub.status.idle": "2025-04-16T14:58:10.688869Z",
351
- "shell.execute_reply": "2025-04-16T14:58:10.688563Z"
348
+ "iopub.execute_input": "2025-04-21T21:32:14.412800Z",
349
+ "iopub.status.busy": "2025-04-21T21:32:14.412651Z",
350
+ "iopub.status.idle": "2025-04-21T21:32:14.469471Z",
351
+ "shell.execute_reply": "2025-04-21T21:32:14.469002Z"
352
352
  }
353
353
  },
354
354
  "outputs": [
@@ -378,7 +378,7 @@
378
378
  },
379
379
  {
380
380
  "cell_type": "markdown",
381
- "id": "c34e0815",
381
+ "id": "20c711fe",
382
382
  "metadata": {},
383
383
  "source": [
384
384
  "Regions allow you to precisely target specific parts of a document for extraction and analysis. They're essential for handling complex document layouts and isolating the exact content you need. "
@@ -389,7 +389,11 @@
389
389
  "jupytext": {
390
390
  "cell_metadata_filter": "-all",
391
391
  "main_language": "python",
392
- "notebook_metadata_filter": "-all"
392
+ "notebook_metadata_filter": "-all",
393
+ "text_representation": {
394
+ "extension": ".md",
395
+ "format_name": "markdown"
396
+ }
393
397
  },
394
398
  "language_info": {
395
399
  "codemirror_mode": {
@@ -81,8 +81,8 @@ tight_region = page.create_region(
81
81
  expanded_region = tight_region.expand(
82
82
  left=10, # Expand 10 points to the left
83
83
  right=200, # Expand 200 points to the right
84
- top_expand=5, # Expand 5 points above
85
- bottom_expand=100 # Expand 100 points below
84
+ top=5, # Expand 5 points above
85
+ bottom=100 # Expand 100 points below
86
86
  )
87
87
 
88
88
  # Visualize both regions