natural-pdf 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. docs/finetuning/index.md +176 -0
  2. docs/ocr/index.md +34 -47
  3. docs/tutorials/01-loading-and-extraction.ipynb +34 -1536
  4. docs/tutorials/02-finding-elements.ipynb +42 -42
  5. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  6. docs/tutorials/04-table-extraction.ipynb +12 -12
  7. docs/tutorials/05-excluding-content.ipynb +30 -30
  8. docs/tutorials/06-document-qa.ipynb +28 -28
  9. docs/tutorials/07-layout-analysis.ipynb +63 -35
  10. docs/tutorials/07-working-with-regions.ipynb +55 -51
  11. docs/tutorials/07-working-with-regions.md +2 -2
  12. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  13. docs/tutorials/09-section-extraction.ipynb +113 -113
  14. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  15. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  16. docs/tutorials/12-ocr-integration.ipynb +149 -131
  17. docs/tutorials/12-ocr-integration.md +0 -13
  18. docs/tutorials/13-semantic-search.ipynb +313 -873
  19. natural_pdf/__init__.py +21 -22
  20. natural_pdf/analyzers/layout/gemini.py +280 -0
  21. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  22. natural_pdf/analyzers/layout/layout_options.py +11 -0
  23. natural_pdf/analyzers/layout/yolo.py +6 -2
  24. natural_pdf/collections/pdf_collection.py +24 -0
  25. natural_pdf/core/element_manager.py +18 -13
  26. natural_pdf/core/page.py +174 -36
  27. natural_pdf/core/pdf.py +156 -42
  28. natural_pdf/elements/base.py +9 -17
  29. natural_pdf/elements/collections.py +99 -38
  30. natural_pdf/elements/region.py +77 -37
  31. natural_pdf/elements/text.py +5 -0
  32. natural_pdf/exporters/__init__.py +4 -0
  33. natural_pdf/exporters/base.py +61 -0
  34. natural_pdf/exporters/paddleocr.py +345 -0
  35. natural_pdf/ocr/__init__.py +57 -36
  36. natural_pdf/ocr/engine.py +160 -49
  37. natural_pdf/ocr/engine_easyocr.py +178 -157
  38. natural_pdf/ocr/engine_paddle.py +114 -189
  39. natural_pdf/ocr/engine_surya.py +87 -144
  40. natural_pdf/ocr/ocr_factory.py +125 -0
  41. natural_pdf/ocr/ocr_manager.py +65 -89
  42. natural_pdf/ocr/ocr_options.py +8 -13
  43. natural_pdf/ocr/utils.py +113 -0
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
  45. natural_pdf/templates/spa/css/style.css +334 -0
  46. natural_pdf/templates/spa/index.html +31 -0
  47. natural_pdf/templates/spa/js/app.js +472 -0
  48. natural_pdf/templates/spa/words.txt +235976 -0
  49. natural_pdf/utils/debug.py +34 -0
  50. natural_pdf/utils/identifiers.py +33 -0
  51. natural_pdf/utils/packaging.py +485 -0
  52. natural_pdf/utils/text_extraction.py +44 -64
  53. natural_pdf/utils/visualization.py +1 -1
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +44 -20
  55. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +58 -47
  56. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +1 -1
  57. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -1
  58. natural_pdf/templates/ocr_debug.html +0 -517
  59. tests/test_loading.py +0 -50
  60. tests/test_optional_deps.py +0 -298
  61. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
@@ -2,7 +2,7 @@
2
2
  "cells": [
3
3
  {
4
4
  "cell_type": "markdown",
5
- "id": "ba201cd9",
5
+ "id": "1964ce9e",
6
6
  "metadata": {},
7
7
  "source": [
8
8
  "# Form Field Extraction\n",
@@ -13,13 +13,13 @@
13
13
  {
14
14
  "cell_type": "code",
15
15
  "execution_count": 1,
16
- "id": "0d4e5891",
16
+ "id": "1dcdb75d",
17
17
  "metadata": {
18
18
  "execution": {
19
- "iopub.execute_input": "2025-04-16T14:58:33.400825Z",
20
- "iopub.status.busy": "2025-04-16T14:58:33.400682Z",
21
- "iopub.status.idle": "2025-04-16T14:58:33.403473Z",
22
- "shell.execute_reply": "2025-04-16T14:58:33.403161Z"
19
+ "iopub.execute_input": "2025-04-21T21:25:24.280873Z",
20
+ "iopub.status.busy": "2025-04-21T21:25:24.280704Z",
21
+ "iopub.status.idle": "2025-04-21T21:25:24.284726Z",
22
+ "shell.execute_reply": "2025-04-21T21:25:24.284318Z"
23
23
  }
24
24
  },
25
25
  "outputs": [],
@@ -30,13 +30,13 @@
30
30
  {
31
31
  "cell_type": "code",
32
32
  "execution_count": 2,
33
- "id": "c5a44eb9",
33
+ "id": "bd457499",
34
34
  "metadata": {
35
35
  "execution": {
36
- "iopub.execute_input": "2025-04-16T14:58:33.404845Z",
37
- "iopub.status.busy": "2025-04-16T14:58:33.404712Z",
38
- "iopub.status.idle": "2025-04-16T14:58:39.532285Z",
39
- "shell.execute_reply": "2025-04-16T14:58:39.531990Z"
36
+ "iopub.execute_input": "2025-04-21T21:25:24.286583Z",
37
+ "iopub.status.busy": "2025-04-21T21:25:24.286416Z",
38
+ "iopub.status.idle": "2025-04-21T21:25:31.558427Z",
39
+ "shell.execute_reply": "2025-04-21T21:25:31.557839Z"
40
40
  }
41
41
  },
42
42
  "outputs": [
@@ -70,7 +70,7 @@
70
70
  },
71
71
  {
72
72
  "cell_type": "markdown",
73
- "id": "771809ec",
73
+ "id": "db805c75",
74
74
  "metadata": {},
75
75
  "source": [
76
76
  "## Extracting Field Values"
@@ -79,13 +79,13 @@
79
79
  {
80
80
  "cell_type": "code",
81
81
  "execution_count": 3,
82
- "id": "254d7656",
82
+ "id": "5f642496",
83
83
  "metadata": {
84
84
  "execution": {
85
- "iopub.execute_input": "2025-04-16T14:58:39.533834Z",
86
- "iopub.status.busy": "2025-04-16T14:58:39.533557Z",
87
- "iopub.status.idle": "2025-04-16T14:58:39.546101Z",
88
- "shell.execute_reply": "2025-04-16T14:58:39.545847Z"
85
+ "iopub.execute_input": "2025-04-21T21:25:31.560723Z",
86
+ "iopub.status.busy": "2025-04-21T21:25:31.560306Z",
87
+ "iopub.status.idle": "2025-04-21T21:25:31.578186Z",
88
+ "shell.execute_reply": "2025-04-21T21:25:31.577738Z"
89
89
  }
90
90
  },
91
91
  "outputs": [
@@ -124,7 +124,7 @@
124
124
  },
125
125
  {
126
126
  "cell_type": "markdown",
127
- "id": "b010e10f",
127
+ "id": "de977e00",
128
128
  "metadata": {},
129
129
  "source": [
130
130
  "## Visualizing Labels and Values"
@@ -133,13 +133,13 @@
133
133
  {
134
134
  "cell_type": "code",
135
135
  "execution_count": 4,
136
- "id": "8ff51bf2",
136
+ "id": "0bc6541e",
137
137
  "metadata": {
138
138
  "execution": {
139
- "iopub.execute_input": "2025-04-16T14:58:39.547546Z",
140
- "iopub.status.busy": "2025-04-16T14:58:39.547419Z",
141
- "iopub.status.idle": "2025-04-16T14:58:39.775972Z",
142
- "shell.execute_reply": "2025-04-16T14:58:39.775646Z"
139
+ "iopub.execute_input": "2025-04-21T21:25:31.580113Z",
140
+ "iopub.status.busy": "2025-04-21T21:25:31.579783Z",
141
+ "iopub.status.idle": "2025-04-21T21:25:31.881639Z",
142
+ "shell.execute_reply": "2025-04-21T21:25:31.881253Z"
143
143
  }
144
144
  },
145
145
  "outputs": [
@@ -173,7 +173,7 @@
173
173
  },
174
174
  {
175
175
  "cell_type": "markdown",
176
- "id": "09eebfe3",
176
+ "id": "f8d4ff06",
177
177
  "metadata": {},
178
178
  "source": [
179
179
  "## Handling Multi-line Values"
@@ -182,13 +182,13 @@
182
182
  {
183
183
  "cell_type": "code",
184
184
  "execution_count": 5,
185
- "id": "89dff886",
185
+ "id": "bca5ed7c",
186
186
  "metadata": {
187
187
  "execution": {
188
- "iopub.execute_input": "2025-04-16T14:58:39.777580Z",
189
- "iopub.status.busy": "2025-04-16T14:58:39.777452Z",
190
- "iopub.status.idle": "2025-04-16T14:58:39.794732Z",
191
- "shell.execute_reply": "2025-04-16T14:58:39.794397Z"
188
+ "iopub.execute_input": "2025-04-21T21:25:31.883647Z",
189
+ "iopub.status.busy": "2025-04-21T21:25:31.883411Z",
190
+ "iopub.status.idle": "2025-04-21T21:25:31.904835Z",
191
+ "shell.execute_reply": "2025-04-21T21:25:31.904493Z"
192
192
  }
193
193
  },
194
194
  "outputs": [
@@ -233,7 +233,7 @@
233
233
  },
234
234
  {
235
235
  "cell_type": "markdown",
236
- "id": "3e3b6258",
236
+ "id": "0619ba40",
237
237
  "metadata": {},
238
238
  "source": [
239
239
  "## Finding Pattern-Based Fields"
@@ -242,13 +242,13 @@
242
242
  {
243
243
  "cell_type": "code",
244
244
  "execution_count": 6,
245
- "id": "679b8ab5",
245
+ "id": "06ed3f0e",
246
246
  "metadata": {
247
247
  "execution": {
248
- "iopub.execute_input": "2025-04-16T14:58:39.796445Z",
249
- "iopub.status.busy": "2025-04-16T14:58:39.796296Z",
250
- "iopub.status.idle": "2025-04-16T14:58:39.819718Z",
251
- "shell.execute_reply": "2025-04-16T14:58:39.819419Z"
248
+ "iopub.execute_input": "2025-04-21T21:25:31.906548Z",
249
+ "iopub.status.busy": "2025-04-21T21:25:31.906380Z",
250
+ "iopub.status.idle": "2025-04-21T21:25:31.933546Z",
251
+ "shell.execute_reply": "2025-04-21T21:25:31.933080Z"
252
252
  }
253
253
  },
254
254
  "outputs": [
@@ -291,7 +291,7 @@
291
291
  },
292
292
  {
293
293
  "cell_type": "markdown",
294
- "id": "b55b30b1",
294
+ "id": "6328f728",
295
295
  "metadata": {},
296
296
  "source": [
297
297
  "## Working with Form Tables"
@@ -300,16 +300,44 @@
300
300
  {
301
301
  "cell_type": "code",
302
302
  "execution_count": 7,
303
- "id": "e61fede9",
303
+ "id": "8ec27457",
304
304
  "metadata": {
305
305
  "execution": {
306
- "iopub.execute_input": "2025-04-16T14:58:39.821414Z",
307
- "iopub.status.busy": "2025-04-16T14:58:39.821277Z",
308
- "iopub.status.idle": "2025-04-16T14:58:42.159117Z",
309
- "shell.execute_reply": "2025-04-16T14:58:42.158739Z"
306
+ "iopub.execute_input": "2025-04-21T21:25:31.935350Z",
307
+ "iopub.status.busy": "2025-04-21T21:25:31.935185Z",
308
+ "iopub.status.idle": "2025-04-21T21:25:34.162554Z",
309
+ "shell.execute_reply": "2025-04-21T21:25:34.162025Z"
310
310
  }
311
311
  },
312
312
  "outputs": [
313
+ {
314
+ "name": "stderr",
315
+ "output_type": "stream",
316
+ "text": [
317
+ "\u001b[2m2025-04-21T21:25:31.947156Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mGOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m72\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.gemini\u001b[0m\n"
318
+ ]
319
+ },
320
+ {
321
+ "name": "stderr",
322
+ "output_type": "stream",
323
+ "text": [
324
+ "[2025-04-21 17:25:31,947] [ WARNING] gemini.py:72 - GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\n"
325
+ ]
326
+ },
327
+ {
328
+ "name": "stderr",
329
+ "output_type": "stream",
330
+ "text": [
331
+ "\u001b[2m2025-04-21T21:25:31.947949Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mGOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m72\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.gemini\u001b[0m\n"
332
+ ]
333
+ },
334
+ {
335
+ "name": "stderr",
336
+ "output_type": "stream",
337
+ "text": [
338
+ "[2025-04-21 17:25:31,947] [ WARNING] gemini.py:72 - GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\n"
339
+ ]
340
+ },
313
341
  {
314
342
  "name": "stdout",
315
343
  "output_type": "stream",
@@ -321,14 +349,14 @@
321
349
  "name": "stdout",
322
350
  "output_type": "stream",
323
351
  "text": [
324
- "image 1/1 /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmp38mauqj6/temp_layout_image.png: 1024x800 2 titles, 3 plain texts, 2 abandons, 1 table, 1695.1ms\n"
352
+ "image 1/1 /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpfyjqm372/temp_layout_image.png: 1024x800 2 titles, 3 plain texts, 2 abandons, 1 table, 1806.1ms\n"
325
353
  ]
326
354
  },
327
355
  {
328
356
  "name": "stdout",
329
357
  "output_type": "stream",
330
358
  "text": [
331
- "Speed: 6.4ms preprocess, 1695.1ms inference, 0.8ms postprocess per image at shape (1, 3, 1024, 800)\n"
359
+ "Speed: 7.3ms preprocess, 1806.1ms inference, 0.9ms postprocess per image at shape (1, 3, 1024, 800)\n"
332
360
  ]
333
361
  }
334
362
  ],
@@ -388,7 +416,7 @@
388
416
  },
389
417
  {
390
418
  "cell_type": "markdown",
391
- "id": "f768fe15",
419
+ "id": "e1236765",
392
420
  "metadata": {},
393
421
  "source": [
394
422
  "## Combining Different Extraction Techniques"
@@ -397,13 +425,13 @@
397
425
  {
398
426
  "cell_type": "code",
399
427
  "execution_count": 8,
400
- "id": "0ccdfd92",
428
+ "id": "3f5fb2ad",
401
429
  "metadata": {
402
430
  "execution": {
403
- "iopub.execute_input": "2025-04-16T14:58:42.160824Z",
404
- "iopub.status.busy": "2025-04-16T14:58:42.160676Z",
405
- "iopub.status.idle": "2025-04-16T14:58:42.174755Z",
406
- "shell.execute_reply": "2025-04-16T14:58:42.174450Z"
431
+ "iopub.execute_input": "2025-04-21T21:25:34.164351Z",
432
+ "iopub.status.busy": "2025-04-21T21:25:34.164185Z",
433
+ "iopub.status.idle": "2025-04-21T21:25:34.180512Z",
434
+ "shell.execute_reply": "2025-04-21T21:25:34.180172Z"
407
435
  }
408
436
  },
409
437
  "outputs": [
@@ -453,7 +481,7 @@
453
481
  },
454
482
  {
455
483
  "cell_type": "markdown",
456
- "id": "a1ddf055",
484
+ "id": "ed4182f5",
457
485
  "metadata": {},
458
486
  "source": [
459
487
  "Form field extraction enables you to automate data entry and document processing. By combining different techniques like label detection, spatial navigation, and pattern matching, you can handle a wide variety of form layouts. "
@@ -2,7 +2,7 @@
2
2
  "cells": [
3
3
  {
4
4
  "cell_type": "markdown",
5
- "id": "c4a0e202",
5
+ "id": "7674e123",
6
6
  "metadata": {},
7
7
  "source": [
8
8
  "# Enhanced Table Processing\n",
@@ -15,13 +15,13 @@
15
15
  {
16
16
  "cell_type": "code",
17
17
  "execution_count": 1,
18
- "id": "5d8327c3",
18
+ "id": "08c7c5f0",
19
19
  "metadata": {
20
20
  "execution": {
21
- "iopub.execute_input": "2025-04-16T14:58:45.013785Z",
22
- "iopub.status.busy": "2025-04-16T14:58:45.013623Z",
23
- "iopub.status.idle": "2025-04-16T14:58:45.016363Z",
24
- "shell.execute_reply": "2025-04-16T14:58:45.016094Z"
21
+ "iopub.execute_input": "2025-04-21T21:25:37.324499Z",
22
+ "iopub.status.busy": "2025-04-21T21:25:37.324337Z",
23
+ "iopub.status.idle": "2025-04-21T21:25:37.328739Z",
24
+ "shell.execute_reply": "2025-04-21T21:25:37.328344Z"
25
25
  }
26
26
  },
27
27
  "outputs": [],