natural-pdf 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. docs/finetuning/index.md +176 -0
  2. docs/ocr/index.md +34 -47
  3. docs/tutorials/01-loading-and-extraction.ipynb +34 -1536
  4. docs/tutorials/02-finding-elements.ipynb +42 -42
  5. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  6. docs/tutorials/04-table-extraction.ipynb +12 -12
  7. docs/tutorials/05-excluding-content.ipynb +30 -30
  8. docs/tutorials/06-document-qa.ipynb +28 -28
  9. docs/tutorials/07-layout-analysis.ipynb +63 -35
  10. docs/tutorials/07-working-with-regions.ipynb +55 -51
  11. docs/tutorials/07-working-with-regions.md +2 -2
  12. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  13. docs/tutorials/09-section-extraction.ipynb +113 -113
  14. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  15. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  16. docs/tutorials/12-ocr-integration.ipynb +149 -131
  17. docs/tutorials/12-ocr-integration.md +0 -13
  18. docs/tutorials/13-semantic-search.ipynb +313 -873
  19. natural_pdf/__init__.py +21 -22
  20. natural_pdf/analyzers/layout/gemini.py +280 -0
  21. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  22. natural_pdf/analyzers/layout/layout_options.py +11 -0
  23. natural_pdf/analyzers/layout/yolo.py +6 -2
  24. natural_pdf/collections/pdf_collection.py +24 -0
  25. natural_pdf/core/element_manager.py +18 -13
  26. natural_pdf/core/page.py +174 -36
  27. natural_pdf/core/pdf.py +156 -42
  28. natural_pdf/elements/base.py +9 -17
  29. natural_pdf/elements/collections.py +99 -38
  30. natural_pdf/elements/region.py +77 -37
  31. natural_pdf/elements/text.py +5 -0
  32. natural_pdf/exporters/__init__.py +4 -0
  33. natural_pdf/exporters/base.py +61 -0
  34. natural_pdf/exporters/paddleocr.py +345 -0
  35. natural_pdf/ocr/__init__.py +57 -36
  36. natural_pdf/ocr/engine.py +160 -49
  37. natural_pdf/ocr/engine_easyocr.py +178 -157
  38. natural_pdf/ocr/engine_paddle.py +114 -189
  39. natural_pdf/ocr/engine_surya.py +87 -144
  40. natural_pdf/ocr/ocr_factory.py +125 -0
  41. natural_pdf/ocr/ocr_manager.py +65 -89
  42. natural_pdf/ocr/ocr_options.py +8 -13
  43. natural_pdf/ocr/utils.py +113 -0
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
  45. natural_pdf/templates/spa/css/style.css +334 -0
  46. natural_pdf/templates/spa/index.html +31 -0
  47. natural_pdf/templates/spa/js/app.js +472 -0
  48. natural_pdf/templates/spa/words.txt +235976 -0
  49. natural_pdf/utils/debug.py +34 -0
  50. natural_pdf/utils/identifiers.py +33 -0
  51. natural_pdf/utils/packaging.py +485 -0
  52. natural_pdf/utils/text_extraction.py +44 -64
  53. natural_pdf/utils/visualization.py +1 -1
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +44 -20
  55. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +58 -47
  56. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +1 -1
  57. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -1
  58. natural_pdf/templates/ocr_debug.html +0 -517
  59. tests/test_loading.py +0 -50
  60. tests/test_optional_deps.py +0 -298
  61. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
@@ -2,7 +2,7 @@
2
2
  "cells": [
3
3
  {
4
4
  "cell_type": "markdown",
5
- "id": "90680188",
5
+ "id": "a0347143",
6
6
  "metadata": {},
7
7
  "source": [
8
8
  "# Document Question Answering (QA)\n",
@@ -17,13 +17,13 @@
17
17
  {
18
18
  "cell_type": "code",
19
19
  "execution_count": 1,
20
- "id": "ea05706b",
20
+ "id": "883ee2f6",
21
21
  "metadata": {
22
22
  "execution": {
23
- "iopub.execute_input": "2025-04-16T14:57:38.929864Z",
24
- "iopub.status.busy": "2025-04-16T14:57:38.929678Z",
25
- "iopub.status.idle": "2025-04-16T14:57:38.934815Z",
26
- "shell.execute_reply": "2025-04-16T14:57:38.934333Z"
23
+ "iopub.execute_input": "2025-04-21T21:24:21.452596Z",
24
+ "iopub.status.busy": "2025-04-21T21:24:21.452452Z",
25
+ "iopub.status.idle": "2025-04-21T21:24:21.457320Z",
26
+ "shell.execute_reply": "2025-04-21T21:24:21.456901Z"
27
27
  }
28
28
  },
29
29
  "outputs": [],
@@ -34,13 +34,13 @@
34
34
  {
35
35
  "cell_type": "code",
36
36
  "execution_count": 2,
37
- "id": "5cf3d82f",
37
+ "id": "abecda7a",
38
38
  "metadata": {
39
39
  "execution": {
40
- "iopub.execute_input": "2025-04-16T14:57:38.936889Z",
41
- "iopub.status.busy": "2025-04-16T14:57:38.936686Z",
42
- "iopub.status.idle": "2025-04-16T14:57:45.655434Z",
43
- "shell.execute_reply": "2025-04-16T14:57:45.655104Z"
40
+ "iopub.execute_input": "2025-04-21T21:24:21.459009Z",
41
+ "iopub.status.busy": "2025-04-21T21:24:21.458879Z",
42
+ "iopub.status.idle": "2025-04-21T21:24:30.401038Z",
43
+ "shell.execute_reply": "2025-04-21T21:24:30.400660Z"
44
44
  }
45
45
  },
46
46
  "outputs": [
@@ -86,13 +86,13 @@
86
86
  {
87
87
  "cell_type": "code",
88
88
  "execution_count": 3,
89
- "id": "876eae93",
89
+ "id": "d6b5a66f",
90
90
  "metadata": {
91
91
  "execution": {
92
- "iopub.execute_input": "2025-04-16T14:57:45.656994Z",
93
- "iopub.status.busy": "2025-04-16T14:57:45.656755Z",
94
- "iopub.status.idle": "2025-04-16T14:57:46.274559Z",
95
- "shell.execute_reply": "2025-04-16T14:57:46.274270Z"
92
+ "iopub.execute_input": "2025-04-21T21:24:30.403287Z",
93
+ "iopub.status.busy": "2025-04-21T21:24:30.402917Z",
94
+ "iopub.status.idle": "2025-04-21T21:24:31.285240Z",
95
+ "shell.execute_reply": "2025-04-21T21:24:31.284848Z"
96
96
  }
97
97
  },
98
98
  "outputs": [
@@ -125,13 +125,13 @@
125
125
  {
126
126
  "cell_type": "code",
127
127
  "execution_count": 4,
128
- "id": "df2a8908",
128
+ "id": "babaee28",
129
129
  "metadata": {
130
130
  "execution": {
131
- "iopub.execute_input": "2025-04-16T14:57:46.275927Z",
132
- "iopub.status.busy": "2025-04-16T14:57:46.275830Z",
133
- "iopub.status.idle": "2025-04-16T14:57:46.952263Z",
134
- "shell.execute_reply": "2025-04-16T14:57:46.951979Z"
131
+ "iopub.execute_input": "2025-04-21T21:24:31.286992Z",
132
+ "iopub.status.busy": "2025-04-21T21:24:31.286826Z",
133
+ "iopub.status.idle": "2025-04-21T21:24:32.069026Z",
134
+ "shell.execute_reply": "2025-04-21T21:24:32.068668Z"
135
135
  }
136
136
  },
137
137
  "outputs": [
@@ -163,7 +163,7 @@
163
163
  },
164
164
  {
165
165
  "cell_type": "markdown",
166
- "id": "14764b29",
166
+ "id": "cf24e07d",
167
167
  "metadata": {},
168
168
  "source": [
169
169
  "The results include the extracted `answer`, a `confidence` score (useful for filtering uncertain answers), the `page_num`, and the `source_elements`.\n",
@@ -176,13 +176,13 @@
176
176
  {
177
177
  "cell_type": "code",
178
178
  "execution_count": 5,
179
- "id": "6228f8e0",
179
+ "id": "00b777b5",
180
180
  "metadata": {
181
181
  "execution": {
182
- "iopub.execute_input": "2025-04-16T14:57:46.953625Z",
183
- "iopub.status.busy": "2025-04-16T14:57:46.953531Z",
184
- "iopub.status.idle": "2025-04-16T14:57:49.838207Z",
185
- "shell.execute_reply": "2025-04-16T14:57:49.837921Z"
182
+ "iopub.execute_input": "2025-04-21T21:24:32.070771Z",
183
+ "iopub.status.busy": "2025-04-21T21:24:32.070607Z",
184
+ "iopub.status.idle": "2025-04-21T21:24:35.309130Z",
185
+ "shell.execute_reply": "2025-04-21T21:24:35.308744Z"
186
186
  }
187
187
  },
188
188
  "outputs": [
@@ -293,7 +293,7 @@
293
293
  },
294
294
  {
295
295
  "cell_type": "markdown",
296
- "id": "75cab1bf",
296
+ "id": "381130c6",
297
297
  "metadata": {},
298
298
  "source": [
299
299
  "This shows how you can iterate through questions, collect the answer dictionaries, and then create a structured DataFrame, making it easy to review questions, answers, and their confidence levels together.\n",