natural-pdf 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. docs/ocr/index.md +34 -47
  2. docs/tutorials/01-loading-and-extraction.ipynb +60 -46
  3. docs/tutorials/02-finding-elements.ipynb +42 -42
  4. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  5. docs/tutorials/04-table-extraction.ipynb +12 -12
  6. docs/tutorials/05-excluding-content.ipynb +30 -30
  7. docs/tutorials/06-document-qa.ipynb +28 -28
  8. docs/tutorials/07-layout-analysis.ipynb +63 -35
  9. docs/tutorials/07-working-with-regions.ipynb +55 -51
  10. docs/tutorials/07-working-with-regions.md +2 -2
  11. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  12. docs/tutorials/09-section-extraction.ipynb +113 -113
  13. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  14. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  15. docs/tutorials/12-ocr-integration.ipynb +149 -131
  16. docs/tutorials/12-ocr-integration.md +0 -13
  17. docs/tutorials/13-semantic-search.ipynb +313 -873
  18. natural_pdf/__init__.py +21 -23
  19. natural_pdf/analyzers/layout/gemini.py +264 -0
  20. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  21. natural_pdf/analyzers/layout/layout_options.py +11 -0
  22. natural_pdf/analyzers/layout/yolo.py +6 -2
  23. natural_pdf/collections/pdf_collection.py +21 -0
  24. natural_pdf/core/element_manager.py +16 -13
  25. natural_pdf/core/page.py +165 -36
  26. natural_pdf/core/pdf.py +146 -41
  27. natural_pdf/elements/base.py +11 -17
  28. natural_pdf/elements/collections.py +100 -38
  29. natural_pdf/elements/region.py +77 -38
  30. natural_pdf/elements/text.py +5 -0
  31. natural_pdf/ocr/__init__.py +49 -36
  32. natural_pdf/ocr/engine.py +146 -51
  33. natural_pdf/ocr/engine_easyocr.py +141 -161
  34. natural_pdf/ocr/engine_paddle.py +107 -193
  35. natural_pdf/ocr/engine_surya.py +75 -148
  36. natural_pdf/ocr/ocr_factory.py +114 -0
  37. natural_pdf/ocr/ocr_manager.py +65 -93
  38. natural_pdf/ocr/ocr_options.py +7 -17
  39. natural_pdf/ocr/utils.py +98 -0
  40. natural_pdf/templates/spa/css/style.css +334 -0
  41. natural_pdf/templates/spa/index.html +31 -0
  42. natural_pdf/templates/spa/js/app.js +472 -0
  43. natural_pdf/templates/spa/words.txt +235976 -0
  44. natural_pdf/utils/debug.py +32 -0
  45. natural_pdf/utils/identifiers.py +29 -0
  46. natural_pdf/utils/packaging.py +418 -0
  47. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +41 -19
  48. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/RECORD +51 -44
  49. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  50. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/top_level.txt +0 -1
  51. natural_pdf/templates/ocr_debug.html +0 -517
  52. tests/test_loading.py +0 -50
  53. tests/test_optional_deps.py +0 -298
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -2,7 +2,7 @@
2
2
  "cells": [
3
3
  {
4
4
  "cell_type": "markdown",
5
- "id": "90680188",
5
+ "id": "a0347143",
6
6
  "metadata": {},
7
7
  "source": [
8
8
  "# Document Question Answering (QA)\n",
@@ -17,13 +17,13 @@
17
17
  {
18
18
  "cell_type": "code",
19
19
  "execution_count": 1,
20
- "id": "ea05706b",
20
+ "id": "883ee2f6",
21
21
  "metadata": {
22
22
  "execution": {
23
- "iopub.execute_input": "2025-04-16T14:57:38.929864Z",
24
- "iopub.status.busy": "2025-04-16T14:57:38.929678Z",
25
- "iopub.status.idle": "2025-04-16T14:57:38.934815Z",
26
- "shell.execute_reply": "2025-04-16T14:57:38.934333Z"
23
+ "iopub.execute_input": "2025-04-21T21:24:21.452596Z",
24
+ "iopub.status.busy": "2025-04-21T21:24:21.452452Z",
25
+ "iopub.status.idle": "2025-04-21T21:24:21.457320Z",
26
+ "shell.execute_reply": "2025-04-21T21:24:21.456901Z"
27
27
  }
28
28
  },
29
29
  "outputs": [],
@@ -34,13 +34,13 @@
34
34
  {
35
35
  "cell_type": "code",
36
36
  "execution_count": 2,
37
- "id": "5cf3d82f",
37
+ "id": "abecda7a",
38
38
  "metadata": {
39
39
  "execution": {
40
- "iopub.execute_input": "2025-04-16T14:57:38.936889Z",
41
- "iopub.status.busy": "2025-04-16T14:57:38.936686Z",
42
- "iopub.status.idle": "2025-04-16T14:57:45.655434Z",
43
- "shell.execute_reply": "2025-04-16T14:57:45.655104Z"
40
+ "iopub.execute_input": "2025-04-21T21:24:21.459009Z",
41
+ "iopub.status.busy": "2025-04-21T21:24:21.458879Z",
42
+ "iopub.status.idle": "2025-04-21T21:24:30.401038Z",
43
+ "shell.execute_reply": "2025-04-21T21:24:30.400660Z"
44
44
  }
45
45
  },
46
46
  "outputs": [
@@ -86,13 +86,13 @@
86
86
  {
87
87
  "cell_type": "code",
88
88
  "execution_count": 3,
89
- "id": "876eae93",
89
+ "id": "d6b5a66f",
90
90
  "metadata": {
91
91
  "execution": {
92
- "iopub.execute_input": "2025-04-16T14:57:45.656994Z",
93
- "iopub.status.busy": "2025-04-16T14:57:45.656755Z",
94
- "iopub.status.idle": "2025-04-16T14:57:46.274559Z",
95
- "shell.execute_reply": "2025-04-16T14:57:46.274270Z"
92
+ "iopub.execute_input": "2025-04-21T21:24:30.403287Z",
93
+ "iopub.status.busy": "2025-04-21T21:24:30.402917Z",
94
+ "iopub.status.idle": "2025-04-21T21:24:31.285240Z",
95
+ "shell.execute_reply": "2025-04-21T21:24:31.284848Z"
96
96
  }
97
97
  },
98
98
  "outputs": [
@@ -125,13 +125,13 @@
125
125
  {
126
126
  "cell_type": "code",
127
127
  "execution_count": 4,
128
- "id": "df2a8908",
128
+ "id": "babaee28",
129
129
  "metadata": {
130
130
  "execution": {
131
- "iopub.execute_input": "2025-04-16T14:57:46.275927Z",
132
- "iopub.status.busy": "2025-04-16T14:57:46.275830Z",
133
- "iopub.status.idle": "2025-04-16T14:57:46.952263Z",
134
- "shell.execute_reply": "2025-04-16T14:57:46.951979Z"
131
+ "iopub.execute_input": "2025-04-21T21:24:31.286992Z",
132
+ "iopub.status.busy": "2025-04-21T21:24:31.286826Z",
133
+ "iopub.status.idle": "2025-04-21T21:24:32.069026Z",
134
+ "shell.execute_reply": "2025-04-21T21:24:32.068668Z"
135
135
  }
136
136
  },
137
137
  "outputs": [
@@ -163,7 +163,7 @@
163
163
  },
164
164
  {
165
165
  "cell_type": "markdown",
166
- "id": "14764b29",
166
+ "id": "cf24e07d",
167
167
  "metadata": {},
168
168
  "source": [
169
169
  "The results include the extracted `answer`, a `confidence` score (useful for filtering uncertain answers), the `page_num`, and the `source_elements`.\n",
@@ -176,13 +176,13 @@
176
176
  {
177
177
  "cell_type": "code",
178
178
  "execution_count": 5,
179
- "id": "6228f8e0",
179
+ "id": "00b777b5",
180
180
  "metadata": {
181
181
  "execution": {
182
- "iopub.execute_input": "2025-04-16T14:57:46.953625Z",
183
- "iopub.status.busy": "2025-04-16T14:57:46.953531Z",
184
- "iopub.status.idle": "2025-04-16T14:57:49.838207Z",
185
- "shell.execute_reply": "2025-04-16T14:57:49.837921Z"
182
+ "iopub.execute_input": "2025-04-21T21:24:32.070771Z",
183
+ "iopub.status.busy": "2025-04-21T21:24:32.070607Z",
184
+ "iopub.status.idle": "2025-04-21T21:24:35.309130Z",
185
+ "shell.execute_reply": "2025-04-21T21:24:35.308744Z"
186
186
  }
187
187
  },
188
188
  "outputs": [
@@ -293,7 +293,7 @@
293
293
  },
294
294
  {
295
295
  "cell_type": "markdown",
296
- "id": "75cab1bf",
296
+ "id": "381130c6",
297
297
  "metadata": {},
298
298
  "source": [
299
299
  "This shows how you can iterate through questions, collect the answer dictionaries, and then create a structured DataFrame, making it easy to review questions, answers, and their confidence levels together.\n",