natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/index.md +19 -0
  6. docs/ocr/index.md +63 -16
  7. docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
  8. docs/tutorials/02-finding-elements.ipynb +123 -46
  9. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  10. docs/tutorials/04-table-extraction.ipynb +17 -12
  11. docs/tutorials/05-excluding-content.ipynb +37 -32
  12. docs/tutorials/06-document-qa.ipynb +36 -31
  13. docs/tutorials/07-layout-analysis.ipynb +45 -40
  14. docs/tutorials/07-working-with-regions.ipynb +61 -60
  15. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  16. docs/tutorials/09-section-extraction.ipynb +160 -155
  17. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  18. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  19. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  20. docs/tutorials/12-ocr-integration.md +68 -106
  21. docs/tutorials/13-semantic-search.ipynb +641 -251
  22. natural_pdf/__init__.py +2 -0
  23. natural_pdf/classification/manager.py +343 -0
  24. natural_pdf/classification/mixin.py +149 -0
  25. natural_pdf/classification/results.py +62 -0
  26. natural_pdf/collections/mixins.py +63 -0
  27. natural_pdf/collections/pdf_collection.py +321 -15
  28. natural_pdf/core/element_manager.py +67 -0
  29. natural_pdf/core/page.py +227 -64
  30. natural_pdf/core/pdf.py +387 -378
  31. natural_pdf/elements/collections.py +272 -41
  32. natural_pdf/elements/region.py +99 -15
  33. natural_pdf/elements/text.py +5 -2
  34. natural_pdf/exporters/paddleocr.py +1 -1
  35. natural_pdf/extraction/manager.py +134 -0
  36. natural_pdf/extraction/mixin.py +246 -0
  37. natural_pdf/extraction/result.py +37 -0
  38. natural_pdf/ocr/engine_easyocr.py +6 -3
  39. natural_pdf/ocr/ocr_manager.py +85 -25
  40. natural_pdf/ocr/ocr_options.py +33 -10
  41. natural_pdf/ocr/utils.py +14 -3
  42. natural_pdf/qa/document_qa.py +0 -4
  43. natural_pdf/selectors/parser.py +363 -238
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
  45. natural_pdf/utils/locks.py +8 -0
  46. natural_pdf/utils/text_extraction.py +52 -1
  47. natural_pdf/utils/tqdm_utils.py +43 -0
  48. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
  49. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
  50. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  51. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@
2
2
  "cells": [
3
3
  {
4
4
  "cell_type": "markdown",
5
- "id": "a0347143",
5
+ "id": "69dddd89",
6
6
  "metadata": {},
7
7
  "source": [
8
8
  "# Document Question Answering (QA)\n",
@@ -17,13 +17,13 @@
17
17
  {
18
18
  "cell_type": "code",
19
19
  "execution_count": 1,
20
- "id": "883ee2f6",
20
+ "id": "83291ee0",
21
21
  "metadata": {
22
22
  "execution": {
23
- "iopub.execute_input": "2025-04-21T21:24:21.452596Z",
24
- "iopub.status.busy": "2025-04-21T21:24:21.452452Z",
25
- "iopub.status.idle": "2025-04-21T21:24:21.457320Z",
26
- "shell.execute_reply": "2025-04-21T21:24:21.456901Z"
23
+ "iopub.execute_input": "2025-04-27T16:32:33.679352Z",
24
+ "iopub.status.busy": "2025-04-27T16:32:33.678861Z",
25
+ "iopub.status.idle": "2025-04-27T16:32:33.686446Z",
26
+ "shell.execute_reply": "2025-04-27T16:32:33.685641Z"
27
27
  }
28
28
  },
29
29
  "outputs": [],
@@ -34,13 +34,13 @@
34
34
  {
35
35
  "cell_type": "code",
36
36
  "execution_count": 2,
37
- "id": "abecda7a",
37
+ "id": "ef370e2d",
38
38
  "metadata": {
39
39
  "execution": {
40
- "iopub.execute_input": "2025-04-21T21:24:21.459009Z",
41
- "iopub.status.busy": "2025-04-21T21:24:21.458879Z",
42
- "iopub.status.idle": "2025-04-21T21:24:30.401038Z",
43
- "shell.execute_reply": "2025-04-21T21:24:30.400660Z"
40
+ "iopub.execute_input": "2025-04-27T16:32:33.688877Z",
41
+ "iopub.status.busy": "2025-04-27T16:32:33.688626Z",
42
+ "iopub.status.idle": "2025-04-27T16:32:42.075986Z",
43
+ "shell.execute_reply": "2025-04-27T16:32:42.075672Z"
44
44
  }
45
45
  },
46
46
  "outputs": [
@@ -48,7 +48,7 @@
48
48
  "name": "stderr",
49
49
  "output_type": "stream",
50
50
  "text": [
51
- "Device set to use cpu\n"
51
+ "Device set to use mps:0\n"
52
52
  ]
53
53
  },
54
54
  {
@@ -86,13 +86,13 @@
86
86
  {
87
87
  "cell_type": "code",
88
88
  "execution_count": 3,
89
- "id": "d6b5a66f",
89
+ "id": "ca8ce7c2",
90
90
  "metadata": {
91
91
  "execution": {
92
- "iopub.execute_input": "2025-04-21T21:24:30.403287Z",
93
- "iopub.status.busy": "2025-04-21T21:24:30.402917Z",
94
- "iopub.status.idle": "2025-04-21T21:24:31.285240Z",
95
- "shell.execute_reply": "2025-04-21T21:24:31.284848Z"
92
+ "iopub.execute_input": "2025-04-27T16:32:42.077368Z",
93
+ "iopub.status.busy": "2025-04-27T16:32:42.077146Z",
94
+ "iopub.status.idle": "2025-04-27T16:32:42.592727Z",
95
+ "shell.execute_reply": "2025-04-27T16:32:42.592409Z"
96
96
  }
97
97
  },
98
98
  "outputs": [
@@ -125,13 +125,13 @@
125
125
  {
126
126
  "cell_type": "code",
127
127
  "execution_count": 4,
128
- "id": "babaee28",
128
+ "id": "a16027bd",
129
129
  "metadata": {
130
130
  "execution": {
131
- "iopub.execute_input": "2025-04-21T21:24:31.286992Z",
132
- "iopub.status.busy": "2025-04-21T21:24:31.286826Z",
133
- "iopub.status.idle": "2025-04-21T21:24:32.069026Z",
134
- "shell.execute_reply": "2025-04-21T21:24:32.068668Z"
131
+ "iopub.execute_input": "2025-04-27T16:32:42.593986Z",
132
+ "iopub.status.busy": "2025-04-27T16:32:42.593887Z",
133
+ "iopub.status.idle": "2025-04-27T16:32:43.110359Z",
134
+ "shell.execute_reply": "2025-04-27T16:32:43.110035Z"
135
135
  }
136
136
  },
137
137
  "outputs": [
@@ -163,7 +163,7 @@
163
163
  },
164
164
  {
165
165
  "cell_type": "markdown",
166
- "id": "cf24e07d",
166
+ "id": "ded74804",
167
167
  "metadata": {},
168
168
  "source": [
169
169
  "The results include the extracted `answer`, a `confidence` score (useful for filtering uncertain answers), the `page_num`, and the `source_elements`.\n",
@@ -176,13 +176,13 @@
176
176
  {
177
177
  "cell_type": "code",
178
178
  "execution_count": 5,
179
- "id": "00b777b5",
179
+ "id": "caabeabb",
180
180
  "metadata": {
181
181
  "execution": {
182
- "iopub.execute_input": "2025-04-21T21:24:32.070771Z",
183
- "iopub.status.busy": "2025-04-21T21:24:32.070607Z",
184
- "iopub.status.idle": "2025-04-21T21:24:35.309130Z",
185
- "shell.execute_reply": "2025-04-21T21:24:35.308744Z"
182
+ "iopub.execute_input": "2025-04-27T16:32:43.111733Z",
183
+ "iopub.status.busy": "2025-04-27T16:32:43.111617Z",
184
+ "iopub.status.idle": "2025-04-27T16:32:45.595626Z",
185
+ "shell.execute_reply": "2025-04-27T16:32:45.595275Z"
186
186
  }
187
187
  },
188
188
  "outputs": [
@@ -235,7 +235,7 @@
235
235
  " <th>3</th>\n",
236
236
  " <td>How many violations were there in total?</td>\n",
237
237
  " <td>4.12.7</td>\n",
238
- " <td>0.662557</td>\n",
238
+ " <td>0.662560</td>\n",
239
239
  " </tr>\n",
240
240
  " </tbody>\n",
241
241
  "</table>\n",
@@ -252,7 +252,7 @@
252
252
  "0 February 3, 1905 0.997994 \n",
253
253
  "1 Jungle Health and Safety Inspection Service 0.998895 \n",
254
254
  "2 Inadequate Protective Equipment. 0.999800 \n",
255
- "3 4.12.7 0.662557 "
255
+ "3 4.12.7 0.662560 "
256
256
  ]
257
257
  },
258
258
  "execution_count": 5,
@@ -293,7 +293,7 @@
293
293
  },
294
294
  {
295
295
  "cell_type": "markdown",
296
- "id": "381130c6",
296
+ "id": "dc0549b4",
297
297
  "metadata": {},
298
298
  "source": [
299
299
  "This shows how you can iterate through questions, collect the answer dictionaries, and then create a structured DataFrame, making it easy to review questions, answers, and their confidence levels together.\n",
@@ -314,6 +314,11 @@
314
314
  "main_language": "python",
315
315
  "notebook_metadata_filter": "-all"
316
316
  },
317
+ "kernelspec": {
318
+ "display_name": "Python (natural-pdf)",
319
+ "language": "python",
320
+ "name": "natural-pdf"
321
+ },
317
322
  "language_info": {
318
323
  "codemirror_mode": {
319
324
  "name": "ipython",