natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/index.md +19 -0
  6. docs/ocr/index.md +63 -16
  7. docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
  8. docs/tutorials/02-finding-elements.ipynb +123 -46
  9. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  10. docs/tutorials/04-table-extraction.ipynb +17 -12
  11. docs/tutorials/05-excluding-content.ipynb +37 -32
  12. docs/tutorials/06-document-qa.ipynb +36 -31
  13. docs/tutorials/07-layout-analysis.ipynb +45 -40
  14. docs/tutorials/07-working-with-regions.ipynb +61 -60
  15. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  16. docs/tutorials/09-section-extraction.ipynb +160 -155
  17. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  18. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  19. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  20. docs/tutorials/12-ocr-integration.md +68 -106
  21. docs/tutorials/13-semantic-search.ipynb +641 -251
  22. natural_pdf/__init__.py +2 -0
  23. natural_pdf/classification/manager.py +343 -0
  24. natural_pdf/classification/mixin.py +149 -0
  25. natural_pdf/classification/results.py +62 -0
  26. natural_pdf/collections/mixins.py +63 -0
  27. natural_pdf/collections/pdf_collection.py +321 -15
  28. natural_pdf/core/element_manager.py +67 -0
  29. natural_pdf/core/page.py +227 -64
  30. natural_pdf/core/pdf.py +387 -378
  31. natural_pdf/elements/collections.py +272 -41
  32. natural_pdf/elements/region.py +99 -15
  33. natural_pdf/elements/text.py +5 -2
  34. natural_pdf/exporters/paddleocr.py +1 -1
  35. natural_pdf/extraction/manager.py +134 -0
  36. natural_pdf/extraction/mixin.py +246 -0
  37. natural_pdf/extraction/result.py +37 -0
  38. natural_pdf/ocr/engine_easyocr.py +6 -3
  39. natural_pdf/ocr/ocr_manager.py +85 -25
  40. natural_pdf/ocr/ocr_options.py +33 -10
  41. natural_pdf/ocr/utils.py +14 -3
  42. natural_pdf/qa/document_qa.py +0 -4
  43. natural_pdf/selectors/parser.py +363 -238
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
  45. natural_pdf/utils/locks.py +8 -0
  46. natural_pdf/utils/text_extraction.py +52 -1
  47. natural_pdf/utils/tqdm_utils.py +43 -0
  48. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
  49. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
  50. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  51. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@
2
2
  "cells": [
3
3
  {
4
4
  "cell_type": "markdown",
5
- "id": "1964ce9e",
5
+ "id": "15f336b1",
6
6
  "metadata": {},
7
7
  "source": [
8
8
  "# Form Field Extraction\n",
@@ -13,13 +13,13 @@
13
13
  {
14
14
  "cell_type": "code",
15
15
  "execution_count": 1,
16
- "id": "1dcdb75d",
16
+ "id": "a4d160f4",
17
17
  "metadata": {
18
18
  "execution": {
19
- "iopub.execute_input": "2025-04-21T21:25:24.280873Z",
20
- "iopub.status.busy": "2025-04-21T21:25:24.280704Z",
21
- "iopub.status.idle": "2025-04-21T21:25:24.284726Z",
22
- "shell.execute_reply": "2025-04-21T21:25:24.284318Z"
19
+ "iopub.execute_input": "2025-04-27T16:33:25.969280Z",
20
+ "iopub.status.busy": "2025-04-27T16:33:25.969137Z",
21
+ "iopub.status.idle": "2025-04-27T16:33:25.972410Z",
22
+ "shell.execute_reply": "2025-04-27T16:33:25.972018Z"
23
23
  }
24
24
  },
25
25
  "outputs": [],
@@ -30,13 +30,13 @@
30
30
  {
31
31
  "cell_type": "code",
32
32
  "execution_count": 2,
33
- "id": "bd457499",
33
+ "id": "260f0289",
34
34
  "metadata": {
35
35
  "execution": {
36
- "iopub.execute_input": "2025-04-21T21:25:24.286583Z",
37
- "iopub.status.busy": "2025-04-21T21:25:24.286416Z",
38
- "iopub.status.idle": "2025-04-21T21:25:31.558427Z",
39
- "shell.execute_reply": "2025-04-21T21:25:31.557839Z"
36
+ "iopub.execute_input": "2025-04-27T16:33:25.974195Z",
37
+ "iopub.status.busy": "2025-04-27T16:33:25.974006Z",
38
+ "iopub.status.idle": "2025-04-27T16:33:31.944270Z",
39
+ "shell.execute_reply": "2025-04-27T16:33:31.943972Z"
40
40
  }
41
41
  },
42
42
  "outputs": [
@@ -70,7 +70,7 @@
70
70
  },
71
71
  {
72
72
  "cell_type": "markdown",
73
- "id": "db805c75",
73
+ "id": "cd7ffefd",
74
74
  "metadata": {},
75
75
  "source": [
76
76
  "## Extracting Field Values"
@@ -79,23 +79,23 @@
79
79
  {
80
80
  "cell_type": "code",
81
81
  "execution_count": 3,
82
- "id": "5f642496",
82
+ "id": "11d12525",
83
83
  "metadata": {
84
84
  "execution": {
85
- "iopub.execute_input": "2025-04-21T21:25:31.560723Z",
86
- "iopub.status.busy": "2025-04-21T21:25:31.560306Z",
87
- "iopub.status.idle": "2025-04-21T21:25:31.578186Z",
88
- "shell.execute_reply": "2025-04-21T21:25:31.577738Z"
85
+ "iopub.execute_input": "2025-04-27T16:33:31.945788Z",
86
+ "iopub.status.busy": "2025-04-27T16:33:31.945477Z",
87
+ "iopub.status.idle": "2025-04-27T16:33:31.957632Z",
88
+ "shell.execute_reply": "2025-04-27T16:33:31.957318Z"
89
89
  }
90
90
  },
91
91
  "outputs": [
92
92
  {
93
93
  "data": {
94
94
  "text/plain": [
95
- "{'Site': 'Durham’s Meatpacking Chicago, Ill.\\n \\nFebruary 3, 1905 \\n \\ntion Count: 7 \\nmary: Worst of any, however, were the fertilize\\ne people could not be shown to the visitor - for\\n \\nr at a hundred yards, and as for the other men\\nof which there were open vats near the level\\nhe vats; and when they were fished out, there\\niting - sometimes they would be overlooked fo\\nworld as Durham’s Pure Leaf Lard!\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nations \\n \\nute Description \\n.7 Unsanitary Working Conditions.\\n \\n3 Inadequate Protective Equipment.\\n9 Ineffective Injury Prevention.\\n \\n5 Failure to Properly Store Hazardous M\\n2 Lack of Adequate Fire Safety Measure\\n \\n4 Inadequate Ventilation Systems.\\n.7 Insufficient Employee Training for Safe\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Jungle Healt',\n",
96
- " 'Date': 'Durham’s Meatpacking Chicago, Ill.\\n \\nFebruary 3, 1905 \\n \\non Count: 7 \\nary: Worst of any, however, were the fertilizer\\npeople could not be shown to the visitor - for t\\n \\nat a hundred yards, and as for the other men,\\nof which there were open vats near the level o\\ne vats; and when they were fished out, there w\\nng - sometimes they would be overlooked for\\nworld as Durham’s Pure Leaf Lard!\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\ntions \\n \\nte Description \\n7 Unsanitary Working Conditions.\\n \\nInadequate Protective Equipment.\\nIneffective Injury Prevention.\\n \\nFailure to Properly Store Hazardous Ma\\nLack of Adequate Fire Safety Measures\\n \\nInadequate Ventilation Systems.\\n7 Insufficient Employee Training for Safe W\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Jungle Health',\n",
97
- " 'Violation Count': 'eatpacking Chicago, Ill. \\n \\n, 1905 \\n \\n7 \\nof any, however, were the fertilizer men, and\\nld not be shown to the visitor - for the odor of\\n \\nd yards, and as for the other men, who worke\\nre were open vats near the level of the floor, t\\nwhen they were fished out, there was never e\\nmes they would be overlooked for days, till all\\nrham’s Pure Leaf Lard! \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nription \\nnitary Working Conditions. \\n \\nquate Protective Equipment.\\nctive Injury Prevention. \\n \\ne to Properly Store Hazardous Materials.\\nof Adequate Fire Safety Measures.\\n \\nquate Ventilation Systems. \\nicient Employee Training for Safe Work Practi\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nJungle Health and Safety Ins',\n",
98
- " 'Summary': 'm’s Meatpacking Chicago, Ill.\\n \\nuary 3, 1905 \\n \\nount: 7 \\nWorst of any, however, were the fertilizer men\\nple could not be shown to the visitor - for the o\\n \\nhundred yards, and as for the other men, who\\nich there were open vats near the level of the\\ns; and when they were fished out, there was n\\nsometimes they would be overlooked for days\\nas Durham’s Pure Leaf Lard!\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\ns \\n \\nDescription \\nUnsanitary Working Conditions.\\n \\nInadequate Protective Equipment.\\nIneffective Injury Prevention.\\n \\nFailure to Properly Store Hazardous Material\\nLack of Adequate Fire Safety Measures.\\n \\nInadequate Ventilation Systems.\\nInsufficient Employee Training for Safe Work\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Jungle Health and S'}"
95
+ "{'Site': 'Durham’s Meatpacking Chicago, Ill.\\nFebruary 3, 1905\\ntion Count: 7\\nmary: Worst of any, however, were the fertilize\\ne people could not be shown to the visitor - for\\nr at a hundred yards, and as for the other men\\nof which there were open vats near the level\\nhe vats; and when they were fished out, there\\niting - sometimes they would be overlooked fo\\nworld as Durham’s Pure Leaf Lard!\\nations\\nute Description\\n.7 Unsanitary Working Conditions.\\n3 Inadequate Protective Equipment.\\n9 Ineffective Injury Prevention.\\n5 Failure to Properly Store Hazardous M\\n2 Lack of Adequate Fire Safety Measure\\n4 Inadequate Ventilation Systems.\\n.7 Insufficient Employee Training for Safe\\nJungle Healt',\n",
96
+ " 'Date': 'Durham’s Meatpacking Chicago, Ill.\\nFebruary 3, 1905\\non Count: 7\\nary: Worst of any, however, were the fertilizer\\npeople could not be shown to the visitor - for t\\nat a hundred yards, and as for the other men,\\nof which there were open vats near the level o\\ne vats; and when they were fished out, there w\\nng - sometimes they would be overlooked for\\nworld as Durham’s Pure Leaf Lard!\\ntions\\nte Description\\n7 Unsanitary Working Conditions.\\nInadequate Protective Equipment.\\nIneffective Injury Prevention.\\nFailure to Properly Store Hazardous Ma\\nLack of Adequate Fire Safety Measures\\nInadequate Ventilation Systems.\\n7 Insufficient Employee Training for Safe W\\nJungle Health',\n",
97
+ " 'Violation Count': 'eatpacking Chicago, Ill.\\n, 1905\\n7\\nof any, however, were the fertilizer men, and\\nld not be shown to the visitor - for the odor of\\nd yards, and as for the other men, who worke\\nre were open vats near the level of the floor, t\\nwhen they were fished out, there was never e\\nmes they would be overlooked for days, till all\\nrham’s Pure Leaf Lard!\\nription\\nnitary Working Conditions.\\nquate Protective Equipment.\\nctive Injury Prevention.\\ne to Properly Store Hazardous Materials.\\nof Adequate Fire Safety Measures.\\nquate Ventilation Systems.\\nicient Employee Training for Safe Work Practi\\nJungle Health and Safety Ins',\n",
98
+ " 'Summary': 'm’s Meatpacking Chicago, Ill.\\nuary 3, 1905\\nount: 7\\nWorst of any, however, were the fertilizer men\\nple could not be shown to the visitor - for the o\\nhundred yards, and as for the other men, who\\nich there were open vats near the level of the\\ns; and when they were fished out, there was n\\nsometimes they would be overlooked for days\\nas Durham’s Pure Leaf Lard!\\ns\\nDescription\\nUnsanitary Working Conditions.\\nInadequate Protective Equipment.\\nIneffective Injury Prevention.\\nFailure to Properly Store Hazardous Material\\nLack of Adequate Fire Safety Measures.\\nInadequate Ventilation Systems.\\nInsufficient Employee Training for Safe Work\\nJungle Health and S'}"
99
99
  ]
100
100
  },
101
101
  "execution_count": 3,
@@ -124,7 +124,7 @@
124
124
  },
125
125
  {
126
126
  "cell_type": "markdown",
127
- "id": "de977e00",
127
+ "id": "b41d66c4",
128
128
  "metadata": {},
129
129
  "source": [
130
130
  "## Visualizing Labels and Values"
@@ -133,13 +133,13 @@
133
133
  {
134
134
  "cell_type": "code",
135
135
  "execution_count": 4,
136
- "id": "0bc6541e",
136
+ "id": "14bd2af0",
137
137
  "metadata": {
138
138
  "execution": {
139
- "iopub.execute_input": "2025-04-21T21:25:31.580113Z",
140
- "iopub.status.busy": "2025-04-21T21:25:31.579783Z",
141
- "iopub.status.idle": "2025-04-21T21:25:31.881639Z",
142
- "shell.execute_reply": "2025-04-21T21:25:31.881253Z"
139
+ "iopub.execute_input": "2025-04-27T16:33:31.958877Z",
140
+ "iopub.status.busy": "2025-04-27T16:33:31.958776Z",
141
+ "iopub.status.idle": "2025-04-27T16:33:32.171037Z",
142
+ "shell.execute_reply": "2025-04-27T16:33:32.170715Z"
143
143
  }
144
144
  },
145
145
  "outputs": [
@@ -173,7 +173,7 @@
173
173
  },
174
174
  {
175
175
  "cell_type": "markdown",
176
- "id": "f8d4ff06",
176
+ "id": "461f082b",
177
177
  "metadata": {},
178
178
  "source": [
179
179
  "## Handling Multi-line Values"
@@ -182,23 +182,23 @@
182
182
  {
183
183
  "cell_type": "code",
184
184
  "execution_count": 5,
185
- "id": "bca5ed7c",
185
+ "id": "bc932c5f",
186
186
  "metadata": {
187
187
  "execution": {
188
- "iopub.execute_input": "2025-04-21T21:25:31.883647Z",
189
- "iopub.status.busy": "2025-04-21T21:25:31.883411Z",
190
- "iopub.status.idle": "2025-04-21T21:25:31.904835Z",
191
- "shell.execute_reply": "2025-04-21T21:25:31.904493Z"
188
+ "iopub.execute_input": "2025-04-27T16:33:32.172585Z",
189
+ "iopub.status.busy": "2025-04-27T16:33:32.172485Z",
190
+ "iopub.status.idle": "2025-04-27T16:33:32.187937Z",
191
+ "shell.execute_reply": "2025-04-27T16:33:32.187629Z"
192
192
  }
193
193
  },
194
194
  "outputs": [
195
195
  {
196
196
  "data": {
197
197
  "text/plain": [
198
- "{'Site': 'Durham’s Meatpacking Chicago, Ill.\\n \\nFebruary 3, 1905 \\n \\ntion Count: 7 \\nmary: Worst of any, however, were the fertilize\\ne people could not be shown to the visitor - for\\n \\nr at a hundred yards, and as for the other men\\nof which there were open vats near the level\\nhe vats; and when they were fished out, there\\niting - sometimes they would be overlooked fo\\nworld as Durham’s Pure Leaf Lard!\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nations \\n \\nute Description \\n.7 Unsanitary Working Conditions.\\n \\n3 Inadequate Protective Equipment.\\n9 Ineffective Injury Prevention.\\n \\n5 Failure to Properly Store Hazardous M\\n2 Lack of Adequate Fire Safety Measure\\n \\n4 Inadequate Ventilation Systems.\\n.7 Insufficient Employee Training for Safe\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Jungle Healt Date: February 3, 1905 \\n Violation Count: 7 \\n Summary: Worst of any, however, were the fertilizer men, and those who served in the cooking rooms.',\n",
199
- " 'Date': 'Durham’s Meatpacking Chicago, Ill.\\n \\nFebruary 3, 1905 \\n \\non Count: 7 \\nary: Worst of any, however, were the fertilizer\\npeople could not be shown to the visitor - for t\\n \\nat a hundred yards, and as for the other men,\\nof which there were open vats near the level o\\ne vats; and when they were fished out, there w\\nng - sometimes they would be overlooked for\\nworld as Durham’s Pure Leaf Lard!\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\ntions \\n \\nte Description \\n7 Unsanitary Working Conditions.\\n \\nInadequate Protective Equipment.\\nIneffective Injury Prevention.\\n \\nFailure to Properly Store Hazardous Ma\\nLack of Adequate Fire Safety Measures\\n \\nInadequate Ventilation Systems.\\n7 Insufficient Employee Training for Safe W\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Jungle Health Violation Count: 7 \\n Summary: Worst of any, however, were the fertilizer men, and those who served in the cooking rooms.\\n These people could not be shown to the visitor - for the odor of a fertilizer man would scare any ordinary',\n",
200
- " 'Violation Count': 'eatpacking Chicago, Ill. \\n \\n, 1905 \\n \\n7 \\nof any, however, were the fertilizer men, and\\nld not be shown to the visitor - for the odor of\\n \\nd yards, and as for the other men, who worke\\nre were open vats near the level of the floor, t\\nwhen they were fished out, there was never e\\nmes they would be overlooked for days, till all\\nrham’s Pure Leaf Lard! \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nription \\nnitary Working Conditions. \\n \\nquate Protective Equipment.\\nctive Injury Prevention. \\n \\ne to Properly Store Hazardous Materials.\\nof Adequate Fire Safety Measures.\\n \\nquate Ventilation Systems. \\nicient Employee Training for Safe Work Practi\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nJungle Health and Safety Ins Summary: Worst of any, however, were the fertilizer men, and those who served in the cooking rooms.\\n These people could not be shown to the visitor - for the odor of a fertilizer man would scare any ordinary\\n visitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in',\n",
201
- " 'Summary': 'm’s Meatpacking Chicago, Ill.\\n \\nuary 3, 1905 \\n \\nount: 7 \\nWorst of any, however, were the fertilizer men\\nple could not be shown to the visitor - for the o\\n \\nhundred yards, and as for the other men, who\\nich there were open vats near the level of the\\ns; and when they were fished out, there was n\\nsometimes they would be overlooked for days\\nas Durham’s Pure Leaf Lard!\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\ns \\n \\nDescription \\nUnsanitary Working Conditions.\\n \\nInadequate Protective Equipment.\\nIneffective Injury Prevention.\\n \\nFailure to Properly Store Hazardous Material\\nLack of Adequate Fire Safety Measures.\\n \\nInadequate Ventilation Systems.\\nInsufficient Employee Training for Safe Work\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Jungle Health and S These people could not be shown to the visitor - for the odor of a fertilizer man would scare any ordinary\\n visitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in\\n some of which there were open vats near the level of the floor, their peculiar trouble was that they fell'}"
198
+ "{'Site': 'Durham’s Meatpacking Chicago, Ill.\\nFebruary 3, 1905\\ntion Count: 7\\nmary: Worst of any, however, were the fertilize\\ne people could not be shown to the visitor - for\\nr at a hundred yards, and as for the other men\\nof which there were open vats near the level\\nhe vats; and when they were fished out, there\\niting - sometimes they would be overlooked fo\\nworld as Durham’s Pure Leaf Lard!\\nations\\nute Description\\n.7 Unsanitary Working Conditions.\\n3 Inadequate Protective Equipment.\\n9 Ineffective Injury Prevention.\\n5 Failure to Properly Store Hazardous M\\n2 Lack of Adequate Fire Safety Measure\\n4 Inadequate Ventilation Systems.\\n.7 Insufficient Employee Training for Safe\\nJungle Healt Date: February 3, 1905\\nViolation Count: 7\\nSummary: Worst of any, however, were the fertilizer men, and those who served in the cooking rooms.',\n",
199
+ " 'Date': 'Durham’s Meatpacking Chicago, Ill.\\nFebruary 3, 1905\\non Count: 7\\nary: Worst of any, however, were the fertilizer\\npeople could not be shown to the visitor - for t\\nat a hundred yards, and as for the other men,\\nof which there were open vats near the level o\\ne vats; and when they were fished out, there w\\nng - sometimes they would be overlooked for\\nworld as Durham’s Pure Leaf Lard!\\ntions\\nte Description\\n7 Unsanitary Working Conditions.\\nInadequate Protective Equipment.\\nIneffective Injury Prevention.\\nFailure to Properly Store Hazardous Ma\\nLack of Adequate Fire Safety Measures\\nInadequate Ventilation Systems.\\n7 Insufficient Employee Training for Safe W\\nJungle Health Violation Count: 7\\nSummary: Worst of any, however, were the fertilizer men, and those who served in the cooking rooms.\\nThese people could not be shown to the visitor - for the odor of a fertilizer man would scare any ordinary',\n",
200
+ " 'Violation Count': 'eatpacking Chicago, Ill.\\n, 1905\\n7\\nof any, however, were the fertilizer men, and\\nld not be shown to the visitor - for the odor of\\nd yards, and as for the other men, who worke\\nre were open vats near the level of the floor, t\\nwhen they were fished out, there was never e\\nmes they would be overlooked for days, till all\\nrham’s Pure Leaf Lard!\\nription\\nnitary Working Conditions.\\nquate Protective Equipment.\\nctive Injury Prevention.\\ne to Properly Store Hazardous Materials.\\nof Adequate Fire Safety Measures.\\nquate Ventilation Systems.\\nicient Employee Training for Safe Work Practi\\nJungle Health and Safety Ins Summary: Worst of any, however, were the fertilizer men, and those who served in the cooking rooms.\\nThese people could not be shown to the visitor - for the odor of a fertilizer man would scare any ordinary\\nvisitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in',\n",
201
+ " 'Summary': 'm’s Meatpacking Chicago, Ill.\\nuary 3, 1905\\nount: 7\\nWorst of any, however, were the fertilizer men\\nple could not be shown to the visitor - for the o\\nhundred yards, and as for the other men, who\\nich there were open vats near the level of the\\ns; and when they were fished out, there was n\\nsometimes they would be overlooked for days\\nas Durham’s Pure Leaf Lard!\\ns\\nDescription\\nUnsanitary Working Conditions.\\nInadequate Protective Equipment.\\nIneffective Injury Prevention.\\nFailure to Properly Store Hazardous Material\\nLack of Adequate Fire Safety Measures.\\nInadequate Ventilation Systems.\\nInsufficient Employee Training for Safe Work\\nJungle Health and S These people could not be shown to the visitor - for the odor of a fertilizer man would scare any ordinary\\nvisitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor, their peculiar trouble was that they fell'}"
202
202
  ]
203
203
  },
204
204
  "execution_count": 5,
@@ -233,7 +233,7 @@
233
233
  },
234
234
  {
235
235
  "cell_type": "markdown",
236
- "id": "0619ba40",
236
+ "id": "1ca87dde",
237
237
  "metadata": {},
238
238
  "source": [
239
239
  "## Finding Pattern-Based Fields"
@@ -242,13 +242,13 @@
242
242
  {
243
243
  "cell_type": "code",
244
244
  "execution_count": 6,
245
- "id": "06ed3f0e",
245
+ "id": "d1d98750",
246
246
  "metadata": {
247
247
  "execution": {
248
- "iopub.execute_input": "2025-04-21T21:25:31.906548Z",
249
- "iopub.status.busy": "2025-04-21T21:25:31.906380Z",
250
- "iopub.status.idle": "2025-04-21T21:25:31.933546Z",
251
- "shell.execute_reply": "2025-04-21T21:25:31.933080Z"
248
+ "iopub.execute_input": "2025-04-27T16:33:32.189138Z",
249
+ "iopub.status.busy": "2025-04-27T16:33:32.189046Z",
250
+ "iopub.status.idle": "2025-04-27T16:33:32.209944Z",
251
+ "shell.execute_reply": "2025-04-27T16:33:32.209645Z"
252
252
  }
253
253
  },
254
254
  "outputs": [
@@ -291,7 +291,7 @@
291
291
  },
292
292
  {
293
293
  "cell_type": "markdown",
294
- "id": "6328f728",
294
+ "id": "6edf8b69",
295
295
  "metadata": {},
296
296
  "source": [
297
297
  "## Working with Form Tables"
@@ -300,13 +300,13 @@
300
300
  {
301
301
  "cell_type": "code",
302
302
  "execution_count": 7,
303
- "id": "8ec27457",
303
+ "id": "0235fd88",
304
304
  "metadata": {
305
305
  "execution": {
306
- "iopub.execute_input": "2025-04-21T21:25:31.935350Z",
307
- "iopub.status.busy": "2025-04-21T21:25:31.935185Z",
308
- "iopub.status.idle": "2025-04-21T21:25:34.162554Z",
309
- "shell.execute_reply": "2025-04-21T21:25:34.162025Z"
306
+ "iopub.execute_input": "2025-04-27T16:33:32.211143Z",
307
+ "iopub.status.busy": "2025-04-27T16:33:32.211046Z",
308
+ "iopub.status.idle": "2025-04-27T16:33:34.484372Z",
309
+ "shell.execute_reply": "2025-04-27T16:33:34.483931Z"
310
310
  }
311
311
  },
312
312
  "outputs": [
@@ -314,28 +314,28 @@
314
314
  "name": "stderr",
315
315
  "output_type": "stream",
316
316
  "text": [
317
- "\u001b[2m2025-04-21T21:25:31.947156Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mGOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m72\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.gemini\u001b[0m\n"
317
+ "\u001b[2m2025-04-27T16:33:32.219597Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mGOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m76\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.gemini\u001b[0m\n"
318
318
  ]
319
319
  },
320
320
  {
321
321
  "name": "stderr",
322
322
  "output_type": "stream",
323
323
  "text": [
324
- "[2025-04-21 17:25:31,947] [ WARNING] gemini.py:72 - GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\n"
324
+ "[2025-04-27 12:33:32,219] [ WARNING] gemini.py:76 - GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\n"
325
325
  ]
326
326
  },
327
327
  {
328
328
  "name": "stderr",
329
329
  "output_type": "stream",
330
330
  "text": [
331
- "\u001b[2m2025-04-21T21:25:31.947949Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mGOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m72\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.gemini\u001b[0m\n"
331
+ "\u001b[2m2025-04-27T16:33:32.220225Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mGOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m76\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.gemini\u001b[0m\n"
332
332
  ]
333
333
  },
334
334
  {
335
335
  "name": "stderr",
336
336
  "output_type": "stream",
337
337
  "text": [
338
- "[2025-04-21 17:25:31,947] [ WARNING] gemini.py:72 - GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\n"
338
+ "[2025-04-27 12:33:32,220] [ WARNING] gemini.py:76 - GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\n"
339
339
  ]
340
340
  },
341
341
  {
@@ -349,14 +349,14 @@
349
349
  "name": "stdout",
350
350
  "output_type": "stream",
351
351
  "text": [
352
- "image 1/1 /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpfyjqm372/temp_layout_image.png: 1024x800 2 titles, 3 plain texts, 2 abandons, 1 table, 1806.1ms\n"
352
+ "image 1/1 /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmp8cxd7muz/temp_layout_image.png: 1024x800 2 titles, 3 plain texts, 2 abandons, 1 table, 1520.2ms\n"
353
353
  ]
354
354
  },
355
355
  {
356
356
  "name": "stdout",
357
357
  "output_type": "stream",
358
358
  "text": [
359
- "Speed: 7.3ms preprocess, 1806.1ms inference, 0.9ms postprocess per image at shape (1, 3, 1024, 800)\n"
359
+ "Speed: 4.2ms preprocess, 1520.2ms inference, 0.8ms postprocess per image at shape (1, 3, 1024, 800)\n"
360
360
  ]
361
361
  }
362
362
  ],
@@ -416,7 +416,7 @@
416
416
  },
417
417
  {
418
418
  "cell_type": "markdown",
419
- "id": "e1236765",
419
+ "id": "350d431a",
420
420
  "metadata": {},
421
421
  "source": [
422
422
  "## Combining Different Extraction Techniques"
@@ -425,23 +425,23 @@
425
425
  {
426
426
  "cell_type": "code",
427
427
  "execution_count": 8,
428
- "id": "3f5fb2ad",
428
+ "id": "28145ee8",
429
429
  "metadata": {
430
430
  "execution": {
431
- "iopub.execute_input": "2025-04-21T21:25:34.164351Z",
432
- "iopub.status.busy": "2025-04-21T21:25:34.164185Z",
433
- "iopub.status.idle": "2025-04-21T21:25:34.180512Z",
434
- "shell.execute_reply": "2025-04-21T21:25:34.180172Z"
431
+ "iopub.execute_input": "2025-04-27T16:33:34.485868Z",
432
+ "iopub.status.busy": "2025-04-27T16:33:34.485763Z",
433
+ "iopub.status.idle": "2025-04-27T16:33:34.498675Z",
434
+ "shell.execute_reply": "2025-04-27T16:33:34.498383Z"
435
435
  }
436
436
  },
437
437
  "outputs": [
438
438
  {
439
439
  "data": {
440
440
  "text/plain": [
441
- "{'Site': 'Durham’s Meatpacking Chicago, Ill.\\n \\nFebruary 3, 1905 \\n \\ntion Count: 7 \\nmary: Worst of any, however, were the fertilize\\ne people could not be shown to the visitor - for\\n \\nr at a hundred yards, and as for the other men\\nof which there were open vats near the level\\nhe vats; and when they were fished out, there\\niting - sometimes they would be overlooked fo\\nworld as Durham’s Pure Leaf Lard!\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nations \\n \\nute Description \\n.7 Unsanitary Working Conditions.\\n \\n3 Inadequate Protective Equipment.\\n9 Ineffective Injury Prevention.\\n \\n5 Failure to Properly Store Hazardous M\\n2 Lack of Adequate Fire Safety Measure\\n \\n4 Inadequate Ventilation Systems.\\n.7 Insufficient Employee Training for Safe\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Jungle Healt',\n",
442
- " 'Date': 'Durham’s Meatpacking Chicago, Ill.\\n \\nFebruary 3, 1905 \\n \\non Count: 7 \\nary: Worst of any, however, were the fertilizer\\npeople could not be shown to the visitor - for t\\n \\nat a hundred yards, and as for the other men,\\nof which there were open vats near the level o\\ne vats; and when they were fished out, there w\\nng - sometimes they would be overlooked for\\nworld as Durham’s Pure Leaf Lard!\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\ntions \\n \\nte Description \\n7 Unsanitary Working Conditions.\\n \\nInadequate Protective Equipment.\\nIneffective Injury Prevention.\\n \\nFailure to Properly Store Hazardous Ma\\nLack of Adequate Fire Safety Measures\\n \\nInadequate Ventilation Systems.\\n7 Insufficient Employee Training for Safe W\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Jungle Health',\n",
443
- " 'Violation Count': 'eatpacking Chicago, Ill. \\n \\n, 1905 \\n \\n7 \\nof any, however, were the fertilizer men, and\\nld not be shown to the visitor - for the odor of\\n \\nd yards, and as for the other men, who worke\\nre were open vats near the level of the floor, t\\nwhen they were fished out, there was never e\\nmes they would be overlooked for days, till all\\nrham’s Pure Leaf Lard! \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nription \\nnitary Working Conditions. \\n \\nquate Protective Equipment.\\nctive Injury Prevention. \\n \\ne to Properly Store Hazardous Materials.\\nof Adequate Fire Safety Measures.\\n \\nquate Ventilation Systems. \\nicient Employee Training for Safe Work Practi\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nJungle Health and Safety Ins',\n",
444
- " 'Summary': 'm’s Meatpacking Chicago, Ill.\\n \\nuary 3, 1905 \\n \\nount: 7 \\nWorst of any, however, were the fertilizer men\\nple could not be shown to the visitor - for the o\\n \\nhundred yards, and as for the other men, who\\nich there were open vats near the level of the\\ns; and when they were fished out, there was n\\nsometimes they would be overlooked for days\\nas Durham’s Pure Leaf Lard!\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\ns \\n \\nDescription \\nUnsanitary Working Conditions.\\n \\nInadequate Protective Equipment.\\nIneffective Injury Prevention.\\n \\nFailure to Properly Store Hazardous Material\\nLack of Adequate Fire Safety Measures.\\n \\nInadequate Ventilation Systems.\\nInsufficient Employee Training for Safe Work\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Jungle Health and S'}"
441
+ "{'Site': 'Durham’s Meatpacking Chicago, Ill.\\nFebruary 3, 1905\\ntion Count: 7\\nmary: Worst of any, however, were the fertilize\\ne people could not be shown to the visitor - for\\nr at a hundred yards, and as for the other men\\nof which there were open vats near the level\\nhe vats; and when they were fished out, there\\niting - sometimes they would be overlooked fo\\nworld as Durham’s Pure Leaf Lard!\\nations\\nute Description\\n.7 Unsanitary Working Conditions.\\n3 Inadequate Protective Equipment.\\n9 Ineffective Injury Prevention.\\n5 Failure to Properly Store Hazardous M\\n2 Lack of Adequate Fire Safety Measure\\n4 Inadequate Ventilation Systems.\\n.7 Insufficient Employee Training for Safe\\nJungle Healt',\n",
442
+ " 'Date': 'Durham’s Meatpacking Chicago, Ill.\\nFebruary 3, 1905\\non Count: 7\\nary: Worst of any, however, were the fertilizer\\npeople could not be shown to the visitor - for t\\nat a hundred yards, and as for the other men,\\nof which there were open vats near the level o\\ne vats; and when they were fished out, there w\\nng - sometimes they would be overlooked for\\nworld as Durham’s Pure Leaf Lard!\\ntions\\nte Description\\n7 Unsanitary Working Conditions.\\nInadequate Protective Equipment.\\nIneffective Injury Prevention.\\nFailure to Properly Store Hazardous Ma\\nLack of Adequate Fire Safety Measures\\nInadequate Ventilation Systems.\\n7 Insufficient Employee Training for Safe W\\nJungle Health',\n",
443
+ " 'Violation Count': 'eatpacking Chicago, Ill.\\n, 1905\\n7\\nof any, however, were the fertilizer men, and\\nld not be shown to the visitor - for the odor of\\nd yards, and as for the other men, who worke\\nre were open vats near the level of the floor, t\\nwhen they were fished out, there was never e\\nmes they would be overlooked for days, till all\\nrham’s Pure Leaf Lard!\\nription\\nnitary Working Conditions.\\nquate Protective Equipment.\\nctive Injury Prevention.\\ne to Properly Store Hazardous Materials.\\nof Adequate Fire Safety Measures.\\nquate Ventilation Systems.\\nicient Employee Training for Safe Work Practi\\nJungle Health and Safety Ins',\n",
444
+ " 'Summary': 'm’s Meatpacking Chicago, Ill.\\nuary 3, 1905\\nount: 7\\nWorst of any, however, were the fertilizer men\\nple could not be shown to the visitor - for the o\\nhundred yards, and as for the other men, who\\nich there were open vats near the level of the\\ns; and when they were fished out, there was n\\nsometimes they would be overlooked for days\\nas Durham’s Pure Leaf Lard!\\ns\\nDescription\\nUnsanitary Working Conditions.\\nInadequate Protective Equipment.\\nIneffective Injury Prevention.\\nFailure to Properly Store Hazardous Material\\nLack of Adequate Fire Safety Measures.\\nInadequate Ventilation Systems.\\nInsufficient Employee Training for Safe Work\\nJungle Health and S'}"
445
445
  ]
446
446
  },
447
447
  "execution_count": 8,
@@ -481,7 +481,7 @@
481
481
  },
482
482
  {
483
483
  "cell_type": "markdown",
484
- "id": "ed4182f5",
484
+ "id": "a6ce3f37",
485
485
  "metadata": {},
486
486
  "source": [
487
487
  "Form field extraction enables you to automate data entry and document processing. By combining different techniques like label detection, spatial navigation, and pattern matching, you can handle a wide variety of form layouts. "
@@ -494,6 +494,11 @@
494
494
  "main_language": "python",
495
495
  "notebook_metadata_filter": "-all"
496
496
  },
497
+ "kernelspec": {
498
+ "display_name": "Python (natural-pdf)",
499
+ "language": "python",
500
+ "name": "natural-pdf"
501
+ },
497
502
  "language_info": {
498
503
  "codemirror_mode": {
499
504
  "name": "ipython",
@@ -2,7 +2,7 @@
2
2
  "cells": [
3
3
  {
4
4
  "cell_type": "markdown",
5
- "id": "7674e123",
5
+ "id": "517f2a42",
6
6
  "metadata": {},
7
7
  "source": [
8
8
  "# Enhanced Table Processing\n",
@@ -15,13 +15,13 @@
15
15
  {
16
16
  "cell_type": "code",
17
17
  "execution_count": 1,
18
- "id": "08c7c5f0",
18
+ "id": "aa46cf5a",
19
19
  "metadata": {
20
20
  "execution": {
21
- "iopub.execute_input": "2025-04-21T21:25:37.324499Z",
22
- "iopub.status.busy": "2025-04-21T21:25:37.324337Z",
23
- "iopub.status.idle": "2025-04-21T21:25:37.328739Z",
24
- "shell.execute_reply": "2025-04-21T21:25:37.328344Z"
21
+ "iopub.execute_input": "2025-04-27T16:33:37.056103Z",
22
+ "iopub.status.busy": "2025-04-27T16:33:37.055816Z",
23
+ "iopub.status.idle": "2025-04-27T16:33:37.061609Z",
24
+ "shell.execute_reply": "2025-04-27T16:33:37.060974Z"
25
25
  }
26
26
  },
27
27
  "outputs": [],
@@ -36,6 +36,11 @@
36
36
  "main_language": "python",
37
37
  "notebook_metadata_filter": "-all"
38
38
  },
39
+ "kernelspec": {
40
+ "display_name": "Python (natural-pdf)",
41
+ "language": "python",
42
+ "name": "natural-pdf"
43
+ },
39
44
  "language_info": {
40
45
  "codemirror_mode": {
41
46
  "name": "ipython",