natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +125 -97
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +907 -513
  81. natural_pdf/core/pdf.py +385 -287
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +708 -508
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,586 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "38157702",
6
+ "metadata": {},
7
+ "source": [
8
+ "# OCR Integration for Scanned Documents\n",
9
+ "\n",
10
+ "Optical Character Recognition (OCR) allows you to extract text from scanned documents where the text isn't embedded in the PDF. This tutorial demonstrates how to work with scanned documents."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "id": "462de69c",
17
+ "metadata": {
18
+ "execution": {
19
+ "iopub.execute_input": "2025-04-16T14:58:46.724429Z",
20
+ "iopub.status.busy": "2025-04-16T14:58:46.724305Z",
21
+ "iopub.status.idle": "2025-04-16T14:58:46.727892Z",
22
+ "shell.execute_reply": "2025-04-16T14:58:46.727465Z"
23
+ }
24
+ },
25
+ "outputs": [],
26
+ "source": [
27
+ "#%pip install \"natural-pdf[all]\""
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 2,
33
+ "id": "1509ad46",
34
+ "metadata": {
35
+ "execution": {
36
+ "iopub.execute_input": "2025-04-16T14:58:46.731122Z",
37
+ "iopub.status.busy": "2025-04-16T14:58:46.730093Z",
38
+ "iopub.status.idle": "2025-04-16T14:58:54.166474Z",
39
+ "shell.execute_reply": "2025-04-16T14:58:54.165872Z"
40
+ }
41
+ },
42
+ "outputs": [
43
+ {
44
+ "data": {
45
+ "text/plain": [
46
+ "'Without OCR: 0 characters extracted'"
47
+ ]
48
+ },
49
+ "execution_count": 2,
50
+ "metadata": {},
51
+ "output_type": "execute_result"
52
+ }
53
+ ],
54
+ "source": [
55
+ "from natural_pdf import PDF\n",
56
+ "\n",
57
+ "# Load a PDF\n",
58
+ "pdf = PDF(\"https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/needs-ocr.pdf\")\n",
59
+ "page = pdf.pages[0]\n",
60
+ "\n",
61
+ "# Try extracting text without OCR\n",
62
+ "text_without_ocr = page.extract_text()\n",
63
+ "f\"Without OCR: {len(text_without_ocr)} characters extracted\""
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "markdown",
68
+ "id": "4f9a732e",
69
+ "metadata": {},
70
+ "source": [
71
+ "## Enabling OCR"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 3,
77
+ "id": "9b2cea08",
78
+ "metadata": {
79
+ "execution": {
80
+ "iopub.execute_input": "2025-04-16T14:58:54.168271Z",
81
+ "iopub.status.busy": "2025-04-16T14:58:54.167928Z",
82
+ "iopub.status.idle": "2025-04-16T14:58:54.171228Z",
83
+ "shell.execute_reply": "2025-04-16T14:58:54.170814Z"
84
+ }
85
+ },
86
+ "outputs": [
87
+ {
88
+ "data": {
89
+ "text/plain": [
90
+ "''"
91
+ ]
92
+ },
93
+ "execution_count": 3,
94
+ "metadata": {},
95
+ "output_type": "execute_result"
96
+ }
97
+ ],
98
+ "source": [
99
+ "# Enable OCR for text extraction\n",
100
+ "page.use_ocr = True\n",
101
+ "\n",
102
+ "# Extract text with OCR enabled\n",
103
+ "text_with_ocr = page.extract_text()\n",
104
+ "\n",
105
+ "# Preview the extracted text\n",
106
+ "text_with_ocr[:200] + \"...\" if len(text_with_ocr) > 200 else text_with_ocr"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "markdown",
111
+ "id": "75e39372",
112
+ "metadata": {},
113
+ "source": [
114
+ "## Finding Text Elements with OCR"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": 4,
120
+ "id": "b253d49f",
121
+ "metadata": {
122
+ "execution": {
123
+ "iopub.execute_input": "2025-04-16T14:58:54.172736Z",
124
+ "iopub.status.busy": "2025-04-16T14:58:54.172581Z",
125
+ "iopub.status.idle": "2025-04-16T14:59:04.346553Z",
126
+ "shell.execute_reply": "2025-04-16T14:59:04.346230Z"
127
+ }
128
+ },
129
+ "outputs": [
130
+ {
131
+ "name": "stderr",
132
+ "output_type": "stream",
133
+ "text": [
134
+ "\u001b[2m2025-04-16T14:58:54.225410Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mUsing CPU. Note: This module is much faster with a GPU.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m71\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35measyocr.easyocr\u001b[0m\n"
135
+ ]
136
+ },
137
+ {
138
+ "name": "stderr",
139
+ "output_type": "stream",
140
+ "text": [
141
+ "[2025-04-16 17:58:54,225] [ WARNING] easyocr.py:71 - Using CPU. Note: This module is much faster with a GPU.\n"
142
+ ]
143
+ },
144
+ {
145
+ "data": {
146
+ "text/plain": [
147
+ "<ElementCollection[TextElement](count=49)>"
148
+ ]
149
+ },
150
+ "execution_count": 4,
151
+ "metadata": {},
152
+ "output_type": "execute_result"
153
+ }
154
+ ],
155
+ "source": [
156
+ "# Convert text-as-image to text elements\n",
157
+ "page.apply_ocr()\n",
158
+ "\n",
159
+ "# Select all text pieces on the page\n",
160
+ "text_elements = page.find_all('text')\n",
161
+ "f\"Found {len(text_elements)} text elements\"\n",
162
+ "\n",
163
+ "# Visualize the elements\n",
164
+ "text_elements.highlight()"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "markdown",
169
+ "id": "c8e12006",
170
+ "metadata": {},
171
+ "source": [
172
+ "## OCR Configuration Options"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": 5,
178
+ "id": "4a77d1bf",
179
+ "metadata": {
180
+ "execution": {
181
+ "iopub.execute_input": "2025-04-16T14:59:04.348402Z",
182
+ "iopub.status.busy": "2025-04-16T14:59:04.348238Z",
183
+ "iopub.status.idle": "2025-04-16T14:59:04.352084Z",
184
+ "shell.execute_reply": "2025-04-16T14:59:04.351691Z"
185
+ }
186
+ },
187
+ "outputs": [
188
+ {
189
+ "data": {
190
+ "text/plain": [
191
+ "' \\n \\n ...'"
192
+ ]
193
+ },
194
+ "execution_count": 5,
195
+ "metadata": {},
196
+ "output_type": "execute_result"
197
+ }
198
+ ],
199
+ "source": [
200
+ "# Set OCR configuration for better results\n",
201
+ "page.ocr_config = {\n",
202
+ " 'language': 'eng', # English\n",
203
+ " 'dpi': 300, # Higher resolution\n",
204
+ "}\n",
205
+ "\n",
206
+ "# Extract text with the improved configuration\n",
207
+ "improved_text = page.extract_text()\n",
208
+ "\n",
209
+ "# Preview the text\n",
210
+ "improved_text[:200] + \"...\" if len(improved_text) > 200 else improved_text"
211
+ ]
212
+ },
213
+ {
214
+ "cell_type": "markdown",
215
+ "id": "7a702637",
216
+ "metadata": {},
217
+ "source": [
218
+ "## Working with Multi-language Documents"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": 6,
224
+ "id": "f42f0c39",
225
+ "metadata": {
226
+ "execution": {
227
+ "iopub.execute_input": "2025-04-16T14:59:04.353822Z",
228
+ "iopub.status.busy": "2025-04-16T14:59:04.353670Z",
229
+ "iopub.status.idle": "2025-04-16T14:59:04.356993Z",
230
+ "shell.execute_reply": "2025-04-16T14:59:04.356695Z"
231
+ }
232
+ },
233
+ "outputs": [
234
+ {
235
+ "data": {
236
+ "text/plain": [
237
+ "' \\n \\n '"
238
+ ]
239
+ },
240
+ "execution_count": 6,
241
+ "metadata": {},
242
+ "output_type": "execute_result"
243
+ }
244
+ ],
245
+ "source": [
246
+ "# Configure for multiple languages\n",
247
+ "page.ocr_config = {\n",
248
+ " 'language': 'eng+fra+deu', # English, French, German\n",
249
+ " 'dpi': 300\n",
250
+ "}\n",
251
+ "\n",
252
+ "# Extract text with multi-language support\n",
253
+ "multilang_text = page.extract_text()\n",
254
+ "multilang_text[:200]"
255
+ ]
256
+ },
257
+ {
258
+ "cell_type": "markdown",
259
+ "id": "46d8fcbb",
260
+ "metadata": {},
261
+ "source": [
262
+ "## Extracting Tables from Scanned Documents"
263
+ ]
264
+ },
265
+ {
266
+ "cell_type": "code",
267
+ "execution_count": 7,
268
+ "id": "e2cb5597",
269
+ "metadata": {
270
+ "execution": {
271
+ "iopub.execute_input": "2025-04-16T14:59:04.358447Z",
272
+ "iopub.status.busy": "2025-04-16T14:59:04.358302Z",
273
+ "iopub.status.idle": "2025-04-16T14:59:06.563788Z",
274
+ "shell.execute_reply": "2025-04-16T14:59:06.563483Z"
275
+ }
276
+ },
277
+ "outputs": [
278
+ {
279
+ "name": "stdout",
280
+ "output_type": "stream",
281
+ "text": [
282
+ "\n"
283
+ ]
284
+ },
285
+ {
286
+ "name": "stdout",
287
+ "output_type": "stream",
288
+ "text": [
289
+ "image 1/1 /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmps1z5zj11/temp_layout_image.png: 1024x800 2 titles, 2 plain texts, 3 abandons, 1 table, 1703.3ms\n"
290
+ ]
291
+ },
292
+ {
293
+ "name": "stdout",
294
+ "output_type": "stream",
295
+ "text": [
296
+ "Speed: 6.6ms preprocess, 1703.3ms inference, 1.1ms postprocess per image at shape (1, 3, 1024, 800)\n"
297
+ ]
298
+ }
299
+ ],
300
+ "source": [
301
+ "# Enable OCR and analyze the document layout\n",
302
+ "page.use_ocr = True\n",
303
+ "page.analyze_layout()\n",
304
+ "\n",
305
+ "# Find table regions\n",
306
+ "table_regions = page.find_all('region[type=table]')\n",
307
+ "\n",
308
+ "# Visualize any detected tables\n",
309
+ "table_regions.highlight()\n",
310
+ "\n",
311
+ "# Extract the first table if found\n",
312
+ "if table_regions:\n",
313
+ " table_data = table_regions[0].extract_table()\n",
314
+ " table_data\n",
315
+ "else:\n",
316
+ " \"No tables found in the document\""
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "markdown",
321
+ "id": "17eee068",
322
+ "metadata": {},
323
+ "source": [
324
+ "## Finding Form Fields in Scanned Documents"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "code",
329
+ "execution_count": 8,
330
+ "id": "e22d5704",
331
+ "metadata": {
332
+ "execution": {
333
+ "iopub.execute_input": "2025-04-16T14:59:06.565411Z",
334
+ "iopub.status.busy": "2025-04-16T14:59:06.565245Z",
335
+ "iopub.status.idle": "2025-04-16T14:59:06.570996Z",
336
+ "shell.execute_reply": "2025-04-16T14:59:06.570628Z"
337
+ }
338
+ },
339
+ "outputs": [
340
+ {
341
+ "data": {
342
+ "text/plain": [
343
+ "{'Date: February 3, 1905': \"Jungle Health and Safety Irspectlon Servlce\\n \\n \\n \\n \\n \\n \\n \\nSummary: Warst of any, however;were the fertilizer men, and those who served in the coaking roams\\nThese people could not be shown to the visltor_for the odor of a fertllizer man would scare any ordlnary\\nhundred yards _ and as far the ather men_who warked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; thelr pecullar trouble was that\\ntheywere fished out; there was never enough of them left to be worth\\nsometimes they would be overlooked for days, till all but the bones of them had gone out\\nto the world as Durham's Pure Leaf Lardl\\n \\n \\n \\n \\n \\n \\n \\n \\n \\nDescription \\n \\nUnsanitary Working Conditions_\\nInadequate Protective Equipment:\\nInjuryPrevention \\n \\nFailurc to Properly Storc Hazardous Matcrials_\\nLack of Adequale Fire Safety Measures_\\nInadequate Ventilation Systems.\\n \\nInsuficlent Employee Trainlng for Safe Work Practices\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nJungle Hlealth and Safety Inspection Servize\",\n",
344
+ " 'Violation Count': \"Date: February 3, 1905 \\n \\n \\nSummary: Warst of any, however;were the fertilizer men, and those who served in the coaking roams\\nThese people could not be shown to the visltor_for the odor of a fertllizer man would scare any ordlnary\\nhundred yards _ and as far the ather men_who warked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; thelr pecullar trouble was that\\ninto the vats; and whentheywere fished out; there was never enough of them left to be worth\\nsometimes they would be overlooked for days, till all but the bones of them had gone out\\nto the world as Durham's Pure Leaf Lardl\\n \\n \\n \\n \\n \\n \\n \\n \\n \\nDescription \\n \\nUnsanitary Working Conditions_\\nInadequate Protective Equipment:\\nIneffectiveInjuryPrevention\\n \\nFailurc to Properly Storc Hazardous Matcrials_\\nLack of Adequale Fire Safety Measures_\\nInadequate Ventilation Systems.\\n \\nInsuficlent Employee Trainlng for Safe Work Practices\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nJungle Hlealth and Safety Inspection Servize\",\n",
345
+ " 'Summary: Warst of any, however;': \"Jungle Health and Safety Irspectlon Servlce\\n \\n \\n \\n \\n \\n \\n \\nwere the fertilizer men, and those who served in the coaking roams\\nThese people could not be shown to the visltor_for the odor of a fertllizer man would scare any ordlnary\\nand as far the ather men_who warked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; thelr pecullar trouble was that\\nwere fished out; there was never enough of them left to be worth\\nsometimes they would be overlooked for days, till all but the bones of them had gone out\\nto the world as Durham's Pure Leaf Lardl\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nUnsanitary Working Conditions_\\nInadequate Protective Equipment:\\nPrevention \\n \\nFailurc to Properly Storc Hazardous Matcrials_\\nLack of Adequale Fire Safety Measures_\\nInadequate Ventilation Systems.\\n \\nInsuficlent Employee Trainlng for Safe Work Practices\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nJungle Hlealth and Safety Inspection Servize\",\n",
346
+ " 'Inadequate Protective Equipment': 'Jungle Health and Safety Irspectlon Servlce\\n \\n \\n \\n \\n \\n \\n \\nwere the fertilizer men, and those who served in the coaking roams\\nThese people could not be shown to the visltor_for the odor of a fertllizer man would scare any ordlnary\\nand as far the ather men_who warked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; thelr pecullar trouble was thattheyfell\\nwere fished out; there was never enough of them left to be worth\\nsometimes they would be overlooked for days, till all but the bones of them had gone out\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nLevel \\nCritical \\nSerious \\nSerious \\n \\nFailurc to Properly Storc Hazardous Matcrials_ Critical\\nLack of Adequale Fire Safety Measures_ Serious\\nSerious \\n \\nInsuficlent Employee Trainlng for Safe Work Practices Serlous\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nJungle Hlealth and Safety Inspection Servize'}"
347
+ ]
348
+ },
349
+ "execution_count": 8,
350
+ "metadata": {},
351
+ "output_type": "execute_result"
352
+ }
353
+ ],
354
+ "source": [
355
+ "# Look for potential form labels (containing a colon)\n",
356
+ "labels = page.find_all('text:contains(\":\")') \n",
357
+ "\n",
358
+ "# Visualize the labels\n",
359
+ "labels.highlight()\n",
360
+ "\n",
361
+ "# Extract form data by looking to the right of each label\n",
362
+ "form_data = {}\n",
363
+ "for label in labels:\n",
364
+ " # Clean the label text\n",
365
+ " field_name = label.text.strip().rstrip(':')\n",
366
+ " \n",
367
+ " # Find the value to the right\n",
368
+ " value_element = label.right(width=200)\n",
369
+ " value = value_element.extract_text().strip()\n",
370
+ " \n",
371
+ " # Add to our dictionary\n",
372
+ " form_data[field_name] = value\n",
373
+ "\n",
374
+ "# Display the extracted data\n",
375
+ "form_data"
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "markdown",
380
+ "id": "4f68f512",
381
+ "metadata": {},
382
+ "source": [
383
+ "## Combining OCR with Layout Analysis"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "execution_count": 9,
389
+ "id": "1135fd7a",
390
+ "metadata": {
391
+ "execution": {
392
+ "iopub.execute_input": "2025-04-16T14:59:06.572437Z",
393
+ "iopub.status.busy": "2025-04-16T14:59:06.572312Z",
394
+ "iopub.status.idle": "2025-04-16T14:59:08.299792Z",
395
+ "shell.execute_reply": "2025-04-16T14:59:08.299429Z"
396
+ }
397
+ },
398
+ "outputs": [
399
+ {
400
+ "name": "stdout",
401
+ "output_type": "stream",
402
+ "text": [
403
+ "\n"
404
+ ]
405
+ },
406
+ {
407
+ "name": "stdout",
408
+ "output_type": "stream",
409
+ "text": [
410
+ "image 1/1 /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpnp5bwgzc/temp_layout_image.png: 1024x800 2 titles, 2 plain texts, 3 abandons, 1 table, 1646.0ms\n"
411
+ ]
412
+ },
413
+ {
414
+ "name": "stdout",
415
+ "output_type": "stream",
416
+ "text": [
417
+ "Speed: 4.5ms preprocess, 1646.0ms inference, 1.0ms postprocess per image at shape (1, 3, 1024, 800)\n"
418
+ ]
419
+ },
420
+ {
421
+ "data": {
422
+ "text/plain": [
423
+ "[]"
424
+ ]
425
+ },
426
+ "execution_count": 9,
427
+ "metadata": {},
428
+ "output_type": "execute_result"
429
+ }
430
+ ],
431
+ "source": [
432
+ "# Apply OCR and analyze layout\n",
433
+ "page.use_ocr = True\n",
434
+ "page.analyze_layout()\n",
435
+ "\n",
436
+ "# Find document structure elements\n",
437
+ "headings = page.find_all('region[type=heading]')\n",
438
+ "paragraphs = page.find_all('region[type=paragraph]')\n",
439
+ "\n",
440
+ "# Visualize the structure\n",
441
+ "headings.highlight(color=\"red\", label=\"Headings\")\n",
442
+ "paragraphs.highlight(color=\"blue\", label=\"Paragraphs\")\n",
443
+ "\n",
444
+ "# Create a simple document outline\n",
445
+ "document_outline = []\n",
446
+ "for heading in headings:\n",
447
+ " heading_text = heading.extract_text()\n",
448
+ " document_outline.append(heading_text)\n",
449
+ "\n",
450
+ "document_outline"
451
+ ]
452
+ },
453
+ {
454
+ "cell_type": "markdown",
455
+ "id": "64b5a539",
456
+ "metadata": {},
457
+ "source": [
458
+ "## Working with Multiple Pages"
459
+ ]
460
+ },
461
+ {
462
+ "cell_type": "code",
463
+ "execution_count": 10,
464
+ "id": "3b11997f",
465
+ "metadata": {
466
+ "execution": {
467
+ "iopub.execute_input": "2025-04-16T14:59:08.301431Z",
468
+ "iopub.status.busy": "2025-04-16T14:59:08.301234Z",
469
+ "iopub.status.idle": "2025-04-16T14:59:08.305191Z",
470
+ "shell.execute_reply": "2025-04-16T14:59:08.304830Z"
471
+ }
472
+ },
473
+ "outputs": [
474
+ {
475
+ "data": {
476
+ "text/plain": [
477
+ "['Page 1: \\n ...']"
478
+ ]
479
+ },
480
+ "execution_count": 10,
481
+ "metadata": {},
482
+ "output_type": "execute_result"
483
+ }
484
+ ],
485
+ "source": [
486
+ "# Process all pages in the document\n",
487
+ "all_text = []\n",
488
+ "\n",
489
+ "for i, page in enumerate(pdf.pages):\n",
490
+ " # Enable OCR for each page\n",
491
+ " page.use_ocr = True\n",
492
+ " \n",
493
+ " # Extract text\n",
494
+ " page_text = page.extract_text()\n",
495
+ " \n",
496
+ " # Add to our collection with page number\n",
497
+ " all_text.append(f\"Page {i+1}: {page_text[:100]}...\")\n",
498
+ "\n",
499
+ "# Show the first few pages\n",
500
+ "all_text"
501
+ ]
502
+ },
503
+ {
504
+ "cell_type": "markdown",
505
+ "id": "cb0fd379",
506
+ "metadata": {},
507
+ "source": [
508
+ "## Saving PDFs with Searchable Text\n",
509
+ "\n",
510
+ "After applying OCR to a PDF, you can save a new version of the PDF where the recognized text is embedded as an invisible layer. This makes the text searchable and copyable in standard PDF viewers.\n",
511
+ "\n",
512
+ "Use the `save_searchable()` method on the `PDF` object:"
513
+ ]
514
+ },
515
+ {
516
+ "cell_type": "code",
517
+ "execution_count": 11,
518
+ "id": "2e330bad",
519
+ "metadata": {
520
+ "execution": {
521
+ "iopub.execute_input": "2025-04-16T14:59:08.306802Z",
522
+ "iopub.status.busy": "2025-04-16T14:59:08.306563Z",
523
+ "iopub.status.idle": "2025-04-16T14:59:20.510084Z",
524
+ "shell.execute_reply": "2025-04-16T14:59:20.509716Z"
525
+ }
526
+ },
527
+ "outputs": [
528
+ {
529
+ "name": "stderr",
530
+ "output_type": "stream",
531
+ "text": [
532
+ "\u001b[2m2025-04-16T14:59:08.672820Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mUsing CPU. Note: This module is much faster with a GPU.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m71\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35measyocr.easyocr\u001b[0m\n"
533
+ ]
534
+ },
535
+ {
536
+ "name": "stderr",
537
+ "output_type": "stream",
538
+ "text": [
539
+ "[2025-04-16 17:59:08,672] [ WARNING] easyocr.py:71 - Using CPU. Note: This module is much faster with a GPU.\n"
540
+ ]
541
+ }
542
+ ],
543
+ "source": [
544
+ "from natural_pdf import PDF\n",
545
+ "\n",
546
+ "input_pdf_path = \"https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/needs-ocr.pdf\"\n",
547
+ "\n",
548
+ "pdf = PDF(input_pdf_path)\n",
549
+ "pdf.apply_ocr() \n",
550
+ "\n",
551
+ "pdf.save_searchable(\"needs-ocr-searchable.pdf\")"
552
+ ]
553
+ },
554
+ {
555
+ "cell_type": "markdown",
556
+ "id": "4f2e3d94",
557
+ "metadata": {},
558
+ "source": [
559
+ "This creates `needs-ocr-searchable.pdf`, which looks identical to the original but now has a text layer corresponding to the OCR results. You can adjust the rendering resolution used during saving with the `dpi` parameter (default is 300).\n",
560
+ "\n",
561
+ "OCR integration enables you to work with scanned documents, historical archives, and image-based PDFs that don't have embedded text. By combining OCR with natural-pdf's layout analysis capabilities, you can turn any document into structured, searchable data. "
562
+ ]
563
+ }
564
+ ],
565
+ "metadata": {
566
+ "jupytext": {
567
+ "cell_metadata_filter": "-all",
568
+ "main_language": "python",
569
+ "notebook_metadata_filter": "-all"
570
+ },
571
+ "language_info": {
572
+ "codemirror_mode": {
573
+ "name": "ipython",
574
+ "version": 3
575
+ },
576
+ "file_extension": ".py",
577
+ "mimetype": "text/x-python",
578
+ "name": "python",
579
+ "nbconvert_exporter": "python",
580
+ "pygments_lexer": "ipython3",
581
+ "version": "3.10.13"
582
+ }
583
+ },
584
+ "nbformat": 4,
585
+ "nbformat_minor": 5
586
+ }