natural-pdf 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. docs/ocr/index.md +34 -47
  2. docs/tutorials/01-loading-and-extraction.ipynb +60 -46
  3. docs/tutorials/02-finding-elements.ipynb +42 -42
  4. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  5. docs/tutorials/04-table-extraction.ipynb +12 -12
  6. docs/tutorials/05-excluding-content.ipynb +30 -30
  7. docs/tutorials/06-document-qa.ipynb +28 -28
  8. docs/tutorials/07-layout-analysis.ipynb +63 -35
  9. docs/tutorials/07-working-with-regions.ipynb +55 -51
  10. docs/tutorials/07-working-with-regions.md +2 -2
  11. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  12. docs/tutorials/09-section-extraction.ipynb +113 -113
  13. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  14. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  15. docs/tutorials/12-ocr-integration.ipynb +149 -131
  16. docs/tutorials/12-ocr-integration.md +0 -13
  17. docs/tutorials/13-semantic-search.ipynb +313 -873
  18. natural_pdf/__init__.py +21 -23
  19. natural_pdf/analyzers/layout/gemini.py +264 -0
  20. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  21. natural_pdf/analyzers/layout/layout_options.py +11 -0
  22. natural_pdf/analyzers/layout/yolo.py +6 -2
  23. natural_pdf/collections/pdf_collection.py +21 -0
  24. natural_pdf/core/element_manager.py +16 -13
  25. natural_pdf/core/page.py +165 -36
  26. natural_pdf/core/pdf.py +146 -41
  27. natural_pdf/elements/base.py +11 -17
  28. natural_pdf/elements/collections.py +100 -38
  29. natural_pdf/elements/region.py +77 -38
  30. natural_pdf/elements/text.py +5 -0
  31. natural_pdf/ocr/__init__.py +49 -36
  32. natural_pdf/ocr/engine.py +146 -51
  33. natural_pdf/ocr/engine_easyocr.py +141 -161
  34. natural_pdf/ocr/engine_paddle.py +107 -193
  35. natural_pdf/ocr/engine_surya.py +75 -148
  36. natural_pdf/ocr/ocr_factory.py +114 -0
  37. natural_pdf/ocr/ocr_manager.py +65 -93
  38. natural_pdf/ocr/ocr_options.py +7 -17
  39. natural_pdf/ocr/utils.py +98 -0
  40. natural_pdf/templates/spa/css/style.css +334 -0
  41. natural_pdf/templates/spa/index.html +31 -0
  42. natural_pdf/templates/spa/js/app.js +472 -0
  43. natural_pdf/templates/spa/words.txt +235976 -0
  44. natural_pdf/utils/debug.py +32 -0
  45. natural_pdf/utils/identifiers.py +29 -0
  46. natural_pdf/utils/packaging.py +418 -0
  47. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +41 -19
  48. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/RECORD +51 -44
  49. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  50. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/top_level.txt +0 -1
  51. natural_pdf/templates/ocr_debug.html +0 -517
  52. tests/test_loading.py +0 -50
  53. tests/test_optional_deps.py +0 -298
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -2,7 +2,7 @@
2
2
  "cells": [
3
3
  {
4
4
  "cell_type": "markdown",
5
- "id": "38157702",
5
+ "id": "8b02fa9e",
6
6
  "metadata": {},
7
7
  "source": [
8
8
  "# OCR Integration for Scanned Documents\n",
@@ -13,13 +13,13 @@
13
13
  {
14
14
  "cell_type": "code",
15
15
  "execution_count": 1,
16
- "id": "462de69c",
16
+ "id": "bde55ac1",
17
17
  "metadata": {
18
18
  "execution": {
19
- "iopub.execute_input": "2025-04-16T14:58:46.724429Z",
20
- "iopub.status.busy": "2025-04-16T14:58:46.724305Z",
21
- "iopub.status.idle": "2025-04-16T14:58:46.727892Z",
22
- "shell.execute_reply": "2025-04-16T14:58:46.727465Z"
19
+ "iopub.execute_input": "2025-04-21T21:32:06.104226Z",
20
+ "iopub.status.busy": "2025-04-21T21:32:06.104019Z",
21
+ "iopub.status.idle": "2025-04-21T21:32:06.108232Z",
22
+ "shell.execute_reply": "2025-04-21T21:32:06.107754Z"
23
23
  }
24
24
  },
25
25
  "outputs": [],
@@ -30,13 +30,13 @@
30
30
  {
31
31
  "cell_type": "code",
32
32
  "execution_count": 2,
33
- "id": "1509ad46",
33
+ "id": "5c624a53",
34
34
  "metadata": {
35
35
  "execution": {
36
- "iopub.execute_input": "2025-04-16T14:58:46.731122Z",
37
- "iopub.status.busy": "2025-04-16T14:58:46.730093Z",
38
- "iopub.status.idle": "2025-04-16T14:58:54.166474Z",
39
- "shell.execute_reply": "2025-04-16T14:58:54.165872Z"
36
+ "iopub.execute_input": "2025-04-21T21:32:06.110125Z",
37
+ "iopub.status.busy": "2025-04-21T21:32:06.109925Z",
38
+ "iopub.status.idle": "2025-04-21T21:32:14.008764Z",
39
+ "shell.execute_reply": "2025-04-21T21:32:14.008268Z"
40
40
  }
41
41
  },
42
42
  "outputs": [
@@ -65,50 +65,7 @@
65
65
  },
66
66
  {
67
67
  "cell_type": "markdown",
68
- "id": "4f9a732e",
69
- "metadata": {},
70
- "source": [
71
- "## Enabling OCR"
72
- ]
73
- },
74
- {
75
- "cell_type": "code",
76
- "execution_count": 3,
77
- "id": "9b2cea08",
78
- "metadata": {
79
- "execution": {
80
- "iopub.execute_input": "2025-04-16T14:58:54.168271Z",
81
- "iopub.status.busy": "2025-04-16T14:58:54.167928Z",
82
- "iopub.status.idle": "2025-04-16T14:58:54.171228Z",
83
- "shell.execute_reply": "2025-04-16T14:58:54.170814Z"
84
- }
85
- },
86
- "outputs": [
87
- {
88
- "data": {
89
- "text/plain": [
90
- "''"
91
- ]
92
- },
93
- "execution_count": 3,
94
- "metadata": {},
95
- "output_type": "execute_result"
96
- }
97
- ],
98
- "source": [
99
- "# Enable OCR for text extraction\n",
100
- "page.use_ocr = True\n",
101
- "\n",
102
- "# Extract text with OCR enabled\n",
103
- "text_with_ocr = page.extract_text()\n",
104
- "\n",
105
- "# Preview the extracted text\n",
106
- "text_with_ocr[:200] + \"...\" if len(text_with_ocr) > 200 else text_with_ocr"
107
- ]
108
- },
109
- {
110
- "cell_type": "markdown",
111
- "id": "75e39372",
68
+ "id": "461a5090",
112
69
  "metadata": {},
113
70
  "source": [
114
71
  "## Finding Text Elements with OCR"
@@ -116,14 +73,14 @@
116
73
  },
117
74
  {
118
75
  "cell_type": "code",
119
- "execution_count": 4,
120
- "id": "b253d49f",
76
+ "execution_count": 3,
77
+ "id": "895e3c2c",
121
78
  "metadata": {
122
79
  "execution": {
123
- "iopub.execute_input": "2025-04-16T14:58:54.172736Z",
124
- "iopub.status.busy": "2025-04-16T14:58:54.172581Z",
125
- "iopub.status.idle": "2025-04-16T14:59:04.346553Z",
126
- "shell.execute_reply": "2025-04-16T14:59:04.346230Z"
80
+ "iopub.execute_input": "2025-04-21T21:32:14.010745Z",
81
+ "iopub.status.busy": "2025-04-21T21:32:14.010324Z",
82
+ "iopub.status.idle": "2025-04-21T21:32:28.416856Z",
83
+ "shell.execute_reply": "2025-04-21T21:32:28.416360Z"
127
84
  }
128
85
  },
129
86
  "outputs": [
@@ -131,23 +88,23 @@
131
88
  "name": "stderr",
132
89
  "output_type": "stream",
133
90
  "text": [
134
- "\u001b[2m2025-04-16T14:58:54.225410Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mUsing CPU. Note: This module is much faster with a GPU.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m71\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35measyocr.easyocr\u001b[0m\n"
91
+ "\u001b[2m2025-04-21T21:32:14.064078Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mUsing CPU. Note: This module is much faster with a GPU.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m71\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35measyocr.easyocr\u001b[0m\n"
135
92
  ]
136
93
  },
137
94
  {
138
95
  "name": "stderr",
139
96
  "output_type": "stream",
140
97
  "text": [
141
- "[2025-04-16 17:58:54,225] [ WARNING] easyocr.py:71 - Using CPU. Note: This module is much faster with a GPU.\n"
98
+ "[2025-04-21 17:32:14,064] [ WARNING] easyocr.py:71 - Using CPU. Note: This module is much faster with a GPU.\n"
142
99
  ]
143
100
  },
144
101
  {
145
102
  "data": {
146
103
  "text/plain": [
147
- "<ElementCollection[TextElement](count=49)>"
104
+ "<ElementCollection[TextElement](count=47)>"
148
105
  ]
149
106
  },
150
- "execution_count": 4,
107
+ "execution_count": 3,
151
108
  "metadata": {},
152
109
  "output_type": "execute_result"
153
110
  }
@@ -166,7 +123,7 @@
166
123
  },
167
124
  {
168
125
  "cell_type": "markdown",
169
- "id": "c8e12006",
126
+ "id": "36051d57",
170
127
  "metadata": {},
171
128
  "source": [
172
129
  "## OCR Configuration Options"
@@ -174,14 +131,14 @@
174
131
  },
175
132
  {
176
133
  "cell_type": "code",
177
- "execution_count": 5,
178
- "id": "4a77d1bf",
134
+ "execution_count": 4,
135
+ "id": "d4461746",
179
136
  "metadata": {
180
137
  "execution": {
181
- "iopub.execute_input": "2025-04-16T14:59:04.348402Z",
182
- "iopub.status.busy": "2025-04-16T14:59:04.348238Z",
183
- "iopub.status.idle": "2025-04-16T14:59:04.352084Z",
184
- "shell.execute_reply": "2025-04-16T14:59:04.351691Z"
138
+ "iopub.execute_input": "2025-04-21T21:32:28.418763Z",
139
+ "iopub.status.busy": "2025-04-21T21:32:28.418565Z",
140
+ "iopub.status.idle": "2025-04-21T21:32:28.423024Z",
141
+ "shell.execute_reply": "2025-04-21T21:32:28.422671Z"
185
142
  }
186
143
  },
187
144
  "outputs": [
@@ -191,7 +148,7 @@
191
148
  "' \\n \\n ...'"
192
149
  ]
193
150
  },
194
- "execution_count": 5,
151
+ "execution_count": 4,
195
152
  "metadata": {},
196
153
  "output_type": "execute_result"
197
154
  }
@@ -212,7 +169,7 @@
212
169
  },
213
170
  {
214
171
  "cell_type": "markdown",
215
- "id": "7a702637",
172
+ "id": "d5a96ac7",
216
173
  "metadata": {},
217
174
  "source": [
218
175
  "## Working with Multi-language Documents"
@@ -220,14 +177,14 @@
220
177
  },
221
178
  {
222
179
  "cell_type": "code",
223
- "execution_count": 6,
224
- "id": "f42f0c39",
180
+ "execution_count": 5,
181
+ "id": "9fa156f5",
225
182
  "metadata": {
226
183
  "execution": {
227
- "iopub.execute_input": "2025-04-16T14:59:04.353822Z",
228
- "iopub.status.busy": "2025-04-16T14:59:04.353670Z",
229
- "iopub.status.idle": "2025-04-16T14:59:04.356993Z",
230
- "shell.execute_reply": "2025-04-16T14:59:04.356695Z"
184
+ "iopub.execute_input": "2025-04-21T21:32:28.424374Z",
185
+ "iopub.status.busy": "2025-04-21T21:32:28.424235Z",
186
+ "iopub.status.idle": "2025-04-21T21:32:28.428114Z",
187
+ "shell.execute_reply": "2025-04-21T21:32:28.427816Z"
231
188
  }
232
189
  },
233
190
  "outputs": [
@@ -237,7 +194,7 @@
237
194
  "' \\n \\n '"
238
195
  ]
239
196
  },
240
- "execution_count": 6,
197
+ "execution_count": 5,
241
198
  "metadata": {},
242
199
  "output_type": "execute_result"
243
200
  }
@@ -256,7 +213,7 @@
256
213
  },
257
214
  {
258
215
  "cell_type": "markdown",
259
- "id": "46d8fcbb",
216
+ "id": "d3ccf43f",
260
217
  "metadata": {},
261
218
  "source": [
262
219
  "## Extracting Tables from Scanned Documents"
@@ -264,17 +221,45 @@
264
221
  },
265
222
  {
266
223
  "cell_type": "code",
267
- "execution_count": 7,
268
- "id": "e2cb5597",
224
+ "execution_count": 6,
225
+ "id": "ee7a7e7d",
269
226
  "metadata": {
270
227
  "execution": {
271
- "iopub.execute_input": "2025-04-16T14:59:04.358447Z",
272
- "iopub.status.busy": "2025-04-16T14:59:04.358302Z",
273
- "iopub.status.idle": "2025-04-16T14:59:06.563788Z",
274
- "shell.execute_reply": "2025-04-16T14:59:06.563483Z"
228
+ "iopub.execute_input": "2025-04-21T21:32:28.429414Z",
229
+ "iopub.status.busy": "2025-04-21T21:32:28.429283Z",
230
+ "iopub.status.idle": "2025-04-21T21:32:30.754086Z",
231
+ "shell.execute_reply": "2025-04-21T21:32:30.753700Z"
275
232
  }
276
233
  },
277
234
  "outputs": [
235
+ {
236
+ "name": "stderr",
237
+ "output_type": "stream",
238
+ "text": [
239
+ "\u001b[2m2025-04-21T21:32:28.446098Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mGOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m72\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.gemini\u001b[0m\n"
240
+ ]
241
+ },
242
+ {
243
+ "name": "stderr",
244
+ "output_type": "stream",
245
+ "text": [
246
+ "[2025-04-21 17:32:28,446] [ WARNING] gemini.py:72 - GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\n"
247
+ ]
248
+ },
249
+ {
250
+ "name": "stderr",
251
+ "output_type": "stream",
252
+ "text": [
253
+ "\u001b[2m2025-04-21T21:32:28.446834Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mGOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m72\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.gemini\u001b[0m\n"
254
+ ]
255
+ },
256
+ {
257
+ "name": "stderr",
258
+ "output_type": "stream",
259
+ "text": [
260
+ "[2025-04-21 17:32:28,446] [ WARNING] gemini.py:72 - GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\n"
261
+ ]
262
+ },
278
263
  {
279
264
  "name": "stdout",
280
265
  "output_type": "stream",
@@ -286,14 +271,14 @@
286
271
  "name": "stdout",
287
272
  "output_type": "stream",
288
273
  "text": [
289
- "image 1/1 /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmps1z5zj11/temp_layout_image.png: 1024x800 2 titles, 2 plain texts, 3 abandons, 1 table, 1703.3ms\n"
274
+ "image 1/1 /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpjbbxsx1v/temp_layout_image.png: 1024x800 2 titles, 2 plain texts, 3 abandons, 1 table, 1940.4ms\n"
290
275
  ]
291
276
  },
292
277
  {
293
278
  "name": "stdout",
294
279
  "output_type": "stream",
295
280
  "text": [
296
- "Speed: 6.6ms preprocess, 1703.3ms inference, 1.1ms postprocess per image at shape (1, 3, 1024, 800)\n"
281
+ "Speed: 5.4ms preprocess, 1940.4ms inference, 1.0ms postprocess per image at shape (1, 3, 1024, 800)\n"
297
282
  ]
298
283
  }
299
284
  ],
@@ -318,7 +303,7 @@
318
303
  },
319
304
  {
320
305
  "cell_type": "markdown",
321
- "id": "17eee068",
306
+ "id": "6a3c701e",
322
307
  "metadata": {},
323
308
  "source": [
324
309
  "## Finding Form Fields in Scanned Documents"
@@ -326,27 +311,28 @@
326
311
  },
327
312
  {
328
313
  "cell_type": "code",
329
- "execution_count": 8,
330
- "id": "e22d5704",
314
+ "execution_count": 7,
315
+ "id": "7180badd",
331
316
  "metadata": {
332
317
  "execution": {
333
- "iopub.execute_input": "2025-04-16T14:59:06.565411Z",
334
- "iopub.status.busy": "2025-04-16T14:59:06.565245Z",
335
- "iopub.status.idle": "2025-04-16T14:59:06.570996Z",
336
- "shell.execute_reply": "2025-04-16T14:59:06.570628Z"
318
+ "iopub.execute_input": "2025-04-21T21:32:30.755960Z",
319
+ "iopub.status.busy": "2025-04-21T21:32:30.755766Z",
320
+ "iopub.status.idle": "2025-04-21T21:32:30.762760Z",
321
+ "shell.execute_reply": "2025-04-21T21:32:30.762434Z"
337
322
  }
338
323
  },
339
324
  "outputs": [
340
325
  {
341
326
  "data": {
342
327
  "text/plain": [
343
- "{'Date: February 3, 1905': \"Jungle Health and Safety Irspectlon Servlce\\n \\n \\n \\n \\n \\n \\n \\nSummary: Warst of any, however;were the fertilizer men, and those who served in the coaking roams\\nThese people could not be shown to the visltor_for the odor of a fertllizer man would scare any ordlnary\\nhundred yards _ and as far the ather men_who warked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; thelr pecullar trouble was that\\ntheywere fished out; there was never enough of them left to be worth\\nsometimes they would be overlooked for days, till all but the bones of them had gone out\\nto the world as Durham's Pure Leaf Lardl\\n \\n \\n \\n \\n \\n \\n \\n \\n \\nDescription \\n \\nUnsanitary Working Conditions_\\nInadequate Protective Equipment:\\nInjuryPrevention \\n \\nFailurc to Properly Storc Hazardous Matcrials_\\nLack of Adequale Fire Safety Measures_\\nInadequate Ventilation Systems.\\n \\nInsuficlent Employee Trainlng for Safe Work Practices\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nJungle Hlealth and Safety Inspection Servize\",\n",
344
- " 'Violation Count': \"Date: February 3, 1905 \\n \\n \\nSummary: Warst of any, however;were the fertilizer men, and those who served in the coaking roams\\nThese people could not be shown to the visltor_for the odor of a fertllizer man would scare any ordlnary\\nhundred yards _ and as far the ather men_who warked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; thelr pecullar trouble was that\\ninto the vats; and whentheywere fished out; there was never enough of them left to be worth\\nsometimes they would be overlooked for days, till all but the bones of them had gone out\\nto the world as Durham's Pure Leaf Lardl\\n \\n \\n \\n \\n \\n \\n \\n \\n \\nDescription \\n \\nUnsanitary Working Conditions_\\nInadequate Protective Equipment:\\nIneffectiveInjuryPrevention\\n \\nFailurc to Properly Storc Hazardous Matcrials_\\nLack of Adequale Fire Safety Measures_\\nInadequate Ventilation Systems.\\n \\nInsuficlent Employee Trainlng for Safe Work Practices\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nJungle Hlealth and Safety Inspection Servize\",\n",
345
- " 'Summary: Warst of any, however;': \"Jungle Health and Safety Irspectlon Servlce\\n \\n \\n \\n \\n \\n \\n \\nwere the fertilizer men, and those who served in the coaking roams\\nThese people could not be shown to the visltor_for the odor of a fertllizer man would scare any ordlnary\\nand as far the ather men_who warked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; thelr pecullar trouble was that\\nwere fished out; there was never enough of them left to be worth\\nsometimes they would be overlooked for days, till all but the bones of them had gone out\\nto the world as Durham's Pure Leaf Lardl\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nUnsanitary Working Conditions_\\nInadequate Protective Equipment:\\nPrevention \\n \\nFailurc to Properly Storc Hazardous Matcrials_\\nLack of Adequale Fire Safety Measures_\\nInadequate Ventilation Systems.\\n \\nInsuficlent Employee Trainlng for Safe Work Practices\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nJungle Hlealth and Safety Inspection Servize\",\n",
346
- " 'Inadequate Protective Equipment': 'Jungle Health and Safety Irspectlon Servlce\\n \\n \\n \\n \\n \\n \\n \\nwere the fertilizer men, and those who served in the coaking roams\\nThese people could not be shown to the visltor_for the odor of a fertllizer man would scare any ordlnary\\nand as far the ather men_who warked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; thelr pecullar trouble was thattheyfell\\nwere fished out; there was never enough of them left to be worth\\nsometimes they would be overlooked for days, till all but the bones of them had gone out\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nLevel \\nCritical \\nSerious \\nSerious \\n \\nFailurc to Properly Storc Hazardous Matcrials_ Critical\\nLack of Adequale Fire Safety Measures_ Serious\\nSerious \\n \\nInsuficlent Employee Trainlng for Safe Work Practices Serlous\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nJungle Hlealth and Safety Inspection Servize'}"
328
+ "{\"Site: Durham's Meatpacking Chicago, IIl.\": 'Jungle Health and Satety Inspection Service\\nINS-UPONSINCLAIR \\n \\n \\n \\n \\n \\nSummary: Worst of any, however; were the fertilizer men, and those who served in the cooking rooms\\nThese people could not be shown to the visitor for the odor of a fertilizer man would scare any\\nvisitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; their peculiar trouble was that they fell\\ninlo the vats; and when they were fished out; there was never enough of them left to be worth\\nwould be overlooked for days, till all but the bones of them had gone out\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nLevel \\nUnsanitary Working Conditions Critical\\nInadequate Protective Equipment: Serious\\n \\nSerious \\nFailure to Properly Storc Hazardous Materials_ Critical\\nSafety Measures_ Serious \\nInadequate Ventilation Systems Serious\\n \\nInsufficient Employee Training for Safe Work Practices Serious\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nJungle Health and Salety Irspection Service',\n",
329
+ " 'Date: February 3, 1905': \"Jungle Health and Satety Inspection Service\\n INS-UPONSINCLAIR \\n \\nSite: Durham's Meatpacking Chicago, IIl.\\n \\n \\n \\nSummary: Worst of any, however; were the fertilizer men, and those who served in the cooking rooms\\nThese people could not be shown to the visitor for the odor of a fertilizer man would scare any\\nvisitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; their peculiar trouble was that they fell\\ninlo the vats; and when they were fished out; there was never enough of them left to be worth\\ntheywould be overlooked for days, till all but the bones of them had gone out\\nto thc world as Durham's Purc Lcaf Lard!\\n \\n \\n \\n \\n \\n \\n \\n \\n \\nDescription \\n \\nUnsanitary Working Conditions\\nInadequate Protective Equipment:\\nIneffective Injury Prevention _\\n \\nFailure to Properly Storc Hazardous Materials_\\nLack of AdequateFireSafety Measures_\\nInadequate Ventilation Systems\\n \\nInsufficient Employee Training for Safe Work Practices\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nJungle Health and Salety Irspection Service\",\n",
330
+ " 'Violation Count': \"Site: Durham's Meatpacking Chicago, IIl.\\nDate: February 3, 1905 \\n \\n \\nSummary: Worst of any, however; were the fertilizer men, and those who served in the cooking rooms\\nThese people could not be shown to the visitor for the odor of a fertilizer man would scare any\\nvisitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; their peculiar trouble was that they fell\\ninlo the vats; and when they were fished out; there was never enough of them left to be worth\\nsometimestheywould be overlooked for days, till all but the bones of them had gone out\\nto thc world as Durham's Purc Lcaf Lard!\\n \\n \\n \\n \\n \\n \\n \\n \\n \\nDescription \\n \\nUnsanitary Working Conditions\\nInadequate Protective Equipment:\\nIneffective Injury Prevention _\\n \\nFailure to Properly Storc Hazardous Materials_\\nLack of AdequateFireSafety Measures_\\nInadequate Ventilation Systems\\n \\nInsufficient Employee Training for Safe Work Practices\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nJungle Health and Salety Irspection Service\",\n",
331
+ " 'Summary: Worst of any, however; were the fertilizer men, and those who served in the cooking rooms': 'Red (ZGB tuple] \\n \\nJungle Health and Satety Inspection Service\\n \\n \\n \\n \\n \\n \\n \\nordinary \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nRepeat?',\n",
332
+ " 'Inadequate Protective Equipment': 'Jungle Health and Satety Inspection Service\\nINS-UPONSINCLAIR \\n \\n \\n \\n \\n \\nSummary: Worst of any, however; were the fertilizer men, and those who served in the cooking rooms\\nThese people could not be shown to the visitor for the odor of a fertilizer man would scare anyordinary\\nvisitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; their peculiar trouble was that they fell\\ninlo the vats; and when they were fished out; there was never enough of them left to be worth\\nwould be overlooked for days, till all but the bones of them had gone out\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nLevel \\nCritical \\nSerious \\n \\nSerious \\nFailure to Properly Storc Hazardous Materials_ Critical\\nSafety Measures_ Serious \\nSerious \\n \\nInsufficient Employee Training for Safe Work Practices Serious\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nJungle Health and Salety Irspection Service'}"
347
333
  ]
348
334
  },
349
- "execution_count": 8,
335
+ "execution_count": 7,
350
336
  "metadata": {},
351
337
  "output_type": "execute_result"
352
338
  }
@@ -377,7 +363,7 @@
377
363
  },
378
364
  {
379
365
  "cell_type": "markdown",
380
- "id": "4f68f512",
366
+ "id": "5495e93c",
381
367
  "metadata": {},
382
368
  "source": [
383
369
  "## Combining OCR with Layout Analysis"
@@ -385,17 +371,45 @@
385
371
  },
386
372
  {
387
373
  "cell_type": "code",
388
- "execution_count": 9,
389
- "id": "1135fd7a",
374
+ "execution_count": 8,
375
+ "id": "20b489df",
390
376
  "metadata": {
391
377
  "execution": {
392
- "iopub.execute_input": "2025-04-16T14:59:06.572437Z",
393
- "iopub.status.busy": "2025-04-16T14:59:06.572312Z",
394
- "iopub.status.idle": "2025-04-16T14:59:08.299792Z",
395
- "shell.execute_reply": "2025-04-16T14:59:08.299429Z"
378
+ "iopub.execute_input": "2025-04-21T21:32:30.764203Z",
379
+ "iopub.status.busy": "2025-04-21T21:32:30.764045Z",
380
+ "iopub.status.idle": "2025-04-21T21:32:32.790129Z",
381
+ "shell.execute_reply": "2025-04-21T21:32:32.789771Z"
396
382
  }
397
383
  },
398
384
  "outputs": [
385
+ {
386
+ "name": "stderr",
387
+ "output_type": "stream",
388
+ "text": [
389
+ "\u001b[2m2025-04-21T21:32:30.782293Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mGOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m72\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.gemini\u001b[0m\n"
390
+ ]
391
+ },
392
+ {
393
+ "name": "stderr",
394
+ "output_type": "stream",
395
+ "text": [
396
+ "[2025-04-21 17:32:30,782] [ WARNING] gemini.py:72 - GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\n"
397
+ ]
398
+ },
399
+ {
400
+ "name": "stderr",
401
+ "output_type": "stream",
402
+ "text": [
403
+ "\u001b[2m2025-04-21T21:32:30.783192Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mGOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m72\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.gemini\u001b[0m\n"
404
+ ]
405
+ },
406
+ {
407
+ "name": "stderr",
408
+ "output_type": "stream",
409
+ "text": [
410
+ "[2025-04-21 17:32:30,783] [ WARNING] gemini.py:72 - GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\n"
411
+ ]
412
+ },
399
413
  {
400
414
  "name": "stdout",
401
415
  "output_type": "stream",
@@ -407,14 +421,14 @@
407
421
  "name": "stdout",
408
422
  "output_type": "stream",
409
423
  "text": [
410
- "image 1/1 /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpnp5bwgzc/temp_layout_image.png: 1024x800 2 titles, 2 plain texts, 3 abandons, 1 table, 1646.0ms\n"
424
+ "image 1/1 /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmprtsl29ey/temp_layout_image.png: 1024x800 2 titles, 2 plain texts, 3 abandons, 1 table, 1925.6ms\n"
411
425
  ]
412
426
  },
413
427
  {
414
428
  "name": "stdout",
415
429
  "output_type": "stream",
416
430
  "text": [
417
- "Speed: 4.5ms preprocess, 1646.0ms inference, 1.0ms postprocess per image at shape (1, 3, 1024, 800)\n"
431
+ "Speed: 4.7ms preprocess, 1925.6ms inference, 1.2ms postprocess per image at shape (1, 3, 1024, 800)\n"
418
432
  ]
419
433
  },
420
434
  {
@@ -423,7 +437,7 @@
423
437
  "[]"
424
438
  ]
425
439
  },
426
- "execution_count": 9,
440
+ "execution_count": 8,
427
441
  "metadata": {},
428
442
  "output_type": "execute_result"
429
443
  }
@@ -452,7 +466,7 @@
452
466
  },
453
467
  {
454
468
  "cell_type": "markdown",
455
- "id": "64b5a539",
469
+ "id": "320bdfc4",
456
470
  "metadata": {},
457
471
  "source": [
458
472
  "## Working with Multiple Pages"
@@ -460,14 +474,14 @@
460
474
  },
461
475
  {
462
476
  "cell_type": "code",
463
- "execution_count": 10,
464
- "id": "3b11997f",
477
+ "execution_count": 9,
478
+ "id": "9421a04d",
465
479
  "metadata": {
466
480
  "execution": {
467
- "iopub.execute_input": "2025-04-16T14:59:08.301431Z",
468
- "iopub.status.busy": "2025-04-16T14:59:08.301234Z",
469
- "iopub.status.idle": "2025-04-16T14:59:08.305191Z",
470
- "shell.execute_reply": "2025-04-16T14:59:08.304830Z"
481
+ "iopub.execute_input": "2025-04-21T21:32:32.791525Z",
482
+ "iopub.status.busy": "2025-04-21T21:32:32.791398Z",
483
+ "iopub.status.idle": "2025-04-21T21:32:32.796295Z",
484
+ "shell.execute_reply": "2025-04-21T21:32:32.795973Z"
471
485
  }
472
486
  },
473
487
  "outputs": [
@@ -477,7 +491,7 @@
477
491
  "['Page 1: \\n ...']"
478
492
  ]
479
493
  },
480
- "execution_count": 10,
494
+ "execution_count": 9,
481
495
  "metadata": {},
482
496
  "output_type": "execute_result"
483
497
  }
@@ -502,7 +516,7 @@
502
516
  },
503
517
  {
504
518
  "cell_type": "markdown",
505
- "id": "cb0fd379",
519
+ "id": "d69c14d1",
506
520
  "metadata": {},
507
521
  "source": [
508
522
  "## Saving PDFs with Searchable Text\n",
@@ -514,14 +528,14 @@
514
528
  },
515
529
  {
516
530
  "cell_type": "code",
517
- "execution_count": 11,
518
- "id": "2e330bad",
531
+ "execution_count": 10,
532
+ "id": "e84f8946",
519
533
  "metadata": {
520
534
  "execution": {
521
- "iopub.execute_input": "2025-04-16T14:59:08.306802Z",
522
- "iopub.status.busy": "2025-04-16T14:59:08.306563Z",
523
- "iopub.status.idle": "2025-04-16T14:59:20.510084Z",
524
- "shell.execute_reply": "2025-04-16T14:59:20.509716Z"
535
+ "iopub.execute_input": "2025-04-21T21:32:32.797789Z",
536
+ "iopub.status.busy": "2025-04-21T21:32:32.797610Z",
537
+ "iopub.status.idle": "2025-04-21T21:32:49.165749Z",
538
+ "shell.execute_reply": "2025-04-21T21:32:49.165293Z"
525
539
  }
526
540
  },
527
541
  "outputs": [
@@ -529,14 +543,14 @@
529
543
  "name": "stderr",
530
544
  "output_type": "stream",
531
545
  "text": [
532
- "\u001b[2m2025-04-16T14:59:08.672820Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mUsing CPU. Note: This module is much faster with a GPU.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m71\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35measyocr.easyocr\u001b[0m\n"
546
+ "\u001b[2m2025-04-21T21:32:32.910436Z\u001b[0m [\u001b[33m\u001b[1mwarning \u001b[0m] \u001b[1mUsing CPU. Note: This module is much faster with a GPU.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m71\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35measyocr.easyocr\u001b[0m\n"
533
547
  ]
534
548
  },
535
549
  {
536
550
  "name": "stderr",
537
551
  "output_type": "stream",
538
552
  "text": [
539
- "[2025-04-16 17:59:08,672] [ WARNING] easyocr.py:71 - Using CPU. Note: This module is much faster with a GPU.\n"
553
+ "[2025-04-21 17:32:32,910] [ WARNING] easyocr.py:71 - Using CPU. Note: This module is much faster with a GPU.\n"
540
554
  ]
541
555
  }
542
556
  ],
@@ -553,7 +567,7 @@
553
567
  },
554
568
  {
555
569
  "cell_type": "markdown",
556
- "id": "4f2e3d94",
570
+ "id": "cd0b43ed",
557
571
  "metadata": {},
558
572
  "source": [
559
573
  "This creates `needs-ocr-searchable.pdf`, which looks identical to the original but now has a text layer corresponding to the OCR results. You can adjust the rendering resolution used during saving with the `dpi` parameter (default is 300).\n",
@@ -566,7 +580,11 @@
566
580
  "jupytext": {
567
581
  "cell_metadata_filter": "-all",
568
582
  "main_language": "python",
569
- "notebook_metadata_filter": "-all"
583
+ "notebook_metadata_filter": "-all",
584
+ "text_representation": {
585
+ "extension": ".md",
586
+ "format_name": "markdown"
587
+ }
570
588
  },
571
589
  "language_info": {
572
590
  "codemirror_mode": {
@@ -18,19 +18,6 @@ text_without_ocr = page.extract_text()
18
18
  f"Without OCR: {len(text_without_ocr)} characters extracted"
19
19
  ```
20
20
 
21
- ## Enabling OCR
22
-
23
- ```python
24
- # Enable OCR for text extraction
25
- page.use_ocr = True
26
-
27
- # Extract text with OCR enabled
28
- text_with_ocr = page.extract_text()
29
-
30
- # Preview the extracted text
31
- text_with_ocr[:200] + "..." if len(text_with_ocr) > 200 else text_with_ocr
32
- ```
33
-
34
21
  ## Finding Text Elements with OCR
35
22
 
36
23
  ```python