natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,66 +0,0 @@
1
- # Layout Analysis
2
-
3
- Beyond simple text and lines, `natural-pdf` can use layout analysis models (like YOLO or DETR) to identify semantic regions within a page, such as paragraphs, tables, figures, headers, etc. This provides a higher-level understanding of the document structure.
4
-
5
- Let's analyze the layout of our `01-practice.pdf`.
6
-
7
- ```python
8
- #%pip install "natural-pdf[all]"
9
- ```
10
-
11
- ```python
12
- from natural_pdf import PDF
13
-
14
- # Load the PDF and get the page
15
- pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
16
- page = pdf.pages[0]
17
-
18
- # Analyze the layout using the default model
19
- # This adds 'detected' Region objects to the page
20
- # It returns an ElementCollection of the detected regions
21
- page.analyze_layout()
22
- detected_regions = page.find_all('region[source="detected"]')
23
- ```
24
-
25
- ```python
26
- # Visualize all detected regions, using default colors based on type
27
- page.clear_highlights() # Clear previous highlights
28
- detected_regions.highlight(group_by='type', include_attrs=['confidence'])
29
-
30
- # Show the image with region overlays
31
- page.to_image(width=900)
32
- ```
33
-
34
- ```python
35
- # Find and visualize only the detected table region(s)
36
- tables = page.find_all('region[type=table]')
37
- tables.show(color='lightgreen', label='Detected Table')
38
- ```
39
-
40
- ```python
41
- # Extract text specifically from the detected table region
42
- table_region = tables.first # Assuming only one table was detected
43
- # Extract text preserving layout
44
- table_text_layout = table_region.extract_text(layout=True)
45
- table_text_layout
46
- ```
47
-
48
- ```python
49
- # Layout-detected regions can also be used for table extraction
50
- # This can be more robust than the basic page.extract_tables()
51
- # especially for tables without clear lines.
52
- table_data = table_region.extract_table()
53
- table_data
54
- ```
55
-
56
- Layout analysis provides structured `Region` objects. You can filter these regions by their predicted `type` and then perform actions like visualization or extracting text/tables specifically from those regions.
57
-
58
- <div class="admonition note">
59
- <p class="admonition-title">Layout Models and Configuration</p>
60
-
61
- * Layout analysis requires external models. Ensure these are installed.
62
- * You can specify different models (`engine='yolo'`, `engine='detr'`, `engine='paddle'`) or configurations (confidence thresholds, specific classes) via arguments to `page.analyze_layout()`. Different models may perform better on different document types.
63
- * The detected regions are added to the page and can be found using selectors like `page.find_all('region[type=paragraph]')`.
64
- </div>
65
-
66
- ```
@@ -1,413 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "289614f6",
6
- "metadata": {},
7
- "source": [
8
- "# Working with Regions\n",
9
- "\n",
10
- "Regions are rectangular areas on a page that let you focus on specific parts of a document. They're perfect for extracting text from defined areas, finding elements within certain boundaries, and working with document sections."
11
- ]
12
- },
13
- {
14
- "cell_type": "code",
15
- "execution_count": 1,
16
- "id": "ee7b29db",
17
- "metadata": {
18
- "execution": {
19
- "iopub.execute_input": "2025-04-21T21:32:06.104242Z",
20
- "iopub.status.busy": "2025-04-21T21:32:06.104024Z",
21
- "iopub.status.idle": "2025-04-21T21:32:06.108388Z",
22
- "shell.execute_reply": "2025-04-21T21:32:06.107767Z"
23
- }
24
- },
25
- "outputs": [],
26
- "source": [
27
- "#%pip install \"natural-pdf[all]\""
28
- ]
29
- },
30
- {
31
- "cell_type": "code",
32
- "execution_count": 2,
33
- "id": "392a4d3b",
34
- "metadata": {
35
- "execution": {
36
- "iopub.execute_input": "2025-04-21T21:32:06.110143Z",
37
- "iopub.status.busy": "2025-04-21T21:32:06.109959Z",
38
- "iopub.status.idle": "2025-04-21T21:32:14.111600Z",
39
- "shell.execute_reply": "2025-04-21T21:32:14.111045Z"
40
- }
41
- },
42
- "outputs": [
43
- {
44
- "data": {
45
- "text/plain": [
46
- "' INS-UP70N51NCL41R \\nSite: Durham’s Meatpacking Chicago, Ill. \\nDate: February 3, 1905 \\nViolation Count: 7 \\nSummary: Worst of any, however, were the fertilizer men, and those who served in the cooking rooms.\\nThese people could not be shown to the visitor - for the odor of a fertilizer man would scare any ordinary\\nvisitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor, their peculiar trouble was that they fell\\n \\n \\n '"
47
- ]
48
- },
49
- "execution_count": 2,
50
- "metadata": {},
51
- "output_type": "execute_result"
52
- }
53
- ],
54
- "source": [
55
- "from natural_pdf import PDF\n",
56
- "\n",
57
- "# Load a PDF\n",
58
- "pdf = PDF(\"https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf\")\n",
59
- "page = pdf.pages[0]\n",
60
- "\n",
61
- "# Create a region in the top portion of the page\n",
62
- "top_region = page.create_region(\n",
63
- " 50, # x0 (left)\n",
64
- " 50, # y0 (top)\n",
65
- " page.width - 50, # x1 (right)\n",
66
- " 200 # y1 (bottom)\n",
67
- ")\n",
68
- "\n",
69
- "# Visualize the region\n",
70
- "top_region.show(color=\"blue\", label=\"Top Region\")\n",
71
- "\n",
72
- "# Extract text from this region\n",
73
- "top_region.extract_text()"
74
- ]
75
- },
76
- {
77
- "cell_type": "markdown",
78
- "id": "326305f6",
79
- "metadata": {},
80
- "source": [
81
- "## Creating Regions from Elements"
82
- ]
83
- },
84
- {
85
- "cell_type": "code",
86
- "execution_count": 3,
87
- "id": "76a034f2",
88
- "metadata": {
89
- "execution": {
90
- "iopub.execute_input": "2025-04-21T21:32:14.113859Z",
91
- "iopub.status.busy": "2025-04-21T21:32:14.113455Z",
92
- "iopub.status.idle": "2025-04-21T21:32:14.187851Z",
93
- "shell.execute_reply": "2025-04-21T21:32:14.187217Z"
94
- }
95
- },
96
- "outputs": [
97
- {
98
- "data": {
99
- "text/plain": [
100
- "' INS-UP70N51NCL41R \\n Site: Durham’s Meatpacking Chicago, Ill. \\n Date: February 3, 1905 \\n Violation Count: 7 \\n \\n \\n '"
101
- ]
102
- },
103
- "execution_count": 3,
104
- "metadata": {},
105
- "output_type": "execute_result"
106
- }
107
- ],
108
- "source": [
109
- "# Find an element to create regions around\n",
110
- "title = page.find('text:contains(\"Jungle Health\")')\n",
111
- "\n",
112
- "# Create regions relative to this element\n",
113
- "below_title = title.below(height=100)\n",
114
- "right_of_title = title.right(width=200) \n",
115
- "above_title = title.above(height=50)\n",
116
- "\n",
117
- "# Visualize these regions\n",
118
- "below_title.show(color=\"green\", label=\"Below\")\n",
119
- "right_of_title.show(color=\"red\", label=\"Right\")\n",
120
- "above_title.show(color=\"orange\", label=\"Above\")\n",
121
- "\n",
122
- "# Extract text from the region below the title\n",
123
- "below_title.extract_text()"
124
- ]
125
- },
126
- {
127
- "cell_type": "markdown",
128
- "id": "ad3b2337",
129
- "metadata": {},
130
- "source": [
131
- "## Finding Elements Within Regions"
132
- ]
133
- },
134
- {
135
- "cell_type": "code",
136
- "execution_count": 4,
137
- "id": "633bdc8b",
138
- "metadata": {
139
- "execution": {
140
- "iopub.execute_input": "2025-04-21T21:32:14.189588Z",
141
- "iopub.status.busy": "2025-04-21T21:32:14.189354Z",
142
- "iopub.status.idle": "2025-04-21T21:32:14.245744Z",
143
- "shell.execute_reply": "2025-04-21T21:32:14.245287Z"
144
- }
145
- },
146
- "outputs": [
147
- {
148
- "data": {
149
- "text/plain": [
150
- "3"
151
- ]
152
- },
153
- "execution_count": 4,
154
- "metadata": {},
155
- "output_type": "execute_result"
156
- }
157
- ],
158
- "source": [
159
- "# Create a region for a specific document section\n",
160
- "form_region = page.create_region(50, 100, page.width - 50, 300)\n",
161
- "\n",
162
- "# Find elements only within this region\n",
163
- "labels = form_region.find_all('text:contains(\":\")') \n",
164
- "\n",
165
- "# Visualize the region and the elements found\n",
166
- "form_region.show(color=(0, 0, 1, 0.2), label=\"Form Region\")\n",
167
- "labels.show(color=\"purple\", label=\"Labels\")\n",
168
- "\n",
169
- "# Count the elements found\n",
170
- "len(labels)"
171
- ]
172
- },
173
- {
174
- "cell_type": "markdown",
175
- "id": "1016d6de",
176
- "metadata": {},
177
- "source": [
178
- "## Expanding and Adjusting Regions"
179
- ]
180
- },
181
- {
182
- "cell_type": "code",
183
- "execution_count": 5,
184
- "id": "ef6db1c1",
185
- "metadata": {
186
- "execution": {
187
- "iopub.execute_input": "2025-04-21T21:32:14.247620Z",
188
- "iopub.status.busy": "2025-04-21T21:32:14.247431Z",
189
- "iopub.status.idle": "2025-04-21T21:32:14.297797Z",
190
- "shell.execute_reply": "2025-04-21T21:32:14.297383Z"
191
- }
192
- },
193
- "outputs": [
194
- {
195
- "data": {
196
- "text/plain": [
197
- "'Summary: Worst of any, however, were the fertilizer men\\nThese people could not be shown to the visitor - for the o\\nvisitor at a hundred yards, and as for the other men, who\\nsome of which there were open vats near the level of the\\ninto the vats; and when they were fished out, there was n\\nexhibiting - sometimes they would be overlooked for days\\nto the world as Durham’s Pure Leaf Lard!\\n '"
198
- ]
199
- },
200
- "execution_count": 5,
201
- "metadata": {},
202
- "output_type": "execute_result"
203
- }
204
- ],
205
- "source": [
206
- "# Find an element to work with\n",
207
- "element = page.find('text:contains(\"Summary:\")')\n",
208
- "\n",
209
- "# Create a tight region around the element\n",
210
- "tight_region = page.create_region(\n",
211
- " element.x0, element.top, \n",
212
- " element.x1, element.bottom\n",
213
- ")\n",
214
- "\n",
215
- "# Expand it to include surrounding content\n",
216
- "expanded_region = tight_region.expand(\n",
217
- " left=10, # Expand 10 points to the left\n",
218
- " right=200, # Expand 200 points to the right\n",
219
- " top=5, # Expand 5 points above\n",
220
- " bottom=100 # Expand 100 points below\n",
221
- ")\n",
222
- "\n",
223
- "# Visualize both regions\n",
224
- "tight_region.show(color=\"red\", label=\"Original\")\n",
225
- "expanded_region.show(color=\"blue\", label=\"Expanded\")\n",
226
- "\n",
227
- "# Extract the content from the expanded region\n",
228
- "expanded_region.extract_text()"
229
- ]
230
- },
231
- {
232
- "cell_type": "markdown",
233
- "id": "3a0a59e5",
234
- "metadata": {},
235
- "source": [
236
- "## Creating Bounded Regions"
237
- ]
238
- },
239
- {
240
- "cell_type": "code",
241
- "execution_count": 6,
242
- "id": "083c200e",
243
- "metadata": {
244
- "execution": {
245
- "iopub.execute_input": "2025-04-21T21:32:14.299580Z",
246
- "iopub.status.busy": "2025-04-21T21:32:14.299178Z",
247
- "iopub.status.idle": "2025-04-21T21:32:14.330220Z",
248
- "shell.execute_reply": "2025-04-21T21:32:14.329836Z"
249
- }
250
- },
251
- "outputs": [
252
- {
253
- "data": {
254
- "text/plain": [
255
- "' Jungle Health and Safety Inspection Service\\n INS-UP70N51NCL41R \\nSite: Durham’s Meatpacking Chicago, Ill. ...'"
256
- ]
257
- },
258
- "execution_count": 6,
259
- "metadata": {},
260
- "output_type": "execute_result"
261
- }
262
- ],
263
- "source": [
264
- "# Find two elements to serve as boundaries\n",
265
- "start_elem = page.find('text:contains(\"Summary:\")')\n",
266
- "end_elem = page.find('text:contains(\"Statute\")')\n",
267
- "\n",
268
- "# Create a region from start to end element\n",
269
- "bounded_region = start_elem.until(end_elem)\n",
270
- "\n",
271
- "# Visualize the bounded region\n",
272
- "bounded_region.show(color=\"green\", label=\"Bounded Region\")\n",
273
- "\n",
274
- "# Extract text from this bounded region\n",
275
- "bounded_region.extract_text()[:200] + \"...\" if len(bounded_region.extract_text()) > 200 else bounded_region.extract_text()"
276
- ]
277
- },
278
- {
279
- "cell_type": "markdown",
280
- "id": "231224fa",
281
- "metadata": {},
282
- "source": [
283
- "## Working with Multiple Regions"
284
- ]
285
- },
286
- {
287
- "cell_type": "code",
288
- "execution_count": 7,
289
- "id": "d520009e",
290
- "metadata": {
291
- "execution": {
292
- "iopub.execute_input": "2025-04-21T21:32:14.331874Z",
293
- "iopub.status.busy": "2025-04-21T21:32:14.331693Z",
294
- "iopub.status.idle": "2025-04-21T21:32:14.410562Z",
295
- "shell.execute_reply": "2025-04-21T21:32:14.410194Z"
296
- }
297
- },
298
- "outputs": [
299
- {
300
- "data": {
301
- "text/plain": [
302
- "{'header': ' \\n \\n \\n Jungle Health and Safety Inspection Service\\n INS-UP70N51NCL41R \\n \\n Site: Durham’s Meatpacking Chicago, Ill. ',\n",
303
- " 'main': 'ruary 3, 1905 \\nCount: 7 ...',\n",
304
- " 'footer': ' Jungle Health and Safety Inspection Service \\n \\n '}"
305
- ]
306
- },
307
- "execution_count": 7,
308
- "metadata": {},
309
- "output_type": "execute_result"
310
- }
311
- ],
312
- "source": [
313
- "# Define multiple regions to extract different parts of the document\n",
314
- "header_region = page.create_region(0, 0, page.width, 100)\n",
315
- "main_region = page.create_region(100, 100, page.width - 100, page.height - 150)\n",
316
- "footer_region = page.create_region(0, page.height - 50, page.width, page.height)\n",
317
- "\n",
318
- "# Visualize all regions\n",
319
- "header_region.show(color=\"blue\", label=\"Header\")\n",
320
- "main_region.show(color=\"green\", label=\"Main Content\")\n",
321
- "footer_region.show(color=\"red\", label=\"Footer\")\n",
322
- "\n",
323
- "# Extract content from each region\n",
324
- "document_parts = {\n",
325
- " \"header\": header_region.extract_text(),\n",
326
- " \"main\": main_region.extract_text()[:100] + \"...\",\n",
327
- " \"footer\": footer_region.extract_text()\n",
328
- "}\n",
329
- "\n",
330
- "# Show what we extracted\n",
331
- "document_parts"
332
- ]
333
- },
334
- {
335
- "cell_type": "markdown",
336
- "id": "3168edfa",
337
- "metadata": {},
338
- "source": [
339
- "## Creating an Image of a Region"
340
- ]
341
- },
342
- {
343
- "cell_type": "code",
344
- "execution_count": 8,
345
- "id": "eb460e68",
346
- "metadata": {
347
- "execution": {
348
- "iopub.execute_input": "2025-04-21T21:32:14.412800Z",
349
- "iopub.status.busy": "2025-04-21T21:32:14.412651Z",
350
- "iopub.status.idle": "2025-04-21T21:32:14.469471Z",
351
- "shell.execute_reply": "2025-04-21T21:32:14.469002Z"
352
- }
353
- },
354
- "outputs": [
355
- {
356
- "data": {
357
- "image/png": "",
358
- "text/plain": [
359
- "<PIL.Image.Image image mode=RGBA size=1275x206>"
360
- ]
361
- },
362
- "execution_count": 8,
363
- "metadata": {},
364
- "output_type": "execute_result"
365
- }
366
- ],
367
- "source": [
368
- "# Find a region of interest\n",
369
- "table_header = page.find('text:contains(\"Statute\")')\n",
370
- "table_region = table_header.below(height=100)\n",
371
- "\n",
372
- "# Visualize the region\n",
373
- "table_region.show(color=\"purple\", label=\"Table Region\")\n",
374
- "\n",
375
- "# Create an image of just this region\n",
376
- "table_region.to_image(resolution=150)"
377
- ]
378
- },
379
- {
380
- "cell_type": "markdown",
381
- "id": "20c711fe",
382
- "metadata": {},
383
- "source": [
384
- "Regions allow you to precisely target specific parts of a document for extraction and analysis. They're essential for handling complex document layouts and isolating the exact content you need. "
385
- ]
386
- }
387
- ],
388
- "metadata": {
389
- "jupytext": {
390
- "cell_metadata_filter": "-all",
391
- "main_language": "python",
392
- "notebook_metadata_filter": "-all",
393
- "text_representation": {
394
- "extension": ".md",
395
- "format_name": "markdown"
396
- }
397
- },
398
- "language_info": {
399
- "codemirror_mode": {
400
- "name": "ipython",
401
- "version": 3
402
- },
403
- "file_extension": ".py",
404
- "mimetype": "text/x-python",
405
- "name": "python",
406
- "nbconvert_exporter": "python",
407
- "pygments_lexer": "ipython3",
408
- "version": "3.10.13"
409
- }
410
- },
411
- "nbformat": 4,
412
- "nbformat_minor": 5
413
- }