natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
docs/regions/index.md DELETED
@@ -1,294 +0,0 @@
1
- # Working with Regions
2
-
3
- Regions are rectangular areas on a page that define boundaries for operations like text extraction, element finding, or visualization. They're one of Natural PDF's most powerful features for working with specific parts of a document.
4
-
5
- ## Setup
6
-
7
- Let's set up a PDF to experiment with regions.
8
-
9
- ```python
10
- from natural_pdf import PDF
11
-
12
- # Load the PDF
13
- pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
14
-
15
- # Get the first page
16
- page = pdf.pages[0]
17
-
18
- # Display the page
19
- page.show(width=700)
20
- ```
21
-
22
- ## Creating Regions
23
-
24
- There are several ways to create regions in Natural PDF.
25
-
26
- ### Using `create_region()` with Coordinates
27
-
28
- This is the most direct method - provide the coordinates directly.
29
-
30
- ```python
31
- # Create a region by specifying (x0, top, x1, bottom) coordinates
32
- # Let's create a region in the middle of the page
33
- mid_region = page.create_region(
34
- x0=100, # Left edge
35
- top=200, # Top edge
36
- x1=500, # Right edge
37
- bottom=400 # Bottom edge
38
- )
39
-
40
- # Highlight the region to see it
41
- mid_region.highlight(color="blue").show()
42
- ```
43
-
44
- ### Using Element Methods: `above()`, `below()`, `left()`, `right()`
45
-
46
- You can create regions relative to existing elements.
47
-
48
- ```python
49
- # Find a heading-like element
50
- heading = page.find('text[size>=12]:bold')
51
-
52
- # Create a region below this heading element
53
- if heading:
54
- region_below = heading.below()
55
-
56
- # Highlight the heading and the region below it
57
- heading.highlight(color="red")
58
- region_below.highlight(color="blue")
59
- page.show()
60
- ```
61
-
62
- ```python
63
- # Create a region with height limit
64
- if heading:
65
- # Only include 100px below the heading
66
- small_region_below = heading.below(height=100)
67
-
68
- page.clear_highlights()
69
- heading.highlight(color="red")
70
- small_region_below.highlight(color="green")
71
- page.show()
72
- ```
73
-
74
- ```python
75
- # Find a line or other element to create a region above
76
- line = page.find('line')
77
- if line:
78
- # Create a region above the line
79
- region_above = line.above()
80
-
81
- page.clear_highlights()
82
- line.highlight(color="black")
83
- region_above.highlight(color="purple")
84
- page.show()
85
- ```
86
-
87
- ### Creating a Region Between Elements with `until()`
88
-
89
- ```python
90
- # Find two elements to use as boundaries
91
- first_heading = page.find('text[size>=11]:bold')
92
- next_heading = first_heading.next('text[size>=11]:bold') if first_heading else None
93
-
94
- if first_heading and next_heading:
95
- # Create a region from the first heading until the next heading
96
- section = first_heading.below(until=next_heading, include_endpoint=False)
97
-
98
- # Highlight both elements and the region between them
99
- page.clear_highlights()
100
- first_heading.highlight(color="red")
101
- next_heading.highlight(color="red")
102
- section.highlight(color="yellow")
103
- page.show()
104
- ```
105
-
106
- ## Using Regions
107
-
108
- Once you have a region, here's what you can do with it.
109
-
110
- ### Extract Text from a Region
111
-
112
- ```python
113
- # Find a region to work with (e.g., from a title to the next bold text)
114
- title = page.find('text:contains("Site")') # Adjust if needed
115
- if title:
116
- # Create a region from title down to the next bold text
117
- content_region = title.below(until='line:horizontal', include_endpoint=False)
118
-
119
- # Extract text from just this region
120
- region_text = content_region.extract_text()
121
-
122
- # Show the region and the extracted text
123
- page.clear_highlights()
124
- content_region.highlight(color="green")
125
- page.show()
126
-
127
- # Displaying the text (first 300 chars if long)
128
- print(region_text[:300] + "..." if len(region_text) > 300 else region_text)
129
- ```
130
-
131
- ### Find Elements Within a Region
132
-
133
- You can use a region as a "filter" to only find elements within its boundaries.
134
-
135
- ```python
136
- # Create a region in an interesting part of the page
137
- test_region = page.create_region(
138
- x0=page.width * 0.1,
139
- top=page.height * 0.25,
140
- x1=page.width * 0.9,
141
- bottom=page.height * 0.75
142
- )
143
-
144
- # Find all text elements ONLY within this region
145
- text_in_region = test_region.find_all('text')
146
-
147
- # Display result
148
- page.clear_highlights()
149
- test_region.highlight(color="blue")
150
- text_in_region.highlight(color="red")
151
- page.show()
152
-
153
- len(text_in_region) # Number of text elements found in region
154
- ```
155
-
156
- ### Generate an Image of a Region
157
-
158
- ```python
159
- # Find a specific region to capture
160
- # (Could be a table, figure, or any significant area)
161
- region_for_image = page.create_region(
162
- x0=100,
163
- top=150,
164
- x1=page.width - 100,
165
- bottom=300
166
- )
167
-
168
- # Generate an image of just this region
169
- region_for_image.to_image(crop_only=True) # Shows just the region
170
- ```
171
-
172
- ### Adjust and Expand Regions
173
-
174
- ```python
175
- # Take an existing region and expand it
176
- region_a = page.create_region(200, 200, 400, 400)
177
-
178
- # Expand by a certain number of points in each direction
179
- expanded = region_a.expand(left=20, right=20, top=20, bottom=20)
180
-
181
- # Visualize original and expanded regions
182
- page.clear_highlights()
183
- region_a.highlight(color="blue", label="Original")
184
- expanded.highlight(color="red", label="Expanded")
185
- page.to_image()
186
- ```
187
-
188
- ## Using Exclusion Zones with Regions
189
-
190
- Exclusion zones are regions that you want to ignore during operations like text extraction.
191
-
192
- ```python
193
- # Create a region for the whole page
194
- full_page_region = page.create_region(0, 0, page.width, page.height)
195
-
196
- # Extract text without exclusions as baseline
197
- full_text = full_page_region.extract_text()
198
- print(f"Full page text length: {len(full_text)} characters")
199
- ```
200
-
201
- ```python
202
- # Define an area we want to exclude (like a header)
203
- # Let's exclude the top 10% of the page
204
- header_zone = page.create_region(0, 0, page.width, page.height * 0.1)
205
-
206
- # Add this as an exclusion for the page
207
- page.add_exclusion(header_zone)
208
-
209
- # Visualize the exclusion
210
- page.clear_highlights()
211
- header_zone.highlight(color="red", label="Excluded")
212
- page.show()
213
- ```
214
-
215
- ```python
216
- # Now extract text again - the header should be excluded
217
- text_with_exclusion = full_page_region.extract_text() # Uses apply_exclusions=True by default
218
-
219
- # Compare text lengths
220
- print(f"Original text: {len(full_text)} chars\nText with exclusion: {len(text_with_exclusion)} chars")
221
- print(f"Difference: {len(full_text) - len(text_with_exclusion)} chars excluded")
222
- ```
223
-
224
- ```python
225
- # When done with this page, clear exclusions
226
- page.clear_exclusions()
227
- ```
228
-
229
- ## Document-Level Exclusions
230
-
231
- PDF-level exclusions apply to all pages and use functions to adapt to each page.
232
-
233
- ```python
234
- # Define a PDF-level exclusion for headers
235
- # This will exclude the top 30% of every page
236
- pdf.add_exclusion(
237
- lambda p: p.create_region(0, 0, p.width, p.height * 0.3),
238
- label="Header zone"
239
- )
240
-
241
- # Define a PDF-level exclusion for footers
242
- # This will exclude the bottom 20% of every page
243
- pdf.add_exclusion(
244
- lambda p: p.create_region(0, p.height * 0.8, p.width, p.height),
245
- label="Footer zone"
246
- )
247
-
248
- # PDF-level exclusions are used whenever you extract text
249
- # Let's try on the first three pages
250
- for page in pdf.pages[:3]:
251
- text = page.extract_text()
252
- text_original = page.extract_text(use_exclusions=False)
253
- print(f"Page {page.number} – Before: {len(text_original)} After: {len(text)}")
254
- ```
255
-
256
- ```python
257
- # Clear PDF-level exclusions when done
258
- pdf.clear_exclusions()
259
- print("Cleared all PDF-level exclusions")
260
- ```
261
-
262
- ## Working with Layout Analysis Regions
263
-
264
- When you run layout analysis, the detected regions (tables, titles, etc.) are also Region objects.
265
-
266
- ```python
267
- # First, run layout analysis to detect regions
268
- page.analyze_layout() # Uses 'yolo' engine by default
269
-
270
- # Find all detected regions
271
- detected_regions = page.find_all('region')
272
- print(f"Found {len(detected_regions)} layout regions")
273
- ```
274
-
275
- ```python
276
- # Highlight all detected regions by type
277
- detected_regions.highlight(group_by='region_type').show()
278
- ```
279
-
280
- ```python
281
- # Extract text from a specific region type (e.g., title)
282
- title_regions = page.find_all('region[type=title]')
283
- if title_regions:
284
- titles_text = title_regions.extract_text()
285
- print(f"Title text: {titles_text}")
286
- ```
287
-
288
- ## Next Steps
289
-
290
- Now that you understand regions, you can:
291
-
292
- - [Extract tables](../tables/index.ipynb) from table regions
293
- - [Ask questions](../document-qa/index.ipynb) about specific regions
294
- - [Exclude content](../text-extraction/index.md#filtering-out-headers-and-footers) from extraction