natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,292 @@
1
+ # Text Extraction Guide
2
+
3
+ This guide demonstrates various ways to extract text from PDFs using Natural PDF, from simple page dumps to targeted extraction based on elements, regions, and styles.
4
+
5
+ ## Setup
6
+
7
+ First, let's import necessary libraries and load a sample PDF. We'll use `example.pdf` from the tutorials' `pdfs` directory. *Adjust the path if your setup differs.*
8
+
9
+ ```python
10
+ from natural_pdf import PDF
11
+
12
+ # Load the PDF
13
+ pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
14
+
15
+ # Select the first page for initial examples
16
+ page = pdf.pages[0]
17
+
18
+ # Display the first page
19
+ page.show(width=700)
20
+ ```
21
+
22
+ ## Basic Text Extraction
23
+
24
+ Get all text from a page or the entire document.
25
+
26
+ ```python
27
+ # Extract all text from the first page
28
+ # Displaying first 500 characters
29
+ print(page.extract_text()[:500])
30
+ ```
31
+
32
+ You can also preserve layout with `layout=True`.
33
+
34
+ ```python
35
+ # Extract text from the entire document (may take time)
36
+ # Uncomment to run:
37
+ print(page.extract_text(layout=True)[:2000])
38
+ ```
39
+
40
+ ## Extracting Text from Specific Elements
41
+
42
+ Use selectors with `find()` or `find_all()` to target specific elements. *Selectors like `:contains("Summary")` are examples; adapt them to your PDF.*
43
+
44
+ ```python
45
+ # Find a single element, e.g., a title containing "Summary"
46
+ # Adjust selector as needed
47
+ date_element = page.find('text:contains("Site")')
48
+ date_element # Display the found element object
49
+ ```
50
+
51
+ ```python
52
+ date_element.show()
53
+ ```
54
+
55
+ ```python
56
+ date_element.text
57
+ ```
58
+
59
+ ```python
60
+ # Find multiple elements, e.g., bold headings (size >= 8)
61
+ heading_elements = page.find_all('text[size>=8]:bold')
62
+ heading_elements
63
+ ```
64
+
65
+ ```python
66
+ page.find_all('text[size>=8]:bold').show()
67
+ ```
68
+
69
+ ```python
70
+ # Pull out all of their text (why? I don't know!)
71
+ print(heading_elements.extract_text())
72
+ ```
73
+
74
+ ## Advanced text searches
75
+
76
+ ```python
77
+ # Exact phrase (case-sensitive)
78
+ page.find('text:contains("Hazardous Materials")').text
79
+ ```
80
+
81
+ ```python
82
+ # Exact phrase (case-sensitive)
83
+ page.find('text:contains("HAZARDOUS MATERIALS")', case=False).text
84
+ ```
85
+
86
+ ```python
87
+ # Regular expression (e.g., "YYYY Report")
88
+ regex = "\d+, \d{4}"
89
+ page.find(f'text:contains("{regex}")', regex=True)
90
+ ```
91
+
92
+ ```python
93
+ # Regular expression (e.g., "YYYY Report")
94
+ page.find_all('text[fontname="Helvetica"][size=10]')
95
+ ```
96
+
97
+ # Regions
98
+
99
+ ```python
100
+ # Region below an element (e.g., below "Introduction")
101
+ # Adjust selector as needed
102
+ page.find('text:contains("Summary")').below(include_element=True).show()
103
+ ```
104
+
105
+ ```python
106
+ (
107
+ page
108
+ .find('text:contains("Summary")')
109
+ .below(include_element=True)
110
+ .extract_text()
111
+ [:500]
112
+ )
113
+ ```
114
+
115
+ ```python
116
+ (
117
+ page
118
+ .find('text:contains("Summary")')
119
+ .below(include_element=True, until='line:horizontal')
120
+ .show()
121
+ )
122
+ ```
123
+
124
+ ```python
125
+ # Manually defined region via coordinates (x0, top, x1, bottom)
126
+ manual_region = page.create_region(30, 60, 600, 300)
127
+ manual_region.show()
128
+ ```
129
+
130
+ ```python
131
+ # Extract text from the manual region
132
+ manual_region.extract_text()[:500]
133
+ ```
134
+
135
+ ## Filtering Out Headers and Footers
136
+
137
+ Use Exclusion Zones to remove unwanted content before extraction. *Adjust selectors for typical header/footer content.*
138
+
139
+ ```python
140
+ header_content = page.find('rect')
141
+ footer_content = page.find_all('line')[-1].below()
142
+
143
+ header_content.highlight()
144
+ footer_content.highlight()
145
+ page.to_image()
146
+ ```
147
+
148
+ ```python
149
+ page.extract_text()[:500]
150
+ ```
151
+
152
+ ```python
153
+ page.add_exclusion(header_content)
154
+ page.add_exclusion(footer_content)
155
+ ```
156
+
157
+ ```python
158
+ page.extract_text()[:500]
159
+ ```
160
+
161
+ ```python
162
+ full_text_no_exclusions = page.extract_text(use_exclusions=False)
163
+ clean_text = page.extract_text()
164
+ f"Original length: {len(full_text_no_exclusions)}, Excluded length: {len(clean_text)}"
165
+ ```
166
+
167
+ ```python
168
+ page.clear_exclusions()
169
+ ```
170
+
171
+ *Exclusions can also be defined globally at the PDF level using `pdf.add_exclusion()` with a function.*
172
+
173
+ ## Controlling Whitespace
174
+
175
+ Manage how spaces and blank lines are handled during extraction using `layout`.
176
+
177
+ ```python
178
+ print(page.extract_text())
179
+ ```
180
+
181
+ ```python
182
+ print(page.extract_text(use_exclusions=False, layout=True))
183
+ ```
184
+
185
+ ### Font Information Access
186
+
187
+ Inspect font details of text elements.
188
+
189
+ ```python
190
+ # Find the first text element on the page
191
+ first_text = page.find_all('text')[1]
192
+ first_text # Display basic info
193
+ ```
194
+
195
+ ```python
196
+ # Highlight the first text element
197
+ first_text.show()
198
+ ```
199
+
200
+ ```python
201
+ # Get detailed font properties dictionary
202
+ first_text.font_info()
203
+ ```
204
+
205
+ ```python
206
+ # Check specific style properties directly
207
+ f"Is Bold: {first_text.bold}, Is Italic: {first_text.italic}, Font: {first_text.fontname}, Size: {first_text.size}"
208
+ ```
209
+
210
+ ```python
211
+ # Find elements by font attributes (adjust selectors)
212
+ # Example: Find Arial fonts
213
+ arial_text = page.find_all('text[fontname*=Helvetica]')
214
+ arial_text # Display list of found elements
215
+ ```
216
+
217
+ ```python
218
+ # Example: Find large text (e.g., size >= 16)
219
+ large_text = page.find_all('text[size>=12]')
220
+ large_text
221
+ ```
222
+
223
+ ```python
224
+ # Example: Find large text (e.g., size >= 16)
225
+ bold_text = page.find_all('text:bold')
226
+ bold_text
227
+ ```
228
+
229
+ ## Working with Font Styles
230
+
231
+ Analyze and group text elements by their computed font *style*, which combines attributes like font name, size, boldness, etc., into logical groups.
232
+
233
+ ```python
234
+ # Analyze styles on the page
235
+ # This returns a dictionary mapping style names to ElementList objects
236
+ page.analyze_text_styles()
237
+ page.text_style_labels
238
+ ```
239
+
240
+ ```python
241
+ page.find_all('text').highlight(group_by='style_label').to_image()
242
+ ```
243
+
244
+ ```python
245
+ page.find_all('text[style_label="8.0pt Helvetica"]')
246
+ ```
247
+
248
+ ```python
249
+ page.find_all('text[fontname="Helvetica"][size=8]')
250
+ ```
251
+
252
+ *Font variants (e.g., `AAAAAB+FontName`) are also accessible via the `font-variant` attribute selector: `page.find_all('text[font-variant="AAAAAB"]')`.*
253
+
254
+ ## Reading Order
255
+
256
+ Text extraction respects a pathetic attempt at natural reading order (top-to-bottom, left-to-right by default). `page.find_all('text')` returns elements already sorted this way.
257
+
258
+ ```python
259
+ # Get first 5 text elements in reading order
260
+ elements_in_order = page.find_all('text')
261
+ elements_in_order[:5]
262
+ ```
263
+
264
+ ```python
265
+ # Text extracted via page.extract_text() respects this order automatically
266
+ # (Result already shown in Basic Text Extraction section)
267
+ page.extract_text()[:100]
268
+ ```
269
+
270
+ ## Element Navigation
271
+
272
+ Move between elements sequentially based on reading order using `.next()` and `.previous()`.
273
+
274
+ ```python
275
+ page.clear_highlights()
276
+
277
+ start = page.find('text:contains("Date")')
278
+ start.highlight(label='Date label')
279
+ start.next().highlight(label='Maybe the date', color='green')
280
+ start.next('text:contains("\d")', regex=True).highlight(label='Probably the date')
281
+
282
+ page.to_image()
283
+ ```
284
+
285
+ ## Next Steps
286
+
287
+ Now that you know how to extract text, you might want to explore:
288
+
289
+ - [Working with regions](../regions/index.ipynb) for more precise extraction
290
+ - [OCR capabilities](../ocr/index.md) for scanned documents
291
+ - [Document layout analysis](../layout-analysis/index.ipynb) for automatic structure detection
292
+ - [Document QA](../document-qa/index.ipynb) for asking questions directly to your documents