natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,256 +0,0 @@
1
- # Section Extraction
2
-
3
- Documents are often organized into logical sections like chapters, articles, or content blocks. This tutorial shows how to extract these sections using natural-pdf, using a library weeding log as an example.
4
-
5
- ```python
6
- #%pip install "natural-pdf[all]"
7
- ```
8
-
9
- ```python
10
- from natural_pdf import PDF
11
-
12
- # Load the PDF using the relative path
13
- pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf")
14
- page = pdf.pages[0]
15
-
16
- # Find horizontal lines that separate book entries
17
- horizontal_lines = page.find_all('line:horizontal')
18
-
19
- # Visualize the potential section boundaries
20
- horizontal_lines.highlight(color="red", label="Section Boundaries")
21
- page.to_image()
22
- ```
23
-
24
- ```python
25
- # Count what we found
26
- len(horizontal_lines)
27
- ```
28
-
29
- ## Basic Section Extraction
30
-
31
- ```python
32
- # Extract sections based on horizontal lines
33
- # Each section starts at a horizontal line and ends at the next one
34
- book_sections = page.get_sections(
35
- start_elements=horizontal_lines,
36
- boundary_inclusion='start' # Include the boundary in the section
37
- )
38
-
39
- # Visualize each section
40
- page.clear_highlights()
41
- for section in book_sections:
42
- section.highlight()
43
- page.to_image()
44
- ```
45
-
46
- ```python
47
- # Display section count and preview the first section
48
- {
49
- "total_sections": len(book_sections),
50
- "first_section_text": book_sections[0].extract_text()[:100] + "..." if book_sections else "No sections found"
51
- }
52
- ```
53
-
54
- ## Working with Section Content
55
-
56
- ```python
57
- # Extract and display content from the first few book entries
58
- book_entries = []
59
-
60
- for i, section in enumerate(book_sections[:5]):
61
- # Extract the section text
62
- text = section.extract_text().strip()
63
-
64
- # Try to parse book information
65
- title = ""
66
- author = ""
67
- isbn = ""
68
-
69
- # Extract title (typically the first line)
70
- title_match = section.find('text:contains("Title:")')
71
- if title_match:
72
- title_value = title_match.right(width=400).extract_text()
73
- title = title_value.strip()
74
-
75
- # Extract author
76
- author_match = section.find('text:contains("Author:")')
77
- if author_match:
78
- author_value = author_match.right(width=400).extract_text()
79
- author = author_value.strip()
80
-
81
- # Extract ISBN
82
- isbn_match = section.find('text:contains("ISBN:")')
83
- if isbn_match:
84
- isbn_value = isbn_match.right(width=400).extract_text()
85
- isbn = isbn_value.strip()
86
-
87
- # Add to our collection
88
- book_entries.append({
89
- "number": i + 1,
90
- "title": title,
91
- "author": author,
92
- "isbn": isbn,
93
- "preview": text[:50] + "..." if len(text) > 50 else text
94
- })
95
-
96
- # Display the structured book entries
97
- import pandas as pd
98
- pd.DataFrame(book_entries)
99
- ```
100
-
101
- ## Using Different Section Boundaries
102
-
103
- ```python
104
- page.viewer()
105
- ```
106
-
107
- ```python
108
- # Find title elements with specific selectors
109
- title_elements = page.find('line[width=2]').below().find_all('text[fontname="AAAAAB+font000000002a8d158a"][size=10]')
110
- title_elements.show()
111
- ```
112
-
113
- ```python
114
- # Extract sections starting from each title
115
- # This now directly returns an ElementCollection
116
- title_sections = page.get_sections(
117
- start_elements=title_elements,
118
- boundary_inclusion='start'
119
- )
120
-
121
- # Show the title-based sections
122
- page.clear_highlights()
123
- title_sections.highlight()
124
- page.to_image()
125
- ```
126
-
127
- ```python
128
- # Count the sections found
129
- len(title_sections)
130
- ```
131
-
132
- ## Section Boundary Inclusion Options
133
-
134
- ```python
135
- # Use horizontal line elements as section dividers
136
- dividers = page.find_all('line[horizontal]')
137
-
138
- # Compare the different boundary inclusion options
139
- inclusion_options = {
140
- 'none': page.get_sections(start_elements=dividers, boundary_inclusion='none'),
141
- 'start': page.get_sections(start_elements=dividers, boundary_inclusion='start'),
142
- 'end': page.get_sections(start_elements=dividers, boundary_inclusion='end'),
143
- 'both': page.get_sections(start_elements=dividers, boundary_inclusion='both')
144
- }
145
-
146
- # Count sections with each option
147
- section_counts = {option: len(sections) for option, sections in inclusion_options.items()}
148
- section_counts
149
- ```
150
-
151
- ## Custom Section Boundaries
152
-
153
- ```python
154
- # Define specific start and end points - let's extract just one book entry
155
- # We'll look for the first and second horizontal lines
156
- page.clear_highlights()
157
-
158
- start_point = title_elements[0]
159
- end_point = title_elements[1]
160
-
161
- # Extract the section between these points
162
- single_book_entry = page.get_sections(
163
- start_elements=[start_point],
164
- end_elements=[end_point],
165
- boundary_inclusion='start' # Include the start but not the end
166
- )
167
-
168
- # Visualize the custom section
169
- single_book_entry.highlight(color="green", label="Single Book Entry")
170
-
171
- print(single_book_entry[0].extract_text())
172
-
173
- page.to_image()
174
- ```
175
-
176
- ## Multi-page Sections
177
-
178
- ```python
179
- # Get sections across the first two pages
180
- multi_page_sections = [] # Initialize as a list
181
-
182
- for page_num in range(min(2, len(pdf.pages))):
183
- page = pdf.pages[page_num]
184
-
185
- # Find horizontal lines on this page
186
- title_elements = page.find('line[width=2]').below().find_all('text[fontname="AAAAAB+font000000002a8d158a"][size=10]')
187
-
188
- # Get sections for this page (returns ElementCollection)
189
- page_sections = page.get_sections(
190
- start_elements=title_elements,
191
- boundary_inclusion='start'
192
- )
193
-
194
- # Add elements from the collection to our list
195
- multi_page_sections.extend(page_sections) # list.extend works with iterables like ElementCollection
196
-
197
- # Display info about each section (showing first 3)
198
- [{
199
- "page": section.page.number + 1, # 1-indexed page number for display
200
- "text": section.extract_text()[:50] + "..." if len(section.extract_text()) > 50 else section.extract_text()
201
- } for section in multi_page_sections]
202
- ```
203
-
204
- ## Building a Book Database
205
-
206
- ```python
207
- # Extract all book entries across multiple pages
208
- book_database = []
209
-
210
- # Process first 3 pages (or fewer if the document is shorter)
211
- for page_num in range(min(3, len(pdf.pages))):
212
- page = pdf.pages[page_num]
213
-
214
- # Find horizontal lines on this page
215
- title_elements = page.find('line[width=2]').below().find_all('text[fontname="AAAAAB+font000000002a8d158a"][size=10]')
216
-
217
- # Get sections for this page
218
- book_sections = page.get_sections(
219
- start_elements=title_elements,
220
- boundary_inclusion='start'
221
- )
222
-
223
- # Process each book section
224
- for section in book_sections:
225
- # Skip sections that are too short (might be headers/footers)
226
- if len(section.extract_text()) < 50:
227
- continue
228
-
229
- # Extract book information
230
- book_info = {"page": page_num + 1}
231
-
232
- for field in ["Title:", "Author:", "ISBN:", "Publisher:", "Copyright:"]:
233
- field_element = section.find(f'text:contains("{field}")')
234
- if field_element:
235
- field_name = field.strip(':').lower()
236
- field_value = field_element.extract_text().replace(field, '').strip()
237
- book_info[field_name] = field_value
238
-
239
- # Below the field name
240
- for field in ["Price", "Acquired", "Barcode", "Removed By"]:
241
- field_element = section.find(f'text:contains("{field}")')
242
- if field_element:
243
- field_name = field.lower()
244
- field_value = field_element.below(height=10, width='element').expand(right=50).extract_text().strip()
245
- book_info[field_name] = field_value
246
-
247
- book_database.append(book_info)
248
-
249
- # Display sample entries (first 3)
250
- import pandas as pd
251
-
252
- df = pd.json_normalize(book_database)
253
- df.head()
254
- ```
255
-
256
- Section extraction lets you break down documents into logical parts, making it easier to generate summaries, extract specific content, and create structured data from semi-structured documents. In this example, we've shown how to convert a PDF library catalog into a structured book database.