natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,190 +0,0 @@
1
- # Spatial Navigation
2
-
3
- Spatial navigation lets you work with PDF content based on the physical layout of elements on the page. It's perfect for finding elements relative to each other and extracting information in context.
4
-
5
- ```python
6
- #%pip install "natural-pdf[all]"
7
- ```
8
-
9
- ```python
10
- from natural_pdf import PDF
11
-
12
- # Load a PDF
13
- pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
14
- page = pdf.pages[0]
15
-
16
- # Find the title of the document
17
- title = page.find('text:contains("Jungle Health")')
18
-
19
- # Visualize our starting point
20
- title.show(color="red", label="Document Title")
21
-
22
- # Display the title text
23
- title.text
24
- ```
25
-
26
- ## Finding Elements Above and Below
27
-
28
- ```python
29
- # Create a region below the title
30
- region_below = title.below(height=100)
31
-
32
- # Visualize the region
33
- region_below.show(color="blue", label="Below Title")
34
-
35
- # Find and extract text from this region
36
- text_below = region_below.extract_text()
37
- text_below
38
- ```
39
-
40
- ## Finding Content Between Elements
41
-
42
- ```python
43
- # Find two labels to serve as boundaries
44
- site_label = page.find('text:contains("Site:")')
45
- date_label = page.find('text:contains("Date:")')
46
-
47
- # Get the region between these labels
48
- between_region = site_label.below(
49
- include_element=True, # Include starting element
50
- until='text:contains("Date:")', # Stop at this element
51
- include_endpoint=False # Don't include ending element
52
- )
53
-
54
- # Visualize the region between labels
55
- between_region.show(color="green", label="Between")
56
-
57
- # Extract text from this bounded area
58
- between_region.extract_text()
59
- ```
60
-
61
- ## Navigating Left and Right
62
-
63
- ```python
64
- # Find a field label
65
- site_label = page.find('text:contains("Site:")')
66
-
67
- # Get the content to the right (the field value)
68
- value_region = site_label.right(width=200)
69
-
70
- # Visualize the label and value regions
71
- site_label.show(color="red", label="Label")
72
- value_region.show(color="blue", label="Value")
73
-
74
- # Extract just the value text
75
- value_region.extract_text()
76
- ```
77
-
78
- ## Finding Adjacent Elements
79
-
80
- ```python
81
- # Start with a label element
82
- label = page.find('text:contains("Site:")')
83
-
84
- # Find the next and previous elements in reading order
85
- next_elem = label.next()
86
- prev_elem = label.prev()
87
-
88
- # Visualize all three elements
89
- label.show(color="red", label="Current")
90
- next_elem.show(color="green", label="Next") if next_elem else None
91
- prev_elem.show(color="blue", label="Previous") if prev_elem else None
92
-
93
- # Show the text of adjacent elements
94
- {
95
- "current": label.text,
96
- "next": next_elem.text if next_elem else "None",
97
- "previous": prev_elem.text if prev_elem else "None"
98
- }
99
- ```
100
-
101
- ## Combining with Element Selectors
102
-
103
- ```python
104
- # Find a section label
105
- summary = page.find('text:contains("Summary:")')
106
-
107
- # Find the next bold text element
108
- next_bold = summary.next('text:bold', limit=20)
109
-
110
- # Find the nearest line element
111
- nearest_line = summary.nearest('line')
112
-
113
- # Visualize what we found
114
- summary.show(color="red", label="Summary")
115
- next_bold.show(color="blue", label="Next Bold") if next_bold else None
116
- nearest_line.show(color="green", label="Nearest Line") if nearest_line else None
117
-
118
- # Show the content we found
119
- {
120
- "summary": summary.text,
121
- "next_bold": next_bold.text if next_bold else "None found",
122
- "nearest_line": nearest_line if nearest_line else "None found"
123
- }
124
- ```
125
-
126
- ## Extracting Table Rows with Spatial Navigation
127
-
128
- ```python
129
- # Find a table heading
130
- table_heading = page.find('text:contains("Statute")')
131
- table_heading.show(color="purple", label="Table Header")
132
-
133
- # Extract table rows using spatial navigation
134
- rows = []
135
- current = table_heading
136
-
137
- # Get the next 4 rows
138
- for i in range(4):
139
- # Find the next row below the current one
140
- next_row = current.below(height=15)
141
-
142
- if next_row:
143
- rows.append(next_row)
144
- current = next_row # Move to the next row
145
- else:
146
- break
147
-
148
- # Visualize all found rows
149
- page.clear_highlights()
150
- for i, row in enumerate(rows):
151
- row.highlight(label=f"Row {i+1}")
152
- page.to_image(width=700)
153
- ```
154
-
155
- ```python
156
- # Extract text from each row
157
- [row.extract_text() for row in rows]
158
- ```
159
-
160
- ## Extracting Key-Value Pairs
161
-
162
- ```python
163
- # Find all potential field labels (text with a colon)
164
- labels = page.find_all('text:contains(":")')
165
-
166
- # Visualize the labels
167
- labels.show(color="blue", label="Labels")
168
-
169
- # Extract key-value pairs
170
- field_data = {}
171
-
172
- for label in labels:
173
- # Clean up the label text
174
- key = label.text.strip().rstrip(':')
175
-
176
- # Skip if not a proper label
177
- if not key:
178
- continue
179
-
180
- # Get the value to the right
181
- value = label.right(width=200).extract_text().strip()
182
-
183
- # Add to our collection
184
- field_data[key] = value
185
-
186
- # Show the extracted data
187
- field_data
188
- ```
189
-
190
- Spatial navigation mimics how humans read documents, letting you navigate content based on physical relationships between elements. It's especially useful for extracting structured data from forms, tables, and formatted documents.