natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,201 +0,0 @@
1
- # Form Field Extraction
2
-
3
- Business documents like invoices, forms, and applications contain field-value pairs that need to be extracted. This tutorial shows how to identify and extract these form fields.
4
-
5
- ```python
6
- #%pip install "natural-pdf[all]"
7
- ```
8
-
9
- ```python
10
- from natural_pdf import PDF
11
-
12
- # Load a PDF
13
- pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
14
- page = pdf.pages[0]
15
-
16
- # Find fields with labels ending in colon
17
- labels = page.find_all('text:contains(":")')
18
-
19
- # Visualize the found labels
20
- labels.show(color="blue", label="Field Labels")
21
-
22
- # Count how many potential fields we found
23
- len(labels)
24
- ```
25
-
26
- ## Extracting Field Values
27
-
28
- ```python
29
- # Extract the value for each field label
30
- form_data = {}
31
-
32
- for label in labels:
33
- # Clean up the label text
34
- field_name = label.text.strip().rstrip(':')
35
-
36
- # Find the value to the right of the label
37
- value_region = label.right(width=200)
38
- value = value_region.extract_text().strip()
39
-
40
- # Store in our dictionary
41
- form_data[field_name] = value
42
-
43
- # Display the extracted data
44
- form_data
45
- ```
46
-
47
- ## Visualizing Labels and Values
48
-
49
- ```python
50
- # Clear previous highlights
51
- page.clear_highlights()
52
-
53
- # Highlight both labels and their values
54
- for label in labels:
55
- # Highlight the label in red
56
- label.show(color="red", label="Label")
57
-
58
- # Highlight the value area in blue
59
- label.right(width=200).show(color="blue", label="Value")
60
-
61
- # Show the page image with highlighted elements
62
- page.to_image()
63
- ```
64
-
65
- ## Handling Multi-line Values
66
-
67
- ```python
68
- # Extract values that might span multiple lines
69
- multi_line_data = {}
70
-
71
- for label in labels:
72
- # Get the field name
73
- field_name = label.text.strip().rstrip(':')
74
-
75
- # Look both to the right and below
76
- right_value = label.right(width=200).extract_text().strip()
77
- below_value = label.below(height=50).extract_text().strip()
78
-
79
- # Combine the values if they're different
80
- if right_value in below_value:
81
- value = below_value
82
- else:
83
- value = f"{right_value} {below_value}".strip()
84
-
85
- # Add to results
86
- multi_line_data[field_name] = value
87
-
88
- # Show fields with potential multi-line values
89
- multi_line_data
90
- ```
91
-
92
- ## Finding Pattern-Based Fields
93
-
94
- ```python
95
- import re
96
-
97
- # Find dates in the format July 31, YYY
98
- date_pattern = r'\b\w+ \d+, \d\d\d\d\b'
99
-
100
- # Search all text elements for dates
101
- text_elements = page.find_all('text')
102
- print([elem.text for elem in text_elements])
103
- dates = text_elements.filter(lambda elem: re.search(date_pattern, elem.text))
104
-
105
- # Visualize the date fields
106
- dates.show(color="green", label="Date")
107
-
108
- # Extract just the date values
109
- date_texts = [re.search(date_pattern, elem.text).group(0) for elem in dates]
110
- date_texts
111
- ```
112
-
113
- ## Working with Form Tables
114
-
115
- ```python
116
- # Run layout analysis to find table structures
117
- page.analyze_layout()
118
-
119
- # Find possible form tables
120
- tables = page.find_all('region[type=table]')
121
-
122
- if tables:
123
- # Visualize the tables
124
- tables.show(color="purple", label="Form Table")
125
-
126
- # Extract data from the first table
127
- first_table = tables[0]
128
- table_data = first_table.extract_table()
129
- table_data
130
- else:
131
- # Try to find form-like structure using text alignment
132
- # Create a region where a form might be
133
- form_region = page.create_region(50, 200, page.width - 50, 500)
134
-
135
- # Group text by vertical position
136
- rows = {}
137
- text_elements = form_region.find_all('text')
138
-
139
- for elem in text_elements:
140
- # Round y-position to group elements in the same row
141
- row_pos = round(elem.top / 5) * 5
142
- if row_pos not in rows:
143
- rows[row_pos] = []
144
- rows[row_pos].append(elem)
145
-
146
- # Extract data from rows (first 5 rows)
147
- row_data = []
148
- for y in sorted(rows.keys())[:5]:
149
- # Sort elements by x-position (left to right)
150
- elements = sorted(rows[y], key=lambda e: e.x0)
151
-
152
- # Show the row
153
- row_box = form_region.create_region(
154
- min(e.x0 for e in elements),
155
- min(e.top for e in elements),
156
- max(e.x1 for e in elements),
157
- max(e.bottom for e in elements)
158
- )
159
- row_box.show(color=None, use_color_cycling=True)
160
-
161
- # Extract text from row
162
- row_text = [e.text for e in elements]
163
- row_data.append(row_text)
164
-
165
- # Show the extracted rows
166
- row_data
167
- ```
168
-
169
- ## Combining Different Extraction Techniques
170
-
171
- ```python
172
- # Combine label-based and pattern-based extraction
173
- all_fields = {}
174
-
175
- # 1. First get fields with explicit labels
176
- for label in labels:
177
- field_name = label.text.strip().rstrip(':')
178
- value = label.right(width=200).extract_text().strip()
179
- all_fields[field_name] = value
180
-
181
- # 2. Add date fields that we found with pattern matching
182
- for date_elem in dates:
183
- # Find the nearest label
184
- nearby_label = date_elem.nearest('text:contains(":")')
185
-
186
- if nearby_label:
187
- # Extract the label text
188
- label_text = nearby_label.text.strip().rstrip(':')
189
-
190
- # Get the date value
191
- date_value = re.search(date_pattern, date_elem.text).group(0)
192
-
193
- # Add to our results if not already present
194
- if label_text not in all_fields:
195
- all_fields[label_text] = date_value
196
-
197
- # Show all extracted fields
198
- all_fields
199
- ```
200
-
201
- Form field extraction enables you to automate data entry and document processing. By combining different techniques like label detection, spatial navigation, and pattern matching, you can handle a wide variety of form layouts.
@@ -1,54 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "7674e123",
6
- "metadata": {},
7
- "source": [
8
- "# Enhanced Table Processing\n",
9
- "\n",
10
- "Tables are a common way to present structured data in documents, but they can be challenging to extract correctly. This tutorial demonstrates advanced techniques for working with tables in natural-pdf.\n",
11
- "\n",
12
- "TK"
13
- ]
14
- },
15
- {
16
- "cell_type": "code",
17
- "execution_count": 1,
18
- "id": "08c7c5f0",
19
- "metadata": {
20
- "execution": {
21
- "iopub.execute_input": "2025-04-21T21:25:37.324499Z",
22
- "iopub.status.busy": "2025-04-21T21:25:37.324337Z",
23
- "iopub.status.idle": "2025-04-21T21:25:37.328739Z",
24
- "shell.execute_reply": "2025-04-21T21:25:37.328344Z"
25
- }
26
- },
27
- "outputs": [],
28
- "source": [
29
- "#%pip install \"natural-pdf[all]\""
30
- ]
31
- }
32
- ],
33
- "metadata": {
34
- "jupytext": {
35
- "cell_metadata_filter": "-all",
36
- "main_language": "python",
37
- "notebook_metadata_filter": "-all"
38
- },
39
- "language_info": {
40
- "codemirror_mode": {
41
- "name": "ipython",
42
- "version": 3
43
- },
44
- "file_extension": ".py",
45
- "mimetype": "text/x-python",
46
- "name": "python",
47
- "nbconvert_exporter": "python",
48
- "pygments_lexer": "ipython3",
49
- "version": "3.10.13"
50
- }
51
- },
52
- "nbformat": 4,
53
- "nbformat_minor": 5
54
- }
@@ -1,9 +0,0 @@
1
- # Enhanced Table Processing
2
-
3
- Tables are a common way to present structured data in documents, but they can be challenging to extract correctly. This tutorial demonstrates advanced techniques for working with tables in natural-pdf.
4
-
5
- TK
6
-
7
- ```python
8
- #%pip install "natural-pdf[all]"
9
- ```