natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +209 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +288 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +413 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +512 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +604 -0
- docs/tutorials/12-ocr-integration.md +175 -0
- docs/tutorials/13-semantic-search.ipynb +1328 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +50 -33
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +125 -58
- natural_pdf/analyzers/layout/layout_options.py +43 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +89 -45
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +146 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +419 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +1044 -521
- natural_pdf/core/pdf.py +516 -313
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +307 -225
- natural_pdf/elements/collections.py +805 -543
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +889 -879
- natural_pdf/elements/text.py +127 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +57 -35
- natural_pdf/ocr/engine.py +150 -46
- natural_pdf/ocr/engine_easyocr.py +146 -150
- natural_pdf/ocr/engine_paddle.py +118 -175
- natural_pdf/ocr/engine_surya.py +78 -141
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +122 -124
- natural_pdf/ocr/ocr_options.py +16 -20
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
- natural_pdf-0.1.6.dist-info/RECORD +141 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- natural_pdf/templates/ocr_debug.html +0 -517
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,26 +1,28 @@
|
|
1
1
|
"""
|
2
2
|
Reading order utilities for natural-pdf.
|
3
3
|
"""
|
4
|
-
from typing import List, Dict, Any, Callable, Optional
|
5
4
|
|
5
|
+
from typing import Any, Callable, Dict, List, Optional
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
|
8
|
+
def establish_reading_order(
|
9
|
+
elements: List[Dict[str, Any]], algorithm: str = "basic"
|
10
|
+
) -> List[Dict[str, Any]]:
|
9
11
|
"""
|
10
12
|
Establish reading order for a collection of elements.
|
11
|
-
|
13
|
+
|
12
14
|
Args:
|
13
15
|
elements: List of elements to order
|
14
16
|
algorithm: Algorithm to use ('basic', 'column', 'complex')
|
15
|
-
|
17
|
+
|
16
18
|
Returns:
|
17
19
|
List of elements in reading order
|
18
20
|
"""
|
19
|
-
if algorithm ==
|
21
|
+
if algorithm == "basic":
|
20
22
|
return _basic_reading_order(elements)
|
21
|
-
elif algorithm ==
|
23
|
+
elif algorithm == "column":
|
22
24
|
return _column_reading_order(elements)
|
23
|
-
elif algorithm ==
|
25
|
+
elif algorithm == "complex":
|
24
26
|
return _complex_reading_order(elements)
|
25
27
|
else:
|
26
28
|
# Default to basic
|
@@ -30,55 +32,52 @@ def establish_reading_order(elements: List[Dict[str, Any]],
|
|
30
32
|
def _basic_reading_order(elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
31
33
|
"""
|
32
34
|
Basic top-to-bottom, left-to-right reading order.
|
33
|
-
|
35
|
+
|
34
36
|
Args:
|
35
37
|
elements: List of elements to order
|
36
|
-
|
38
|
+
|
37
39
|
Returns:
|
38
40
|
List of elements in reading order
|
39
41
|
"""
|
40
42
|
# Simple sort by y0 (top), then by x0 (left)
|
41
|
-
return sorted(elements, key=lambda e: (
|
42
|
-
e.get('top', e.get('y0', 0)),
|
43
|
-
e.get('x0', 0)
|
44
|
-
))
|
43
|
+
return sorted(elements, key=lambda e: (e.get("top", e.get("y0", 0)), e.get("x0", 0)))
|
45
44
|
|
46
45
|
|
47
|
-
def _column_reading_order(
|
48
|
-
|
49
|
-
|
46
|
+
def _column_reading_order(
|
47
|
+
elements: List[Dict[str, Any]], column_threshold: float = 0.2, x_tolerance: float = 10.0
|
48
|
+
) -> List[Dict[str, Any]]:
|
50
49
|
"""
|
51
50
|
Reading order that accounts for columns.
|
52
|
-
|
51
|
+
|
53
52
|
This is more complex as it needs to detect columns first,
|
54
53
|
then read each column in order.
|
55
|
-
|
54
|
+
|
56
55
|
Args:
|
57
56
|
elements: List of elements to order
|
58
57
|
column_threshold: Percentage overlap threshold for column detection (0.0 to 1.0)
|
59
58
|
x_tolerance: Horizontal tolerance for determining column edges
|
60
|
-
|
59
|
+
|
61
60
|
Returns:
|
62
61
|
List of elements in reading order
|
63
62
|
"""
|
64
63
|
if not elements:
|
65
64
|
return []
|
66
|
-
|
65
|
+
|
67
66
|
# 1. Group elements by line
|
68
67
|
lines = group_elements_by_line(elements)
|
69
|
-
|
68
|
+
|
70
69
|
# 2. For each line, find the x-coordinate ranges (potential column boundaries)
|
71
70
|
line_x_ranges = []
|
72
71
|
for line in lines:
|
73
72
|
for el in line:
|
74
|
-
x0 = el.get(
|
75
|
-
x1 = el.get(
|
73
|
+
x0 = el.get("x0", 0)
|
74
|
+
x1 = el.get("x1", 0)
|
76
75
|
line_x_ranges.append((x0, x1))
|
77
|
-
|
76
|
+
|
78
77
|
# If we don't have enough ranges to detect columns, just use basic ordering
|
79
78
|
if len(line_x_ranges) < 3:
|
80
79
|
return _basic_reading_order(elements)
|
81
|
-
|
80
|
+
|
82
81
|
# 3. Detect columns by clustering x-coordinate ranges
|
83
82
|
def overlaps(range1, range2, threshold=column_threshold):
|
84
83
|
"""Determine if two ranges overlap by more than threshold percentage."""
|
@@ -86,25 +85,25 @@ def _column_reading_order(elements: List[Dict[str, Any]],
|
|
86
85
|
overlap_start = max(range1[0], range2[0])
|
87
86
|
overlap_end = min(range1[1], range2[1])
|
88
87
|
overlap = max(0, overlap_end - overlap_start)
|
89
|
-
|
88
|
+
|
90
89
|
# Calculate lengths
|
91
90
|
len1 = range1[1] - range1[0]
|
92
91
|
len2 = range2[1] - range2[0]
|
93
|
-
|
92
|
+
|
94
93
|
# Calculate overlap as percentage of the shorter range
|
95
94
|
shorter_len = min(len1, len2)
|
96
95
|
if shorter_len == 0:
|
97
96
|
return False
|
98
|
-
|
97
|
+
|
99
98
|
return overlap / shorter_len >= threshold
|
100
|
-
|
99
|
+
|
101
100
|
# Cluster x-ranges into columns
|
102
101
|
columns = []
|
103
102
|
for x_range in line_x_ranges:
|
104
103
|
# Skip zero-width ranges
|
105
104
|
if x_range[1] - x_range[0] <= 0:
|
106
105
|
continue
|
107
|
-
|
106
|
+
|
108
107
|
# Try to find an existing column to add to
|
109
108
|
added = False
|
110
109
|
for col in columns:
|
@@ -112,68 +111,70 @@ def _column_reading_order(elements: List[Dict[str, Any]],
|
|
112
111
|
col.append(x_range)
|
113
112
|
added = True
|
114
113
|
break
|
115
|
-
|
114
|
+
|
116
115
|
# If not added to an existing column, create a new one
|
117
116
|
if not added:
|
118
117
|
columns.append([x_range])
|
119
|
-
|
118
|
+
|
120
119
|
# 4. Get column boundaries by averaging x-ranges in each column
|
121
120
|
column_bounds = []
|
122
121
|
for col in columns:
|
123
122
|
left = sum(r[0] for r in col) / len(col)
|
124
123
|
right = sum(r[1] for r in col) / len(col)
|
125
124
|
column_bounds.append((left, right))
|
126
|
-
|
125
|
+
|
127
126
|
# Sort columns by x-coordinate (left to right)
|
128
127
|
column_bounds.sort(key=lambda b: b[0])
|
129
|
-
|
128
|
+
|
130
129
|
# 5. Assign each element to a column
|
131
130
|
element_columns = {}
|
132
131
|
for el in elements:
|
133
132
|
# Get element x-coordinates
|
134
|
-
el_x0 = el.get(
|
135
|
-
el_x1 = el.get(
|
133
|
+
el_x0 = el.get("x0", 0)
|
134
|
+
el_x1 = el.get("x1", 0)
|
136
135
|
el_center = (el_x0 + el_x1) / 2
|
137
|
-
|
136
|
+
|
138
137
|
# Find the column this element belongs to
|
139
138
|
for i, (left, right) in enumerate(column_bounds):
|
140
139
|
# Extend bounds by tolerance
|
141
140
|
extended_left = left - x_tolerance
|
142
141
|
extended_right = right + x_tolerance
|
143
|
-
|
142
|
+
|
144
143
|
# Check if center point is within extended column bounds
|
145
144
|
if extended_left <= el_center <= extended_right:
|
146
145
|
element_columns[el] = i
|
147
146
|
break
|
148
147
|
else:
|
149
148
|
# If no column found, assign to nearest column
|
150
|
-
distances = [
|
151
|
-
|
149
|
+
distances = [
|
150
|
+
(i, min(abs(el_center - left), abs(el_center - right)))
|
151
|
+
for i, (left, right) in enumerate(column_bounds)
|
152
|
+
]
|
152
153
|
nearest_col = min(distances, key=lambda d: d[1])[0]
|
153
154
|
element_columns[el] = nearest_col
|
154
|
-
|
155
|
+
|
155
156
|
# 6. Sort elements by column, then by vertical position
|
156
157
|
sorted_elements = []
|
157
158
|
for col_idx, _ in enumerate(column_bounds):
|
158
159
|
# Get elements in this column
|
159
160
|
col_elements = [el for el in elements if element_columns.get(el) == col_idx]
|
160
161
|
# Sort by top coordinate
|
161
|
-
col_elements.sort(key=lambda e: e.get(
|
162
|
+
col_elements.sort(key=lambda e: e.get("top", e.get("y0", 0)))
|
162
163
|
# Add to final list
|
163
164
|
sorted_elements.extend(col_elements)
|
164
|
-
|
165
|
+
|
165
166
|
return sorted_elements
|
166
167
|
|
167
168
|
|
168
169
|
def _complex_reading_order(elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
169
170
|
"""
|
170
171
|
Complex reading order that accounts for various document structures.
|
171
|
-
|
172
|
+
|
172
173
|
This considers columns, text flow around images, tables, etc.
|
173
|
-
|
174
|
+
|
174
175
|
Args:
|
175
176
|
elements: List of elements to order
|
176
|
-
|
177
|
+
|
177
178
|
Returns:
|
178
179
|
List of elements in reading order
|
179
180
|
"""
|
@@ -182,31 +183,32 @@ def _complex_reading_order(elements: List[Dict[str, Any]]) -> List[Dict[str, Any
|
|
182
183
|
return _column_reading_order(elements)
|
183
184
|
|
184
185
|
|
185
|
-
def group_elements_by_line(
|
186
|
-
|
186
|
+
def group_elements_by_line(
|
187
|
+
elements: List[Dict[str, Any]], tolerance: float = 3.0
|
188
|
+
) -> List[List[Dict[str, Any]]]:
|
187
189
|
"""
|
188
190
|
Group elements into lines based on vertical position.
|
189
|
-
|
191
|
+
|
190
192
|
Args:
|
191
193
|
elements: List of elements to group
|
192
194
|
tolerance: Maximum vertical distance for elements to be considered on the same line
|
193
|
-
|
195
|
+
|
194
196
|
Returns:
|
195
197
|
List of lists, where each sublist contains elements on the same line
|
196
198
|
"""
|
197
199
|
if not elements:
|
198
200
|
return []
|
199
|
-
|
201
|
+
|
200
202
|
# Sort by top coordinate
|
201
|
-
sorted_elements = sorted(elements, key=lambda e: e.get(
|
202
|
-
|
203
|
+
sorted_elements = sorted(elements, key=lambda e: e.get("top", e.get("y0", 0)))
|
204
|
+
|
203
205
|
lines = []
|
204
206
|
current_line = [sorted_elements[0]]
|
205
|
-
current_top = sorted_elements[0].get(
|
206
|
-
|
207
|
+
current_top = sorted_elements[0].get("top", sorted_elements[0].get("y0", 0))
|
208
|
+
|
207
209
|
for element in sorted_elements[1:]:
|
208
|
-
element_top = element.get(
|
209
|
-
|
210
|
+
element_top = element.get("top", element.get("y0", 0))
|
211
|
+
|
210
212
|
# If element is close enough to current line's top, add to current line
|
211
213
|
if abs(element_top - current_top) <= tolerance:
|
212
214
|
current_line.append(element)
|
@@ -215,13 +217,13 @@ def group_elements_by_line(elements: List[Dict[str, Any]],
|
|
215
217
|
lines.append(current_line)
|
216
218
|
current_line = [element]
|
217
219
|
current_top = element_top
|
218
|
-
|
220
|
+
|
219
221
|
# Add the last line
|
220
222
|
if current_line:
|
221
223
|
lines.append(current_line)
|
222
|
-
|
224
|
+
|
223
225
|
# Sort elements within each line by x0
|
224
226
|
for line in lines:
|
225
|
-
line.sort(key=lambda e: e.get(
|
226
|
-
|
227
|
-
return lines
|
227
|
+
line.sort(key=lambda e: e.get("x0", 0))
|
228
|
+
|
229
|
+
return lines
|
@@ -0,0 +1,195 @@
|
|
1
|
+
# natural_pdf/utils/text_extraction.py
|
2
|
+
import logging
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
4
|
+
|
5
|
+
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
|
6
|
+
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from natural_pdf.elements.region import Region # Use type hint
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
def filter_chars_spatially(
|
15
|
+
char_dicts: List[Dict[str, Any]],
|
16
|
+
exclusion_regions: List["Region"],
|
17
|
+
target_region: Optional["Region"] = None,
|
18
|
+
debug: bool = False,
|
19
|
+
) -> List[Dict[str, Any]]:
|
20
|
+
"""
|
21
|
+
Filters a list of character dictionaries spatially based on exclusions
|
22
|
+
and an optional target region.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
char_dicts: List of character dictionaries to filter.
|
26
|
+
exclusion_regions: List of Region objects to exclude characters from.
|
27
|
+
target_region: Optional Region object. If provided, only characters within
|
28
|
+
this region (respecting polygons) are kept.
|
29
|
+
debug: Enable debug logging.
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
Filtered list of character dictionaries.
|
33
|
+
"""
|
34
|
+
if not char_dicts:
|
35
|
+
return []
|
36
|
+
|
37
|
+
initial_count = len(char_dicts)
|
38
|
+
filtered_chars = char_dicts
|
39
|
+
|
40
|
+
# 1. Filter by Target Region (if provided)
|
41
|
+
if target_region:
|
42
|
+
target_bbox = target_region.bbox
|
43
|
+
target_is_polygon = target_region.has_polygon # Check once
|
44
|
+
region_filtered_chars = []
|
45
|
+
for char_dict in filtered_chars:
|
46
|
+
# Ensure basic geometry keys exist before processing
|
47
|
+
if not all(k in char_dict for k in ["x0", "top", "x1", "bottom"]):
|
48
|
+
if debug:
|
49
|
+
logger.warning(
|
50
|
+
f"Skipping char due to missing geometry: {char_dict.get('text', '?')}"
|
51
|
+
)
|
52
|
+
continue
|
53
|
+
char_bbox = (char_dict["x0"], char_dict["top"], char_dict["x1"], char_dict["bottom"])
|
54
|
+
# BBox pre-filter first
|
55
|
+
if get_bbox_overlap(char_bbox, target_bbox) is None:
|
56
|
+
continue
|
57
|
+
# Precise check if needed
|
58
|
+
char_center_x = (char_dict["x0"] + char_dict["x1"]) / 2
|
59
|
+
char_center_y = (char_dict["top"] + char_dict["bottom"]) / 2
|
60
|
+
if target_is_polygon:
|
61
|
+
if target_region.is_point_inside(char_center_x, char_center_y):
|
62
|
+
region_filtered_chars.append(char_dict)
|
63
|
+
# else: # Optionally log discarded by polygon
|
64
|
+
# if debug: logger.debug(...)
|
65
|
+
else: # Rectangular region, bbox overlap was sufficient
|
66
|
+
region_filtered_chars.append(char_dict)
|
67
|
+
filtered_chars = region_filtered_chars
|
68
|
+
if debug:
|
69
|
+
logger.debug(
|
70
|
+
f"filter_chars_spatially: {len(filtered_chars)}/{initial_count} chars remaining after target region filter."
|
71
|
+
)
|
72
|
+
if not filtered_chars:
|
73
|
+
return []
|
74
|
+
|
75
|
+
# 2. Filter by Exclusions (if any)
|
76
|
+
if exclusion_regions:
|
77
|
+
final_chars = []
|
78
|
+
# Only calculate union_bbox if there are exclusions AND chars remaining
|
79
|
+
union_bbox = merge_bboxes(excl.bbox for excl in exclusion_regions)
|
80
|
+
for char_dict in filtered_chars: # Process only chars within target
|
81
|
+
# Ensure basic geometry keys exist before processing
|
82
|
+
if not all(k in char_dict for k in ["x0", "top", "x1", "bottom"]):
|
83
|
+
# Already warned in target region filter if applicable
|
84
|
+
continue
|
85
|
+
char_bbox = (char_dict["x0"], char_dict["top"], char_dict["x1"], char_dict["bottom"])
|
86
|
+
# BBox pre-filter vs exclusion union
|
87
|
+
if get_bbox_overlap(char_bbox, union_bbox) is None:
|
88
|
+
final_chars.append(char_dict) # Cannot be excluded
|
89
|
+
continue
|
90
|
+
# Precise check against individual overlapping exclusions
|
91
|
+
is_excluded = False
|
92
|
+
char_center_x = (char_dict["x0"] + char_dict["x1"]) / 2
|
93
|
+
char_center_y = (char_dict["top"] + char_dict["bottom"]) / 2
|
94
|
+
for exclusion in exclusion_regions:
|
95
|
+
# Optional: Add bbox overlap check here too before point_inside
|
96
|
+
if get_bbox_overlap(char_bbox, exclusion.bbox) is not None:
|
97
|
+
if exclusion.is_point_inside(char_center_x, char_center_y):
|
98
|
+
is_excluded = True
|
99
|
+
if debug:
|
100
|
+
char_text = char_dict.get("text", "?")
|
101
|
+
log_msg = f" - Excluding char '{char_text}' at {char_bbox} due to overlap with exclusion {exclusion.bbox}"
|
102
|
+
logger.debug(log_msg)
|
103
|
+
break
|
104
|
+
if not is_excluded:
|
105
|
+
final_chars.append(char_dict)
|
106
|
+
filtered_chars = final_chars
|
107
|
+
if debug:
|
108
|
+
logger.debug(
|
109
|
+
f"filter_chars_spatially: {len(filtered_chars)}/{initial_count} chars remaining after exclusion filter."
|
110
|
+
)
|
111
|
+
if not filtered_chars:
|
112
|
+
return []
|
113
|
+
|
114
|
+
return filtered_chars
|
115
|
+
|
116
|
+
|
117
|
+
def generate_text_layout(
|
118
|
+
char_dicts: List[Dict[str, Any]],
|
119
|
+
layout_context_bbox: Tuple[float, float, float, float],
|
120
|
+
user_kwargs: Dict[str, Any],
|
121
|
+
) -> str:
|
122
|
+
"""
|
123
|
+
Takes a list of filtered character dictionaries and generates
|
124
|
+
text output using pdfplumber's layout engine.
|
125
|
+
|
126
|
+
Args:
|
127
|
+
char_dicts: The final list of character dictionaries to include.
|
128
|
+
layout_context_bbox: The bounding box (x0, top, x1, bottom) to use for
|
129
|
+
calculating default layout width/height/shifts.
|
130
|
+
user_kwargs: Dictionary of user-provided keyword arguments.
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
The formatted text string.
|
134
|
+
"""
|
135
|
+
if not char_dicts:
|
136
|
+
logger.debug("generate_text_layout: No characters provided.")
|
137
|
+
return ""
|
138
|
+
|
139
|
+
# Prepare layout kwargs, prioritizing user input
|
140
|
+
layout_kwargs = {}
|
141
|
+
allowed_keys = set(WORD_EXTRACTOR_KWARGS) | set(TEXTMAP_KWARGS)
|
142
|
+
for key, value in user_kwargs.items():
|
143
|
+
if key in allowed_keys:
|
144
|
+
layout_kwargs[key] = value
|
145
|
+
|
146
|
+
# Default to layout=True unless explicitly False
|
147
|
+
use_layout = layout_kwargs.get("layout", True) # Default to layout if called
|
148
|
+
layout_kwargs["layout"] = use_layout
|
149
|
+
|
150
|
+
if use_layout:
|
151
|
+
ctx_x0, ctx_top, ctx_x1, ctx_bottom = layout_context_bbox
|
152
|
+
ctx_width = ctx_x1 - ctx_x0
|
153
|
+
ctx_height = ctx_bottom - ctx_top
|
154
|
+
|
155
|
+
# Set layout defaults based on context_bbox if not overridden by user
|
156
|
+
if "layout_bbox" not in layout_kwargs:
|
157
|
+
layout_kwargs["layout_bbox"] = layout_context_bbox
|
158
|
+
# Only set default layout_width if neither width specifier is present
|
159
|
+
if "layout_width_chars" not in layout_kwargs and "layout_width" not in layout_kwargs:
|
160
|
+
layout_kwargs["layout_width"] = ctx_width
|
161
|
+
if "layout_height" not in layout_kwargs:
|
162
|
+
layout_kwargs["layout_height"] = ctx_height
|
163
|
+
# Adjust shift based on context's top-left corner
|
164
|
+
if "x_shift" not in layout_kwargs:
|
165
|
+
layout_kwargs["x_shift"] = ctx_x0
|
166
|
+
if "y_shift" not in layout_kwargs:
|
167
|
+
layout_kwargs["y_shift"] = ctx_top
|
168
|
+
|
169
|
+
logger.debug(
|
170
|
+
f"generate_text_layout: Calling chars_to_textmap with {len(char_dicts)} chars and kwargs: {layout_kwargs}"
|
171
|
+
)
|
172
|
+
try:
|
173
|
+
# Sort final list by reading order before passing to textmap
|
174
|
+
# TODO: Make sorting key dynamic based on layout_kwargs directions?
|
175
|
+
char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
|
176
|
+
textmap = chars_to_textmap(char_dicts, **layout_kwargs)
|
177
|
+
result = textmap.as_string
|
178
|
+
except Exception as e:
|
179
|
+
logger.error(
|
180
|
+
f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=True
|
181
|
+
)
|
182
|
+
logger.warning(
|
183
|
+
"generate_text_layout: Falling back to simple character join due to layout error."
|
184
|
+
)
|
185
|
+
# Ensure chars are sorted before fallback join
|
186
|
+
fallback_chars = sorted(char_dicts, key=lambda c: (c.get("top", 0), c.get("x0", 0)))
|
187
|
+
result = "".join(c.get("text", "") for c in fallback_chars)
|
188
|
+
else:
|
189
|
+
# Simple join if layout=False
|
190
|
+
logger.debug("generate_text_layout: Using simple join (layout=False).")
|
191
|
+
# Sort by document order for simple join as well
|
192
|
+
char_dicts.sort(key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0)))
|
193
|
+
result = "".join(c.get("text", "") for c in char_dicts)
|
194
|
+
|
195
|
+
return result
|