natural-pdf 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +45 -0
- natural_pdf/analyzers/guides.py +359 -0
- natural_pdf/core/element_manager.py +4 -0
- natural_pdf/core/page.py +88 -22
- natural_pdf/core/page_collection.py +75 -0
- natural_pdf/core/pdf.py +33 -0
- natural_pdf/describe/base.py +48 -7
- natural_pdf/elements/base.py +408 -43
- natural_pdf/elements/element_collection.py +83 -10
- natural_pdf/elements/region.py +217 -178
- natural_pdf/elements/text.py +5 -3
- natural_pdf/flows/element.py +1 -0
- natural_pdf/flows/flow.py +175 -480
- natural_pdf/flows/region.py +76 -0
- natural_pdf/selectors/parser.py +180 -9
- natural_pdf/utils/pdfminer_patches.py +136 -0
- natural_pdf/utils/sections.py +346 -0
- natural_pdf/utils/spatial.py +169 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/RECORD +24 -21
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,346 @@
|
|
1
|
+
"""Centralized utilities for section extraction to avoid code duplication.
|
2
|
+
|
3
|
+
This module provides the core logic for get_sections() and get_section_between()
|
4
|
+
functionality that's used across Page, PDF, Region, and Flow classes.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import logging
|
8
|
+
from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
|
9
|
+
|
10
|
+
if TYPE_CHECKING:
|
11
|
+
from natural_pdf.core.page import Page
|
12
|
+
from natural_pdf.elements.base import Element
|
13
|
+
from natural_pdf.elements.region import Region
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
def calculate_section_bounds(
|
19
|
+
start_element: "Element",
|
20
|
+
end_element: "Element",
|
21
|
+
include_boundaries: str,
|
22
|
+
orientation: str,
|
23
|
+
parent_bounds: Tuple[float, float, float, float],
|
24
|
+
) -> Tuple[float, float, float, float]:
|
25
|
+
"""
|
26
|
+
Calculate the bounding box for a section between two elements.
|
27
|
+
|
28
|
+
This centralizes the logic for determining section boundaries based on
|
29
|
+
the include_boundaries parameter and orientation.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
start_element: Element marking the start of the section
|
33
|
+
end_element: Element marking the end of the section
|
34
|
+
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
35
|
+
orientation: 'vertical' or 'horizontal' - determines section direction
|
36
|
+
parent_bounds: The bounding box (x0, top, x1, bottom) of the parent region
|
37
|
+
|
38
|
+
Returns:
|
39
|
+
Tuple of (x0, top, x1, bottom) representing the section bounds
|
40
|
+
"""
|
41
|
+
parent_x0, parent_top, parent_x1, parent_bottom = parent_bounds
|
42
|
+
|
43
|
+
if orientation == "vertical":
|
44
|
+
# Use full width of the parent region for vertical sections
|
45
|
+
x0 = parent_x0
|
46
|
+
x1 = parent_x1
|
47
|
+
|
48
|
+
# Determine vertical boundaries based on include_boundaries
|
49
|
+
if include_boundaries == "both":
|
50
|
+
# Include both boundary elements
|
51
|
+
top = start_element.top
|
52
|
+
bottom = end_element.bottom
|
53
|
+
elif include_boundaries == "start":
|
54
|
+
# Include start element, exclude end element
|
55
|
+
top = start_element.top
|
56
|
+
bottom = end_element.top # Stop at the top of end element
|
57
|
+
elif include_boundaries == "end":
|
58
|
+
# Exclude start element, include end element
|
59
|
+
top = start_element.bottom # Start at the bottom of start element
|
60
|
+
bottom = end_element.bottom
|
61
|
+
else: # "none"
|
62
|
+
# Exclude both boundary elements
|
63
|
+
top = start_element.bottom # Start at the bottom of start element
|
64
|
+
bottom = end_element.top # Stop at the top of end element
|
65
|
+
|
66
|
+
else: # horizontal
|
67
|
+
# Use full height of the parent region for horizontal sections
|
68
|
+
top = parent_top
|
69
|
+
bottom = parent_bottom
|
70
|
+
|
71
|
+
# Determine horizontal boundaries based on include_boundaries
|
72
|
+
if include_boundaries == "both":
|
73
|
+
# Include both boundary elements
|
74
|
+
x0 = start_element.x0
|
75
|
+
x1 = end_element.x1
|
76
|
+
elif include_boundaries == "start":
|
77
|
+
# Include start element, exclude end element
|
78
|
+
x0 = start_element.x0
|
79
|
+
x1 = end_element.x0 # Stop at the left of end element
|
80
|
+
elif include_boundaries == "end":
|
81
|
+
# Exclude start element, include end element
|
82
|
+
x0 = start_element.x1 # Start at the right of start element
|
83
|
+
x1 = end_element.x1
|
84
|
+
else: # "none"
|
85
|
+
# Exclude both boundary elements
|
86
|
+
x0 = start_element.x1 # Start at the right of start element
|
87
|
+
x1 = end_element.x0 # Stop at the left of end element
|
88
|
+
|
89
|
+
return (x0, top, x1, bottom)
|
90
|
+
|
91
|
+
|
92
|
+
def validate_section_bounds(bounds: Tuple[float, float, float, float], orientation: str) -> bool:
|
93
|
+
"""
|
94
|
+
Validate that section bounds are valid (not inverted).
|
95
|
+
|
96
|
+
Args:
|
97
|
+
bounds: The bounding box (x0, top, x1, bottom) to validate
|
98
|
+
orientation: 'vertical' or 'horizontal' - determines which dimension to check
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
True if bounds are valid, False otherwise
|
102
|
+
"""
|
103
|
+
x0, top, x1, bottom = bounds
|
104
|
+
|
105
|
+
if orientation == "vertical":
|
106
|
+
if top >= bottom:
|
107
|
+
logger.debug(f"Invalid vertical section boundaries: top={top} >= bottom={bottom}")
|
108
|
+
return False
|
109
|
+
else: # horizontal
|
110
|
+
if x0 >= x1:
|
111
|
+
logger.debug(f"Invalid horizontal section boundaries: x0={x0} >= x1={x1}")
|
112
|
+
return False
|
113
|
+
|
114
|
+
return True
|
115
|
+
|
116
|
+
|
117
|
+
def pair_boundary_elements(
|
118
|
+
start_elements: List["Element"],
|
119
|
+
end_elements: Optional[List["Element"]],
|
120
|
+
orientation: str = "vertical",
|
121
|
+
) -> List[Tuple["Element", Optional["Element"]]]:
|
122
|
+
"""
|
123
|
+
Pair up start and end boundary elements for section extraction.
|
124
|
+
|
125
|
+
This implements the logic for matching start elements with their corresponding
|
126
|
+
end elements, handling cases where end_elements is None or has different length.
|
127
|
+
|
128
|
+
Args:
|
129
|
+
start_elements: List of elements marking section starts
|
130
|
+
end_elements: Optional list of elements marking section ends
|
131
|
+
orientation: 'vertical' or 'horizontal' - affects element ordering
|
132
|
+
|
133
|
+
Returns:
|
134
|
+
List of (start_element, end_element) tuples
|
135
|
+
"""
|
136
|
+
if not start_elements:
|
137
|
+
return []
|
138
|
+
|
139
|
+
# Sort elements by position
|
140
|
+
if orientation == "vertical":
|
141
|
+
start_elements = sorted(start_elements, key=lambda e: (e.top, e.x0))
|
142
|
+
if end_elements:
|
143
|
+
end_elements = sorted(end_elements, key=lambda e: (e.top, e.x0))
|
144
|
+
else:
|
145
|
+
start_elements = sorted(start_elements, key=lambda e: (e.x0, e.top))
|
146
|
+
if end_elements:
|
147
|
+
end_elements = sorted(end_elements, key=lambda e: (e.x0, e.top))
|
148
|
+
|
149
|
+
pairs = []
|
150
|
+
|
151
|
+
if not end_elements:
|
152
|
+
# No end elements - pair each start with the next start
|
153
|
+
for i in range(len(start_elements) - 1):
|
154
|
+
pairs.append((start_elements[i], start_elements[i + 1]))
|
155
|
+
# Last element has no pair unless we want to go to end of container
|
156
|
+
pairs.append((start_elements[-1], None))
|
157
|
+
else:
|
158
|
+
# Match each start with the next end that comes after it
|
159
|
+
used_ends = set()
|
160
|
+
|
161
|
+
for start in start_elements:
|
162
|
+
# Find the first unused end element that comes after this start
|
163
|
+
matching_end = None
|
164
|
+
|
165
|
+
for end in end_elements:
|
166
|
+
if end in used_ends:
|
167
|
+
continue
|
168
|
+
|
169
|
+
# Check if end comes after start
|
170
|
+
if orientation == "vertical":
|
171
|
+
if end.top > start.bottom or (end.top == start.bottom and end.x0 >= start.x0):
|
172
|
+
matching_end = end
|
173
|
+
break
|
174
|
+
else: # horizontal
|
175
|
+
if end.x0 > start.x1 or (end.x0 == start.x1 and end.top >= start.top):
|
176
|
+
matching_end = end
|
177
|
+
break
|
178
|
+
|
179
|
+
if matching_end:
|
180
|
+
pairs.append((start, matching_end))
|
181
|
+
used_ends.add(matching_end)
|
182
|
+
else:
|
183
|
+
# No matching end found
|
184
|
+
pairs.append((start, None))
|
185
|
+
|
186
|
+
return pairs
|
187
|
+
|
188
|
+
|
189
|
+
def process_selector_to_elements(
|
190
|
+
selector_or_elements: Union[str, List["Element"], "Element", None],
|
191
|
+
search_context: Any, # Can be Page, Region, Flow, etc.
|
192
|
+
find_method_name: str = "find_all",
|
193
|
+
) -> List["Element"]:
|
194
|
+
"""
|
195
|
+
Process a selector string or element list into a normalized list of elements.
|
196
|
+
|
197
|
+
Args:
|
198
|
+
selector_or_elements: Selector string, element, list of elements, or None
|
199
|
+
search_context: Object with find_all method (Page, Region, etc.)
|
200
|
+
find_method_name: Name of the method to call for searching (default: "find_all")
|
201
|
+
|
202
|
+
Returns:
|
203
|
+
List of elements (empty list if None or no matches)
|
204
|
+
"""
|
205
|
+
if selector_or_elements is None:
|
206
|
+
return []
|
207
|
+
|
208
|
+
if isinstance(selector_or_elements, str):
|
209
|
+
# It's a selector string - search for matching elements
|
210
|
+
if hasattr(search_context, find_method_name):
|
211
|
+
result = getattr(search_context, find_method_name)(selector_or_elements)
|
212
|
+
if hasattr(result, "elements"):
|
213
|
+
return result.elements
|
214
|
+
elif isinstance(result, list):
|
215
|
+
return result
|
216
|
+
else:
|
217
|
+
return []
|
218
|
+
else:
|
219
|
+
logger.warning(f"Search context {type(search_context)} lacks {find_method_name} method")
|
220
|
+
return []
|
221
|
+
|
222
|
+
# Handle single element
|
223
|
+
if hasattr(selector_or_elements, "bbox"): # Duck typing for Element
|
224
|
+
return [selector_or_elements]
|
225
|
+
|
226
|
+
# Handle ElementCollection or similar
|
227
|
+
if hasattr(selector_or_elements, "elements"):
|
228
|
+
return selector_or_elements.elements
|
229
|
+
|
230
|
+
# Handle list/iterable
|
231
|
+
if hasattr(selector_or_elements, "__iter__"):
|
232
|
+
return list(selector_or_elements)
|
233
|
+
|
234
|
+
return []
|
235
|
+
|
236
|
+
|
237
|
+
def extract_sections_from_region(
|
238
|
+
region: "Region",
|
239
|
+
start_elements: Union[str, List["Element"], None],
|
240
|
+
end_elements: Union[str, List["Element"], None] = None,
|
241
|
+
include_boundaries: str = "both",
|
242
|
+
orientation: str = "vertical",
|
243
|
+
get_section_between_func: Optional[Any] = None,
|
244
|
+
) -> List["Region"]:
|
245
|
+
"""
|
246
|
+
Core implementation of get_sections() that can be reused across classes.
|
247
|
+
|
248
|
+
This implements the full logic for extracting multiple sections from a region
|
249
|
+
based on start/end boundary elements.
|
250
|
+
|
251
|
+
Args:
|
252
|
+
region: The region to extract sections from
|
253
|
+
start_elements: Elements or selector marking section starts
|
254
|
+
end_elements: Optional elements or selector marking section ends
|
255
|
+
include_boundaries: How to include boundary elements
|
256
|
+
orientation: Section orientation ('vertical' or 'horizontal')
|
257
|
+
get_section_between_func: Optional custom function to create sections
|
258
|
+
|
259
|
+
Returns:
|
260
|
+
List of Region objects representing the sections
|
261
|
+
"""
|
262
|
+
# Process selectors to get element lists
|
263
|
+
start_elements = process_selector_to_elements(start_elements, region)
|
264
|
+
end_elements = process_selector_to_elements(end_elements, region) if end_elements else []
|
265
|
+
|
266
|
+
# Validate inputs
|
267
|
+
if not start_elements:
|
268
|
+
logger.debug("No start elements found for section extraction")
|
269
|
+
return []
|
270
|
+
|
271
|
+
# Get all elements in the region and sort by position
|
272
|
+
all_elements = region.get_elements()
|
273
|
+
if not all_elements:
|
274
|
+
return []
|
275
|
+
|
276
|
+
# Sort elements based on orientation
|
277
|
+
if orientation == "vertical":
|
278
|
+
all_elements.sort(key=lambda e: (e.top, e.x0))
|
279
|
+
else:
|
280
|
+
all_elements.sort(key=lambda e: (e.x0, e.top))
|
281
|
+
|
282
|
+
# Create element index map
|
283
|
+
element_to_index = {el: i for i, el in enumerate(all_elements)}
|
284
|
+
|
285
|
+
# Build boundary list with indices
|
286
|
+
boundaries = []
|
287
|
+
|
288
|
+
# Add start boundaries
|
289
|
+
for elem in start_elements:
|
290
|
+
idx = element_to_index.get(elem)
|
291
|
+
if idx is not None:
|
292
|
+
boundaries.append({"index": idx, "element": elem, "type": "start"})
|
293
|
+
|
294
|
+
# Add end boundaries
|
295
|
+
for elem in end_elements:
|
296
|
+
idx = element_to_index.get(elem)
|
297
|
+
if idx is not None:
|
298
|
+
boundaries.append({"index": idx, "element": elem, "type": "end"})
|
299
|
+
|
300
|
+
# Sort boundaries by document order
|
301
|
+
boundaries.sort(key=lambda x: x["index"])
|
302
|
+
|
303
|
+
# Generate sections
|
304
|
+
sections = []
|
305
|
+
current_start = None
|
306
|
+
section_func = get_section_between_func or region.get_section_between
|
307
|
+
|
308
|
+
for boundary in boundaries:
|
309
|
+
if boundary["type"] == "start":
|
310
|
+
if current_start is None:
|
311
|
+
# Start a new section
|
312
|
+
current_start = boundary
|
313
|
+
elif not end_elements:
|
314
|
+
# No end elements specified - use starts as both start and end
|
315
|
+
# Create section from previous start to this start (which acts as end)
|
316
|
+
start_elem = current_start["element"]
|
317
|
+
end_elem = boundary["element"] # Use the actual boundary element as end
|
318
|
+
|
319
|
+
section = section_func(start_elem, end_elem, include_boundaries, orientation)
|
320
|
+
sections.append(section)
|
321
|
+
|
322
|
+
# This boundary becomes the new start
|
323
|
+
current_start = boundary
|
324
|
+
|
325
|
+
elif boundary["type"] == "end" and current_start:
|
326
|
+
# Create section from current start to this end
|
327
|
+
section = section_func(
|
328
|
+
current_start["element"], boundary["element"], include_boundaries, orientation
|
329
|
+
)
|
330
|
+
sections.append(section)
|
331
|
+
current_start = None
|
332
|
+
|
333
|
+
# Handle final section if we have an unclosed start
|
334
|
+
if current_start:
|
335
|
+
start_elem = current_start["element"]
|
336
|
+
# For the final section, we need to go to the end of the region
|
337
|
+
# Create a dummy end element at the region boundary
|
338
|
+
if orientation == "vertical":
|
339
|
+
# Create section to bottom of region
|
340
|
+
section = section_func(start_elem, None, include_boundaries, orientation)
|
341
|
+
else:
|
342
|
+
# Create section to right edge of region
|
343
|
+
section = section_func(start_elem, None, include_boundaries, orientation)
|
344
|
+
sections.append(section)
|
345
|
+
|
346
|
+
return sections
|
@@ -0,0 +1,169 @@
|
|
1
|
+
"""Spatial utilities for consistent element-region relationships.
|
2
|
+
|
3
|
+
This module centralizes the logic for determining whether elements belong to regions,
|
4
|
+
ensuring consistent behavior across Region, Page, and Flow components.
|
5
|
+
|
6
|
+
The default strategy is 'center' - an element belongs to a region if its center
|
7
|
+
point falls within that region. This prevents double-counting of elements at
|
8
|
+
boundaries and provides predictable behavior for operations like get_sections()
|
9
|
+
with include_boundaries='none'.
|
10
|
+
|
11
|
+
Example:
|
12
|
+
from natural_pdf.utils.spatial import is_element_in_region
|
13
|
+
|
14
|
+
# Check if element is in region using center-based logic (default)
|
15
|
+
if is_element_in_region(element, region):
|
16
|
+
print("Element is in region")
|
17
|
+
|
18
|
+
# Use different strategies
|
19
|
+
if is_element_in_region(element, region, strategy="intersects"):
|
20
|
+
print("Element overlaps with region")
|
21
|
+
"""
|
22
|
+
|
23
|
+
import logging
|
24
|
+
from typing import TYPE_CHECKING, Literal, Optional
|
25
|
+
|
26
|
+
if TYPE_CHECKING:
|
27
|
+
from natural_pdf.elements.base import Element
|
28
|
+
from natural_pdf.elements.region import Region
|
29
|
+
|
30
|
+
logger = logging.getLogger(__name__)
|
31
|
+
|
32
|
+
# Element inclusion strategies
|
33
|
+
InclusionStrategy = Literal["center", "intersects", "contains"]
|
34
|
+
|
35
|
+
|
36
|
+
def is_element_in_region(
|
37
|
+
element: "Element",
|
38
|
+
region: "Region",
|
39
|
+
strategy: InclusionStrategy = "center",
|
40
|
+
check_page: bool = True
|
41
|
+
) -> bool:
|
42
|
+
"""
|
43
|
+
Unified function to check if an element is inside a region.
|
44
|
+
|
45
|
+
This centralizes the logic used across Region, Page, and Flow to ensure
|
46
|
+
consistent behavior throughout the library.
|
47
|
+
|
48
|
+
Args:
|
49
|
+
element: The element to check
|
50
|
+
region: The region to check against
|
51
|
+
strategy: The inclusion strategy to use:
|
52
|
+
- "center": Element belongs if its center point is inside (default)
|
53
|
+
- "intersects": Element belongs if any part overlaps
|
54
|
+
- "contains": Element belongs only if fully contained
|
55
|
+
check_page: Whether to verify element and region are on the same page
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
bool: True if element is in region according to the strategy
|
59
|
+
"""
|
60
|
+
# Validate inputs
|
61
|
+
if not hasattr(element, "bbox") or not element.bbox:
|
62
|
+
logger.debug(f"Element lacks bbox attributes: {element}")
|
63
|
+
return False
|
64
|
+
|
65
|
+
if not hasattr(region, "bbox") or not region.bbox:
|
66
|
+
logger.debug(f"Region lacks bbox attributes: {region}")
|
67
|
+
return False
|
68
|
+
|
69
|
+
# Check page membership if requested
|
70
|
+
if check_page:
|
71
|
+
if not hasattr(element, "page") or not hasattr(region, "page"):
|
72
|
+
return False
|
73
|
+
if element.page != region.page:
|
74
|
+
return False
|
75
|
+
|
76
|
+
# Apply the appropriate strategy
|
77
|
+
if strategy == "center":
|
78
|
+
# Use existing region method if available
|
79
|
+
if hasattr(region, "is_element_center_inside"):
|
80
|
+
return region.is_element_center_inside(element)
|
81
|
+
else:
|
82
|
+
# Fallback calculation
|
83
|
+
elem_center_x = (element.x0 + element.x1) / 2
|
84
|
+
elem_center_y = (element.top + element.bottom) / 2
|
85
|
+
|
86
|
+
# Use region's is_point_inside if available
|
87
|
+
if hasattr(region, "is_point_inside"):
|
88
|
+
return region.is_point_inside(elem_center_x, elem_center_y)
|
89
|
+
else:
|
90
|
+
# Simple bounds check
|
91
|
+
return (region.x0 <= elem_center_x <= region.x1 and
|
92
|
+
region.top <= elem_center_y <= region.bottom)
|
93
|
+
|
94
|
+
elif strategy == "intersects":
|
95
|
+
# Use existing region method if available
|
96
|
+
if hasattr(region, "intersects"):
|
97
|
+
return region.intersects(element)
|
98
|
+
else:
|
99
|
+
# Simple bbox overlap check
|
100
|
+
return not (element.x1 < region.x0 or
|
101
|
+
element.x0 > region.x1 or
|
102
|
+
element.bottom < region.top or
|
103
|
+
element.top > region.bottom)
|
104
|
+
|
105
|
+
elif strategy == "contains":
|
106
|
+
# Use existing region method if available
|
107
|
+
if hasattr(region, "contains"):
|
108
|
+
return region.contains(element)
|
109
|
+
else:
|
110
|
+
# Simple full containment check
|
111
|
+
return (region.x0 <= element.x0 and
|
112
|
+
element.x1 <= region.x1 and
|
113
|
+
region.top <= element.top and
|
114
|
+
element.bottom <= region.bottom)
|
115
|
+
|
116
|
+
else:
|
117
|
+
raise ValueError(f"Unknown inclusion strategy: {strategy}")
|
118
|
+
|
119
|
+
|
120
|
+
def get_inclusion_strategy() -> InclusionStrategy:
|
121
|
+
"""
|
122
|
+
Get the current global inclusion strategy.
|
123
|
+
|
124
|
+
This could be made configurable via environment variable or settings.
|
125
|
+
For now, returns the default strategy.
|
126
|
+
|
127
|
+
Returns:
|
128
|
+
The current inclusion strategy (default: "center")
|
129
|
+
"""
|
130
|
+
# Could read from settings or environment
|
131
|
+
# return os.environ.get("NATURAL_PDF_INCLUSION_STRATEGY", "center")
|
132
|
+
return "center"
|
133
|
+
|
134
|
+
|
135
|
+
def calculate_element_overlap_percentage(
|
136
|
+
element: "Element",
|
137
|
+
region: "Region"
|
138
|
+
) -> float:
|
139
|
+
"""
|
140
|
+
Calculate what percentage of an element overlaps with a region.
|
141
|
+
|
142
|
+
Args:
|
143
|
+
element: The element to check
|
144
|
+
region: The region to check against
|
145
|
+
|
146
|
+
Returns:
|
147
|
+
float: Percentage of element area that overlaps with region (0.0 to 1.0)
|
148
|
+
"""
|
149
|
+
if not hasattr(element, "bbox") or not hasattr(region, "bbox"):
|
150
|
+
return 0.0
|
151
|
+
|
152
|
+
# Calculate intersection bounds
|
153
|
+
intersect_x0 = max(element.x0, region.x0)
|
154
|
+
intersect_y0 = max(element.top, region.top)
|
155
|
+
intersect_x1 = min(element.x1, region.x1)
|
156
|
+
intersect_y1 = min(element.bottom, region.bottom)
|
157
|
+
|
158
|
+
# Check if there's an intersection
|
159
|
+
if intersect_x1 <= intersect_x0 or intersect_y1 <= intersect_y0:
|
160
|
+
return 0.0
|
161
|
+
|
162
|
+
# Calculate areas
|
163
|
+
element_area = (element.x1 - element.x0) * (element.bottom - element.top)
|
164
|
+
if element_area == 0:
|
165
|
+
return 0.0
|
166
|
+
|
167
|
+
intersect_area = (intersect_x1 - intersect_x0) * (intersect_y1 - intersect_y0)
|
168
|
+
|
169
|
+
return intersect_area / element_area
|
@@ -1,8 +1,8 @@
|
|
1
|
-
natural_pdf/__init__.py,sha256=
|
1
|
+
natural_pdf/__init__.py,sha256=N9ubwsFpmPj7WHA6Uewgn6IbmU2r0BeUGIdIhmTl6nw,4701
|
2
2
|
natural_pdf/cli.py,sha256=0zO9ZoRiP8JmyGBaVavrMATnvbARWTl7WD2PEefu9BM,4061
|
3
3
|
natural_pdf/text_mixin.py,sha256=eFCiHj6Okcw3aum4955BepcI2NPRalkf9UFFVTc_H30,4012
|
4
4
|
natural_pdf/analyzers/__init__.py,sha256=3XGoNq3OgiVkZP7tOdeP5XVUl7fDgyztdA8DlOcMLXg,1138
|
5
|
-
natural_pdf/analyzers/guides.py,sha256=
|
5
|
+
natural_pdf/analyzers/guides.py,sha256=BqFgt-bRSOkEoFCvNsYyY8j__00X-8DJ_TLb2Hx9qsQ,202430
|
6
6
|
natural_pdf/analyzers/shape_detection_mixin.py,sha256=mgpyJ4jIulz9l9HCqThabJIsLSrXh9BB2AmLxUoHmw0,62584
|
7
7
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
8
8
|
natural_pdf/analyzers/text_structure.py,sha256=3WWusi-BI0krUnJxB05DD6XmKj5qRNvQBqH7zOQGm1M,28451
|
@@ -25,27 +25,27 @@ natural_pdf/classification/mixin.py,sha256=CXygXXhe_qx1563SmIjiu4uSnZkxCkuRR4fGv
|
|
25
25
|
natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZDK9nO4j8I,3239
|
26
26
|
natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666MNj0,5688
|
27
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
|
-
natural_pdf/core/element_manager.py,sha256=
|
28
|
+
natural_pdf/core/element_manager.py,sha256=619R97OtMd7uhaax7fZNJmhy9GxSs9HCNP4OzGgP828,55882
|
29
29
|
natural_pdf/core/highlighting_service.py,sha256=wEV-koqHoHf7S3wZ3j8D2L-ucGp3Nd0YhhStz9yqeLc,70406
|
30
|
-
natural_pdf/core/page.py,sha256
|
31
|
-
natural_pdf/core/page_collection.py,sha256=
|
30
|
+
natural_pdf/core/page.py,sha256=-0OaIoXz0zjT_jnPjjI2jpb8vvNKh-1W56auA5UBhTA,158791
|
31
|
+
natural_pdf/core/page_collection.py,sha256=bLZ3TqTQbmP3oYrbfEi7HUoPMbcGplEtUMZ3Z1y7fuw,66728
|
32
32
|
natural_pdf/core/page_groupby.py,sha256=V2e_RNlHaasUzYm2h2vNJI7_aV_fl3_pg7kU3F2j0z8,8218
|
33
|
-
natural_pdf/core/pdf.py,sha256=
|
33
|
+
natural_pdf/core/pdf.py,sha256=i8dYCimL_k5FV6BmPI1a2Dk7XZfwLP8TziXr2n3O_fI,105639
|
34
34
|
natural_pdf/core/pdf_collection.py,sha256=s3ogu4CEHrHMTRqQMJUKJZ-9Ii8b_B9dWbVLTFj0s7g,34992
|
35
35
|
natural_pdf/core/render_spec.py,sha256=y9QkMiIvWaKiEBlV0TjyldADIEUY3YfWLQXxStHu1S4,15480
|
36
36
|
natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
|
37
|
-
natural_pdf/describe/base.py,sha256=
|
37
|
+
natural_pdf/describe/base.py,sha256=M4TGXR8ppTvznTnA1ZDgMQMkDpgu1pwGMNaOcgHf2iY,20154
|
38
38
|
natural_pdf/describe/elements.py,sha256=3Y541z5TQ2obrfZFiFi1YQMsCt3oYrhMHpD5j1tuppw,12639
|
39
39
|
natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
|
40
40
|
natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
|
41
41
|
natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
|
42
|
-
natural_pdf/elements/base.py,sha256=
|
43
|
-
natural_pdf/elements/element_collection.py,sha256=
|
42
|
+
natural_pdf/elements/base.py,sha256=YYdoss63yv3IzQeuHbNypo7VLz2UJDFK5b6lqQe5tR8,76090
|
43
|
+
natural_pdf/elements/element_collection.py,sha256=dlKoIaqmK_pC_cEcTX9LA2bNbZmc8iXcTTDfpHDlyUM,139812
|
44
44
|
natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
|
45
45
|
natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
|
46
46
|
natural_pdf/elements/rect.py,sha256=kmUmhwnihd-aTweAO-LsngRDo5Iqmx7lcSa8ZBlE_2E,4544
|
47
|
-
natural_pdf/elements/region.py,sha256=
|
48
|
-
natural_pdf/elements/text.py,sha256=
|
47
|
+
natural_pdf/elements/region.py,sha256=qJ86iToSjrCUjVrEbO0M0S1nTuZDW9tpI4jF9T5xJKs,168777
|
48
|
+
natural_pdf/elements/text.py,sha256=Jo4gnrsJe1PStdoWF2Bt8RSeSmOcfA9DxvMJl7EoAmI,21344
|
49
49
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
50
50
|
natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
|
51
51
|
natural_pdf/exporters/base.py,sha256=379sioW_hbkGb21sEVuJhbkkDO5MFsFtTUNO5TgG2YU,2101
|
@@ -62,9 +62,9 @@ natural_pdf/extraction/mixin.py,sha256=dBcp96R8zMQqaRHiB8vpyad8GR89gv5RPXlr8Mt0a
|
|
62
62
|
natural_pdf/extraction/result.py,sha256=PDaCCN2LQBbHsZy0_lrQ0ROeMsnmH1WRoXWOjk9M2o4,1825
|
63
63
|
natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU,277
|
64
64
|
natural_pdf/flows/collections.py,sha256=ErkHWdX6W_y1SjkcA_bGM0uUYRGPWWpRkHip6LHpej0,25740
|
65
|
-
natural_pdf/flows/element.py,sha256=
|
66
|
-
natural_pdf/flows/flow.py,sha256=
|
67
|
-
natural_pdf/flows/region.py,sha256=
|
65
|
+
natural_pdf/flows/element.py,sha256=rDfWICK2gXBMXiqX8D_l7866dkQwgAlJMkCFAFoz6xM,25044
|
66
|
+
natural_pdf/flows/flow.py,sha256=MEls08CtkVox41du0wvkL3u11CAYzidQ6WxN1-vthUs,70591
|
67
|
+
natural_pdf/flows/region.py,sha256=HMk4xfYJiKgER2KzRIcmXb1Vfp9amnyy0ay8YrLtV8w,55362
|
68
68
|
natural_pdf/ocr/__init__.py,sha256=VY8hhvDPf7Gh2lB-d2QRmghLLyTy6ydxlgo1cS4dOSk,2482
|
69
69
|
natural_pdf/ocr/engine.py,sha256=SwNlWydtHbrIghV5JD_j5B4-rnjCMYIWUIEARag-zHw,11839
|
70
70
|
natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
|
@@ -85,7 +85,7 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
|
|
85
85
|
natural_pdf/search/search_service_protocol.py,sha256=u8pbuWP96fnQEe6mnreY9DrdiDAHP6ZCY7phvSbFlP8,6697
|
86
86
|
natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
|
87
87
|
natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
|
88
|
-
natural_pdf/selectors/parser.py,sha256=
|
88
|
+
natural_pdf/selectors/parser.py,sha256=HbPgmtXXA4lRSAVkCzw6vpCi3oh66e-53yUEPhYLGX8,46909
|
89
89
|
natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
|
90
90
|
natural_pdf/tables/result.py,sha256=-8ctA-jCJYSHtlfAoqTvhUwO5zSP2BQxxetAjqEsNyg,8665
|
91
91
|
natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
|
@@ -98,7 +98,10 @@ natural_pdf/utils/identifiers.py,sha256=P7n6owcubnF8oAMa_UfYtENmIaJQdH_AMC9Jbs2b
|
|
98
98
|
natural_pdf/utils/layout.py,sha256=tJRRzwUVP0EeqqbGzr9yOuE5qFvhjZ9A44BuItmKGaU,753
|
99
99
|
natural_pdf/utils/locks.py,sha256=7HJqV0VsNcOfISnbw8goCKWP5ck11uSJo6T_x9XIPKI,215
|
100
100
|
natural_pdf/utils/packaging.py,sha256=TM0jafwS5yVbTGC-RMi4TyWunf9cUUo9h5J6rMzkT-o,22444
|
101
|
+
natural_pdf/utils/pdfminer_patches.py,sha256=Ob81OMoNUGMUIy9nMw3deSQ_Z6cQmhbRlHUC3EHw2jk,4201
|
101
102
|
natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
|
103
|
+
natural_pdf/utils/sections.py,sha256=HZX7829-fquKgIF7vUN2tL10-aXckEaM25g_2VcgWU4,12941
|
104
|
+
natural_pdf/utils/spatial.py,sha256=JOH2LHnF5WBDcjNQsHQdj458zwUgKtSWW7Tj0motn70,5968
|
102
105
|
natural_pdf/utils/text_extraction.py,sha256=CCwPTmMoTgtQt2P00X_ADIf6ZGNfxvjCO9FO0_HqG40,13900
|
103
106
|
natural_pdf/utils/visualization.py,sha256=zhZEHgYnZFuX7YxTHXF8Y3D97uHp2beTKMaC-JkCFwk,22364
|
104
107
|
natural_pdf/vision/__init__.py,sha256=TkoQtdODlh0n_99dsjLIWKE9dgK0m4jfrui_cQ3gTwU,221
|
@@ -108,7 +111,7 @@ natural_pdf/vision/similarity.py,sha256=HWmXDBNLSOlRWH-_1K3FVR7tSsRuMFqXZwrVhhg2
|
|
108
111
|
natural_pdf/vision/template_matching.py,sha256=91XQt5tp-vmcMX_4b2Bz-YwIAlb-hc8E5ih_qAHQuCk,7145
|
109
112
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
110
113
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
111
|
-
natural_pdf-0.2.
|
114
|
+
natural_pdf-0.2.17.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
112
115
|
optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
|
113
116
|
optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
|
114
117
|
optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
|
@@ -145,8 +148,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
145
148
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
146
149
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
147
150
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
148
|
-
natural_pdf-0.2.
|
149
|
-
natural_pdf-0.2.
|
150
|
-
natural_pdf-0.2.
|
151
|
-
natural_pdf-0.2.
|
152
|
-
natural_pdf-0.2.
|
151
|
+
natural_pdf-0.2.17.dist-info/METADATA,sha256=8K5PCwh_OuI8vkWRLChHeT-LuEd0sRmigkRm55ZNeDo,6960
|
152
|
+
natural_pdf-0.2.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
153
|
+
natural_pdf-0.2.17.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
154
|
+
natural_pdf-0.2.17.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
|
155
|
+
natural_pdf-0.2.17.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|