natural-pdf 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +1373 -0
- natural_pdf/classification/manager.py +2 -3
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/highlighting_service.py +29 -38
- natural_pdf/core/page.py +284 -187
- natural_pdf/core/pdf.py +4 -4
- natural_pdf/elements/base.py +54 -20
- natural_pdf/elements/collections.py +160 -9
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +380 -38
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/selectors/parser.py +163 -8
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/RECORD +22 -17
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/WHEEL +1 -1
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/top_level.txt +0 -0
natural_pdf/elements/base.py
CHANGED
@@ -15,6 +15,40 @@ if TYPE_CHECKING:
|
|
15
15
|
from natural_pdf.elements.region import Region
|
16
16
|
|
17
17
|
|
18
|
+
def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
|
19
|
+
"""
|
20
|
+
Extract bounding box coordinates from any object that has bbox properties.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
obj: Object that might have bbox coordinates (Element, Region, etc.)
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
Tuple of (x0, top, x1, bottom) or None if object doesn't have bbox properties
|
27
|
+
"""
|
28
|
+
# Try bbox property first (most common)
|
29
|
+
if hasattr(obj, 'bbox') and obj.bbox is not None:
|
30
|
+
bbox = obj.bbox
|
31
|
+
if isinstance(bbox, (tuple, list)) and len(bbox) == 4:
|
32
|
+
return tuple(float(coord) for coord in bbox)
|
33
|
+
|
34
|
+
# Try individual coordinate properties
|
35
|
+
if all(hasattr(obj, attr) for attr in ['x0', 'top', 'x1', 'bottom']):
|
36
|
+
try:
|
37
|
+
return (float(obj.x0), float(obj.top), float(obj.x1), float(obj.bottom))
|
38
|
+
except (ValueError, TypeError):
|
39
|
+
pass
|
40
|
+
|
41
|
+
# If object is a dict with bbox keys
|
42
|
+
if isinstance(obj, dict):
|
43
|
+
if all(key in obj for key in ['x0', 'top', 'x1', 'bottom']):
|
44
|
+
try:
|
45
|
+
return (float(obj['x0']), float(obj['top']), float(obj['x1']), float(obj['bottom']))
|
46
|
+
except (ValueError, TypeError):
|
47
|
+
pass
|
48
|
+
|
49
|
+
return None
|
50
|
+
|
51
|
+
|
18
52
|
class DirectionalMixin:
|
19
53
|
"""
|
20
54
|
Mixin class providing directional methods for both Element and Region classes.
|
@@ -25,7 +59,7 @@ class DirectionalMixin:
|
|
25
59
|
direction: str,
|
26
60
|
size: Optional[float] = None,
|
27
61
|
cross_size: str = "full",
|
28
|
-
|
62
|
+
include_source: bool = False,
|
29
63
|
until: Optional[str] = None,
|
30
64
|
include_endpoint: bool = True,
|
31
65
|
**kwargs,
|
@@ -37,7 +71,7 @@ class DirectionalMixin:
|
|
37
71
|
direction: 'left', 'right', 'above', or 'below'
|
38
72
|
size: Size in the primary direction (width for horizontal, height for vertical)
|
39
73
|
cross_size: Size in the cross direction ('full' or 'element')
|
40
|
-
|
74
|
+
include_source: Whether to include this element/region's area in the result
|
41
75
|
until: Optional selector string to specify a boundary element
|
42
76
|
include_endpoint: Whether to include the boundary element found by 'until'
|
43
77
|
**kwargs: Additional parameters for the 'until' selector search
|
@@ -51,7 +85,7 @@ class DirectionalMixin:
|
|
51
85
|
is_positive = direction in ("right", "below") # right/below are positive directions
|
52
86
|
pixel_offset = 1 # Offset for excluding elements/endpoints
|
53
87
|
|
54
|
-
# 1. Determine initial boundaries based on direction and
|
88
|
+
# 1. Determine initial boundaries based on direction and include_source
|
55
89
|
if is_horizontal:
|
56
90
|
# Initial cross-boundaries (vertical)
|
57
91
|
y0 = 0 if cross_size == "full" else self.top
|
@@ -59,11 +93,11 @@ class DirectionalMixin:
|
|
59
93
|
|
60
94
|
# Initial primary boundaries (horizontal)
|
61
95
|
if is_positive: # right
|
62
|
-
x0_initial = self.x0 if
|
96
|
+
x0_initial = self.x0 if include_source else self.x1 + pixel_offset
|
63
97
|
x1_initial = self.x1 # This edge moves
|
64
98
|
else: # left
|
65
99
|
x0_initial = self.x0 # This edge moves
|
66
|
-
x1_initial = self.x1 if
|
100
|
+
x1_initial = self.x1 if include_source else self.x0 - pixel_offset
|
67
101
|
else: # Vertical
|
68
102
|
# Initial cross-boundaries (horizontal)
|
69
103
|
x0 = 0 if cross_size == "full" else self.x0
|
@@ -71,11 +105,11 @@ class DirectionalMixin:
|
|
71
105
|
|
72
106
|
# Initial primary boundaries (vertical)
|
73
107
|
if is_positive: # below
|
74
|
-
y0_initial = self.top if
|
108
|
+
y0_initial = self.top if include_source else self.bottom + pixel_offset
|
75
109
|
y1_initial = self.bottom # This edge moves
|
76
110
|
else: # above
|
77
111
|
y0_initial = self.top # This edge moves
|
78
|
-
y1_initial = self.bottom if
|
112
|
+
y1_initial = self.bottom if include_source else self.top - pixel_offset
|
79
113
|
|
80
114
|
# 2. Calculate the final primary boundary, considering 'size' or page limits
|
81
115
|
if is_horizontal:
|
@@ -161,7 +195,7 @@ class DirectionalMixin:
|
|
161
195
|
|
162
196
|
result = Region(self.page, final_bbox)
|
163
197
|
result.source_element = self
|
164
|
-
result.includes_source =
|
198
|
+
result.includes_source = include_source
|
165
199
|
# Optionally store the boundary element if found
|
166
200
|
if target:
|
167
201
|
result.boundary_element = target
|
@@ -172,7 +206,7 @@ class DirectionalMixin:
|
|
172
206
|
self,
|
173
207
|
height: Optional[float] = None,
|
174
208
|
width: str = "full",
|
175
|
-
|
209
|
+
include_source: bool = False,
|
176
210
|
until: Optional[str] = None,
|
177
211
|
include_endpoint: bool = True,
|
178
212
|
**kwargs,
|
@@ -183,7 +217,7 @@ class DirectionalMixin:
|
|
183
217
|
Args:
|
184
218
|
height: Height of the region above, in points
|
185
219
|
width: Width mode - "full" for full page width or "element" for element width
|
186
|
-
|
220
|
+
include_source: Whether to include this element/region in the result (default: False)
|
187
221
|
until: Optional selector string to specify an upper boundary element
|
188
222
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
189
223
|
**kwargs: Additional parameters
|
@@ -195,7 +229,7 @@ class DirectionalMixin:
|
|
195
229
|
direction="above",
|
196
230
|
size=height,
|
197
231
|
cross_size=width,
|
198
|
-
|
232
|
+
include_source=include_source,
|
199
233
|
until=until,
|
200
234
|
include_endpoint=include_endpoint,
|
201
235
|
**kwargs,
|
@@ -205,7 +239,7 @@ class DirectionalMixin:
|
|
205
239
|
self,
|
206
240
|
height: Optional[float] = None,
|
207
241
|
width: str = "full",
|
208
|
-
|
242
|
+
include_source: bool = False,
|
209
243
|
until: Optional[str] = None,
|
210
244
|
include_endpoint: bool = True,
|
211
245
|
**kwargs,
|
@@ -216,7 +250,7 @@ class DirectionalMixin:
|
|
216
250
|
Args:
|
217
251
|
height: Height of the region below, in points
|
218
252
|
width: Width mode - "full" for full page width or "element" for element width
|
219
|
-
|
253
|
+
include_source: Whether to include this element/region in the result (default: False)
|
220
254
|
until: Optional selector string to specify a lower boundary element
|
221
255
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
222
256
|
**kwargs: Additional parameters
|
@@ -228,7 +262,7 @@ class DirectionalMixin:
|
|
228
262
|
direction="below",
|
229
263
|
size=height,
|
230
264
|
cross_size=width,
|
231
|
-
|
265
|
+
include_source=include_source,
|
232
266
|
until=until,
|
233
267
|
include_endpoint=include_endpoint,
|
234
268
|
**kwargs,
|
@@ -238,7 +272,7 @@ class DirectionalMixin:
|
|
238
272
|
self,
|
239
273
|
width: Optional[float] = None,
|
240
274
|
height: str = "full",
|
241
|
-
|
275
|
+
include_source: bool = False,
|
242
276
|
until: Optional[str] = None,
|
243
277
|
include_endpoint: bool = True,
|
244
278
|
**kwargs,
|
@@ -249,7 +283,7 @@ class DirectionalMixin:
|
|
249
283
|
Args:
|
250
284
|
width: Width of the region to the left, in points
|
251
285
|
height: Height mode - "full" for full page height or "element" for element height
|
252
|
-
|
286
|
+
include_source: Whether to include this element/region in the result (default: False)
|
253
287
|
until: Optional selector string to specify a left boundary element
|
254
288
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
255
289
|
**kwargs: Additional parameters
|
@@ -261,7 +295,7 @@ class DirectionalMixin:
|
|
261
295
|
direction="left",
|
262
296
|
size=width,
|
263
297
|
cross_size=height,
|
264
|
-
|
298
|
+
include_source=include_source,
|
265
299
|
until=until,
|
266
300
|
include_endpoint=include_endpoint,
|
267
301
|
**kwargs,
|
@@ -271,7 +305,7 @@ class DirectionalMixin:
|
|
271
305
|
self,
|
272
306
|
width: Optional[float] = None,
|
273
307
|
height: str = "full",
|
274
|
-
|
308
|
+
include_source: bool = False,
|
275
309
|
until: Optional[str] = None,
|
276
310
|
include_endpoint: bool = True,
|
277
311
|
**kwargs,
|
@@ -282,7 +316,7 @@ class DirectionalMixin:
|
|
282
316
|
Args:
|
283
317
|
width: Width of the region to the right, in points
|
284
318
|
height: Height mode - "full" for full page height or "element" for element height
|
285
|
-
|
319
|
+
include_source: Whether to include this element/region in the result (default: False)
|
286
320
|
until: Optional selector string to specify a right boundary element
|
287
321
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
288
322
|
**kwargs: Additional parameters
|
@@ -294,7 +328,7 @@ class DirectionalMixin:
|
|
294
328
|
direction="right",
|
295
329
|
size=width,
|
296
330
|
cross_size=height,
|
297
|
-
|
331
|
+
include_source=include_source,
|
298
332
|
until=until,
|
299
333
|
include_endpoint=include_endpoint,
|
300
334
|
**kwargs,
|
@@ -18,6 +18,7 @@ from typing import (
|
|
18
18
|
Union,
|
19
19
|
overload,
|
20
20
|
)
|
21
|
+
import hashlib
|
21
22
|
|
22
23
|
from pdfplumber.utils.geometry import objects_to_bbox
|
23
24
|
|
@@ -37,6 +38,8 @@ from natural_pdf.export.mixin import ExportMixin
|
|
37
38
|
from natural_pdf.ocr import OCROptions
|
38
39
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
39
40
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
41
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
42
|
+
from tqdm.auto import tqdm
|
40
43
|
|
41
44
|
# Potentially lazy imports for optional dependencies needed in save_pdf
|
42
45
|
try:
|
@@ -46,8 +49,6 @@ except ImportError:
|
|
46
49
|
|
47
50
|
try:
|
48
51
|
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
49
|
-
|
50
|
-
pass
|
51
52
|
except ImportError:
|
52
53
|
create_searchable_pdf = None
|
53
54
|
|
@@ -64,6 +65,7 @@ if TYPE_CHECKING:
|
|
64
65
|
from natural_pdf.core.page import Page
|
65
66
|
from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
|
66
67
|
from natural_pdf.elements.region import Region
|
68
|
+
from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
|
67
69
|
|
68
70
|
T = TypeVar("T")
|
69
71
|
P = TypeVar("P", bound="Page")
|
@@ -1586,13 +1588,162 @@ class ElementCollection(
|
|
1586
1588
|
|
1587
1589
|
return all_data
|
1588
1590
|
|
1591
|
+
def to_text_elements(
|
1592
|
+
self,
|
1593
|
+
text_content_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1594
|
+
source_label: str = "derived_from_region",
|
1595
|
+
object_type: str = "word",
|
1596
|
+
default_font_size: float = 10.0,
|
1597
|
+
default_font_name: str = "RegionContent",
|
1598
|
+
confidence: Optional[float] = None,
|
1599
|
+
add_to_page: bool = False # Default is False
|
1600
|
+
) -> "ElementCollection[TextElement]":
|
1601
|
+
"""
|
1602
|
+
Converts each Region in this collection to a TextElement.
|
1589
1603
|
|
1590
|
-
|
1591
|
-
|
1592
|
-
|
1604
|
+
Args:
|
1605
|
+
text_content_func: A callable that takes a Region and returns its text
|
1606
|
+
(or None). If None, all created TextElements will
|
1607
|
+
have text=None.
|
1608
|
+
source_label: The 'source' attribute for the new TextElements.
|
1609
|
+
object_type: The 'object_type' for the TextElement's data dict.
|
1610
|
+
default_font_size: Placeholder font size.
|
1611
|
+
default_font_name: Placeholder font name.
|
1612
|
+
confidence: Confidence score.
|
1613
|
+
add_to_page: If True (default is False), also adds the created
|
1614
|
+
TextElements to their respective page's element manager.
|
1615
|
+
|
1616
|
+
Returns:
|
1617
|
+
A new ElementCollection containing the created TextElement objects.
|
1618
|
+
"""
|
1619
|
+
from natural_pdf.elements.region import Region # Local import for type checking if needed or to resolve circularity
|
1620
|
+
from natural_pdf.elements.text import TextElement # Ensure TextElement is imported for type hint if not in TYPE_CHECKING
|
1593
1621
|
|
1594
|
-
|
1595
|
-
|
1622
|
+
new_text_elements: List["TextElement"] = []
|
1623
|
+
if not self.elements: # Accesses self._elements via property
|
1624
|
+
return ElementCollection([])
|
1625
|
+
|
1626
|
+
page_context_for_adding: Optional["Page"] = None
|
1627
|
+
if add_to_page:
|
1628
|
+
# Try to determine a consistent page context if adding elements
|
1629
|
+
first_valid_region_with_page = next(
|
1630
|
+
(el for el in self.elements if isinstance(el, Region) and hasattr(el, 'page') and el.page is not None),
|
1631
|
+
None
|
1632
|
+
)
|
1633
|
+
if first_valid_region_with_page:
|
1634
|
+
page_context_for_adding = first_valid_region_with_page.page
|
1635
|
+
else:
|
1636
|
+
logger.warning("Cannot add TextElements to page: No valid Region with a page attribute found in collection, or first region's page is None.")
|
1637
|
+
add_to_page = False # Disable adding if no valid page context can be determined
|
1638
|
+
|
1639
|
+
for element in self.elements: # Accesses self._elements via property/iterator
|
1640
|
+
if isinstance(element, Region):
|
1641
|
+
text_el = element.to_text_element(
|
1642
|
+
text_content=text_content_func,
|
1643
|
+
source_label=source_label,
|
1644
|
+
object_type=object_type,
|
1645
|
+
default_font_size=default_font_size,
|
1646
|
+
default_font_name=default_font_name,
|
1647
|
+
confidence=confidence
|
1648
|
+
)
|
1649
|
+
new_text_elements.append(text_el)
|
1650
|
+
|
1651
|
+
if add_to_page:
|
1652
|
+
if not hasattr(text_el, 'page') or text_el.page is None:
|
1653
|
+
logger.warning(f"TextElement created from region {element.bbox} has no page attribute. Cannot add to page.")
|
1654
|
+
continue
|
1655
|
+
|
1656
|
+
if page_context_for_adding and text_el.page == page_context_for_adding:
|
1657
|
+
if hasattr(page_context_for_adding, '_element_mgr') and page_context_for_adding._element_mgr is not None:
|
1658
|
+
add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
|
1659
|
+
page_context_for_adding._element_mgr.add_element(text_el, element_type=add_as_type)
|
1660
|
+
else:
|
1661
|
+
page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
|
1662
|
+
logger.error(f"Page context for region {element.bbox} (Page {page_num_str}) is missing '_element_mgr'. Cannot add TextElement.")
|
1663
|
+
elif page_context_for_adding and text_el.page != page_context_for_adding:
|
1664
|
+
current_page_num_str = str(text_el.page.page_number) if hasattr(text_el.page, 'page_number') else "Unknown"
|
1665
|
+
context_page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else "N/A"
|
1666
|
+
logger.warning(f"TextElement for region {element.bbox} from page {current_page_num_str} "
|
1667
|
+
f"not added as it's different from collection's inferred page context {context_page_num_str}.")
|
1668
|
+
elif not page_context_for_adding:
|
1669
|
+
logger.warning(f"TextElement for region {element.bbox} created, but no page context was determined for adding.")
|
1670
|
+
else:
|
1671
|
+
logger.warning(f"Skipping element {type(element)}, not a Region.")
|
1672
|
+
|
1673
|
+
if add_to_page and page_context_for_adding:
|
1674
|
+
page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
|
1675
|
+
logger.info(f"Created and added {len(new_text_elements)} TextElements to page {page_num_str}.")
|
1676
|
+
elif add_to_page and not page_context_for_adding:
|
1677
|
+
logger.info(f"Created {len(new_text_elements)} TextElements, but could not add to page as page context was not determined or was inconsistent.")
|
1678
|
+
else: # add_to_page is False
|
1679
|
+
logger.info(f"Created {len(new_text_elements)} TextElements (not added to page).")
|
1680
|
+
|
1681
|
+
return ElementCollection(new_text_elements)
|
1682
|
+
|
1683
|
+
def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, show_progress: bool = True) -> "ElementCollection":
|
1684
|
+
"""
|
1685
|
+
Trim visual whitespace from each region in the collection.
|
1686
|
+
|
1687
|
+
Applies the trim() method to each element in the collection,
|
1688
|
+
returning a new collection with the trimmed regions.
|
1689
|
+
|
1690
|
+
Args:
|
1691
|
+
padding: Number of pixels to keep as padding after trimming (default: 1)
|
1692
|
+
threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
|
1693
|
+
resolution: Resolution for image rendering in DPI (default: 150)
|
1694
|
+
show_progress: Whether to show a progress bar for the trimming operation
|
1695
|
+
|
1696
|
+
Returns:
|
1697
|
+
New ElementCollection with trimmed regions
|
1698
|
+
"""
|
1699
|
+
return self.apply(
|
1700
|
+
lambda element: element.trim(padding=padding, threshold=threshold, resolution=resolution),
|
1701
|
+
show_progress=show_progress
|
1702
|
+
)
|
1703
|
+
|
1704
|
+
def clip(
|
1705
|
+
self,
|
1706
|
+
obj: Optional[Any] = None,
|
1707
|
+
left: Optional[float] = None,
|
1708
|
+
top: Optional[float] = None,
|
1709
|
+
right: Optional[float] = None,
|
1710
|
+
bottom: Optional[float] = None,
|
1711
|
+
) -> "ElementCollection":
|
1712
|
+
"""
|
1713
|
+
Clip each element in the collection to the specified bounds.
|
1714
|
+
|
1715
|
+
This method applies the clip operation to each individual element,
|
1716
|
+
returning a new collection with the clipped elements.
|
1717
|
+
|
1718
|
+
Args:
|
1719
|
+
obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
|
1720
|
+
left: Optional left boundary (x0) to clip to
|
1721
|
+
top: Optional top boundary to clip to
|
1722
|
+
right: Optional right boundary (x1) to clip to
|
1723
|
+
bottom: Optional bottom boundary to clip to
|
1724
|
+
|
1725
|
+
Returns:
|
1726
|
+
New ElementCollection containing the clipped elements
|
1727
|
+
|
1728
|
+
Examples:
|
1729
|
+
# Clip each element to another region's bounds
|
1730
|
+
clipped_elements = collection.clip(container_region)
|
1731
|
+
|
1732
|
+
# Clip each element to specific coordinates
|
1733
|
+
clipped_elements = collection.clip(left=100, right=400)
|
1734
|
+
|
1735
|
+
# Mix object bounds with specific overrides
|
1736
|
+
clipped_elements = collection.clip(obj=container, bottom=page.height/2)
|
1737
|
+
"""
|
1738
|
+
return self.apply(
|
1739
|
+
lambda element: element.clip(obj=obj, left=left, top=top, right=right, bottom=bottom)
|
1740
|
+
)
|
1741
|
+
|
1742
|
+
|
1743
|
+
class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
1744
|
+
"""
|
1745
|
+
Represents a collection of Page objects, often from a single PDF document.
|
1746
|
+
Provides methods for batch operations on these pages.
|
1596
1747
|
"""
|
1597
1748
|
|
1598
1749
|
def __init__(self, pages: List[P]):
|
@@ -1921,7 +2072,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1921
2072
|
end_elements=None,
|
1922
2073
|
new_section_on_page_break=False,
|
1923
2074
|
boundary_inclusion="both",
|
1924
|
-
) ->
|
2075
|
+
) -> "ElementCollection[Region]":
|
1925
2076
|
"""
|
1926
2077
|
Extract sections from a page collection based on start/end elements.
|
1927
2078
|
|
@@ -2214,7 +2365,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2214
2365
|
region.start_element = start_element
|
2215
2366
|
sections.append(region)
|
2216
2367
|
|
2217
|
-
return sections
|
2368
|
+
return ElementCollection(sections)
|
2218
2369
|
|
2219
2370
|
def _gather_analysis_data(
|
2220
2371
|
self,
|
natural_pdf/elements/line.py
CHANGED
@@ -28,6 +28,11 @@ class LineElement(Element):
|
|
28
28
|
"""
|
29
29
|
super().__init__(obj, page)
|
30
30
|
|
31
|
+
@property
|
32
|
+
def source(self) -> Optional[str]:
|
33
|
+
"""Get the source of this line element (e.g., 'pdf', 'detected')."""
|
34
|
+
return self._obj.get("source")
|
35
|
+
|
31
36
|
@property
|
32
37
|
def type(self) -> str:
|
33
38
|
"""Element type."""
|