natural-pdf 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,6 +15,40 @@ if TYPE_CHECKING:
15
15
  from natural_pdf.elements.region import Region
16
16
 
17
17
 
18
+ def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
19
+ """
20
+ Extract bounding box coordinates from any object that has bbox properties.
21
+
22
+ Args:
23
+ obj: Object that might have bbox coordinates (Element, Region, etc.)
24
+
25
+ Returns:
26
+ Tuple of (x0, top, x1, bottom) or None if object doesn't have bbox properties
27
+ """
28
+ # Try bbox property first (most common)
29
+ if hasattr(obj, 'bbox') and obj.bbox is not None:
30
+ bbox = obj.bbox
31
+ if isinstance(bbox, (tuple, list)) and len(bbox) == 4:
32
+ return tuple(float(coord) for coord in bbox)
33
+
34
+ # Try individual coordinate properties
35
+ if all(hasattr(obj, attr) for attr in ['x0', 'top', 'x1', 'bottom']):
36
+ try:
37
+ return (float(obj.x0), float(obj.top), float(obj.x1), float(obj.bottom))
38
+ except (ValueError, TypeError):
39
+ pass
40
+
41
+ # If object is a dict with bbox keys
42
+ if isinstance(obj, dict):
43
+ if all(key in obj for key in ['x0', 'top', 'x1', 'bottom']):
44
+ try:
45
+ return (float(obj['x0']), float(obj['top']), float(obj['x1']), float(obj['bottom']))
46
+ except (ValueError, TypeError):
47
+ pass
48
+
49
+ return None
50
+
51
+
18
52
  class DirectionalMixin:
19
53
  """
20
54
  Mixin class providing directional methods for both Element and Region classes.
@@ -25,7 +59,7 @@ class DirectionalMixin:
25
59
  direction: str,
26
60
  size: Optional[float] = None,
27
61
  cross_size: str = "full",
28
- include_element: bool = False,
62
+ include_source: bool = False,
29
63
  until: Optional[str] = None,
30
64
  include_endpoint: bool = True,
31
65
  **kwargs,
@@ -37,7 +71,7 @@ class DirectionalMixin:
37
71
  direction: 'left', 'right', 'above', or 'below'
38
72
  size: Size in the primary direction (width for horizontal, height for vertical)
39
73
  cross_size: Size in the cross direction ('full' or 'element')
40
- include_element: Whether to include this element/region's area in the result
74
+ include_source: Whether to include this element/region's area in the result
41
75
  until: Optional selector string to specify a boundary element
42
76
  include_endpoint: Whether to include the boundary element found by 'until'
43
77
  **kwargs: Additional parameters for the 'until' selector search
@@ -51,7 +85,7 @@ class DirectionalMixin:
51
85
  is_positive = direction in ("right", "below") # right/below are positive directions
52
86
  pixel_offset = 1 # Offset for excluding elements/endpoints
53
87
 
54
- # 1. Determine initial boundaries based on direction and include_element
88
+ # 1. Determine initial boundaries based on direction and include_source
55
89
  if is_horizontal:
56
90
  # Initial cross-boundaries (vertical)
57
91
  y0 = 0 if cross_size == "full" else self.top
@@ -59,11 +93,11 @@ class DirectionalMixin:
59
93
 
60
94
  # Initial primary boundaries (horizontal)
61
95
  if is_positive: # right
62
- x0_initial = self.x0 if include_element else self.x1 + pixel_offset
96
+ x0_initial = self.x0 if include_source else self.x1 + pixel_offset
63
97
  x1_initial = self.x1 # This edge moves
64
98
  else: # left
65
99
  x0_initial = self.x0 # This edge moves
66
- x1_initial = self.x1 if include_element else self.x0 - pixel_offset
100
+ x1_initial = self.x1 if include_source else self.x0 - pixel_offset
67
101
  else: # Vertical
68
102
  # Initial cross-boundaries (horizontal)
69
103
  x0 = 0 if cross_size == "full" else self.x0
@@ -71,11 +105,11 @@ class DirectionalMixin:
71
105
 
72
106
  # Initial primary boundaries (vertical)
73
107
  if is_positive: # below
74
- y0_initial = self.top if include_element else self.bottom + pixel_offset
108
+ y0_initial = self.top if include_source else self.bottom + pixel_offset
75
109
  y1_initial = self.bottom # This edge moves
76
110
  else: # above
77
111
  y0_initial = self.top # This edge moves
78
- y1_initial = self.bottom if include_element else self.top - pixel_offset
112
+ y1_initial = self.bottom if include_source else self.top - pixel_offset
79
113
 
80
114
  # 2. Calculate the final primary boundary, considering 'size' or page limits
81
115
  if is_horizontal:
@@ -161,7 +195,7 @@ class DirectionalMixin:
161
195
 
162
196
  result = Region(self.page, final_bbox)
163
197
  result.source_element = self
164
- result.includes_source = include_element
198
+ result.includes_source = include_source
165
199
  # Optionally store the boundary element if found
166
200
  if target:
167
201
  result.boundary_element = target
@@ -172,7 +206,7 @@ class DirectionalMixin:
172
206
  self,
173
207
  height: Optional[float] = None,
174
208
  width: str = "full",
175
- include_element: bool = False,
209
+ include_source: bool = False,
176
210
  until: Optional[str] = None,
177
211
  include_endpoint: bool = True,
178
212
  **kwargs,
@@ -183,7 +217,7 @@ class DirectionalMixin:
183
217
  Args:
184
218
  height: Height of the region above, in points
185
219
  width: Width mode - "full" for full page width or "element" for element width
186
- include_element: Whether to include this element/region in the result (default: False)
220
+ include_source: Whether to include this element/region in the result (default: False)
187
221
  until: Optional selector string to specify an upper boundary element
188
222
  include_endpoint: Whether to include the boundary element in the region (default: True)
189
223
  **kwargs: Additional parameters
@@ -195,7 +229,7 @@ class DirectionalMixin:
195
229
  direction="above",
196
230
  size=height,
197
231
  cross_size=width,
198
- include_element=include_element,
232
+ include_source=include_source,
199
233
  until=until,
200
234
  include_endpoint=include_endpoint,
201
235
  **kwargs,
@@ -205,7 +239,7 @@ class DirectionalMixin:
205
239
  self,
206
240
  height: Optional[float] = None,
207
241
  width: str = "full",
208
- include_element: bool = False,
242
+ include_source: bool = False,
209
243
  until: Optional[str] = None,
210
244
  include_endpoint: bool = True,
211
245
  **kwargs,
@@ -216,7 +250,7 @@ class DirectionalMixin:
216
250
  Args:
217
251
  height: Height of the region below, in points
218
252
  width: Width mode - "full" for full page width or "element" for element width
219
- include_element: Whether to include this element/region in the result (default: False)
253
+ include_source: Whether to include this element/region in the result (default: False)
220
254
  until: Optional selector string to specify a lower boundary element
221
255
  include_endpoint: Whether to include the boundary element in the region (default: True)
222
256
  **kwargs: Additional parameters
@@ -228,7 +262,7 @@ class DirectionalMixin:
228
262
  direction="below",
229
263
  size=height,
230
264
  cross_size=width,
231
- include_element=include_element,
265
+ include_source=include_source,
232
266
  until=until,
233
267
  include_endpoint=include_endpoint,
234
268
  **kwargs,
@@ -238,7 +272,7 @@ class DirectionalMixin:
238
272
  self,
239
273
  width: Optional[float] = None,
240
274
  height: str = "full",
241
- include_element: bool = False,
275
+ include_source: bool = False,
242
276
  until: Optional[str] = None,
243
277
  include_endpoint: bool = True,
244
278
  **kwargs,
@@ -249,7 +283,7 @@ class DirectionalMixin:
249
283
  Args:
250
284
  width: Width of the region to the left, in points
251
285
  height: Height mode - "full" for full page height or "element" for element height
252
- include_element: Whether to include this element/region in the result (default: False)
286
+ include_source: Whether to include this element/region in the result (default: False)
253
287
  until: Optional selector string to specify a left boundary element
254
288
  include_endpoint: Whether to include the boundary element in the region (default: True)
255
289
  **kwargs: Additional parameters
@@ -261,7 +295,7 @@ class DirectionalMixin:
261
295
  direction="left",
262
296
  size=width,
263
297
  cross_size=height,
264
- include_element=include_element,
298
+ include_source=include_source,
265
299
  until=until,
266
300
  include_endpoint=include_endpoint,
267
301
  **kwargs,
@@ -271,7 +305,7 @@ class DirectionalMixin:
271
305
  self,
272
306
  width: Optional[float] = None,
273
307
  height: str = "full",
274
- include_element: bool = False,
308
+ include_source: bool = False,
275
309
  until: Optional[str] = None,
276
310
  include_endpoint: bool = True,
277
311
  **kwargs,
@@ -282,7 +316,7 @@ class DirectionalMixin:
282
316
  Args:
283
317
  width: Width of the region to the right, in points
284
318
  height: Height mode - "full" for full page height or "element" for element height
285
- include_element: Whether to include this element/region in the result (default: False)
319
+ include_source: Whether to include this element/region in the result (default: False)
286
320
  until: Optional selector string to specify a right boundary element
287
321
  include_endpoint: Whether to include the boundary element in the region (default: True)
288
322
  **kwargs: Additional parameters
@@ -294,7 +328,7 @@ class DirectionalMixin:
294
328
  direction="right",
295
329
  size=width,
296
330
  cross_size=height,
297
- include_element=include_element,
331
+ include_source=include_source,
298
332
  until=until,
299
333
  include_endpoint=include_endpoint,
300
334
  **kwargs,
@@ -18,6 +18,7 @@ from typing import (
18
18
  Union,
19
19
  overload,
20
20
  )
21
+ import hashlib
21
22
 
22
23
  from pdfplumber.utils.geometry import objects_to_bbox
23
24
 
@@ -37,6 +38,8 @@ from natural_pdf.export.mixin import ExportMixin
37
38
  from natural_pdf.ocr import OCROptions
38
39
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
39
40
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
41
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
42
+ from tqdm.auto import tqdm
40
43
 
41
44
  # Potentially lazy imports for optional dependencies needed in save_pdf
42
45
  try:
@@ -46,8 +49,6 @@ except ImportError:
46
49
 
47
50
  try:
48
51
  from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
49
-
50
- pass
51
52
  except ImportError:
52
53
  create_searchable_pdf = None
53
54
 
@@ -64,6 +65,7 @@ if TYPE_CHECKING:
64
65
  from natural_pdf.core.page import Page
65
66
  from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
66
67
  from natural_pdf.elements.region import Region
68
+ from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
67
69
 
68
70
  T = TypeVar("T")
69
71
  P = TypeVar("P", bound="Page")
@@ -1586,13 +1588,162 @@ class ElementCollection(
1586
1588
 
1587
1589
  return all_data
1588
1590
 
1591
+ def to_text_elements(
1592
+ self,
1593
+ text_content_func: Optional[Callable[["Region"], Optional[str]]] = None,
1594
+ source_label: str = "derived_from_region",
1595
+ object_type: str = "word",
1596
+ default_font_size: float = 10.0,
1597
+ default_font_name: str = "RegionContent",
1598
+ confidence: Optional[float] = None,
1599
+ add_to_page: bool = False # Default is False
1600
+ ) -> "ElementCollection[TextElement]":
1601
+ """
1602
+ Converts each Region in this collection to a TextElement.
1589
1603
 
1590
- class PageCollection(Generic[P], ApplyMixin):
1591
- """
1592
- A collection of PDF pages with cross-page operations.
1604
+ Args:
1605
+ text_content_func: A callable that takes a Region and returns its text
1606
+ (or None). If None, all created TextElements will
1607
+ have text=None.
1608
+ source_label: The 'source' attribute for the new TextElements.
1609
+ object_type: The 'object_type' for the TextElement's data dict.
1610
+ default_font_size: Placeholder font size.
1611
+ default_font_name: Placeholder font name.
1612
+ confidence: Confidence score.
1613
+ add_to_page: If True (default is False), also adds the created
1614
+ TextElements to their respective page's element manager.
1615
+
1616
+ Returns:
1617
+ A new ElementCollection containing the created TextElement objects.
1618
+ """
1619
+ from natural_pdf.elements.region import Region # Local import for type checking if needed or to resolve circularity
1620
+ from natural_pdf.elements.text import TextElement # Ensure TextElement is imported for type hint if not in TYPE_CHECKING
1593
1621
 
1594
- This class provides methods for working with multiple pages, such as finding
1595
- elements across pages, extracting text from page ranges, and more.
1622
+ new_text_elements: List["TextElement"] = []
1623
+ if not self.elements: # Accesses self._elements via property
1624
+ return ElementCollection([])
1625
+
1626
+ page_context_for_adding: Optional["Page"] = None
1627
+ if add_to_page:
1628
+ # Try to determine a consistent page context if adding elements
1629
+ first_valid_region_with_page = next(
1630
+ (el for el in self.elements if isinstance(el, Region) and hasattr(el, 'page') and el.page is not None),
1631
+ None
1632
+ )
1633
+ if first_valid_region_with_page:
1634
+ page_context_for_adding = first_valid_region_with_page.page
1635
+ else:
1636
+ logger.warning("Cannot add TextElements to page: No valid Region with a page attribute found in collection, or first region's page is None.")
1637
+ add_to_page = False # Disable adding if no valid page context can be determined
1638
+
1639
+ for element in self.elements: # Accesses self._elements via property/iterator
1640
+ if isinstance(element, Region):
1641
+ text_el = element.to_text_element(
1642
+ text_content=text_content_func,
1643
+ source_label=source_label,
1644
+ object_type=object_type,
1645
+ default_font_size=default_font_size,
1646
+ default_font_name=default_font_name,
1647
+ confidence=confidence
1648
+ )
1649
+ new_text_elements.append(text_el)
1650
+
1651
+ if add_to_page:
1652
+ if not hasattr(text_el, 'page') or text_el.page is None:
1653
+ logger.warning(f"TextElement created from region {element.bbox} has no page attribute. Cannot add to page.")
1654
+ continue
1655
+
1656
+ if page_context_for_adding and text_el.page == page_context_for_adding:
1657
+ if hasattr(page_context_for_adding, '_element_mgr') and page_context_for_adding._element_mgr is not None:
1658
+ add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
1659
+ page_context_for_adding._element_mgr.add_element(text_el, element_type=add_as_type)
1660
+ else:
1661
+ page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
1662
+ logger.error(f"Page context for region {element.bbox} (Page {page_num_str}) is missing '_element_mgr'. Cannot add TextElement.")
1663
+ elif page_context_for_adding and text_el.page != page_context_for_adding:
1664
+ current_page_num_str = str(text_el.page.page_number) if hasattr(text_el.page, 'page_number') else "Unknown"
1665
+ context_page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else "N/A"
1666
+ logger.warning(f"TextElement for region {element.bbox} from page {current_page_num_str} "
1667
+ f"not added as it's different from collection's inferred page context {context_page_num_str}.")
1668
+ elif not page_context_for_adding:
1669
+ logger.warning(f"TextElement for region {element.bbox} created, but no page context was determined for adding.")
1670
+ else:
1671
+ logger.warning(f"Skipping element {type(element)}, not a Region.")
1672
+
1673
+ if add_to_page and page_context_for_adding:
1674
+ page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
1675
+ logger.info(f"Created and added {len(new_text_elements)} TextElements to page {page_num_str}.")
1676
+ elif add_to_page and not page_context_for_adding:
1677
+ logger.info(f"Created {len(new_text_elements)} TextElements, but could not add to page as page context was not determined or was inconsistent.")
1678
+ else: # add_to_page is False
1679
+ logger.info(f"Created {len(new_text_elements)} TextElements (not added to page).")
1680
+
1681
+ return ElementCollection(new_text_elements)
1682
+
1683
+ def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, show_progress: bool = True) -> "ElementCollection":
1684
+ """
1685
+ Trim visual whitespace from each region in the collection.
1686
+
1687
+ Applies the trim() method to each element in the collection,
1688
+ returning a new collection with the trimmed regions.
1689
+
1690
+ Args:
1691
+ padding: Number of pixels to keep as padding after trimming (default: 1)
1692
+ threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
1693
+ resolution: Resolution for image rendering in DPI (default: 150)
1694
+ show_progress: Whether to show a progress bar for the trimming operation
1695
+
1696
+ Returns:
1697
+ New ElementCollection with trimmed regions
1698
+ """
1699
+ return self.apply(
1700
+ lambda element: element.trim(padding=padding, threshold=threshold, resolution=resolution),
1701
+ show_progress=show_progress
1702
+ )
1703
+
1704
+ def clip(
1705
+ self,
1706
+ obj: Optional[Any] = None,
1707
+ left: Optional[float] = None,
1708
+ top: Optional[float] = None,
1709
+ right: Optional[float] = None,
1710
+ bottom: Optional[float] = None,
1711
+ ) -> "ElementCollection":
1712
+ """
1713
+ Clip each element in the collection to the specified bounds.
1714
+
1715
+ This method applies the clip operation to each individual element,
1716
+ returning a new collection with the clipped elements.
1717
+
1718
+ Args:
1719
+ obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
1720
+ left: Optional left boundary (x0) to clip to
1721
+ top: Optional top boundary to clip to
1722
+ right: Optional right boundary (x1) to clip to
1723
+ bottom: Optional bottom boundary to clip to
1724
+
1725
+ Returns:
1726
+ New ElementCollection containing the clipped elements
1727
+
1728
+ Examples:
1729
+ # Clip each element to another region's bounds
1730
+ clipped_elements = collection.clip(container_region)
1731
+
1732
+ # Clip each element to specific coordinates
1733
+ clipped_elements = collection.clip(left=100, right=400)
1734
+
1735
+ # Mix object bounds with specific overrides
1736
+ clipped_elements = collection.clip(obj=container, bottom=page.height/2)
1737
+ """
1738
+ return self.apply(
1739
+ lambda element: element.clip(obj=obj, left=left, top=top, right=right, bottom=bottom)
1740
+ )
1741
+
1742
+
1743
+ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
1744
+ """
1745
+ Represents a collection of Page objects, often from a single PDF document.
1746
+ Provides methods for batch operations on these pages.
1596
1747
  """
1597
1748
 
1598
1749
  def __init__(self, pages: List[P]):
@@ -1921,7 +2072,7 @@ class PageCollection(Generic[P], ApplyMixin):
1921
2072
  end_elements=None,
1922
2073
  new_section_on_page_break=False,
1923
2074
  boundary_inclusion="both",
1924
- ) -> List["Region"]:
2075
+ ) -> "ElementCollection[Region]":
1925
2076
  """
1926
2077
  Extract sections from a page collection based on start/end elements.
1927
2078
 
@@ -2214,7 +2365,7 @@ class PageCollection(Generic[P], ApplyMixin):
2214
2365
  region.start_element = start_element
2215
2366
  sections.append(region)
2216
2367
 
2217
- return sections
2368
+ return ElementCollection(sections)
2218
2369
 
2219
2370
  def _gather_analysis_data(
2220
2371
  self,
@@ -28,6 +28,11 @@ class LineElement(Element):
28
28
  """
29
29
  super().__init__(obj, page)
30
30
 
31
+ @property
32
+ def source(self) -> Optional[str]:
33
+ """Get the source of this line element (e.g., 'pdf', 'detected')."""
34
+ return self._obj.get("source")
35
+
31
36
  @property
32
37
  def type(self) -> str:
33
38
  """Element type."""