natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +119 -76
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/describe/__init__.py +21 -0
  14. natural_pdf/describe/base.py +457 -0
  15. natural_pdf/describe/elements.py +411 -0
  16. natural_pdf/describe/mixin.py +84 -0
  17. natural_pdf/describe/summary.py +186 -0
  18. natural_pdf/elements/base.py +11 -10
  19. natural_pdf/elements/collections.py +116 -51
  20. natural_pdf/elements/region.py +204 -127
  21. natural_pdf/exporters/paddleocr.py +38 -13
  22. natural_pdf/flows/__init__.py +3 -3
  23. natural_pdf/flows/collections.py +303 -132
  24. natural_pdf/flows/element.py +277 -132
  25. natural_pdf/flows/flow.py +33 -16
  26. natural_pdf/flows/region.py +142 -79
  27. natural_pdf/ocr/engine_doctr.py +37 -4
  28. natural_pdf/ocr/engine_easyocr.py +23 -3
  29. natural_pdf/ocr/engine_paddle.py +281 -30
  30. natural_pdf/ocr/engine_surya.py +8 -3
  31. natural_pdf/ocr/ocr_manager.py +75 -76
  32. natural_pdf/ocr/ocr_options.py +52 -87
  33. natural_pdf/search/__init__.py +25 -12
  34. natural_pdf/search/lancedb_search_service.py +91 -54
  35. natural_pdf/search/numpy_search_service.py +86 -65
  36. natural_pdf/search/searchable_mixin.py +2 -2
  37. natural_pdf/selectors/parser.py +125 -81
  38. natural_pdf/widgets/__init__.py +1 -1
  39. natural_pdf/widgets/viewer.py +205 -449
  40. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
  41. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
  42. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
  43. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
  44. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,7 @@ from PIL import Image
8
8
 
9
9
  # Import selector parsing functions
10
10
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
11
+ from natural_pdf.describe.mixin import DescribeMixin
11
12
 
12
13
  if TYPE_CHECKING:
13
14
  from natural_pdf.core.page import Page
@@ -18,34 +19,34 @@ if TYPE_CHECKING:
18
19
  def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
19
20
  """
20
21
  Extract bounding box coordinates from any object that has bbox properties.
21
-
22
+
22
23
  Args:
23
24
  obj: Object that might have bbox coordinates (Element, Region, etc.)
24
-
25
+
25
26
  Returns:
26
27
  Tuple of (x0, top, x1, bottom) or None if object doesn't have bbox properties
27
28
  """
28
29
  # Try bbox property first (most common)
29
- if hasattr(obj, 'bbox') and obj.bbox is not None:
30
+ if hasattr(obj, "bbox") and obj.bbox is not None:
30
31
  bbox = obj.bbox
31
32
  if isinstance(bbox, (tuple, list)) and len(bbox) == 4:
32
33
  return tuple(float(coord) for coord in bbox)
33
-
34
+
34
35
  # Try individual coordinate properties
35
- if all(hasattr(obj, attr) for attr in ['x0', 'top', 'x1', 'bottom']):
36
+ if all(hasattr(obj, attr) for attr in ["x0", "top", "x1", "bottom"]):
36
37
  try:
37
38
  return (float(obj.x0), float(obj.top), float(obj.x1), float(obj.bottom))
38
39
  except (ValueError, TypeError):
39
40
  pass
40
-
41
+
41
42
  # If object is a dict with bbox keys
42
43
  if isinstance(obj, dict):
43
- if all(key in obj for key in ['x0', 'top', 'x1', 'bottom']):
44
+ if all(key in obj for key in ["x0", "top", "x1", "bottom"]):
44
45
  try:
45
- return (float(obj['x0']), float(obj['top']), float(obj['x1']), float(obj['bottom']))
46
+ return (float(obj["x0"]), float(obj["top"]), float(obj["x1"]), float(obj["bottom"]))
46
47
  except (ValueError, TypeError):
47
48
  pass
48
-
49
+
49
50
  return None
50
51
 
51
52
 
@@ -412,7 +413,7 @@ class DirectionalMixin:
412
413
  return new_region
413
414
 
414
415
 
415
- class Element(DirectionalMixin):
416
+ class Element(DirectionalMixin, DescribeMixin):
416
417
  """
417
418
  Base class for all PDF elements.
418
419
 
@@ -1,3 +1,4 @@
1
+ import hashlib
1
2
  import logging
2
3
  from collections.abc import MutableSequence
3
4
  from pathlib import Path
@@ -18,7 +19,6 @@ from typing import (
18
19
  Union,
19
20
  overload,
20
21
  )
21
- import hashlib
22
22
 
23
23
  from pdfplumber.utils.geometry import objects_to_bbox
24
24
 
@@ -27,8 +27,10 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
27
27
  from PIL import Image, ImageDraw, ImageFont
28
28
  from tqdm.auto import tqdm
29
29
 
30
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
30
31
  from natural_pdf.classification.manager import ClassificationManager
31
32
  from natural_pdf.classification.mixin import ClassificationMixin
33
+ from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
32
34
  from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
33
35
  from natural_pdf.core.pdf import PDF
34
36
  from natural_pdf.elements.base import Element
@@ -38,8 +40,6 @@ from natural_pdf.export.mixin import ExportMixin
38
40
  from natural_pdf.ocr import OCROptions
39
41
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
40
42
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
41
- from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
42
- from tqdm.auto import tqdm
43
43
 
44
44
  # Potentially lazy imports for optional dependencies needed in save_pdf
45
45
  try:
@@ -65,14 +65,21 @@ if TYPE_CHECKING:
65
65
  from natural_pdf.core.page import Page
66
66
  from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
67
67
  from natural_pdf.elements.region import Region
68
- from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
68
+ from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
69
69
 
70
70
  T = TypeVar("T")
71
71
  P = TypeVar("P", bound="Page")
72
72
 
73
73
 
74
74
  class ElementCollection(
75
- Generic[T], ApplyMixin, ExportMixin, DirectionalCollectionMixin, MutableSequence
75
+ Generic[T],
76
+ ApplyMixin,
77
+ ExportMixin,
78
+ ClassificationMixin,
79
+ DirectionalCollectionMixin,
80
+ DescribeMixin,
81
+ InspectMixin,
82
+ MutableSequence,
76
83
  ):
77
84
  """
78
85
  Collection of PDF elements with batch operations.
@@ -844,6 +851,7 @@ class ElementCollection(
844
851
  legend_position: str = "right",
845
852
  render_ocr: bool = False,
846
853
  width: Optional[int] = None, # Add width parameter
854
+ page: Optional[Any] = None, # NEW: Optional page parameter for empty collections
847
855
  ) -> Optional["Image.Image"]:
848
856
  """
849
857
  Generates a temporary preview image highlighting elements in this collection
@@ -1590,13 +1598,13 @@ class ElementCollection(
1590
1598
 
1591
1599
  def to_text_elements(
1592
1600
  self,
1593
- text_content_func: Optional[Callable[["Region"], Optional[str]]] = None,
1601
+ text_content_func: Optional[Callable[["Region"], Optional[str]]] = None,
1594
1602
  source_label: str = "derived_from_region",
1595
1603
  object_type: str = "word",
1596
1604
  default_font_size: float = 10.0,
1597
1605
  default_font_name: str = "RegionContent",
1598
1606
  confidence: Optional[float] = None,
1599
- add_to_page: bool = False # Default is False
1607
+ add_to_page: bool = False, # Default is False
1600
1608
  ) -> "ElementCollection[TextElement]":
1601
1609
  """
1602
1610
  Converts each Region in this collection to a TextElement.
@@ -1610,95 +1618,150 @@ class ElementCollection(
1610
1618
  default_font_size: Placeholder font size.
1611
1619
  default_font_name: Placeholder font name.
1612
1620
  confidence: Confidence score.
1613
- add_to_page: If True (default is False), also adds the created
1621
+ add_to_page: If True (default is False), also adds the created
1614
1622
  TextElements to their respective page's element manager.
1615
1623
 
1616
1624
  Returns:
1617
1625
  A new ElementCollection containing the created TextElement objects.
1618
1626
  """
1619
- from natural_pdf.elements.region import Region # Local import for type checking if needed or to resolve circularity
1620
- from natural_pdf.elements.text import TextElement # Ensure TextElement is imported for type hint if not in TYPE_CHECKING
1627
+ from natural_pdf.elements.region import ( # Local import for type checking if needed or to resolve circularity
1628
+ Region,
1629
+ )
1630
+ from natural_pdf.elements.text import ( # Ensure TextElement is imported for type hint if not in TYPE_CHECKING
1631
+ TextElement,
1632
+ )
1621
1633
 
1622
1634
  new_text_elements: List["TextElement"] = []
1623
- if not self.elements: # Accesses self._elements via property
1635
+ if not self.elements: # Accesses self._elements via property
1624
1636
  return ElementCollection([])
1625
1637
 
1626
1638
  page_context_for_adding: Optional["Page"] = None
1627
1639
  if add_to_page:
1628
1640
  # Try to determine a consistent page context if adding elements
1629
1641
  first_valid_region_with_page = next(
1630
- (el for el in self.elements if isinstance(el, Region) and hasattr(el, 'page') and el.page is not None),
1631
- None
1642
+ (
1643
+ el
1644
+ for el in self.elements
1645
+ if isinstance(el, Region) and hasattr(el, "page") and el.page is not None
1646
+ ),
1647
+ None,
1632
1648
  )
1633
1649
  if first_valid_region_with_page:
1634
1650
  page_context_for_adding = first_valid_region_with_page.page
1635
1651
  else:
1636
- logger.warning("Cannot add TextElements to page: No valid Region with a page attribute found in collection, or first region's page is None.")
1637
- add_to_page = False # Disable adding if no valid page context can be determined
1652
+ logger.warning(
1653
+ "Cannot add TextElements to page: No valid Region with a page attribute found in collection, or first region's page is None."
1654
+ )
1655
+ add_to_page = False # Disable adding if no valid page context can be determined
1638
1656
 
1639
- for element in self.elements: # Accesses self._elements via property/iterator
1657
+ for element in self.elements: # Accesses self._elements via property/iterator
1640
1658
  if isinstance(element, Region):
1641
1659
  text_el = element.to_text_element(
1642
- text_content=text_content_func,
1660
+ text_content=text_content_func,
1643
1661
  source_label=source_label,
1644
1662
  object_type=object_type,
1645
1663
  default_font_size=default_font_size,
1646
1664
  default_font_name=default_font_name,
1647
- confidence=confidence
1665
+ confidence=confidence,
1648
1666
  )
1649
1667
  new_text_elements.append(text_el)
1650
1668
 
1651
1669
  if add_to_page:
1652
- if not hasattr(text_el, 'page') or text_el.page is None:
1653
- logger.warning(f"TextElement created from region {element.bbox} has no page attribute. Cannot add to page.")
1670
+ if not hasattr(text_el, "page") or text_el.page is None:
1671
+ logger.warning(
1672
+ f"TextElement created from region {element.bbox} has no page attribute. Cannot add to page."
1673
+ )
1654
1674
  continue
1655
-
1675
+
1656
1676
  if page_context_for_adding and text_el.page == page_context_for_adding:
1657
- if hasattr(page_context_for_adding, '_element_mgr') and page_context_for_adding._element_mgr is not None:
1658
- add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
1659
- page_context_for_adding._element_mgr.add_element(text_el, element_type=add_as_type)
1677
+ if (
1678
+ hasattr(page_context_for_adding, "_element_mgr")
1679
+ and page_context_for_adding._element_mgr is not None
1680
+ ):
1681
+ add_as_type = (
1682
+ "words"
1683
+ if object_type == "word"
1684
+ else "chars" if object_type == "char" else object_type
1685
+ )
1686
+ page_context_for_adding._element_mgr.add_element(
1687
+ text_el, element_type=add_as_type
1688
+ )
1660
1689
  else:
1661
- page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
1662
- logger.error(f"Page context for region {element.bbox} (Page {page_num_str}) is missing '_element_mgr'. Cannot add TextElement.")
1690
+ page_num_str = (
1691
+ str(page_context_for_adding.page_number)
1692
+ if hasattr(page_context_for_adding, "page_number")
1693
+ else "N/A"
1694
+ )
1695
+ logger.error(
1696
+ f"Page context for region {element.bbox} (Page {page_num_str}) is missing '_element_mgr'. Cannot add TextElement."
1697
+ )
1663
1698
  elif page_context_for_adding and text_el.page != page_context_for_adding:
1664
- current_page_num_str = str(text_el.page.page_number) if hasattr(text_el.page, 'page_number') else "Unknown"
1665
- context_page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else "N/A"
1666
- logger.warning(f"TextElement for region {element.bbox} from page {current_page_num_str} "
1667
- f"not added as it's different from collection's inferred page context {context_page_num_str}.")
1668
- elif not page_context_for_adding:
1669
- logger.warning(f"TextElement for region {element.bbox} created, but no page context was determined for adding.")
1699
+ current_page_num_str = (
1700
+ str(text_el.page.page_number)
1701
+ if hasattr(text_el.page, "page_number")
1702
+ else "Unknown"
1703
+ )
1704
+ context_page_num_str = (
1705
+ str(page_context_for_adding.page_number)
1706
+ if hasattr(page_context_for_adding, "page_number")
1707
+ else "N/A"
1708
+ )
1709
+ logger.warning(
1710
+ f"TextElement for region {element.bbox} from page {current_page_num_str} "
1711
+ f"not added as it's different from collection's inferred page context {context_page_num_str}."
1712
+ )
1713
+ elif not page_context_for_adding:
1714
+ logger.warning(
1715
+ f"TextElement for region {element.bbox} created, but no page context was determined for adding."
1716
+ )
1670
1717
  else:
1671
1718
  logger.warning(f"Skipping element {type(element)}, not a Region.")
1672
-
1719
+
1673
1720
  if add_to_page and page_context_for_adding:
1674
- page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
1675
- logger.info(f"Created and added {len(new_text_elements)} TextElements to page {page_num_str}.")
1676
- elif add_to_page and not page_context_for_adding:
1677
- logger.info(f"Created {len(new_text_elements)} TextElements, but could not add to page as page context was not determined or was inconsistent.")
1678
- else: # add_to_page is False
1721
+ page_num_str = (
1722
+ str(page_context_for_adding.page_number)
1723
+ if hasattr(page_context_for_adding, "page_number")
1724
+ else "N/A"
1725
+ )
1726
+ logger.info(
1727
+ f"Created and added {len(new_text_elements)} TextElements to page {page_num_str}."
1728
+ )
1729
+ elif add_to_page and not page_context_for_adding:
1730
+ logger.info(
1731
+ f"Created {len(new_text_elements)} TextElements, but could not add to page as page context was not determined or was inconsistent."
1732
+ )
1733
+ else: # add_to_page is False
1679
1734
  logger.info(f"Created {len(new_text_elements)} TextElements (not added to page).")
1680
1735
 
1681
1736
  return ElementCollection(new_text_elements)
1682
1737
 
1683
- def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, show_progress: bool = True) -> "ElementCollection":
1738
+ def trim(
1739
+ self,
1740
+ padding: int = 1,
1741
+ threshold: float = 0.95,
1742
+ resolution: float = 150,
1743
+ show_progress: bool = True,
1744
+ ) -> "ElementCollection":
1684
1745
  """
1685
1746
  Trim visual whitespace from each region in the collection.
1686
-
1747
+
1687
1748
  Applies the trim() method to each element in the collection,
1688
1749
  returning a new collection with the trimmed regions.
1689
-
1750
+
1690
1751
  Args:
1691
1752
  padding: Number of pixels to keep as padding after trimming (default: 1)
1692
1753
  threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
1693
1754
  resolution: Resolution for image rendering in DPI (default: 150)
1694
1755
  show_progress: Whether to show a progress bar for the trimming operation
1695
-
1756
+
1696
1757
  Returns:
1697
1758
  New ElementCollection with trimmed regions
1698
1759
  """
1699
1760
  return self.apply(
1700
- lambda element: element.trim(padding=padding, threshold=threshold, resolution=resolution),
1701
- show_progress=show_progress
1761
+ lambda element: element.trim(
1762
+ padding=padding, threshold=threshold, resolution=resolution
1763
+ ),
1764
+ show_progress=show_progress,
1702
1765
  )
1703
1766
 
1704
1767
  def clip(
@@ -1711,27 +1774,27 @@ class ElementCollection(
1711
1774
  ) -> "ElementCollection":
1712
1775
  """
1713
1776
  Clip each element in the collection to the specified bounds.
1714
-
1777
+
1715
1778
  This method applies the clip operation to each individual element,
1716
1779
  returning a new collection with the clipped elements.
1717
-
1780
+
1718
1781
  Args:
1719
1782
  obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
1720
1783
  left: Optional left boundary (x0) to clip to
1721
- top: Optional top boundary to clip to
1784
+ top: Optional top boundary to clip to
1722
1785
  right: Optional right boundary (x1) to clip to
1723
1786
  bottom: Optional bottom boundary to clip to
1724
-
1787
+
1725
1788
  Returns:
1726
1789
  New ElementCollection containing the clipped elements
1727
-
1790
+
1728
1791
  Examples:
1729
1792
  # Clip each element to another region's bounds
1730
1793
  clipped_elements = collection.clip(container_region)
1731
-
1794
+
1732
1795
  # Clip each element to specific coordinates
1733
1796
  clipped_elements = collection.clip(left=100, right=400)
1734
-
1797
+
1735
1798
  # Mix object bounds with specific overrides
1736
1799
  clipped_elements = collection.clip(obj=container, bottom=page.height/2)
1737
1800
  """
@@ -1740,6 +1803,8 @@ class ElementCollection(
1740
1803
  )
1741
1804
 
1742
1805
 
1806
+
1807
+
1743
1808
  class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
1744
1809
  """
1745
1810
  Represents a collection of Page objects, often from a single PDF document.