natural-pdf 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +226 -70
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/elements/base.py +9 -9
  14. natural_pdf/elements/collections.py +105 -50
  15. natural_pdf/elements/region.py +320 -113
  16. natural_pdf/exporters/paddleocr.py +38 -13
  17. natural_pdf/flows/__init__.py +3 -3
  18. natural_pdf/flows/collections.py +303 -132
  19. natural_pdf/flows/element.py +277 -132
  20. natural_pdf/flows/flow.py +33 -16
  21. natural_pdf/flows/region.py +142 -79
  22. natural_pdf/ocr/engine_doctr.py +37 -4
  23. natural_pdf/ocr/engine_easyocr.py +23 -3
  24. natural_pdf/ocr/engine_paddle.py +281 -30
  25. natural_pdf/ocr/engine_surya.py +8 -3
  26. natural_pdf/ocr/ocr_manager.py +75 -76
  27. natural_pdf/ocr/ocr_options.py +52 -87
  28. natural_pdf/search/__init__.py +25 -12
  29. natural_pdf/search/lancedb_search_service.py +91 -54
  30. natural_pdf/search/numpy_search_service.py +86 -65
  31. natural_pdf/search/searchable_mixin.py +2 -2
  32. natural_pdf/selectors/parser.py +125 -81
  33. natural_pdf/widgets/__init__.py +1 -1
  34. natural_pdf/widgets/viewer.py +205 -449
  35. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
  36. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
  37. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
  38. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
  39. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import hashlib
1
2
  import logging
2
3
  from collections.abc import MutableSequence
3
4
  from pathlib import Path
@@ -18,7 +19,6 @@ from typing import (
18
19
  Union,
19
20
  overload,
20
21
  )
21
- import hashlib
22
22
 
23
23
  from pdfplumber.utils.geometry import objects_to_bbox
24
24
 
@@ -27,6 +27,7 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
27
27
  from PIL import Image, ImageDraw, ImageFont
28
28
  from tqdm.auto import tqdm
29
29
 
30
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
30
31
  from natural_pdf.classification.manager import ClassificationManager
31
32
  from natural_pdf.classification.mixin import ClassificationMixin
32
33
  from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
@@ -38,8 +39,6 @@ from natural_pdf.export.mixin import ExportMixin
38
39
  from natural_pdf.ocr import OCROptions
39
40
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
40
41
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
41
- from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
42
- from tqdm.auto import tqdm
43
42
 
44
43
  # Potentially lazy imports for optional dependencies needed in save_pdf
45
44
  try:
@@ -65,7 +64,7 @@ if TYPE_CHECKING:
65
64
  from natural_pdf.core.page import Page
66
65
  from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
67
66
  from natural_pdf.elements.region import Region
68
- from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
67
+ from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
69
68
 
70
69
  T = TypeVar("T")
71
70
  P = TypeVar("P", bound="Page")
@@ -844,6 +843,7 @@ class ElementCollection(
844
843
  legend_position: str = "right",
845
844
  render_ocr: bool = False,
846
845
  width: Optional[int] = None, # Add width parameter
846
+ page: Optional[Any] = None, # NEW: Optional page parameter for empty collections
847
847
  ) -> Optional["Image.Image"]:
848
848
  """
849
849
  Generates a temporary preview image highlighting elements in this collection
@@ -1590,13 +1590,13 @@ class ElementCollection(
1590
1590
 
1591
1591
  def to_text_elements(
1592
1592
  self,
1593
- text_content_func: Optional[Callable[["Region"], Optional[str]]] = None,
1593
+ text_content_func: Optional[Callable[["Region"], Optional[str]]] = None,
1594
1594
  source_label: str = "derived_from_region",
1595
1595
  object_type: str = "word",
1596
1596
  default_font_size: float = 10.0,
1597
1597
  default_font_name: str = "RegionContent",
1598
1598
  confidence: Optional[float] = None,
1599
- add_to_page: bool = False # Default is False
1599
+ add_to_page: bool = False, # Default is False
1600
1600
  ) -> "ElementCollection[TextElement]":
1601
1601
  """
1602
1602
  Converts each Region in this collection to a TextElement.
@@ -1610,95 +1610,150 @@ class ElementCollection(
1610
1610
  default_font_size: Placeholder font size.
1611
1611
  default_font_name: Placeholder font name.
1612
1612
  confidence: Confidence score.
1613
- add_to_page: If True (default is False), also adds the created
1613
+ add_to_page: If True (default is False), also adds the created
1614
1614
  TextElements to their respective page's element manager.
1615
1615
 
1616
1616
  Returns:
1617
1617
  A new ElementCollection containing the created TextElement objects.
1618
1618
  """
1619
- from natural_pdf.elements.region import Region # Local import for type checking if needed or to resolve circularity
1620
- from natural_pdf.elements.text import TextElement # Ensure TextElement is imported for type hint if not in TYPE_CHECKING
1619
+ from natural_pdf.elements.region import ( # Local import for type checking if needed or to resolve circularity
1620
+ Region,
1621
+ )
1622
+ from natural_pdf.elements.text import ( # Ensure TextElement is imported for type hint if not in TYPE_CHECKING
1623
+ TextElement,
1624
+ )
1621
1625
 
1622
1626
  new_text_elements: List["TextElement"] = []
1623
- if not self.elements: # Accesses self._elements via property
1627
+ if not self.elements: # Accesses self._elements via property
1624
1628
  return ElementCollection([])
1625
1629
 
1626
1630
  page_context_for_adding: Optional["Page"] = None
1627
1631
  if add_to_page:
1628
1632
  # Try to determine a consistent page context if adding elements
1629
1633
  first_valid_region_with_page = next(
1630
- (el for el in self.elements if isinstance(el, Region) and hasattr(el, 'page') and el.page is not None),
1631
- None
1634
+ (
1635
+ el
1636
+ for el in self.elements
1637
+ if isinstance(el, Region) and hasattr(el, "page") and el.page is not None
1638
+ ),
1639
+ None,
1632
1640
  )
1633
1641
  if first_valid_region_with_page:
1634
1642
  page_context_for_adding = first_valid_region_with_page.page
1635
1643
  else:
1636
- logger.warning("Cannot add TextElements to page: No valid Region with a page attribute found in collection, or first region's page is None.")
1637
- add_to_page = False # Disable adding if no valid page context can be determined
1644
+ logger.warning(
1645
+ "Cannot add TextElements to page: No valid Region with a page attribute found in collection, or first region's page is None."
1646
+ )
1647
+ add_to_page = False # Disable adding if no valid page context can be determined
1638
1648
 
1639
- for element in self.elements: # Accesses self._elements via property/iterator
1649
+ for element in self.elements: # Accesses self._elements via property/iterator
1640
1650
  if isinstance(element, Region):
1641
1651
  text_el = element.to_text_element(
1642
- text_content=text_content_func,
1652
+ text_content=text_content_func,
1643
1653
  source_label=source_label,
1644
1654
  object_type=object_type,
1645
1655
  default_font_size=default_font_size,
1646
1656
  default_font_name=default_font_name,
1647
- confidence=confidence
1657
+ confidence=confidence,
1648
1658
  )
1649
1659
  new_text_elements.append(text_el)
1650
1660
 
1651
1661
  if add_to_page:
1652
- if not hasattr(text_el, 'page') or text_el.page is None:
1653
- logger.warning(f"TextElement created from region {element.bbox} has no page attribute. Cannot add to page.")
1662
+ if not hasattr(text_el, "page") or text_el.page is None:
1663
+ logger.warning(
1664
+ f"TextElement created from region {element.bbox} has no page attribute. Cannot add to page."
1665
+ )
1654
1666
  continue
1655
-
1667
+
1656
1668
  if page_context_for_adding and text_el.page == page_context_for_adding:
1657
- if hasattr(page_context_for_adding, '_element_mgr') and page_context_for_adding._element_mgr is not None:
1658
- add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
1659
- page_context_for_adding._element_mgr.add_element(text_el, element_type=add_as_type)
1669
+ if (
1670
+ hasattr(page_context_for_adding, "_element_mgr")
1671
+ and page_context_for_adding._element_mgr is not None
1672
+ ):
1673
+ add_as_type = (
1674
+ "words"
1675
+ if object_type == "word"
1676
+ else "chars" if object_type == "char" else object_type
1677
+ )
1678
+ page_context_for_adding._element_mgr.add_element(
1679
+ text_el, element_type=add_as_type
1680
+ )
1660
1681
  else:
1661
- page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
1662
- logger.error(f"Page context for region {element.bbox} (Page {page_num_str}) is missing '_element_mgr'. Cannot add TextElement.")
1682
+ page_num_str = (
1683
+ str(page_context_for_adding.page_number)
1684
+ if hasattr(page_context_for_adding, "page_number")
1685
+ else "N/A"
1686
+ )
1687
+ logger.error(
1688
+ f"Page context for region {element.bbox} (Page {page_num_str}) is missing '_element_mgr'. Cannot add TextElement."
1689
+ )
1663
1690
  elif page_context_for_adding and text_el.page != page_context_for_adding:
1664
- current_page_num_str = str(text_el.page.page_number) if hasattr(text_el.page, 'page_number') else "Unknown"
1665
- context_page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else "N/A"
1666
- logger.warning(f"TextElement for region {element.bbox} from page {current_page_num_str} "
1667
- f"not added as it's different from collection's inferred page context {context_page_num_str}.")
1668
- elif not page_context_for_adding:
1669
- logger.warning(f"TextElement for region {element.bbox} created, but no page context was determined for adding.")
1691
+ current_page_num_str = (
1692
+ str(text_el.page.page_number)
1693
+ if hasattr(text_el.page, "page_number")
1694
+ else "Unknown"
1695
+ )
1696
+ context_page_num_str = (
1697
+ str(page_context_for_adding.page_number)
1698
+ if hasattr(page_context_for_adding, "page_number")
1699
+ else "N/A"
1700
+ )
1701
+ logger.warning(
1702
+ f"TextElement for region {element.bbox} from page {current_page_num_str} "
1703
+ f"not added as it's different from collection's inferred page context {context_page_num_str}."
1704
+ )
1705
+ elif not page_context_for_adding:
1706
+ logger.warning(
1707
+ f"TextElement for region {element.bbox} created, but no page context was determined for adding."
1708
+ )
1670
1709
  else:
1671
1710
  logger.warning(f"Skipping element {type(element)}, not a Region.")
1672
-
1711
+
1673
1712
  if add_to_page and page_context_for_adding:
1674
- page_num_str = str(page_context_for_adding.page_number) if hasattr(page_context_for_adding, 'page_number') else 'N/A'
1675
- logger.info(f"Created and added {len(new_text_elements)} TextElements to page {page_num_str}.")
1676
- elif add_to_page and not page_context_for_adding:
1677
- logger.info(f"Created {len(new_text_elements)} TextElements, but could not add to page as page context was not determined or was inconsistent.")
1678
- else: # add_to_page is False
1713
+ page_num_str = (
1714
+ str(page_context_for_adding.page_number)
1715
+ if hasattr(page_context_for_adding, "page_number")
1716
+ else "N/A"
1717
+ )
1718
+ logger.info(
1719
+ f"Created and added {len(new_text_elements)} TextElements to page {page_num_str}."
1720
+ )
1721
+ elif add_to_page and not page_context_for_adding:
1722
+ logger.info(
1723
+ f"Created {len(new_text_elements)} TextElements, but could not add to page as page context was not determined or was inconsistent."
1724
+ )
1725
+ else: # add_to_page is False
1679
1726
  logger.info(f"Created {len(new_text_elements)} TextElements (not added to page).")
1680
1727
 
1681
1728
  return ElementCollection(new_text_elements)
1682
1729
 
1683
- def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, show_progress: bool = True) -> "ElementCollection":
1730
+ def trim(
1731
+ self,
1732
+ padding: int = 1,
1733
+ threshold: float = 0.95,
1734
+ resolution: float = 150,
1735
+ show_progress: bool = True,
1736
+ ) -> "ElementCollection":
1684
1737
  """
1685
1738
  Trim visual whitespace from each region in the collection.
1686
-
1739
+
1687
1740
  Applies the trim() method to each element in the collection,
1688
1741
  returning a new collection with the trimmed regions.
1689
-
1742
+
1690
1743
  Args:
1691
1744
  padding: Number of pixels to keep as padding after trimming (default: 1)
1692
1745
  threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
1693
1746
  resolution: Resolution for image rendering in DPI (default: 150)
1694
1747
  show_progress: Whether to show a progress bar for the trimming operation
1695
-
1748
+
1696
1749
  Returns:
1697
1750
  New ElementCollection with trimmed regions
1698
1751
  """
1699
1752
  return self.apply(
1700
- lambda element: element.trim(padding=padding, threshold=threshold, resolution=resolution),
1701
- show_progress=show_progress
1753
+ lambda element: element.trim(
1754
+ padding=padding, threshold=threshold, resolution=resolution
1755
+ ),
1756
+ show_progress=show_progress,
1702
1757
  )
1703
1758
 
1704
1759
  def clip(
@@ -1711,27 +1766,27 @@ class ElementCollection(
1711
1766
  ) -> "ElementCollection":
1712
1767
  """
1713
1768
  Clip each element in the collection to the specified bounds.
1714
-
1769
+
1715
1770
  This method applies the clip operation to each individual element,
1716
1771
  returning a new collection with the clipped elements.
1717
-
1772
+
1718
1773
  Args:
1719
1774
  obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
1720
1775
  left: Optional left boundary (x0) to clip to
1721
- top: Optional top boundary to clip to
1776
+ top: Optional top boundary to clip to
1722
1777
  right: Optional right boundary (x1) to clip to
1723
1778
  bottom: Optional bottom boundary to clip to
1724
-
1779
+
1725
1780
  Returns:
1726
1781
  New ElementCollection containing the clipped elements
1727
-
1782
+
1728
1783
  Examples:
1729
1784
  # Clip each element to another region's bounds
1730
1785
  clipped_elements = collection.clip(container_region)
1731
-
1786
+
1732
1787
  # Clip each element to specific coordinates
1733
1788
  clipped_elements = collection.clip(left=100, right=400)
1734
-
1789
+
1735
1790
  # Mix object bounds with specific overrides
1736
1791
  clipped_elements = collection.clip(obj=container, bottom=page.height/2)
1737
1792
  """