natural-pdf 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +117 -75
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/elements/base.py +9 -9
- natural_pdf/elements/collections.py +105 -50
- natural_pdf/elements/region.py +200 -126
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
|
|
1
|
+
import hashlib
|
1
2
|
import logging
|
2
3
|
from collections.abc import MutableSequence
|
3
4
|
from pathlib import Path
|
@@ -18,7 +19,6 @@ from typing import (
|
|
18
19
|
Union,
|
19
20
|
overload,
|
20
21
|
)
|
21
|
-
import hashlib
|
22
22
|
|
23
23
|
from pdfplumber.utils.geometry import objects_to_bbox
|
24
24
|
|
@@ -27,6 +27,7 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
|
|
27
27
|
from PIL import Image, ImageDraw, ImageFont
|
28
28
|
from tqdm.auto import tqdm
|
29
29
|
|
30
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
30
31
|
from natural_pdf.classification.manager import ClassificationManager
|
31
32
|
from natural_pdf.classification.mixin import ClassificationMixin
|
32
33
|
from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
|
@@ -38,8 +39,6 @@ from natural_pdf.export.mixin import ExportMixin
|
|
38
39
|
from natural_pdf.ocr import OCROptions
|
39
40
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
40
41
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
41
|
-
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
42
|
-
from tqdm.auto import tqdm
|
43
42
|
|
44
43
|
# Potentially lazy imports for optional dependencies needed in save_pdf
|
45
44
|
try:
|
@@ -65,7 +64,7 @@ if TYPE_CHECKING:
|
|
65
64
|
from natural_pdf.core.page import Page
|
66
65
|
from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
|
67
66
|
from natural_pdf.elements.region import Region
|
68
|
-
from natural_pdf.elements.text import TextElement
|
67
|
+
from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
|
69
68
|
|
70
69
|
T = TypeVar("T")
|
71
70
|
P = TypeVar("P", bound="Page")
|
@@ -844,6 +843,7 @@ class ElementCollection(
|
|
844
843
|
legend_position: str = "right",
|
845
844
|
render_ocr: bool = False,
|
846
845
|
width: Optional[int] = None, # Add width parameter
|
846
|
+
page: Optional[Any] = None, # NEW: Optional page parameter for empty collections
|
847
847
|
) -> Optional["Image.Image"]:
|
848
848
|
"""
|
849
849
|
Generates a temporary preview image highlighting elements in this collection
|
@@ -1590,13 +1590,13 @@ class ElementCollection(
|
|
1590
1590
|
|
1591
1591
|
def to_text_elements(
|
1592
1592
|
self,
|
1593
|
-
text_content_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1593
|
+
text_content_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1594
1594
|
source_label: str = "derived_from_region",
|
1595
1595
|
object_type: str = "word",
|
1596
1596
|
default_font_size: float = 10.0,
|
1597
1597
|
default_font_name: str = "RegionContent",
|
1598
1598
|
confidence: Optional[float] = None,
|
1599
|
-
add_to_page: bool = False
|
1599
|
+
add_to_page: bool = False, # Default is False
|
1600
1600
|
) -> "ElementCollection[TextElement]":
|
1601
1601
|
"""
|
1602
1602
|
Converts each Region in this collection to a TextElement.
|
@@ -1610,95 +1610,150 @@ class ElementCollection(
|
|
1610
1610
|
default_font_size: Placeholder font size.
|
1611
1611
|
default_font_name: Placeholder font name.
|
1612
1612
|
confidence: Confidence score.
|
1613
|
-
add_to_page: If True (default is False), also adds the created
|
1613
|
+
add_to_page: If True (default is False), also adds the created
|
1614
1614
|
TextElements to their respective page's element manager.
|
1615
1615
|
|
1616
1616
|
Returns:
|
1617
1617
|
A new ElementCollection containing the created TextElement objects.
|
1618
1618
|
"""
|
1619
|
-
from natural_pdf.elements.region import
|
1620
|
-
|
1619
|
+
from natural_pdf.elements.region import ( # Local import for type checking if needed or to resolve circularity
|
1620
|
+
Region,
|
1621
|
+
)
|
1622
|
+
from natural_pdf.elements.text import ( # Ensure TextElement is imported for type hint if not in TYPE_CHECKING
|
1623
|
+
TextElement,
|
1624
|
+
)
|
1621
1625
|
|
1622
1626
|
new_text_elements: List["TextElement"] = []
|
1623
|
-
if not self.elements:
|
1627
|
+
if not self.elements: # Accesses self._elements via property
|
1624
1628
|
return ElementCollection([])
|
1625
1629
|
|
1626
1630
|
page_context_for_adding: Optional["Page"] = None
|
1627
1631
|
if add_to_page:
|
1628
1632
|
# Try to determine a consistent page context if adding elements
|
1629
1633
|
first_valid_region_with_page = next(
|
1630
|
-
(
|
1631
|
-
|
1634
|
+
(
|
1635
|
+
el
|
1636
|
+
for el in self.elements
|
1637
|
+
if isinstance(el, Region) and hasattr(el, "page") and el.page is not None
|
1638
|
+
),
|
1639
|
+
None,
|
1632
1640
|
)
|
1633
1641
|
if first_valid_region_with_page:
|
1634
1642
|
page_context_for_adding = first_valid_region_with_page.page
|
1635
1643
|
else:
|
1636
|
-
logger.warning(
|
1637
|
-
|
1644
|
+
logger.warning(
|
1645
|
+
"Cannot add TextElements to page: No valid Region with a page attribute found in collection, or first region's page is None."
|
1646
|
+
)
|
1647
|
+
add_to_page = False # Disable adding if no valid page context can be determined
|
1638
1648
|
|
1639
|
-
for element in self.elements:
|
1649
|
+
for element in self.elements: # Accesses self._elements via property/iterator
|
1640
1650
|
if isinstance(element, Region):
|
1641
1651
|
text_el = element.to_text_element(
|
1642
|
-
text_content=text_content_func,
|
1652
|
+
text_content=text_content_func,
|
1643
1653
|
source_label=source_label,
|
1644
1654
|
object_type=object_type,
|
1645
1655
|
default_font_size=default_font_size,
|
1646
1656
|
default_font_name=default_font_name,
|
1647
|
-
confidence=confidence
|
1657
|
+
confidence=confidence,
|
1648
1658
|
)
|
1649
1659
|
new_text_elements.append(text_el)
|
1650
1660
|
|
1651
1661
|
if add_to_page:
|
1652
|
-
if not hasattr(text_el,
|
1653
|
-
logger.warning(
|
1662
|
+
if not hasattr(text_el, "page") or text_el.page is None:
|
1663
|
+
logger.warning(
|
1664
|
+
f"TextElement created from region {element.bbox} has no page attribute. Cannot add to page."
|
1665
|
+
)
|
1654
1666
|
continue
|
1655
|
-
|
1667
|
+
|
1656
1668
|
if page_context_for_adding and text_el.page == page_context_for_adding:
|
1657
|
-
if
|
1658
|
-
|
1659
|
-
page_context_for_adding._element_mgr
|
1669
|
+
if (
|
1670
|
+
hasattr(page_context_for_adding, "_element_mgr")
|
1671
|
+
and page_context_for_adding._element_mgr is not None
|
1672
|
+
):
|
1673
|
+
add_as_type = (
|
1674
|
+
"words"
|
1675
|
+
if object_type == "word"
|
1676
|
+
else "chars" if object_type == "char" else object_type
|
1677
|
+
)
|
1678
|
+
page_context_for_adding._element_mgr.add_element(
|
1679
|
+
text_el, element_type=add_as_type
|
1680
|
+
)
|
1660
1681
|
else:
|
1661
|
-
page_num_str =
|
1662
|
-
|
1682
|
+
page_num_str = (
|
1683
|
+
str(page_context_for_adding.page_number)
|
1684
|
+
if hasattr(page_context_for_adding, "page_number")
|
1685
|
+
else "N/A"
|
1686
|
+
)
|
1687
|
+
logger.error(
|
1688
|
+
f"Page context for region {element.bbox} (Page {page_num_str}) is missing '_element_mgr'. Cannot add TextElement."
|
1689
|
+
)
|
1663
1690
|
elif page_context_for_adding and text_el.page != page_context_for_adding:
|
1664
|
-
current_page_num_str =
|
1665
|
-
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
|
1691
|
+
current_page_num_str = (
|
1692
|
+
str(text_el.page.page_number)
|
1693
|
+
if hasattr(text_el.page, "page_number")
|
1694
|
+
else "Unknown"
|
1695
|
+
)
|
1696
|
+
context_page_num_str = (
|
1697
|
+
str(page_context_for_adding.page_number)
|
1698
|
+
if hasattr(page_context_for_adding, "page_number")
|
1699
|
+
else "N/A"
|
1700
|
+
)
|
1701
|
+
logger.warning(
|
1702
|
+
f"TextElement for region {element.bbox} from page {current_page_num_str} "
|
1703
|
+
f"not added as it's different from collection's inferred page context {context_page_num_str}."
|
1704
|
+
)
|
1705
|
+
elif not page_context_for_adding:
|
1706
|
+
logger.warning(
|
1707
|
+
f"TextElement for region {element.bbox} created, but no page context was determined for adding."
|
1708
|
+
)
|
1670
1709
|
else:
|
1671
1710
|
logger.warning(f"Skipping element {type(element)}, not a Region.")
|
1672
|
-
|
1711
|
+
|
1673
1712
|
if add_to_page and page_context_for_adding:
|
1674
|
-
page_num_str =
|
1675
|
-
|
1676
|
-
|
1677
|
-
|
1678
|
-
|
1713
|
+
page_num_str = (
|
1714
|
+
str(page_context_for_adding.page_number)
|
1715
|
+
if hasattr(page_context_for_adding, "page_number")
|
1716
|
+
else "N/A"
|
1717
|
+
)
|
1718
|
+
logger.info(
|
1719
|
+
f"Created and added {len(new_text_elements)} TextElements to page {page_num_str}."
|
1720
|
+
)
|
1721
|
+
elif add_to_page and not page_context_for_adding:
|
1722
|
+
logger.info(
|
1723
|
+
f"Created {len(new_text_elements)} TextElements, but could not add to page as page context was not determined or was inconsistent."
|
1724
|
+
)
|
1725
|
+
else: # add_to_page is False
|
1679
1726
|
logger.info(f"Created {len(new_text_elements)} TextElements (not added to page).")
|
1680
1727
|
|
1681
1728
|
return ElementCollection(new_text_elements)
|
1682
1729
|
|
1683
|
-
def trim(
|
1730
|
+
def trim(
|
1731
|
+
self,
|
1732
|
+
padding: int = 1,
|
1733
|
+
threshold: float = 0.95,
|
1734
|
+
resolution: float = 150,
|
1735
|
+
show_progress: bool = True,
|
1736
|
+
) -> "ElementCollection":
|
1684
1737
|
"""
|
1685
1738
|
Trim visual whitespace from each region in the collection.
|
1686
|
-
|
1739
|
+
|
1687
1740
|
Applies the trim() method to each element in the collection,
|
1688
1741
|
returning a new collection with the trimmed regions.
|
1689
|
-
|
1742
|
+
|
1690
1743
|
Args:
|
1691
1744
|
padding: Number of pixels to keep as padding after trimming (default: 1)
|
1692
1745
|
threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
|
1693
1746
|
resolution: Resolution for image rendering in DPI (default: 150)
|
1694
1747
|
show_progress: Whether to show a progress bar for the trimming operation
|
1695
|
-
|
1748
|
+
|
1696
1749
|
Returns:
|
1697
1750
|
New ElementCollection with trimmed regions
|
1698
1751
|
"""
|
1699
1752
|
return self.apply(
|
1700
|
-
lambda element: element.trim(
|
1701
|
-
|
1753
|
+
lambda element: element.trim(
|
1754
|
+
padding=padding, threshold=threshold, resolution=resolution
|
1755
|
+
),
|
1756
|
+
show_progress=show_progress,
|
1702
1757
|
)
|
1703
1758
|
|
1704
1759
|
def clip(
|
@@ -1711,27 +1766,27 @@ class ElementCollection(
|
|
1711
1766
|
) -> "ElementCollection":
|
1712
1767
|
"""
|
1713
1768
|
Clip each element in the collection to the specified bounds.
|
1714
|
-
|
1769
|
+
|
1715
1770
|
This method applies the clip operation to each individual element,
|
1716
1771
|
returning a new collection with the clipped elements.
|
1717
|
-
|
1772
|
+
|
1718
1773
|
Args:
|
1719
1774
|
obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
|
1720
1775
|
left: Optional left boundary (x0) to clip to
|
1721
|
-
top: Optional top boundary to clip to
|
1776
|
+
top: Optional top boundary to clip to
|
1722
1777
|
right: Optional right boundary (x1) to clip to
|
1723
1778
|
bottom: Optional bottom boundary to clip to
|
1724
|
-
|
1779
|
+
|
1725
1780
|
Returns:
|
1726
1781
|
New ElementCollection containing the clipped elements
|
1727
|
-
|
1782
|
+
|
1728
1783
|
Examples:
|
1729
1784
|
# Clip each element to another region's bounds
|
1730
1785
|
clipped_elements = collection.clip(container_region)
|
1731
|
-
|
1786
|
+
|
1732
1787
|
# Clip each element to specific coordinates
|
1733
1788
|
clipped_elements = collection.clip(left=100, right=400)
|
1734
|
-
|
1789
|
+
|
1735
1790
|
# Mix object bounds with specific overrides
|
1736
1791
|
clipped_elements = collection.clip(obj=container, bottom=page.height/2)
|
1737
1792
|
"""
|