natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +119 -76
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/describe/__init__.py +21 -0
- natural_pdf/describe/base.py +457 -0
- natural_pdf/describe/elements.py +411 -0
- natural_pdf/describe/mixin.py +84 -0
- natural_pdf/describe/summary.py +186 -0
- natural_pdf/elements/base.py +11 -10
- natural_pdf/elements/collections.py +116 -51
- natural_pdf/elements/region.py +204 -127
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
natural_pdf/elements/base.py
CHANGED
@@ -8,6 +8,7 @@ from PIL import Image
|
|
8
8
|
|
9
9
|
# Import selector parsing functions
|
10
10
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
11
|
+
from natural_pdf.describe.mixin import DescribeMixin
|
11
12
|
|
12
13
|
if TYPE_CHECKING:
|
13
14
|
from natural_pdf.core.page import Page
|
@@ -18,34 +19,34 @@ if TYPE_CHECKING:
|
|
18
19
|
def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
|
19
20
|
"""
|
20
21
|
Extract bounding box coordinates from any object that has bbox properties.
|
21
|
-
|
22
|
+
|
22
23
|
Args:
|
23
24
|
obj: Object that might have bbox coordinates (Element, Region, etc.)
|
24
|
-
|
25
|
+
|
25
26
|
Returns:
|
26
27
|
Tuple of (x0, top, x1, bottom) or None if object doesn't have bbox properties
|
27
28
|
"""
|
28
29
|
# Try bbox property first (most common)
|
29
|
-
if hasattr(obj,
|
30
|
+
if hasattr(obj, "bbox") and obj.bbox is not None:
|
30
31
|
bbox = obj.bbox
|
31
32
|
if isinstance(bbox, (tuple, list)) and len(bbox) == 4:
|
32
33
|
return tuple(float(coord) for coord in bbox)
|
33
|
-
|
34
|
+
|
34
35
|
# Try individual coordinate properties
|
35
|
-
if all(hasattr(obj, attr) for attr in [
|
36
|
+
if all(hasattr(obj, attr) for attr in ["x0", "top", "x1", "bottom"]):
|
36
37
|
try:
|
37
38
|
return (float(obj.x0), float(obj.top), float(obj.x1), float(obj.bottom))
|
38
39
|
except (ValueError, TypeError):
|
39
40
|
pass
|
40
|
-
|
41
|
+
|
41
42
|
# If object is a dict with bbox keys
|
42
43
|
if isinstance(obj, dict):
|
43
|
-
if all(key in obj for key in [
|
44
|
+
if all(key in obj for key in ["x0", "top", "x1", "bottom"]):
|
44
45
|
try:
|
45
|
-
return (float(obj[
|
46
|
+
return (float(obj["x0"]), float(obj["top"]), float(obj["x1"]), float(obj["bottom"]))
|
46
47
|
except (ValueError, TypeError):
|
47
48
|
pass
|
48
|
-
|
49
|
+
|
49
50
|
return None
|
50
51
|
|
51
52
|
|
@@ -412,7 +413,7 @@ class DirectionalMixin:
|
|
412
413
|
return new_region
|
413
414
|
|
414
415
|
|
415
|
-
class Element(DirectionalMixin):
|
416
|
+
class Element(DirectionalMixin, DescribeMixin):
|
416
417
|
"""
|
417
418
|
Base class for all PDF elements.
|
418
419
|
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import hashlib
|
1
2
|
import logging
|
2
3
|
from collections.abc import MutableSequence
|
3
4
|
from pathlib import Path
|
@@ -18,7 +19,6 @@ from typing import (
|
|
18
19
|
Union,
|
19
20
|
overload,
|
20
21
|
)
|
21
|
-
import hashlib
|
22
22
|
|
23
23
|
from pdfplumber.utils.geometry import objects_to_bbox
|
24
24
|
|
@@ -27,8 +27,10 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
|
|
27
27
|
from PIL import Image, ImageDraw, ImageFont
|
28
28
|
from tqdm.auto import tqdm
|
29
29
|
|
30
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
30
31
|
from natural_pdf.classification.manager import ClassificationManager
|
31
32
|
from natural_pdf.classification.mixin import ClassificationMixin
|
33
|
+
from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
|
32
34
|
from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
|
33
35
|
from natural_pdf.core.pdf import PDF
|
34
36
|
from natural_pdf.elements.base import Element
|
@@ -38,8 +40,6 @@ from natural_pdf.export.mixin import ExportMixin
|
|
38
40
|
from natural_pdf.ocr import OCROptions
|
39
41
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
40
42
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
41
|
-
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
42
|
-
from tqdm.auto import tqdm
|
43
43
|
|
44
44
|
# Potentially lazy imports for optional dependencies needed in save_pdf
|
45
45
|
try:
|
@@ -65,14 +65,21 @@ if TYPE_CHECKING:
|
|
65
65
|
from natural_pdf.core.page import Page
|
66
66
|
from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
|
67
67
|
from natural_pdf.elements.region import Region
|
68
|
-
from natural_pdf.elements.text import TextElement
|
68
|
+
from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
|
69
69
|
|
70
70
|
T = TypeVar("T")
|
71
71
|
P = TypeVar("P", bound="Page")
|
72
72
|
|
73
73
|
|
74
74
|
class ElementCollection(
|
75
|
-
Generic[T],
|
75
|
+
Generic[T],
|
76
|
+
ApplyMixin,
|
77
|
+
ExportMixin,
|
78
|
+
ClassificationMixin,
|
79
|
+
DirectionalCollectionMixin,
|
80
|
+
DescribeMixin,
|
81
|
+
InspectMixin,
|
82
|
+
MutableSequence,
|
76
83
|
):
|
77
84
|
"""
|
78
85
|
Collection of PDF elements with batch operations.
|
@@ -844,6 +851,7 @@ class ElementCollection(
|
|
844
851
|
legend_position: str = "right",
|
845
852
|
render_ocr: bool = False,
|
846
853
|
width: Optional[int] = None, # Add width parameter
|
854
|
+
page: Optional[Any] = None, # NEW: Optional page parameter for empty collections
|
847
855
|
) -> Optional["Image.Image"]:
|
848
856
|
"""
|
849
857
|
Generates a temporary preview image highlighting elements in this collection
|
@@ -1590,13 +1598,13 @@ class ElementCollection(
|
|
1590
1598
|
|
1591
1599
|
def to_text_elements(
|
1592
1600
|
self,
|
1593
|
-
text_content_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1601
|
+
text_content_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1594
1602
|
source_label: str = "derived_from_region",
|
1595
1603
|
object_type: str = "word",
|
1596
1604
|
default_font_size: float = 10.0,
|
1597
1605
|
default_font_name: str = "RegionContent",
|
1598
1606
|
confidence: Optional[float] = None,
|
1599
|
-
add_to_page: bool = False
|
1607
|
+
add_to_page: bool = False, # Default is False
|
1600
1608
|
) -> "ElementCollection[TextElement]":
|
1601
1609
|
"""
|
1602
1610
|
Converts each Region in this collection to a TextElement.
|
@@ -1610,95 +1618,150 @@ class ElementCollection(
|
|
1610
1618
|
default_font_size: Placeholder font size.
|
1611
1619
|
default_font_name: Placeholder font name.
|
1612
1620
|
confidence: Confidence score.
|
1613
|
-
add_to_page: If True (default is False), also adds the created
|
1621
|
+
add_to_page: If True (default is False), also adds the created
|
1614
1622
|
TextElements to their respective page's element manager.
|
1615
1623
|
|
1616
1624
|
Returns:
|
1617
1625
|
A new ElementCollection containing the created TextElement objects.
|
1618
1626
|
"""
|
1619
|
-
from natural_pdf.elements.region import
|
1620
|
-
|
1627
|
+
from natural_pdf.elements.region import ( # Local import for type checking if needed or to resolve circularity
|
1628
|
+
Region,
|
1629
|
+
)
|
1630
|
+
from natural_pdf.elements.text import ( # Ensure TextElement is imported for type hint if not in TYPE_CHECKING
|
1631
|
+
TextElement,
|
1632
|
+
)
|
1621
1633
|
|
1622
1634
|
new_text_elements: List["TextElement"] = []
|
1623
|
-
if not self.elements:
|
1635
|
+
if not self.elements: # Accesses self._elements via property
|
1624
1636
|
return ElementCollection([])
|
1625
1637
|
|
1626
1638
|
page_context_for_adding: Optional["Page"] = None
|
1627
1639
|
if add_to_page:
|
1628
1640
|
# Try to determine a consistent page context if adding elements
|
1629
1641
|
first_valid_region_with_page = next(
|
1630
|
-
(
|
1631
|
-
|
1642
|
+
(
|
1643
|
+
el
|
1644
|
+
for el in self.elements
|
1645
|
+
if isinstance(el, Region) and hasattr(el, "page") and el.page is not None
|
1646
|
+
),
|
1647
|
+
None,
|
1632
1648
|
)
|
1633
1649
|
if first_valid_region_with_page:
|
1634
1650
|
page_context_for_adding = first_valid_region_with_page.page
|
1635
1651
|
else:
|
1636
|
-
logger.warning(
|
1637
|
-
|
1652
|
+
logger.warning(
|
1653
|
+
"Cannot add TextElements to page: No valid Region with a page attribute found in collection, or first region's page is None."
|
1654
|
+
)
|
1655
|
+
add_to_page = False # Disable adding if no valid page context can be determined
|
1638
1656
|
|
1639
|
-
for element in self.elements:
|
1657
|
+
for element in self.elements: # Accesses self._elements via property/iterator
|
1640
1658
|
if isinstance(element, Region):
|
1641
1659
|
text_el = element.to_text_element(
|
1642
|
-
text_content=text_content_func,
|
1660
|
+
text_content=text_content_func,
|
1643
1661
|
source_label=source_label,
|
1644
1662
|
object_type=object_type,
|
1645
1663
|
default_font_size=default_font_size,
|
1646
1664
|
default_font_name=default_font_name,
|
1647
|
-
confidence=confidence
|
1665
|
+
confidence=confidence,
|
1648
1666
|
)
|
1649
1667
|
new_text_elements.append(text_el)
|
1650
1668
|
|
1651
1669
|
if add_to_page:
|
1652
|
-
if not hasattr(text_el,
|
1653
|
-
logger.warning(
|
1670
|
+
if not hasattr(text_el, "page") or text_el.page is None:
|
1671
|
+
logger.warning(
|
1672
|
+
f"TextElement created from region {element.bbox} has no page attribute. Cannot add to page."
|
1673
|
+
)
|
1654
1674
|
continue
|
1655
|
-
|
1675
|
+
|
1656
1676
|
if page_context_for_adding and text_el.page == page_context_for_adding:
|
1657
|
-
if
|
1658
|
-
|
1659
|
-
page_context_for_adding._element_mgr
|
1677
|
+
if (
|
1678
|
+
hasattr(page_context_for_adding, "_element_mgr")
|
1679
|
+
and page_context_for_adding._element_mgr is not None
|
1680
|
+
):
|
1681
|
+
add_as_type = (
|
1682
|
+
"words"
|
1683
|
+
if object_type == "word"
|
1684
|
+
else "chars" if object_type == "char" else object_type
|
1685
|
+
)
|
1686
|
+
page_context_for_adding._element_mgr.add_element(
|
1687
|
+
text_el, element_type=add_as_type
|
1688
|
+
)
|
1660
1689
|
else:
|
1661
|
-
page_num_str =
|
1662
|
-
|
1690
|
+
page_num_str = (
|
1691
|
+
str(page_context_for_adding.page_number)
|
1692
|
+
if hasattr(page_context_for_adding, "page_number")
|
1693
|
+
else "N/A"
|
1694
|
+
)
|
1695
|
+
logger.error(
|
1696
|
+
f"Page context for region {element.bbox} (Page {page_num_str}) is missing '_element_mgr'. Cannot add TextElement."
|
1697
|
+
)
|
1663
1698
|
elif page_context_for_adding and text_el.page != page_context_for_adding:
|
1664
|
-
current_page_num_str =
|
1665
|
-
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
|
1699
|
+
current_page_num_str = (
|
1700
|
+
str(text_el.page.page_number)
|
1701
|
+
if hasattr(text_el.page, "page_number")
|
1702
|
+
else "Unknown"
|
1703
|
+
)
|
1704
|
+
context_page_num_str = (
|
1705
|
+
str(page_context_for_adding.page_number)
|
1706
|
+
if hasattr(page_context_for_adding, "page_number")
|
1707
|
+
else "N/A"
|
1708
|
+
)
|
1709
|
+
logger.warning(
|
1710
|
+
f"TextElement for region {element.bbox} from page {current_page_num_str} "
|
1711
|
+
f"not added as it's different from collection's inferred page context {context_page_num_str}."
|
1712
|
+
)
|
1713
|
+
elif not page_context_for_adding:
|
1714
|
+
logger.warning(
|
1715
|
+
f"TextElement for region {element.bbox} created, but no page context was determined for adding."
|
1716
|
+
)
|
1670
1717
|
else:
|
1671
1718
|
logger.warning(f"Skipping element {type(element)}, not a Region.")
|
1672
|
-
|
1719
|
+
|
1673
1720
|
if add_to_page and page_context_for_adding:
|
1674
|
-
page_num_str =
|
1675
|
-
|
1676
|
-
|
1677
|
-
|
1678
|
-
|
1721
|
+
page_num_str = (
|
1722
|
+
str(page_context_for_adding.page_number)
|
1723
|
+
if hasattr(page_context_for_adding, "page_number")
|
1724
|
+
else "N/A"
|
1725
|
+
)
|
1726
|
+
logger.info(
|
1727
|
+
f"Created and added {len(new_text_elements)} TextElements to page {page_num_str}."
|
1728
|
+
)
|
1729
|
+
elif add_to_page and not page_context_for_adding:
|
1730
|
+
logger.info(
|
1731
|
+
f"Created {len(new_text_elements)} TextElements, but could not add to page as page context was not determined or was inconsistent."
|
1732
|
+
)
|
1733
|
+
else: # add_to_page is False
|
1679
1734
|
logger.info(f"Created {len(new_text_elements)} TextElements (not added to page).")
|
1680
1735
|
|
1681
1736
|
return ElementCollection(new_text_elements)
|
1682
1737
|
|
1683
|
-
def trim(
|
1738
|
+
def trim(
|
1739
|
+
self,
|
1740
|
+
padding: int = 1,
|
1741
|
+
threshold: float = 0.95,
|
1742
|
+
resolution: float = 150,
|
1743
|
+
show_progress: bool = True,
|
1744
|
+
) -> "ElementCollection":
|
1684
1745
|
"""
|
1685
1746
|
Trim visual whitespace from each region in the collection.
|
1686
|
-
|
1747
|
+
|
1687
1748
|
Applies the trim() method to each element in the collection,
|
1688
1749
|
returning a new collection with the trimmed regions.
|
1689
|
-
|
1750
|
+
|
1690
1751
|
Args:
|
1691
1752
|
padding: Number of pixels to keep as padding after trimming (default: 1)
|
1692
1753
|
threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
|
1693
1754
|
resolution: Resolution for image rendering in DPI (default: 150)
|
1694
1755
|
show_progress: Whether to show a progress bar for the trimming operation
|
1695
|
-
|
1756
|
+
|
1696
1757
|
Returns:
|
1697
1758
|
New ElementCollection with trimmed regions
|
1698
1759
|
"""
|
1699
1760
|
return self.apply(
|
1700
|
-
lambda element: element.trim(
|
1701
|
-
|
1761
|
+
lambda element: element.trim(
|
1762
|
+
padding=padding, threshold=threshold, resolution=resolution
|
1763
|
+
),
|
1764
|
+
show_progress=show_progress,
|
1702
1765
|
)
|
1703
1766
|
|
1704
1767
|
def clip(
|
@@ -1711,27 +1774,27 @@ class ElementCollection(
|
|
1711
1774
|
) -> "ElementCollection":
|
1712
1775
|
"""
|
1713
1776
|
Clip each element in the collection to the specified bounds.
|
1714
|
-
|
1777
|
+
|
1715
1778
|
This method applies the clip operation to each individual element,
|
1716
1779
|
returning a new collection with the clipped elements.
|
1717
|
-
|
1780
|
+
|
1718
1781
|
Args:
|
1719
1782
|
obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
|
1720
1783
|
left: Optional left boundary (x0) to clip to
|
1721
|
-
top: Optional top boundary to clip to
|
1784
|
+
top: Optional top boundary to clip to
|
1722
1785
|
right: Optional right boundary (x1) to clip to
|
1723
1786
|
bottom: Optional bottom boundary to clip to
|
1724
|
-
|
1787
|
+
|
1725
1788
|
Returns:
|
1726
1789
|
New ElementCollection containing the clipped elements
|
1727
|
-
|
1790
|
+
|
1728
1791
|
Examples:
|
1729
1792
|
# Clip each element to another region's bounds
|
1730
1793
|
clipped_elements = collection.clip(container_region)
|
1731
|
-
|
1794
|
+
|
1732
1795
|
# Clip each element to specific coordinates
|
1733
1796
|
clipped_elements = collection.clip(left=100, right=400)
|
1734
|
-
|
1797
|
+
|
1735
1798
|
# Mix object bounds with specific overrides
|
1736
1799
|
clipped_elements = collection.clip(obj=container, bottom=page.height/2)
|
1737
1800
|
"""
|
@@ -1740,6 +1803,8 @@ class ElementCollection(
|
|
1740
1803
|
)
|
1741
1804
|
|
1742
1805
|
|
1806
|
+
|
1807
|
+
|
1743
1808
|
class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
1744
1809
|
"""
|
1745
1810
|
Represents a collection of Page objects, often from a single PDF document.
|