natural-pdf 0.1.37__py3-none-any.whl → 0.1.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +6 -0
- natural_pdf/core/page.py +90 -22
- natural_pdf/core/pdf.py +183 -59
- natural_pdf/elements/collections.py +202 -47
- natural_pdf/elements/region.py +176 -56
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +702 -20
- natural_pdf/flows/region.py +52 -4
- natural_pdf/selectors/parser.py +34 -1
- natural_pdf/text_mixin.py +97 -0
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/RECORD +16 -15
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -21,6 +21,7 @@ from natural_pdf.elements.text import TextElement # ADDED IMPORT
|
|
21
21
|
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
22
22
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
|
23
23
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
24
|
+
from natural_pdf.text_mixin import TextMixin
|
24
25
|
|
25
26
|
# ------------------------------------------------------------------
|
26
27
|
# Table utilities
|
@@ -56,7 +57,12 @@ logger = logging.getLogger(__name__)
|
|
56
57
|
|
57
58
|
|
58
59
|
class Region(
|
59
|
-
|
60
|
+
TextMixin,
|
61
|
+
DirectionalMixin,
|
62
|
+
ClassificationMixin,
|
63
|
+
ExtractionMixin,
|
64
|
+
ShapeDetectionMixin,
|
65
|
+
DescribeMixin,
|
60
66
|
):
|
61
67
|
"""Represents a rectangular region on a page.
|
62
68
|
|
@@ -1610,14 +1616,71 @@ class Region(
|
|
1610
1616
|
table_settings.setdefault("join_x_tolerance", join)
|
1611
1617
|
table_settings.setdefault("join_y_tolerance", join)
|
1612
1618
|
|
1613
|
-
#
|
1614
|
-
|
1619
|
+
# -------------------------------------------------------------
|
1620
|
+
# Apply char-level exclusion filtering, if any exclusions are
|
1621
|
+
# defined on the parent Page. We create a lightweight
|
1622
|
+
# pdfplumber.Page copy whose .chars list omits characters that
|
1623
|
+
# fall inside any exclusion Region. Other object types are
|
1624
|
+
# left untouched for now ("chars-only" strategy).
|
1625
|
+
# -------------------------------------------------------------
|
1626
|
+
base_plumber_page = self.page._page
|
1627
|
+
|
1628
|
+
if getattr(self.page, "_exclusions", None):
|
1629
|
+
# Resolve exclusion Regions (callables already evaluated)
|
1630
|
+
exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
|
1631
|
+
|
1632
|
+
def _keep_char(obj):
|
1633
|
+
"""Return True if pdfplumber obj should be kept."""
|
1634
|
+
if obj.get("object_type") != "char":
|
1635
|
+
# Keep non-char objects unchanged – lattice grids etc.
|
1636
|
+
return True
|
1637
|
+
|
1638
|
+
# Compute character centre point
|
1639
|
+
cx = (obj["x0"] + obj["x1"]) / 2.0
|
1640
|
+
cy = (obj["top"] + obj["bottom"]) / 2.0
|
1641
|
+
|
1642
|
+
# Reject if the centre lies inside ANY exclusion Region
|
1643
|
+
for reg in exclusion_regions:
|
1644
|
+
if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
|
1645
|
+
return False
|
1646
|
+
return True
|
1647
|
+
|
1648
|
+
try:
|
1649
|
+
filtered_page = base_plumber_page.filter(_keep_char)
|
1650
|
+
except Exception as _filter_err:
|
1651
|
+
# Fallback – if filtering fails, log and proceed unfiltered
|
1652
|
+
logger.warning(
|
1653
|
+
f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions: {_filter_err}"
|
1654
|
+
)
|
1655
|
+
filtered_page = base_plumber_page
|
1656
|
+
else:
|
1657
|
+
filtered_page = base_plumber_page
|
1658
|
+
|
1659
|
+
cropped = filtered_page.crop(self.bbox)
|
1615
1660
|
|
1616
1661
|
# Extract all tables from the cropped area
|
1617
1662
|
tables = cropped.extract_tables(table_settings)
|
1618
1663
|
|
1619
|
-
#
|
1620
|
-
|
1664
|
+
# Apply RTL text processing to all tables
|
1665
|
+
if tables:
|
1666
|
+
processed_tables = []
|
1667
|
+
for table in tables:
|
1668
|
+
processed_table = []
|
1669
|
+
for row in table:
|
1670
|
+
processed_row = []
|
1671
|
+
for cell in row:
|
1672
|
+
if cell is not None:
|
1673
|
+
# Apply RTL text processing to each cell
|
1674
|
+
rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
|
1675
|
+
processed_row.append(rtl_processed_cell)
|
1676
|
+
else:
|
1677
|
+
processed_row.append(cell)
|
1678
|
+
processed_table.append(processed_row)
|
1679
|
+
processed_tables.append(processed_table)
|
1680
|
+
return processed_tables
|
1681
|
+
|
1682
|
+
# Return empty list if no tables found
|
1683
|
+
return []
|
1621
1684
|
|
1622
1685
|
def _extract_table_plumber(self, table_settings: dict, content_filter=None) -> List[List[str]]:
|
1623
1686
|
"""
|
@@ -1654,29 +1717,63 @@ class Region(
|
|
1654
1717
|
if y_tol is not None:
|
1655
1718
|
table_settings.setdefault("text_y_tolerance", y_tol)
|
1656
1719
|
|
1657
|
-
#
|
1658
|
-
|
1720
|
+
# -------------------------------------------------------------
|
1721
|
+
# Apply char-level exclusion filtering (chars only) just like in
|
1722
|
+
# _extract_tables_plumber so header/footer text does not appear
|
1723
|
+
# in extracted tables.
|
1724
|
+
# -------------------------------------------------------------
|
1725
|
+
base_plumber_page = self.page._page
|
1726
|
+
|
1727
|
+
if getattr(self.page, "_exclusions", None):
|
1728
|
+
exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
|
1729
|
+
|
1730
|
+
def _keep_char(obj):
|
1731
|
+
if obj.get("object_type") != "char":
|
1732
|
+
return True
|
1733
|
+
cx = (obj["x0"] + obj["x1"]) / 2.0
|
1734
|
+
cy = (obj["top"] + obj["bottom"]) / 2.0
|
1735
|
+
for reg in exclusion_regions:
|
1736
|
+
if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
|
1737
|
+
return False
|
1738
|
+
return True
|
1739
|
+
|
1740
|
+
try:
|
1741
|
+
filtered_page = base_plumber_page.filter(_keep_char)
|
1742
|
+
except Exception as _filter_err:
|
1743
|
+
logger.warning(
|
1744
|
+
f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions (single table): {_filter_err}"
|
1745
|
+
)
|
1746
|
+
filtered_page = base_plumber_page
|
1747
|
+
else:
|
1748
|
+
filtered_page = base_plumber_page
|
1749
|
+
|
1750
|
+
# Now crop the (possibly filtered) page to the region bbox
|
1751
|
+
cropped = filtered_page.crop(self.bbox)
|
1659
1752
|
|
1660
1753
|
# Extract the single largest table from the cropped area
|
1661
1754
|
table = cropped.extract_table(table_settings)
|
1662
1755
|
|
1663
1756
|
# Return the table or an empty list if none found
|
1664
1757
|
if table:
|
1665
|
-
# Apply content filtering if provided
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
|
1670
|
-
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1758
|
+
# Apply RTL text processing and content filtering if provided
|
1759
|
+
processed_table = []
|
1760
|
+
for row in table:
|
1761
|
+
processed_row = []
|
1762
|
+
for cell in row:
|
1763
|
+
if cell is not None:
|
1764
|
+
# Apply RTL text processing first
|
1765
|
+
rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
|
1766
|
+
|
1767
|
+
# Then apply content filter if provided
|
1768
|
+
if content_filter is not None:
|
1769
|
+
filtered_cell = self._apply_content_filter_to_text(rtl_processed_cell, content_filter)
|
1770
|
+
processed_row.append(filtered_cell)
|
1675
1771
|
else:
|
1676
|
-
|
1677
|
-
|
1678
|
-
|
1679
|
-
|
1772
|
+
processed_row.append(rtl_processed_cell)
|
1773
|
+
else:
|
1774
|
+
processed_row.append(cell)
|
1775
|
+
processed_table.append(processed_row)
|
1776
|
+
return processed_table
|
1680
1777
|
return []
|
1681
1778
|
|
1682
1779
|
def _extract_table_tatr(self, use_ocr=False, ocr_config=None, content_filter=None) -> List[List[str]]:
|
@@ -2985,45 +3082,20 @@ class Region(
|
|
2985
3082
|
source_info = f" source='{self.source}'" if self.source else ""
|
2986
3083
|
return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
|
2987
3084
|
|
2988
|
-
def
|
3085
|
+
def update_text(
|
2989
3086
|
self,
|
2990
|
-
|
2991
|
-
|
2992
|
-
""
|
2993
|
-
|
2994
|
-
|
2995
|
-
|
2996
|
-
Finds text elements within this region whose 'source' attribute starts
|
2997
|
-
with 'ocr' and calls the `correction_callback` for each, passing the
|
2998
|
-
element itself.
|
2999
|
-
|
3000
|
-
The `correction_callback` should contain the logic to:
|
3001
|
-
1. Determine if the element needs correction.
|
3002
|
-
2. Perform the correction (e.g., call an LLM).
|
3003
|
-
3. Return the new text (`str`) or `None`.
|
3004
|
-
|
3005
|
-
If the callback returns a string, the element's `.text` is updated.
|
3006
|
-
Metadata updates (source, confidence, etc.) should happen within the callback.
|
3007
|
-
|
3008
|
-
Args:
|
3009
|
-
correction_callback: A function accepting an element and returning
|
3010
|
-
`Optional[str]` (new text or None).
|
3087
|
+
transform: Callable[[Any], Optional[str]],
|
3088
|
+
*,
|
3089
|
+
selector: str = "text",
|
3090
|
+
apply_exclusions: bool = False,
|
3091
|
+
) -> "Region":
|
3092
|
+
"""Apply *transform* to every text element matched by *selector* inside this region.
|
3011
3093
|
|
3012
|
-
|
3013
|
-
|
3094
|
+
The heavy lifting is delegated to :py:meth:`TextMixin.update_text`; this
|
3095
|
+
override simply ensures the search is scoped to the region.
|
3014
3096
|
"""
|
3015
|
-
# Find OCR elements specifically within this region
|
3016
|
-
# Note: We typically want to correct even if the element falls in an excluded area
|
3017
|
-
target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
|
3018
3097
|
|
3019
|
-
|
3020
|
-
_apply_ocr_correction_to_elements(
|
3021
|
-
elements=target_elements, # Pass the ElementCollection directly
|
3022
|
-
correction_callback=correction_callback,
|
3023
|
-
caller_info=f"Region({self.bbox})", # Pass caller info
|
3024
|
-
)
|
3025
|
-
|
3026
|
-
return self # Return self for chaining
|
3098
|
+
return TextMixin.update_text(self, transform, selector=selector, apply_exclusions=apply_exclusions)
|
3027
3099
|
|
3028
3100
|
# --- Classification Mixin Implementation --- #
|
3029
3101
|
def _get_classification_manager(self) -> "ClassificationManager":
|
@@ -3490,6 +3562,54 @@ class Region(
|
|
3490
3562
|
|
3491
3563
|
return table_grid
|
3492
3564
|
|
3565
|
+
def _apply_rtl_processing_to_text(self, text: str) -> str:
|
3566
|
+
"""
|
3567
|
+
Apply RTL (Right-to-Left) text processing to a string.
|
3568
|
+
|
3569
|
+
This converts visual order text (as stored in PDFs) to logical order
|
3570
|
+
for proper display of Arabic, Hebrew, and other RTL scripts.
|
3571
|
+
|
3572
|
+
Args:
|
3573
|
+
text: Input text string in visual order
|
3574
|
+
|
3575
|
+
Returns:
|
3576
|
+
Text string in logical order
|
3577
|
+
"""
|
3578
|
+
if not text or not text.strip():
|
3579
|
+
return text
|
3580
|
+
|
3581
|
+
# Quick check for RTL characters - if none found, return as-is
|
3582
|
+
import unicodedata
|
3583
|
+
|
3584
|
+
def _contains_rtl(s):
|
3585
|
+
return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
|
3586
|
+
|
3587
|
+
if not _contains_rtl(text):
|
3588
|
+
return text
|
3589
|
+
|
3590
|
+
try:
|
3591
|
+
from bidi.algorithm import get_display # type: ignore
|
3592
|
+
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
3593
|
+
|
3594
|
+
# Apply BiDi algorithm to convert from visual to logical order
|
3595
|
+
# Process line by line to handle mixed content properly
|
3596
|
+
processed_lines = []
|
3597
|
+
for line in text.split("\n"):
|
3598
|
+
if line.strip():
|
3599
|
+
# Determine base direction for this line
|
3600
|
+
base_dir = "R" if _contains_rtl(line) else "L"
|
3601
|
+
logical_line = get_display(line, base_dir=base_dir)
|
3602
|
+
# Apply bracket mirroring for correct logical order
|
3603
|
+
processed_lines.append(mirror_brackets(logical_line))
|
3604
|
+
else:
|
3605
|
+
processed_lines.append(line)
|
3606
|
+
|
3607
|
+
return "\n".join(processed_lines)
|
3608
|
+
|
3609
|
+
except (ImportError, Exception):
|
3610
|
+
# If bidi library is not available or fails, return original text
|
3611
|
+
return text
|
3612
|
+
|
3493
3613
|
def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
|
3494
3614
|
"""
|
3495
3615
|
Apply content filter to a text string.
|
natural_pdf/flows/element.py
CHANGED
@@ -73,6 +73,31 @@ class FlowElement:
|
|
73
73
|
"""Returns the physical page of the underlying element."""
|
74
74
|
return getattr(self.physical_object, "page", None)
|
75
75
|
|
76
|
+
def __getattr__(self, name: str) -> Any:
|
77
|
+
"""
|
78
|
+
Delegate unknown attribute access to the physical_object.
|
79
|
+
|
80
|
+
This ensures that attributes like 'type', 'region_type', 'source', 'model', etc.
|
81
|
+
from the physical element are accessible on the FlowElement wrapper.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
name: The attribute name being accessed
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
The attribute value from physical_object
|
88
|
+
|
89
|
+
Raises:
|
90
|
+
AttributeError: If the attribute doesn't exist on physical_object either
|
91
|
+
"""
|
92
|
+
try:
|
93
|
+
return getattr(self.physical_object, name)
|
94
|
+
except AttributeError:
|
95
|
+
# Provide a helpful error message that mentions both FlowElement and physical_object
|
96
|
+
raise AttributeError(
|
97
|
+
f"'{type(self).__name__}' object has no attribute '{name}' "
|
98
|
+
f"(also not found on underlying {type(self.physical_object).__name__})"
|
99
|
+
)
|
100
|
+
|
76
101
|
def _flow_direction(
|
77
102
|
self,
|
78
103
|
direction: str, # "above", "below", "left", "right"
|