natural-pdf 0.1.37__py3-none-any.whl → 0.1.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,6 +21,7 @@ from natural_pdf.elements.text import TextElement # ADDED IMPORT
21
21
  from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
22
22
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
23
23
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
24
+ from natural_pdf.text_mixin import TextMixin
24
25
 
25
26
  # ------------------------------------------------------------------
26
27
  # Table utilities
@@ -56,7 +57,12 @@ logger = logging.getLogger(__name__)
56
57
 
57
58
 
58
59
  class Region(
59
- DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin
60
+ TextMixin,
61
+ DirectionalMixin,
62
+ ClassificationMixin,
63
+ ExtractionMixin,
64
+ ShapeDetectionMixin,
65
+ DescribeMixin,
60
66
  ):
61
67
  """Represents a rectangular region on a page.
62
68
 
@@ -1610,14 +1616,71 @@ class Region(
1610
1616
  table_settings.setdefault("join_x_tolerance", join)
1611
1617
  table_settings.setdefault("join_y_tolerance", join)
1612
1618
 
1613
- # Create a crop of the page for this region
1614
- cropped = self.page._page.crop(self.bbox)
1619
+ # -------------------------------------------------------------
1620
+ # Apply char-level exclusion filtering, if any exclusions are
1621
+ # defined on the parent Page. We create a lightweight
1622
+ # pdfplumber.Page copy whose .chars list omits characters that
1623
+ # fall inside any exclusion Region. Other object types are
1624
+ # left untouched for now ("chars-only" strategy).
1625
+ # -------------------------------------------------------------
1626
+ base_plumber_page = self.page._page
1627
+
1628
+ if getattr(self.page, "_exclusions", None):
1629
+ # Resolve exclusion Regions (callables already evaluated)
1630
+ exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
1631
+
1632
+ def _keep_char(obj):
1633
+ """Return True if pdfplumber obj should be kept."""
1634
+ if obj.get("object_type") != "char":
1635
+ # Keep non-char objects unchanged – lattice grids etc.
1636
+ return True
1637
+
1638
+ # Compute character centre point
1639
+ cx = (obj["x0"] + obj["x1"]) / 2.0
1640
+ cy = (obj["top"] + obj["bottom"]) / 2.0
1641
+
1642
+ # Reject if the centre lies inside ANY exclusion Region
1643
+ for reg in exclusion_regions:
1644
+ if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
1645
+ return False
1646
+ return True
1647
+
1648
+ try:
1649
+ filtered_page = base_plumber_page.filter(_keep_char)
1650
+ except Exception as _filter_err:
1651
+ # Fallback – if filtering fails, log and proceed unfiltered
1652
+ logger.warning(
1653
+ f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions: {_filter_err}"
1654
+ )
1655
+ filtered_page = base_plumber_page
1656
+ else:
1657
+ filtered_page = base_plumber_page
1658
+
1659
+ cropped = filtered_page.crop(self.bbox)
1615
1660
 
1616
1661
  # Extract all tables from the cropped area
1617
1662
  tables = cropped.extract_tables(table_settings)
1618
1663
 
1619
- # Return the tables or an empty list if none found
1620
- return tables if tables else []
1664
+ # Apply RTL text processing to all tables
1665
+ if tables:
1666
+ processed_tables = []
1667
+ for table in tables:
1668
+ processed_table = []
1669
+ for row in table:
1670
+ processed_row = []
1671
+ for cell in row:
1672
+ if cell is not None:
1673
+ # Apply RTL text processing to each cell
1674
+ rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
1675
+ processed_row.append(rtl_processed_cell)
1676
+ else:
1677
+ processed_row.append(cell)
1678
+ processed_table.append(processed_row)
1679
+ processed_tables.append(processed_table)
1680
+ return processed_tables
1681
+
1682
+ # Return empty list if no tables found
1683
+ return []
1621
1684
 
1622
1685
  def _extract_table_plumber(self, table_settings: dict, content_filter=None) -> List[List[str]]:
1623
1686
  """
@@ -1654,29 +1717,63 @@ class Region(
1654
1717
  if y_tol is not None:
1655
1718
  table_settings.setdefault("text_y_tolerance", y_tol)
1656
1719
 
1657
- # Create a crop of the page for this region
1658
- cropped = self.page._page.crop(self.bbox)
1720
+ # -------------------------------------------------------------
1721
+ # Apply char-level exclusion filtering (chars only) just like in
1722
+ # _extract_tables_plumber so header/footer text does not appear
1723
+ # in extracted tables.
1724
+ # -------------------------------------------------------------
1725
+ base_plumber_page = self.page._page
1726
+
1727
+ if getattr(self.page, "_exclusions", None):
1728
+ exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
1729
+
1730
+ def _keep_char(obj):
1731
+ if obj.get("object_type") != "char":
1732
+ return True
1733
+ cx = (obj["x0"] + obj["x1"]) / 2.0
1734
+ cy = (obj["top"] + obj["bottom"]) / 2.0
1735
+ for reg in exclusion_regions:
1736
+ if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
1737
+ return False
1738
+ return True
1739
+
1740
+ try:
1741
+ filtered_page = base_plumber_page.filter(_keep_char)
1742
+ except Exception as _filter_err:
1743
+ logger.warning(
1744
+ f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions (single table): {_filter_err}"
1745
+ )
1746
+ filtered_page = base_plumber_page
1747
+ else:
1748
+ filtered_page = base_plumber_page
1749
+
1750
+ # Now crop the (possibly filtered) page to the region bbox
1751
+ cropped = filtered_page.crop(self.bbox)
1659
1752
 
1660
1753
  # Extract the single largest table from the cropped area
1661
1754
  table = cropped.extract_table(table_settings)
1662
1755
 
1663
1756
  # Return the table or an empty list if none found
1664
1757
  if table:
1665
- # Apply content filtering if provided
1666
- if content_filter is not None:
1667
- filtered_table = []
1668
- for row in table:
1669
- filtered_row = []
1670
- for cell in row:
1671
- if cell is not None:
1672
- # Apply content filter to cell text
1673
- filtered_cell = self._apply_content_filter_to_text(cell, content_filter)
1674
- filtered_row.append(filtered_cell)
1758
+ # Apply RTL text processing and content filtering if provided
1759
+ processed_table = []
1760
+ for row in table:
1761
+ processed_row = []
1762
+ for cell in row:
1763
+ if cell is not None:
1764
+ # Apply RTL text processing first
1765
+ rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
1766
+
1767
+ # Then apply content filter if provided
1768
+ if content_filter is not None:
1769
+ filtered_cell = self._apply_content_filter_to_text(rtl_processed_cell, content_filter)
1770
+ processed_row.append(filtered_cell)
1675
1771
  else:
1676
- filtered_row.append(cell)
1677
- filtered_table.append(filtered_row)
1678
- return filtered_table
1679
- return table
1772
+ processed_row.append(rtl_processed_cell)
1773
+ else:
1774
+ processed_row.append(cell)
1775
+ processed_table.append(processed_row)
1776
+ return processed_table
1680
1777
  return []
1681
1778
 
1682
1779
  def _extract_table_tatr(self, use_ocr=False, ocr_config=None, content_filter=None) -> List[List[str]]:
@@ -2985,45 +3082,20 @@ class Region(
2985
3082
  source_info = f" source='{self.source}'" if self.source else ""
2986
3083
  return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
2987
3084
 
2988
- def correct_ocr(
3085
+ def update_text(
2989
3086
  self,
2990
- correction_callback: Callable[[Any], Optional[str]],
2991
- ) -> "Region": # Return self for chaining
2992
- """
2993
- Applies corrections to OCR-generated text elements within this region
2994
- using a user-provided callback function.
2995
-
2996
- Finds text elements within this region whose 'source' attribute starts
2997
- with 'ocr' and calls the `correction_callback` for each, passing the
2998
- element itself.
2999
-
3000
- The `correction_callback` should contain the logic to:
3001
- 1. Determine if the element needs correction.
3002
- 2. Perform the correction (e.g., call an LLM).
3003
- 3. Return the new text (`str`) or `None`.
3004
-
3005
- If the callback returns a string, the element's `.text` is updated.
3006
- Metadata updates (source, confidence, etc.) should happen within the callback.
3007
-
3008
- Args:
3009
- correction_callback: A function accepting an element and returning
3010
- `Optional[str]` (new text or None).
3087
+ transform: Callable[[Any], Optional[str]],
3088
+ *,
3089
+ selector: str = "text",
3090
+ apply_exclusions: bool = False,
3091
+ ) -> "Region":
3092
+ """Apply *transform* to every text element matched by *selector* inside this region.
3011
3093
 
3012
- Returns:
3013
- Self for method chaining.
3094
+ The heavy lifting is delegated to :py:meth:`TextMixin.update_text`; this
3095
+ override simply ensures the search is scoped to the region.
3014
3096
  """
3015
- # Find OCR elements specifically within this region
3016
- # Note: We typically want to correct even if the element falls in an excluded area
3017
- target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
3018
3097
 
3019
- # Delegate to the utility function
3020
- _apply_ocr_correction_to_elements(
3021
- elements=target_elements, # Pass the ElementCollection directly
3022
- correction_callback=correction_callback,
3023
- caller_info=f"Region({self.bbox})", # Pass caller info
3024
- )
3025
-
3026
- return self # Return self for chaining
3098
+ return TextMixin.update_text(self, transform, selector=selector, apply_exclusions=apply_exclusions)
3027
3099
 
3028
3100
  # --- Classification Mixin Implementation --- #
3029
3101
  def _get_classification_manager(self) -> "ClassificationManager":
@@ -3490,6 +3562,54 @@ class Region(
3490
3562
 
3491
3563
  return table_grid
3492
3564
 
3565
+ def _apply_rtl_processing_to_text(self, text: str) -> str:
3566
+ """
3567
+ Apply RTL (Right-to-Left) text processing to a string.
3568
+
3569
+ This converts visual order text (as stored in PDFs) to logical order
3570
+ for proper display of Arabic, Hebrew, and other RTL scripts.
3571
+
3572
+ Args:
3573
+ text: Input text string in visual order
3574
+
3575
+ Returns:
3576
+ Text string in logical order
3577
+ """
3578
+ if not text or not text.strip():
3579
+ return text
3580
+
3581
+ # Quick check for RTL characters - if none found, return as-is
3582
+ import unicodedata
3583
+
3584
+ def _contains_rtl(s):
3585
+ return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
3586
+
3587
+ if not _contains_rtl(text):
3588
+ return text
3589
+
3590
+ try:
3591
+ from bidi.algorithm import get_display # type: ignore
3592
+ from natural_pdf.utils.bidi_mirror import mirror_brackets
3593
+
3594
+ # Apply BiDi algorithm to convert from visual to logical order
3595
+ # Process line by line to handle mixed content properly
3596
+ processed_lines = []
3597
+ for line in text.split("\n"):
3598
+ if line.strip():
3599
+ # Determine base direction for this line
3600
+ base_dir = "R" if _contains_rtl(line) else "L"
3601
+ logical_line = get_display(line, base_dir=base_dir)
3602
+ # Apply bracket mirroring for correct logical order
3603
+ processed_lines.append(mirror_brackets(logical_line))
3604
+ else:
3605
+ processed_lines.append(line)
3606
+
3607
+ return "\n".join(processed_lines)
3608
+
3609
+ except (ImportError, Exception):
3610
+ # If bidi library is not available or fails, return original text
3611
+ return text
3612
+
3493
3613
  def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
3494
3614
  """
3495
3615
  Apply content filter to a text string.
@@ -73,6 +73,31 @@ class FlowElement:
73
73
  """Returns the physical page of the underlying element."""
74
74
  return getattr(self.physical_object, "page", None)
75
75
 
76
+ def __getattr__(self, name: str) -> Any:
77
+ """
78
+ Delegate unknown attribute access to the physical_object.
79
+
80
+ This ensures that attributes like 'type', 'region_type', 'source', 'model', etc.
81
+ from the physical element are accessible on the FlowElement wrapper.
82
+
83
+ Args:
84
+ name: The attribute name being accessed
85
+
86
+ Returns:
87
+ The attribute value from physical_object
88
+
89
+ Raises:
90
+ AttributeError: If the attribute doesn't exist on physical_object either
91
+ """
92
+ try:
93
+ return getattr(self.physical_object, name)
94
+ except AttributeError:
95
+ # Provide a helpful error message that mentions both FlowElement and physical_object
96
+ raise AttributeError(
97
+ f"'{type(self).__name__}' object has no attribute '{name}' "
98
+ f"(also not found on underlying {type(self.physical_object).__name__})"
99
+ )
100
+
76
101
  def _flow_direction(
77
102
  self,
78
103
  direction: str, # "above", "below", "left", "right"