PyPI - natural-pdf - Versions diffs - 0.1.30__py3-none-any.whl → 0.1.31__py3-none-any.whl - Mend

natural-pdf 0.1.30py3-none-any.whl → 0.1.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -7,6 +7,7 @@ characters, words, rectangles, and lines extracted from a page.
 import logging
 import re
+from contextlib import contextmanager
 from itertools import groupby
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -47,6 +48,33 @@ HIGHLIGHT_DEFAULTS = {
     "color_value_min": 0.4,        # HSV V >
 }
+@contextmanager
+def disable_text_sync():
+    """
+    Temporarily disable text synchronization for performance.
+    This is used when bulk-updating text content where character-level
+    synchronization is not needed, such as during bidi processing.
+    Fixes exponential recursion issue with Arabic/RTL text processing.
+    """
+    # Save original setter
+    original_setter = TextElement.text.fset
+    # Create a fast setter that skips sync
+    def fast_setter(self, value):
+        self._obj["text"] = value
+        # Skip character synchronization for performance
+    # Apply fast setter
+    TextElement.text = property(TextElement.text.fget, fast_setter)
+    try:
+        yield
+    finally:
+        # Restore original setter
+        TextElement.text = property(TextElement.text.fget, original_setter)
 class NaturalWordExtractor(WordExtractor):
     """
     Custom WordExtractor that splits words based on specified character attributes
@@ -208,8 +236,7 @@ class ElementManager:
         yt = pdf_config.get("y_tolerance", 3)
         use_flow = pdf_config.get("use_text_flow", False)
-        # Define which attributes to preserve on the merged word object
-        # Should include split attributes + any others needed for filtering (like color)
+        # List of attributes to preserve on word objects
         attributes_to_preserve = list(
             set(
                 self._word_split_attributes
@@ -223,7 +250,7 @@ class ElementManager:
             )
         )
-        # -------------------------------------------------------------
+        # ------------------------------------------------------------------
         # NEW: Detect direction (LTR vs RTL) per visual line and feed
         #       pdfplumber's WordExtractor with the correct settings.
         # -------------------------------------------------------------
@@ -271,7 +298,9 @@ class ElementManager:
             # Build a WordExtractor tailored for this line's direction
             if is_rtl_line:
                 line_dir = "ttb"  # horizontal lines stacked top→bottom
-                char_dir = "rtl"  # characters right→left within the line
+                # Feed characters in right→left x-order; extractor can then treat
+                # them as left-to-right so that resulting text stays logical.
+                char_dir = "ltr"
             else:
                 line_dir = "ttb"
                 char_dir = "ltr"
@@ -288,9 +317,8 @@ class ElementManager:
             )
             # Prepare character sequence for the extractor:
-            #  • For LTR lines -> left→right order (x0 ascending)
-            #  • For RTL lines -> feed **reversed** list so that neighbouring
-            #    characters appear adjacent when the extractor walks right→left.
+            # Always feed characters in spatial order (x0 ascending)
+            # PDF stores glyphs in visual order, so this gives us the visual sequence
             line_chars_for_extractor = sorted(line_chars, key=lambda c: c.get("x0", 0))
             try:
@@ -324,15 +352,18 @@ class ElementManager:
                 # on the whole-line heuristic.
                 rtl_in_word = any(_is_rtl_char(ch.get("text", "")) for ch in char_list)
                 if rtl_in_word:
+                    # Convert from visual order (from PDF) to logical order using bidi
                     try:
                         from bidi.algorithm import get_display  # type: ignore
                         from natural_pdf.utils.bidi_mirror import mirror_brackets
-                        word_element.text = mirror_brackets(
-                            get_display(word_element.text, base_dir="R")
-                        )
+                        with disable_text_sync():
+                            # word_element.text is currently in visual order (from PDF)
+                            # Convert to logical order using bidi with auto direction detection
+                            logical_text = get_display(word_element.text, base_dir='L')
+                            # Apply bracket mirroring for logical order
+                            word_element.text = mirror_brackets(logical_text)
                     except Exception:
-                        # Fallback: keep original text if python-bidi fails
                         pass
         # ------------------------------------------------------------------
@@ -415,19 +446,6 @@ class ElementManager:
             f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
         )
-        # --- Post-processing pass to ensure every word containing RTL characters is
-        #     stored in logical order and with mirrored brackets.  This is a
-        #     safeguard in case the per-line loop above missed some tokens.
-        try:
-            from bidi.algorithm import get_display  # type: ignore
-            from natural_pdf.utils.bidi_mirror import mirror_brackets
-            for w in generated_words:
-                if any(_is_rtl_char(ch) for ch in w.text):
-                    w.text = mirror_brackets(get_display(w.text, base_dir="R"))
-        except Exception:
-            pass  # graceful degradation – keep original text
         # 4. Load other elements (rects, lines)
         rect_elements = [RectangleElement(r, self._page) for r in self._page._page.rects]
         line_elements = [LineElement(l, self._page) for l in self._page._page.lines]
@@ -463,6 +481,8 @@ class ElementManager:
         logger.debug(f"Page {self._page.number}: Element loading complete.")
+        # If per-word BiDi was skipped, generated_words already stay in logical order.
     def _prepare_char_dicts(self) -> List[Dict[str, Any]]:
         """
         Prepares a list of character dictionaries from native PDF characters,

natural_pdf/elements/text.py CHANGED Viewed

@@ -468,3 +468,32 @@ class TextElement(Element):
                 info[f"raw_{prop}"] = self._obj[prop]
         return info
+    @property
+    def visual_text(self) -> str:
+        """Return the text converted to *visual* order using the Unicode BiDi algorithm.
+        This helper is intentionally side-effect–free: it does **not** mutate
+        ``self.text`` or the underlying character dictionaries.  It should be
+        used by UI / rendering code that needs human-readable RTL/LTR mixing.
+        """
+        logical = self.text
+        if not logical:
+            return logical
+        # Quick check – bail out if no RTL chars to save import/CPU.
+        import unicodedata
+        if not any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in logical):
+            return logical
+        try:
+            from bidi.algorithm import get_display  # type: ignore
+            from natural_pdf.utils.bidi_mirror import mirror_brackets
+            # Convert from logical order to visual order
+            visual = get_display(logical, base_dir="R")
+            return mirror_brackets(visual)
+        except Exception:
+            # If python-bidi is missing or errors, fall back to logical order
+            return logical

{natural_pdf-0.1.30.dist-info → natural_pdf-0.1.31.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.1.30
+Version: 0.1.31
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT

{natural_pdf-0.1.30.dist-info → natural_pdf-0.1.31.dist-info}/RECORD RENAMED Viewed

@@ -28,7 +28,7 @@ natural_pdf/classification/results.py,sha256=Mcay-xLBHbYoZ8U7f4gMj2IhhH_yORNEkZH
 natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
 natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
 natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
-natural_pdf/core/element_manager.py,sha256=96v_w3kXhSUqRsJlX5Bl6O6hJzpYRqDn4xoyRsdqZ7o,49260
+natural_pdf/core/element_manager.py,sha256=Mn4cYqPL_2LD_GK9lf2duExaJF1qhASCKsOdAZdQb00,49821
 natural_pdf/core/highlighting_service.py,sha256=WKDqRpex1yS8CWhkNitWtKhxbyRRCLu3Xsct_HTPsD4,40774
 natural_pdf/core/page.py,sha256=kQKKqsbOaNeLhW3ark6mueDS-4tsopJcGcoMmKPK6B8,125624
 natural_pdf/core/pdf.py,sha256=YfniZp54AyptzMyr7ZP8n617n4wlV28SPrajt32nNBk,74233
@@ -44,7 +44,7 @@ natural_pdf/elements/image.py,sha256=UjHNzCgDzOseQmLpkKshcxg51DPmWNIAVYxZ0TAMyUI
 natural_pdf/elements/line.py,sha256=aQm4pDdlQSDAAXqrdg4AU-oTl9JCXgYuaJN0EYls6E0,4920
 natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
 natural_pdf/elements/region.py,sha256=v1PzWvQoGHGdn7SQiPf4Oq3hIGueIfYGwcZ05ZU6XPE,127692
-natural_pdf/elements/text.py,sha256=2neapKplef0FsAMYWr4OdICt-TmrZ3z9z0YBrX8FrSk,17738
+natural_pdf/elements/text.py,sha256=kw7u2KfHtDB905YawP7Hs89kcR8XnbtpkYQGEk6LNyk,18860
 natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
 natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
 natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
@@ -100,7 +100,7 @@ natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6
 natural_pdf/utils/visualization.py,sha256=n3IZpbY5cf9LItzGavBcNyVZZrrUVxjYnmqZHYPa7NU,9386
 natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
 natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
-natural_pdf-0.1.30.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
+natural_pdf-0.1.31.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
 optimization/memory_comparison.py,sha256=XEHtjduSmzXzxnsJMvemTcq-OAlvGUBAm5wwnOXq8TY,6524
 optimization/pdf_analyzer.py,sha256=G3XWhsEqIYbohEgTqz6wzxkAnOx4MkbvbSspx577-8w,19145
 optimization/performance_analysis.py,sha256=vVlFDywEXxhJLd9n2KVVqqQnS6rwWoHV_jlogboGF2k,13784
@@ -115,8 +115,8 @@ tools/bad_pdf_eval/export_enrichment_csv.py,sha256=SMEm9WxFUN_RIf8AGfZfjGEmvBvrO
 tools/bad_pdf_eval/llm_enrich.py,sha256=PsFMymPc8BNck21T3vupTN18pLdum-A_OLoJEKr6f80,12234
 tools/bad_pdf_eval/reporter.py,sha256=LIhcguDZ5XKgb0WeJsyA7m0kcliebOohzveShvt_KmY,400
 tools/bad_pdf_eval/utils.py,sha256=FuxaPX6f26IjQXu1vP0a2i9h1jgJNbASb8mRyj5-elE,4849
-natural_pdf-0.1.30.dist-info/METADATA,sha256=4Jg-iXXt6zGNE4gSYE_nMF395JDzv1Dierh93x1Lklo,6711
-natural_pdf-0.1.30.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-natural_pdf-0.1.30.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
-natural_pdf-0.1.30.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
-natural_pdf-0.1.30.dist-info/RECORD,,
+natural_pdf-0.1.31.dist-info/METADATA,sha256=tqimu2ZReyYu5pS0PsbCo-Z9fIzkpMj1ljGPNbaOFss,6711
+natural_pdf-0.1.31.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+natural_pdf-0.1.31.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
+natural_pdf-0.1.31.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
+natural_pdf-0.1.31.dist-info/RECORD,,

{natural_pdf-0.1.30.dist-info → natural_pdf-0.1.31.dist-info}/WHEEL RENAMED Viewed

File without changes

{natural_pdf-0.1.30.dist-info → natural_pdf-0.1.31.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{natural_pdf-0.1.30.dist-info → natural_pdf-0.1.31.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{natural_pdf-0.1.30.dist-info → natural_pdf-0.1.31.dist-info}/top_level.txt RENAMED Viewed

File without changes

natural-pdf 0.1.30__py3-none-any.whl → 0.1.31__py3-none-any.whl

natural-pdf 0.1.30py3-none-any.whl → 0.1.31py3-none-any.whl