PyPI - natural-pdf - Versions diffs - 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl - Mend

natural-pdf 0.2.8py3-none-any.whl → 0.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

natural_pdf/elements/element_collection.py CHANGED Viewed

@@ -633,9 +633,7 @@ class ElementCollection(
         pdfplumber's layout engine if layout=True is specified.
         Args:
-            separator: String to insert between text from different elements when
-                      using simple joining (layout=False). Default is a single space.
-                      Ignored when layout=True as the layout engine handles spacing.
+            separator: String to join text from elements. Default is a single space.
             preserve_whitespace: Deprecated. Use layout=False for simple joining.
             use_exclusions: Deprecated. Exclusions should be applied *before* creating
                           the collection or by filtering the collection itself.
@@ -652,15 +650,49 @@ class ElementCollection(
         Returns:
             Combined text from elements, potentially with layout-based spacing.
         """
-        # Filter to just TextElements that likely have _char_dicts
-        text_elements = [
+        # Check if we have any elements at all
+        if not self._elements:
+            return ""
+        # Check if all elements are TextElements with character data
+        text_elements_with_chars = [
             el
             for el in self._elements
-            if isinstance(el, TextElement) and hasattr(el, "_char_dicts")
+            if isinstance(el, TextElement) and hasattr(el, "_char_dicts") and el._char_dicts
         ]
-        if not text_elements:
-            return ""
+        # If we have a mixed collection (Regions, TextElements without chars, etc),
+        # use a simpler approach: call extract_text on each element
+        if len(text_elements_with_chars) < len(self._elements):
+            # Mixed collection - extract text from each element
+            element_texts = []
+            # Sort elements by position first
+            sorted_elements = sorted(
+                self._elements,
+                key=lambda el: (
+                    el.page.index if hasattr(el, "page") else 0,
+                    el.top if hasattr(el, "top") else 0,
+                    el.x0 if hasattr(el, "x0") else 0,
+                ),
+            )
+            for el in sorted_elements:
+                if hasattr(el, "extract_text"):
+                    # Call extract_text on the element (works for TextElement, Region, etc)
+                    text = el.extract_text(**kwargs)
+                    if text:
+                        element_texts.append(text)
+                elif hasattr(el, "text"):
+                    # Fallback to text property if available
+                    text = getattr(el, "text", "")
+                    if text:
+                        element_texts.append(text)
+            return separator.join(element_texts)
+        # All elements are TextElements with char data - use the original approach
+        text_elements = text_elements_with_chars
         # Collect all character dictionaries
         all_char_dicts = []
@@ -669,11 +701,20 @@ class ElementCollection(
         if not all_char_dicts:
             # Handle case where elements exist but have no char dicts
-            logger.warning(
+            logger.debug(
                 "ElementCollection.extract_text: No character dictionaries found in TextElements."
             )
+            # Sort elements by position before joining
+            sorted_text_elements = sorted(
+                text_elements,
+                key=lambda el: (
+                    el.page.index if hasattr(el, "page") else 0,
+                    el.top if hasattr(el, "top") else 0,
+                    el.x0 if hasattr(el, "x0") else 0,
+                ),
+            )
             return separator.join(
-                getattr(el, "text", "") for el in text_elements
+                getattr(el, "text", "") for el in sorted_text_elements
             )  # Fallback to simple join of word text
         # Apply content filtering if provided
@@ -737,33 +778,20 @@ class ElementCollection(
                 all_char_dicts.sort(
                     key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
                 )
-                result = separator.join(c.get("text", "") for c in all_char_dicts)
+                result = " ".join(c.get("text", "") for c in all_char_dicts)
         else:
+            print("JOIN WITHOUT LAYOUT")
             # Default: Simple join without layout
             logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
-            # Instead of joining all characters individually, we need to:
-            # 1. Extract text from each element
-            # 2. Join the element texts with the separator
-            # Sort elements by document order (page, top, x0)
-            sorted_elements = sorted(
-                text_elements,
-                key=lambda el: (
-                    el.page.index if hasattr(el, "page") else 0,
-                    el.top if hasattr(el, "top") else 0,
-                    el.x0 if hasattr(el, "x0") else 0,
-                ),
-            )
-            # Extract text from each element
-            element_texts = []
-            for el in sorted_elements:
-                if hasattr(el, "text") and el.text:
-                    element_texts.append(el.text)
-            result = separator.join(element_texts)
+            result = separator.join(el.extract_text() for el in text_elements)
+            # # Sort chars by document order (page, top, x0)
+            # all_char_dicts.sort(
+            #     key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
+            # )
+            # # Simple join of character text
+            # result = "".join(c.get("text", "") for c in all_char_dicts)
         # Determine final strip flag – same rule as global helper unless caller overrides
         strip_text = strip if strip is not None else (not use_layout)

{natural_pdf-0.2.8.dist-info → natural_pdf-0.2.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.2.8
+Version: 0.2.9
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT

{natural_pdf-0.2.8.dist-info → natural_pdf-0.2.9.dist-info}/RECORD RENAMED Viewed

@@ -40,7 +40,7 @@ natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ
 natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
 natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
 natural_pdf/elements/base.py,sha256=92ukTtRCQFsa5KvKflChCt4mt0ZGS4ecGYCQTNMO4zU,58907
-natural_pdf/elements/element_collection.py,sha256=-piFQGiDPiqmnl-Cpoi3PGPmGe4AYvpl0IqaJGxBsBc,129405
+natural_pdf/elements/element_collection.py,sha256=idM_BUWEfbCJ5Sq0Ae_KfbVHy8TdkNfzs7iWkFe_j2I,130707
 natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
 natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
 natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
@@ -107,7 +107,7 @@ natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs
 natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
 natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
 natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
-natural_pdf-0.2.8.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
+natural_pdf-0.2.9.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
 optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
 optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
 optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
@@ -124,8 +124,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
 tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
 tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
 tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
-natural_pdf-0.2.8.dist-info/METADATA,sha256=tuWXV-mY9zU0qsVsXhrrp3aGBfSxlklUxS_Dlllqmp4,6959
-natural_pdf-0.2.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-natural_pdf-0.2.8.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
-natural_pdf-0.2.8.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
-natural_pdf-0.2.8.dist-info/RECORD,,
+natural_pdf-0.2.9.dist-info/METADATA,sha256=Uekld9I1IAGvJnMbiMCyrIB9iKQNcqScD9h6aMYBQAE,6959
+natural_pdf-0.2.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+natural_pdf-0.2.9.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
+natural_pdf-0.2.9.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
+natural_pdf-0.2.9.dist-info/RECORD,,

{natural_pdf-0.2.8.dist-info → natural_pdf-0.2.9.dist-info}/WHEEL RENAMED Viewed

File without changes

{natural_pdf-0.2.8.dist-info → natural_pdf-0.2.9.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{natural_pdf-0.2.8.dist-info → natural_pdf-0.2.9.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{natural_pdf-0.2.8.dist-info → natural_pdf-0.2.9.dist-info}/top_level.txt RENAMED Viewed

File without changes

natural-pdf 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl

natural-pdf 0.2.8py3-none-any.whl → 0.2.9py3-none-any.whl