natural-pdf 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -633,9 +633,7 @@ class ElementCollection(
633
633
  pdfplumber's layout engine if layout=True is specified.
634
634
 
635
635
  Args:
636
- separator: String to insert between text from different elements when
637
- using simple joining (layout=False). Default is a single space.
638
- Ignored when layout=True as the layout engine handles spacing.
636
+ separator: String to join text from elements. Default is a single space.
639
637
  preserve_whitespace: Deprecated. Use layout=False for simple joining.
640
638
  use_exclusions: Deprecated. Exclusions should be applied *before* creating
641
639
  the collection or by filtering the collection itself.
@@ -652,15 +650,49 @@ class ElementCollection(
652
650
  Returns:
653
651
  Combined text from elements, potentially with layout-based spacing.
654
652
  """
655
- # Filter to just TextElements that likely have _char_dicts
656
- text_elements = [
653
+ # Check if we have any elements at all
654
+ if not self._elements:
655
+ return ""
656
+
657
+ # Check if all elements are TextElements with character data
658
+ text_elements_with_chars = [
657
659
  el
658
660
  for el in self._elements
659
- if isinstance(el, TextElement) and hasattr(el, "_char_dicts")
661
+ if isinstance(el, TextElement) and hasattr(el, "_char_dicts") and el._char_dicts
660
662
  ]
661
663
 
662
- if not text_elements:
663
- return ""
664
+ # If we have a mixed collection (Regions, TextElements without chars, etc),
665
+ # use a simpler approach: call extract_text on each element
666
+ if len(text_elements_with_chars) < len(self._elements):
667
+ # Mixed collection - extract text from each element
668
+ element_texts = []
669
+
670
+ # Sort elements by position first
671
+ sorted_elements = sorted(
672
+ self._elements,
673
+ key=lambda el: (
674
+ el.page.index if hasattr(el, "page") else 0,
675
+ el.top if hasattr(el, "top") else 0,
676
+ el.x0 if hasattr(el, "x0") else 0,
677
+ ),
678
+ )
679
+
680
+ for el in sorted_elements:
681
+ if hasattr(el, "extract_text"):
682
+ # Call extract_text on the element (works for TextElement, Region, etc)
683
+ text = el.extract_text(**kwargs)
684
+ if text:
685
+ element_texts.append(text)
686
+ elif hasattr(el, "text"):
687
+ # Fallback to text property if available
688
+ text = getattr(el, "text", "")
689
+ if text:
690
+ element_texts.append(text)
691
+
692
+ return separator.join(element_texts)
693
+
694
+ # All elements are TextElements with char data - use the original approach
695
+ text_elements = text_elements_with_chars
664
696
 
665
697
  # Collect all character dictionaries
666
698
  all_char_dicts = []
@@ -669,11 +701,20 @@ class ElementCollection(
669
701
 
670
702
  if not all_char_dicts:
671
703
  # Handle case where elements exist but have no char dicts
672
- logger.warning(
704
+ logger.debug(
673
705
  "ElementCollection.extract_text: No character dictionaries found in TextElements."
674
706
  )
707
+ # Sort elements by position before joining
708
+ sorted_text_elements = sorted(
709
+ text_elements,
710
+ key=lambda el: (
711
+ el.page.index if hasattr(el, "page") else 0,
712
+ el.top if hasattr(el, "top") else 0,
713
+ el.x0 if hasattr(el, "x0") else 0,
714
+ ),
715
+ )
675
716
  return separator.join(
676
- getattr(el, "text", "") for el in text_elements
717
+ getattr(el, "text", "") for el in sorted_text_elements
677
718
  ) # Fallback to simple join of word text
678
719
 
679
720
  # Apply content filtering if provided
@@ -737,33 +778,20 @@ class ElementCollection(
737
778
  all_char_dicts.sort(
738
779
  key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
739
780
  )
740
- result = separator.join(c.get("text", "") for c in all_char_dicts)
781
+ result = " ".join(c.get("text", "") for c in all_char_dicts)
741
782
 
742
783
  else:
784
+ print("JOIN WITHOUT LAYOUT")
743
785
  # Default: Simple join without layout
744
786
  logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
745
-
746
- # Instead of joining all characters individually, we need to:
747
- # 1. Extract text from each element
748
- # 2. Join the element texts with the separator
749
-
750
- # Sort elements by document order (page, top, x0)
751
- sorted_elements = sorted(
752
- text_elements,
753
- key=lambda el: (
754
- el.page.index if hasattr(el, "page") else 0,
755
- el.top if hasattr(el, "top") else 0,
756
- el.x0 if hasattr(el, "x0") else 0,
757
- ),
758
- )
759
-
760
- # Extract text from each element
761
- element_texts = []
762
- for el in sorted_elements:
763
- if hasattr(el, "text") and el.text:
764
- element_texts.append(el.text)
765
-
766
- result = separator.join(element_texts)
787
+ result = separator.join(el.extract_text() for el in text_elements)
788
+
789
+ # # Sort chars by document order (page, top, x0)
790
+ # all_char_dicts.sort(
791
+ # key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
792
+ # )
793
+ # # Simple join of character text
794
+ # result = "".join(c.get("text", "") for c in all_char_dicts)
767
795
 
768
796
  # Determine final strip flag – same rule as global helper unless caller overrides
769
797
  strip_text = strip if strip is not None else (not use_layout)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.8
3
+ Version: 0.2.9
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -40,7 +40,7 @@ natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ
40
40
  natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
41
41
  natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
42
42
  natural_pdf/elements/base.py,sha256=92ukTtRCQFsa5KvKflChCt4mt0ZGS4ecGYCQTNMO4zU,58907
43
- natural_pdf/elements/element_collection.py,sha256=-piFQGiDPiqmnl-Cpoi3PGPmGe4AYvpl0IqaJGxBsBc,129405
43
+ natural_pdf/elements/element_collection.py,sha256=idM_BUWEfbCJ5Sq0Ae_KfbVHy8TdkNfzs7iWkFe_j2I,130707
44
44
  natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
45
45
  natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
46
46
  natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
@@ -107,7 +107,7 @@ natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs
107
107
  natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
108
108
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
109
109
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
110
- natural_pdf-0.2.8.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
110
+ natural_pdf-0.2.9.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
111
111
  optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
112
112
  optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
113
113
  optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
@@ -124,8 +124,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
124
124
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
125
125
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
126
126
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
127
- natural_pdf-0.2.8.dist-info/METADATA,sha256=tuWXV-mY9zU0qsVsXhrrp3aGBfSxlklUxS_Dlllqmp4,6959
128
- natural_pdf-0.2.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
129
- natural_pdf-0.2.8.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
130
- natural_pdf-0.2.8.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
131
- natural_pdf-0.2.8.dist-info/RECORD,,
127
+ natural_pdf-0.2.9.dist-info/METADATA,sha256=Uekld9I1IAGvJnMbiMCyrIB9iKQNcqScD9h6aMYBQAE,6959
128
+ natural_pdf-0.2.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
129
+ natural_pdf-0.2.9.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
130
+ natural_pdf-0.2.9.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
131
+ natural_pdf-0.2.9.dist-info/RECORD,,