natural-pdf 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/elements/element_collection.py +61 -33
- {natural_pdf-0.2.8.dist-info → natural_pdf-0.2.9.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.8.dist-info → natural_pdf-0.2.9.dist-info}/RECORD +7 -7
- {natural_pdf-0.2.8.dist-info → natural_pdf-0.2.9.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.8.dist-info → natural_pdf-0.2.9.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.8.dist-info → natural_pdf-0.2.9.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.8.dist-info → natural_pdf-0.2.9.dist-info}/top_level.txt +0 -0
@@ -633,9 +633,7 @@ class ElementCollection(
|
|
633
633
|
pdfplumber's layout engine if layout=True is specified.
|
634
634
|
|
635
635
|
Args:
|
636
|
-
separator: String to
|
637
|
-
using simple joining (layout=False). Default is a single space.
|
638
|
-
Ignored when layout=True as the layout engine handles spacing.
|
636
|
+
separator: String to join text from elements. Default is a single space.
|
639
637
|
preserve_whitespace: Deprecated. Use layout=False for simple joining.
|
640
638
|
use_exclusions: Deprecated. Exclusions should be applied *before* creating
|
641
639
|
the collection or by filtering the collection itself.
|
@@ -652,15 +650,49 @@ class ElementCollection(
|
|
652
650
|
Returns:
|
653
651
|
Combined text from elements, potentially with layout-based spacing.
|
654
652
|
"""
|
655
|
-
#
|
656
|
-
|
653
|
+
# Check if we have any elements at all
|
654
|
+
if not self._elements:
|
655
|
+
return ""
|
656
|
+
|
657
|
+
# Check if all elements are TextElements with character data
|
658
|
+
text_elements_with_chars = [
|
657
659
|
el
|
658
660
|
for el in self._elements
|
659
|
-
if isinstance(el, TextElement) and hasattr(el, "_char_dicts")
|
661
|
+
if isinstance(el, TextElement) and hasattr(el, "_char_dicts") and el._char_dicts
|
660
662
|
]
|
661
663
|
|
662
|
-
|
663
|
-
|
664
|
+
# If we have a mixed collection (Regions, TextElements without chars, etc),
|
665
|
+
# use a simpler approach: call extract_text on each element
|
666
|
+
if len(text_elements_with_chars) < len(self._elements):
|
667
|
+
# Mixed collection - extract text from each element
|
668
|
+
element_texts = []
|
669
|
+
|
670
|
+
# Sort elements by position first
|
671
|
+
sorted_elements = sorted(
|
672
|
+
self._elements,
|
673
|
+
key=lambda el: (
|
674
|
+
el.page.index if hasattr(el, "page") else 0,
|
675
|
+
el.top if hasattr(el, "top") else 0,
|
676
|
+
el.x0 if hasattr(el, "x0") else 0,
|
677
|
+
),
|
678
|
+
)
|
679
|
+
|
680
|
+
for el in sorted_elements:
|
681
|
+
if hasattr(el, "extract_text"):
|
682
|
+
# Call extract_text on the element (works for TextElement, Region, etc)
|
683
|
+
text = el.extract_text(**kwargs)
|
684
|
+
if text:
|
685
|
+
element_texts.append(text)
|
686
|
+
elif hasattr(el, "text"):
|
687
|
+
# Fallback to text property if available
|
688
|
+
text = getattr(el, "text", "")
|
689
|
+
if text:
|
690
|
+
element_texts.append(text)
|
691
|
+
|
692
|
+
return separator.join(element_texts)
|
693
|
+
|
694
|
+
# All elements are TextElements with char data - use the original approach
|
695
|
+
text_elements = text_elements_with_chars
|
664
696
|
|
665
697
|
# Collect all character dictionaries
|
666
698
|
all_char_dicts = []
|
@@ -669,11 +701,20 @@ class ElementCollection(
|
|
669
701
|
|
670
702
|
if not all_char_dicts:
|
671
703
|
# Handle case where elements exist but have no char dicts
|
672
|
-
logger.
|
704
|
+
logger.debug(
|
673
705
|
"ElementCollection.extract_text: No character dictionaries found in TextElements."
|
674
706
|
)
|
707
|
+
# Sort elements by position before joining
|
708
|
+
sorted_text_elements = sorted(
|
709
|
+
text_elements,
|
710
|
+
key=lambda el: (
|
711
|
+
el.page.index if hasattr(el, "page") else 0,
|
712
|
+
el.top if hasattr(el, "top") else 0,
|
713
|
+
el.x0 if hasattr(el, "x0") else 0,
|
714
|
+
),
|
715
|
+
)
|
675
716
|
return separator.join(
|
676
|
-
getattr(el, "text", "") for el in
|
717
|
+
getattr(el, "text", "") for el in sorted_text_elements
|
677
718
|
) # Fallback to simple join of word text
|
678
719
|
|
679
720
|
# Apply content filtering if provided
|
@@ -737,33 +778,20 @@ class ElementCollection(
|
|
737
778
|
all_char_dicts.sort(
|
738
779
|
key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
|
739
780
|
)
|
740
|
-
result =
|
781
|
+
result = " ".join(c.get("text", "") for c in all_char_dicts)
|
741
782
|
|
742
783
|
else:
|
784
|
+
print("JOIN WITHOUT LAYOUT")
|
743
785
|
# Default: Simple join without layout
|
744
786
|
logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
|
745
|
-
|
746
|
-
|
747
|
-
#
|
748
|
-
#
|
749
|
-
|
750
|
-
#
|
751
|
-
|
752
|
-
|
753
|
-
key=lambda el: (
|
754
|
-
el.page.index if hasattr(el, "page") else 0,
|
755
|
-
el.top if hasattr(el, "top") else 0,
|
756
|
-
el.x0 if hasattr(el, "x0") else 0,
|
757
|
-
),
|
758
|
-
)
|
759
|
-
|
760
|
-
# Extract text from each element
|
761
|
-
element_texts = []
|
762
|
-
for el in sorted_elements:
|
763
|
-
if hasattr(el, "text") and el.text:
|
764
|
-
element_texts.append(el.text)
|
765
|
-
|
766
|
-
result = separator.join(element_texts)
|
787
|
+
result = separator.join(el.extract_text() for el in text_elements)
|
788
|
+
|
789
|
+
# # Sort chars by document order (page, top, x0)
|
790
|
+
# all_char_dicts.sort(
|
791
|
+
# key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
|
792
|
+
# )
|
793
|
+
# # Simple join of character text
|
794
|
+
# result = "".join(c.get("text", "") for c in all_char_dicts)
|
767
795
|
|
768
796
|
# Determine final strip flag – same rule as global helper unless caller overrides
|
769
797
|
strip_text = strip if strip is not None else (not use_layout)
|
@@ -40,7 +40,7 @@ natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ
|
|
40
40
|
natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
|
41
41
|
natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
|
42
42
|
natural_pdf/elements/base.py,sha256=92ukTtRCQFsa5KvKflChCt4mt0ZGS4ecGYCQTNMO4zU,58907
|
43
|
-
natural_pdf/elements/element_collection.py,sha256
|
43
|
+
natural_pdf/elements/element_collection.py,sha256=idM_BUWEfbCJ5Sq0Ae_KfbVHy8TdkNfzs7iWkFe_j2I,130707
|
44
44
|
natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
|
45
45
|
natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
|
46
46
|
natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
|
@@ -107,7 +107,7 @@ natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs
|
|
107
107
|
natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
|
108
108
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
109
109
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
110
|
-
natural_pdf-0.2.
|
110
|
+
natural_pdf-0.2.9.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
111
111
|
optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
|
112
112
|
optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
|
113
113
|
optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
|
@@ -124,8 +124,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
124
124
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
125
125
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
126
126
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
127
|
-
natural_pdf-0.2.
|
128
|
-
natural_pdf-0.2.
|
129
|
-
natural_pdf-0.2.
|
130
|
-
natural_pdf-0.2.
|
131
|
-
natural_pdf-0.2.
|
127
|
+
natural_pdf-0.2.9.dist-info/METADATA,sha256=Uekld9I1IAGvJnMbiMCyrIB9iKQNcqScD9h6aMYBQAE,6959
|
128
|
+
natural_pdf-0.2.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
129
|
+
natural_pdf-0.2.9.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
130
|
+
natural_pdf-0.2.9.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
|
131
|
+
natural_pdf-0.2.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|