natural-pdf 0.1.37__py3-none-any.whl → 0.1.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/core/page.py +69 -1
- natural_pdf/core/pdf.py +106 -35
- natural_pdf/elements/collections.py +38 -7
- natural_pdf/elements/region.py +86 -16
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.38.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.38.dist-info}/RECORD +10 -10
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.38.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.38.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.38.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.38.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -1655,7 +1655,27 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1655
1655
|
table_settings.setdefault("join_x_tolerance", join)
|
1656
1656
|
table_settings.setdefault("join_y_tolerance", join)
|
1657
1657
|
|
1658
|
-
|
1658
|
+
raw_tables = self._page.extract_tables(table_settings)
|
1659
|
+
|
1660
|
+
# Apply RTL text processing to all extracted tables
|
1661
|
+
if raw_tables:
|
1662
|
+
processed_tables = []
|
1663
|
+
for table in raw_tables:
|
1664
|
+
processed_table = []
|
1665
|
+
for row in table:
|
1666
|
+
processed_row = []
|
1667
|
+
for cell in row:
|
1668
|
+
if cell is not None:
|
1669
|
+
# Apply RTL text processing to each cell
|
1670
|
+
rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
|
1671
|
+
processed_row.append(rtl_processed_cell)
|
1672
|
+
else:
|
1673
|
+
processed_row.append(cell)
|
1674
|
+
processed_table.append(processed_row)
|
1675
|
+
processed_tables.append(processed_table)
|
1676
|
+
return processed_tables
|
1677
|
+
|
1678
|
+
return raw_tables
|
1659
1679
|
else:
|
1660
1680
|
raise ValueError(
|
1661
1681
|
f"Unknown tables extraction method: '{method}'. Choose from 'pdfplumber', 'stream', 'lattice'."
|
@@ -3280,6 +3300,54 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
3280
3300
|
)
|
3281
3301
|
return self
|
3282
3302
|
|
3303
|
+
def _apply_rtl_processing_to_text(self, text: str) -> str:
|
3304
|
+
"""
|
3305
|
+
Apply RTL (Right-to-Left) text processing to a string.
|
3306
|
+
|
3307
|
+
This converts visual order text (as stored in PDFs) to logical order
|
3308
|
+
for proper display of Arabic, Hebrew, and other RTL scripts.
|
3309
|
+
|
3310
|
+
Args:
|
3311
|
+
text: Input text string in visual order
|
3312
|
+
|
3313
|
+
Returns:
|
3314
|
+
Text string in logical order
|
3315
|
+
"""
|
3316
|
+
if not text or not text.strip():
|
3317
|
+
return text
|
3318
|
+
|
3319
|
+
# Quick check for RTL characters - if none found, return as-is
|
3320
|
+
import unicodedata
|
3321
|
+
|
3322
|
+
def _contains_rtl(s):
|
3323
|
+
return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
|
3324
|
+
|
3325
|
+
if not _contains_rtl(text):
|
3326
|
+
return text
|
3327
|
+
|
3328
|
+
try:
|
3329
|
+
from bidi.algorithm import get_display # type: ignore
|
3330
|
+
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
3331
|
+
|
3332
|
+
# Apply BiDi algorithm to convert from visual to logical order
|
3333
|
+
# Process line by line to handle mixed content properly
|
3334
|
+
processed_lines = []
|
3335
|
+
for line in text.split("\n"):
|
3336
|
+
if line.strip():
|
3337
|
+
# Determine base direction for this line
|
3338
|
+
base_dir = "R" if _contains_rtl(line) else "L"
|
3339
|
+
logical_line = get_display(line, base_dir=base_dir)
|
3340
|
+
# Apply bracket mirroring for correct logical order
|
3341
|
+
processed_lines.append(mirror_brackets(logical_line))
|
3342
|
+
else:
|
3343
|
+
processed_lines.append(line)
|
3344
|
+
|
3345
|
+
return "\n".join(processed_lines)
|
3346
|
+
|
3347
|
+
except (ImportError, Exception):
|
3348
|
+
# If bidi library is not available or fails, return original text
|
3349
|
+
return text
|
3350
|
+
|
3283
3351
|
@property
|
3284
3352
|
def lines(self) -> List[Any]:
|
3285
3353
|
"""Get all line elements on this page."""
|
natural_pdf/core/pdf.py
CHANGED
@@ -103,6 +103,7 @@ except ImportError:
|
|
103
103
|
from collections.abc import Sequence
|
104
104
|
|
105
105
|
|
106
|
+
|
106
107
|
class _LazyPageList(Sequence):
|
107
108
|
"""A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
|
108
109
|
|
@@ -121,6 +122,7 @@ class _LazyPageList(Sequence):
|
|
121
122
|
_font_attrs: Font attributes to use when creating pages.
|
122
123
|
_cache: List of cached Page objects (None until accessed).
|
123
124
|
_load_text: Whether to load text layer when creating pages.
|
125
|
+
_indices: Optional range of indices this list represents (for slices).
|
124
126
|
|
125
127
|
Example:
|
126
128
|
```python
|
@@ -130,7 +132,7 @@ class _LazyPageList(Sequence):
|
|
130
132
|
last_page = pdf.pages[-1] # Creates another Page object
|
131
133
|
|
132
134
|
# Slicing works too
|
133
|
-
first_three = pdf.pages[0:3] #
|
135
|
+
first_three = pdf.pages[0:3] # Returns another lazy list
|
134
136
|
|
135
137
|
# Iteration creates all pages
|
136
138
|
for page in pdf.pages: # Each page created as needed
|
@@ -139,30 +141,71 @@ class _LazyPageList(Sequence):
|
|
139
141
|
"""
|
140
142
|
|
141
143
|
def __init__(
|
142
|
-
self,
|
144
|
+
self,
|
145
|
+
parent_pdf: "PDF",
|
146
|
+
plumber_pdf: "pdfplumber.PDF",
|
147
|
+
font_attrs=None,
|
148
|
+
load_text=True,
|
149
|
+
indices: Optional[List[int]] = None
|
143
150
|
):
|
144
151
|
self._parent_pdf = parent_pdf
|
145
152
|
self._plumber_pdf = plumber_pdf
|
146
153
|
self._font_attrs = font_attrs
|
147
|
-
# One slot per pdfplumber page – initially all None
|
148
|
-
self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
|
149
154
|
self._load_text = load_text
|
155
|
+
|
156
|
+
# If indices is provided, this is a sliced view
|
157
|
+
if indices is not None:
|
158
|
+
self._indices = indices
|
159
|
+
self._cache = [None] * len(indices)
|
160
|
+
else:
|
161
|
+
# Full PDF - one slot per pdfplumber page
|
162
|
+
self._indices = list(range(len(plumber_pdf.pages)))
|
163
|
+
self._cache = [None] * len(plumber_pdf.pages)
|
150
164
|
|
151
165
|
# Internal helper -----------------------------------------------------
|
152
166
|
def _create_page(self, index: int) -> "Page":
|
167
|
+
"""Create and cache a page at the given index within this list."""
|
153
168
|
cached = self._cache[index]
|
154
169
|
if cached is None:
|
155
170
|
# Import here to avoid circular import problems
|
156
171
|
from natural_pdf.core.page import Page
|
157
172
|
|
158
|
-
|
173
|
+
# Get the actual page index in the full PDF
|
174
|
+
actual_page_index = self._indices[index]
|
175
|
+
plumber_page = self._plumber_pdf.pages[actual_page_index]
|
159
176
|
cached = Page(
|
160
177
|
plumber_page,
|
161
178
|
parent=self._parent_pdf,
|
162
|
-
index=
|
179
|
+
index=actual_page_index,
|
163
180
|
font_attrs=self._font_attrs,
|
164
181
|
load_text=self._load_text,
|
165
182
|
)
|
183
|
+
|
184
|
+
# Apply any stored exclusions to the newly created page
|
185
|
+
if hasattr(self._parent_pdf, '_exclusions'):
|
186
|
+
for exclusion_data in self._parent_pdf._exclusions:
|
187
|
+
exclusion_func, label = exclusion_data
|
188
|
+
try:
|
189
|
+
cached.add_exclusion(exclusion_func, label=label)
|
190
|
+
except Exception as e:
|
191
|
+
logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
|
192
|
+
|
193
|
+
# Apply any stored regions to the newly created page
|
194
|
+
if hasattr(self._parent_pdf, '_regions'):
|
195
|
+
for region_data in self._parent_pdf._regions:
|
196
|
+
region_func, name = region_data
|
197
|
+
try:
|
198
|
+
region_instance = region_func(cached)
|
199
|
+
if region_instance and hasattr(region_instance, '__class__'):
|
200
|
+
# Check if it's a Region-like object (avoid importing Region here)
|
201
|
+
cached.add_region(region_instance, name=name, source="named")
|
202
|
+
elif region_instance is not None:
|
203
|
+
logger.warning(
|
204
|
+
f"Region function did not return a valid Region for page {cached.number}"
|
205
|
+
)
|
206
|
+
except Exception as e:
|
207
|
+
logger.warning(f"Failed to apply region to page {cached.number}: {e}")
|
208
|
+
|
166
209
|
self._cache[index] = cached
|
167
210
|
return cached
|
168
211
|
|
@@ -172,9 +215,18 @@ class _LazyPageList(Sequence):
|
|
172
215
|
|
173
216
|
def __getitem__(self, key):
|
174
217
|
if isinstance(key, slice):
|
175
|
-
#
|
176
|
-
|
177
|
-
|
218
|
+
# Get the slice of our current indices
|
219
|
+
slice_indices = range(*key.indices(len(self)))
|
220
|
+
# Extract the actual page indices for this slice
|
221
|
+
actual_indices = [self._indices[i] for i in slice_indices]
|
222
|
+
# Return a new lazy list for the slice
|
223
|
+
return _LazyPageList(
|
224
|
+
self._parent_pdf,
|
225
|
+
self._plumber_pdf,
|
226
|
+
font_attrs=self._font_attrs,
|
227
|
+
load_text=self._load_text,
|
228
|
+
indices=actual_indices
|
229
|
+
)
|
178
230
|
elif isinstance(key, int):
|
179
231
|
if key < 0:
|
180
232
|
key += len(self)
|
@@ -556,8 +608,14 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
556
608
|
raise AttributeError("PDF pages not yet initialized.")
|
557
609
|
|
558
610
|
self._exclusions = []
|
559
|
-
|
560
|
-
|
611
|
+
|
612
|
+
# Clear exclusions only from already-created (cached) pages to avoid forcing page creation
|
613
|
+
for i in range(len(self._pages)):
|
614
|
+
if self._pages._cache[i] is not None: # Only clear from existing pages
|
615
|
+
try:
|
616
|
+
self._pages._cache[i].clear_exclusions()
|
617
|
+
except Exception as e:
|
618
|
+
logger.warning(f"Failed to clear exclusions from existing page {i}: {e}")
|
561
619
|
return self
|
562
620
|
|
563
621
|
def add_exclusion(
|
@@ -608,25 +666,35 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
608
666
|
raise AttributeError("PDF pages not yet initialized.")
|
609
667
|
|
610
668
|
# ------------------------------------------------------------------
|
611
|
-
#
|
612
|
-
#
|
613
|
-
# now knows how to interpret these inputs.
|
669
|
+
# Support selector strings and ElementCollection objects directly.
|
670
|
+
# Store exclusion and apply only to already-created pages.
|
614
671
|
# ------------------------------------------------------------------
|
615
672
|
from natural_pdf.elements.collections import ElementCollection # local import
|
616
673
|
|
617
674
|
if isinstance(exclusion_func, str) or isinstance(exclusion_func, ElementCollection):
|
618
|
-
# Store for bookkeeping
|
675
|
+
# Store for bookkeeping and lazy application
|
619
676
|
self._exclusions.append((exclusion_func, label))
|
620
|
-
|
621
|
-
|
677
|
+
|
678
|
+
# Apply only to already-created (cached) pages to avoid forcing page creation
|
679
|
+
for i in range(len(self._pages)):
|
680
|
+
if self._pages._cache[i] is not None: # Only apply to existing pages
|
681
|
+
try:
|
682
|
+
self._pages._cache[i].add_exclusion(exclusion_func, label=label)
|
683
|
+
except Exception as e:
|
684
|
+
logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
|
622
685
|
return self
|
623
686
|
|
624
687
|
# Fallback to original callable / Region behaviour ------------------
|
625
688
|
exclusion_data = (exclusion_func, label)
|
626
689
|
self._exclusions.append(exclusion_data)
|
627
690
|
|
628
|
-
|
629
|
-
|
691
|
+
# Apply only to already-created (cached) pages to avoid forcing page creation
|
692
|
+
for i in range(len(self._pages)):
|
693
|
+
if self._pages._cache[i] is not None: # Only apply to existing pages
|
694
|
+
try:
|
695
|
+
self._pages._cache[i].add_exclusion(exclusion_func, label=label)
|
696
|
+
except Exception as e:
|
697
|
+
logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
|
630
698
|
|
631
699
|
return self
|
632
700
|
|
@@ -868,7 +936,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
868
936
|
Add a region function to the PDF.
|
869
937
|
|
870
938
|
Args:
|
871
|
-
region_func: A function that takes a Page and returns a Region, or None
|
872
939
|
region_func: A function that takes a Page and returns a Region, or None
|
873
940
|
name: Optional name for the region
|
874
941
|
|
@@ -881,17 +948,20 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
881
948
|
region_data = (region_func, name)
|
882
949
|
self._regions.append(region_data)
|
883
950
|
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
951
|
+
# Apply only to already-created (cached) pages to avoid forcing page creation
|
952
|
+
for i in range(len(self._pages)):
|
953
|
+
if self._pages._cache[i] is not None: # Only apply to existing pages
|
954
|
+
page = self._pages._cache[i]
|
955
|
+
try:
|
956
|
+
region_instance = region_func(page)
|
957
|
+
if region_instance and isinstance(region_instance, Region):
|
958
|
+
page.add_region(region_instance, name=name, source="named")
|
959
|
+
elif region_instance is not None:
|
960
|
+
logger.warning(
|
961
|
+
f"Region function did not return a valid Region for page {page.number}"
|
962
|
+
)
|
963
|
+
except Exception as e:
|
964
|
+
logger.error(f"Error adding region for page {page.number}: {e}")
|
895
965
|
|
896
966
|
return self
|
897
967
|
|
@@ -1712,10 +1782,11 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1712
1782
|
|
1713
1783
|
if isinstance(key, slice):
|
1714
1784
|
from natural_pdf.elements.collections import PageCollection
|
1715
|
-
|
1716
|
-
|
1717
|
-
|
1718
|
-
|
1785
|
+
# Use the lazy page list's slicing which returns another _LazyPageList
|
1786
|
+
lazy_slice = self._pages[key]
|
1787
|
+
# Wrap in PageCollection for compatibility
|
1788
|
+
return PageCollection(lazy_slice)
|
1789
|
+
elif isinstance(key, int):
|
1719
1790
|
if 0 <= key < len(self._pages):
|
1720
1791
|
return self._pages[key]
|
1721
1792
|
else:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import hashlib
|
2
2
|
import logging
|
3
|
-
from collections.abc import MutableSequence
|
3
|
+
from collections.abc import MutableSequence, Sequence
|
4
4
|
from pathlib import Path
|
5
5
|
from typing import (
|
6
6
|
TYPE_CHECKING,
|
@@ -2051,14 +2051,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2051
2051
|
Provides methods for batch operations on these pages.
|
2052
2052
|
"""
|
2053
2053
|
|
2054
|
-
def __init__(self, pages: List[P]):
|
2054
|
+
def __init__(self, pages: Union[List[P], Sequence[P]]):
|
2055
2055
|
"""
|
2056
2056
|
Initialize a page collection.
|
2057
2057
|
|
2058
2058
|
Args:
|
2059
|
-
pages: List of Page objects
|
2059
|
+
pages: List or sequence of Page objects (can be lazy)
|
2060
2060
|
"""
|
2061
|
-
|
2061
|
+
# Store the sequence as-is to preserve lazy behavior
|
2062
|
+
# Only convert to list if we need list-specific operations
|
2063
|
+
if hasattr(pages, '__iter__') and hasattr(pages, '__len__'):
|
2064
|
+
self.pages = pages
|
2065
|
+
else:
|
2066
|
+
# Fallback for non-sequence types
|
2067
|
+
self.pages = list(pages)
|
2062
2068
|
|
2063
2069
|
def __len__(self) -> int:
|
2064
2070
|
"""Return the number of pages in the collection."""
|
@@ -2078,6 +2084,31 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2078
2084
|
"""Return a string representation showing the page count."""
|
2079
2085
|
return f"<PageCollection(count={len(self)})>"
|
2080
2086
|
|
2087
|
+
def _get_items_for_apply(self) -> Iterator[P]:
|
2088
|
+
"""
|
2089
|
+
Override ApplyMixin's _get_items_for_apply to preserve lazy behavior.
|
2090
|
+
|
2091
|
+
Returns an iterator that yields pages on-demand rather than materializing
|
2092
|
+
all pages at once, maintaining the lazy loading behavior.
|
2093
|
+
"""
|
2094
|
+
return iter(self.pages)
|
2095
|
+
|
2096
|
+
def _get_page_indices(self) -> List[int]:
|
2097
|
+
"""
|
2098
|
+
Get page indices without forcing materialization of pages.
|
2099
|
+
|
2100
|
+
Returns:
|
2101
|
+
List of page indices for the pages in this collection.
|
2102
|
+
"""
|
2103
|
+
# Handle different types of page sequences efficiently
|
2104
|
+
if hasattr(self.pages, '_indices'):
|
2105
|
+
# If it's a _LazyPageList (or slice), get indices directly
|
2106
|
+
return list(self.pages._indices)
|
2107
|
+
else:
|
2108
|
+
# Fallback: if pages are already materialized, get indices normally
|
2109
|
+
# This will force materialization but only if pages aren't lazy
|
2110
|
+
return [p.index for p in self.pages]
|
2111
|
+
|
2081
2112
|
def extract_text(
|
2082
2113
|
self,
|
2083
2114
|
keep_blank_chars: bool = True,
|
@@ -2172,7 +2203,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2172
2203
|
raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
|
2173
2204
|
|
2174
2205
|
# Get the 0-based indices of the pages in this collection
|
2175
|
-
page_indices =
|
2206
|
+
page_indices = self._get_page_indices()
|
2176
2207
|
|
2177
2208
|
logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
|
2178
2209
|
|
@@ -2374,7 +2405,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2374
2405
|
"Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
|
2375
2406
|
)
|
2376
2407
|
|
2377
|
-
page_indices =
|
2408
|
+
page_indices = self._get_page_indices()
|
2378
2409
|
logger.info(
|
2379
2410
|
f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
|
2380
2411
|
)
|
@@ -2800,7 +2831,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2800
2831
|
)
|
2801
2832
|
|
2802
2833
|
# Get the 0-based indices of the pages in this collection
|
2803
|
-
page_indices =
|
2834
|
+
page_indices = self._get_page_indices()
|
2804
2835
|
logger.info(
|
2805
2836
|
f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
|
2806
2837
|
)
|
natural_pdf/elements/region.py
CHANGED
@@ -1616,8 +1616,26 @@ class Region(
|
|
1616
1616
|
# Extract all tables from the cropped area
|
1617
1617
|
tables = cropped.extract_tables(table_settings)
|
1618
1618
|
|
1619
|
-
#
|
1620
|
-
|
1619
|
+
# Apply RTL text processing to all tables
|
1620
|
+
if tables:
|
1621
|
+
processed_tables = []
|
1622
|
+
for table in tables:
|
1623
|
+
processed_table = []
|
1624
|
+
for row in table:
|
1625
|
+
processed_row = []
|
1626
|
+
for cell in row:
|
1627
|
+
if cell is not None:
|
1628
|
+
# Apply RTL text processing to each cell
|
1629
|
+
rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
|
1630
|
+
processed_row.append(rtl_processed_cell)
|
1631
|
+
else:
|
1632
|
+
processed_row.append(cell)
|
1633
|
+
processed_table.append(processed_row)
|
1634
|
+
processed_tables.append(processed_table)
|
1635
|
+
return processed_tables
|
1636
|
+
|
1637
|
+
# Return empty list if no tables found
|
1638
|
+
return []
|
1621
1639
|
|
1622
1640
|
def _extract_table_plumber(self, table_settings: dict, content_filter=None) -> List[List[str]]:
|
1623
1641
|
"""
|
@@ -1662,21 +1680,25 @@ class Region(
|
|
1662
1680
|
|
1663
1681
|
# Return the table or an empty list if none found
|
1664
1682
|
if table:
|
1665
|
-
# Apply content filtering if provided
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
|
1670
|
-
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1683
|
+
# Apply RTL text processing and content filtering if provided
|
1684
|
+
processed_table = []
|
1685
|
+
for row in table:
|
1686
|
+
processed_row = []
|
1687
|
+
for cell in row:
|
1688
|
+
if cell is not None:
|
1689
|
+
# Apply RTL text processing first
|
1690
|
+
rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
|
1691
|
+
|
1692
|
+
# Then apply content filter if provided
|
1693
|
+
if content_filter is not None:
|
1694
|
+
filtered_cell = self._apply_content_filter_to_text(rtl_processed_cell, content_filter)
|
1695
|
+
processed_row.append(filtered_cell)
|
1675
1696
|
else:
|
1676
|
-
|
1677
|
-
|
1678
|
-
|
1679
|
-
|
1697
|
+
processed_row.append(rtl_processed_cell)
|
1698
|
+
else:
|
1699
|
+
processed_row.append(cell)
|
1700
|
+
processed_table.append(processed_row)
|
1701
|
+
return processed_table
|
1680
1702
|
return []
|
1681
1703
|
|
1682
1704
|
def _extract_table_tatr(self, use_ocr=False, ocr_config=None, content_filter=None) -> List[List[str]]:
|
@@ -3490,6 +3512,54 @@ class Region(
|
|
3490
3512
|
|
3491
3513
|
return table_grid
|
3492
3514
|
|
3515
|
+
def _apply_rtl_processing_to_text(self, text: str) -> str:
|
3516
|
+
"""
|
3517
|
+
Apply RTL (Right-to-Left) text processing to a string.
|
3518
|
+
|
3519
|
+
This converts visual order text (as stored in PDFs) to logical order
|
3520
|
+
for proper display of Arabic, Hebrew, and other RTL scripts.
|
3521
|
+
|
3522
|
+
Args:
|
3523
|
+
text: Input text string in visual order
|
3524
|
+
|
3525
|
+
Returns:
|
3526
|
+
Text string in logical order
|
3527
|
+
"""
|
3528
|
+
if not text or not text.strip():
|
3529
|
+
return text
|
3530
|
+
|
3531
|
+
# Quick check for RTL characters - if none found, return as-is
|
3532
|
+
import unicodedata
|
3533
|
+
|
3534
|
+
def _contains_rtl(s):
|
3535
|
+
return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
|
3536
|
+
|
3537
|
+
if not _contains_rtl(text):
|
3538
|
+
return text
|
3539
|
+
|
3540
|
+
try:
|
3541
|
+
from bidi.algorithm import get_display # type: ignore
|
3542
|
+
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
3543
|
+
|
3544
|
+
# Apply BiDi algorithm to convert from visual to logical order
|
3545
|
+
# Process line by line to handle mixed content properly
|
3546
|
+
processed_lines = []
|
3547
|
+
for line in text.split("\n"):
|
3548
|
+
if line.strip():
|
3549
|
+
# Determine base direction for this line
|
3550
|
+
base_dir = "R" if _contains_rtl(line) else "L"
|
3551
|
+
logical_line = get_display(line, base_dir=base_dir)
|
3552
|
+
# Apply bracket mirroring for correct logical order
|
3553
|
+
processed_lines.append(mirror_brackets(logical_line))
|
3554
|
+
else:
|
3555
|
+
processed_lines.append(line)
|
3556
|
+
|
3557
|
+
return "\n".join(processed_lines)
|
3558
|
+
|
3559
|
+
except (ImportError, Exception):
|
3560
|
+
# If bidi library is not available or fails, return original text
|
3561
|
+
return text
|
3562
|
+
|
3493
3563
|
def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
|
3494
3564
|
"""
|
3495
3565
|
Apply content filter to a text string.
|
@@ -27,8 +27,8 @@ natural_pdf/collections/pdf_collection.py,sha256=sDVEbFMNME_2OaHIsCoR_W7V1cAATNw
|
|
27
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
28
|
natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
|
29
29
|
natural_pdf/core/highlighting_service.py,sha256=2tBrrEq6d6hz5f6Yf7z5TysJdlTyuHTURBnQxokJnDM,40645
|
30
|
-
natural_pdf/core/page.py,sha256=
|
31
|
-
natural_pdf/core/pdf.py,sha256=
|
30
|
+
natural_pdf/core/page.py,sha256=iWokHLuSrQ71kxB_tTWkCp_O-i72urR4iGFUIzKoH8k,145351
|
31
|
+
natural_pdf/core/pdf.py,sha256=5M1gB9psqwJCgE0w7PQ_G1XVa_XCmyNNmluZO7pIyZ4,97112
|
32
32
|
natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
|
33
33
|
natural_pdf/describe/base.py,sha256=CLhZXYQO6SOPUVWLt6VwZ7MK48t_6wgPMyFMLtTCKRc,18166
|
34
34
|
natural_pdf/describe/elements.py,sha256=JicXC9SJmmasqxalpCXA47-kVwv-6JnR3Xiu778aNHM,12634
|
@@ -36,11 +36,11 @@ natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ
|
|
36
36
|
natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
|
37
37
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
38
38
|
natural_pdf/elements/base.py,sha256=-ZAcc8lb2aSWTKcprwKTvnR6hsDGDm7T8a1Y9V38E_A,52042
|
39
|
-
natural_pdf/elements/collections.py,sha256=
|
39
|
+
natural_pdf/elements/collections.py,sha256=qtHEaLPxZ6i3zPQsbSOw_KMAr9oDMWR1516ilSMSDeY,133189
|
40
40
|
natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
|
41
41
|
natural_pdf/elements/line.py,sha256=mHSeV-ZABY-Cc_K_NpFL53OGtTWlexYDlMvZc8_Vrx8,3845
|
42
42
|
natural_pdf/elements/rect.py,sha256=QuQg0Qo7XYQKBac-3Ss0n0ELV6icdPcrygWM2VWzeX8,3325
|
43
|
-
natural_pdf/elements/region.py,sha256=
|
43
|
+
natural_pdf/elements/region.py,sha256=s3iFTq6QNiEgSAEV9ywt-3oQW5_swTvB6FNMgANpvmA,151055
|
44
44
|
natural_pdf/elements/text.py,sha256=giPJQaXuOBCviQ7QKVx_ZMrKFVpgQAsaCS2-kn-8mp0,20530
|
45
45
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
46
46
|
natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
|
@@ -98,7 +98,7 @@ natural_pdf/utils/text_extraction.py,sha256=HYWlYGPfafwzsuMyfL5oQhvcD4NobbvC_aCp
|
|
98
98
|
natural_pdf/utils/visualization.py,sha256=olDkWtuVzP0NxRg0CP0DL-eXNCY7Bs-SH-2Xn-cjbo0,9370
|
99
99
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
100
100
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
101
|
-
natural_pdf-0.1.
|
101
|
+
natural_pdf-0.1.38.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
102
102
|
optimization/memory_comparison.py,sha256=F90D_5WhliSGAct_lyx93xd4q4F-jeo8QpGyDr8tmNw,6543
|
103
103
|
optimization/pdf_analyzer.py,sha256=xf6h-FNlqCpsm8NriXcs_bQZOB8eQkxgGGKVRL_jgCM,19347
|
104
104
|
optimization/performance_analysis.py,sha256=RjAqeE3YS1r_7qTWkY6Ng5YMbb6MXJXfXX6LoVjg_xQ,13035
|
@@ -115,8 +115,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
115
115
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
116
116
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
117
117
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
118
|
-
natural_pdf-0.1.
|
119
|
-
natural_pdf-0.1.
|
120
|
-
natural_pdf-0.1.
|
121
|
-
natural_pdf-0.1.
|
122
|
-
natural_pdf-0.1.
|
118
|
+
natural_pdf-0.1.38.dist-info/METADATA,sha256=7a2BfP1oBRbUDUm_9t-3jCsw9BGjIiGyoFwGQyDvcVo,6739
|
119
|
+
natural_pdf-0.1.38.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
120
|
+
natural_pdf-0.1.38.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
121
|
+
natural_pdf-0.1.38.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
|
122
|
+
natural_pdf-0.1.38.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|