natural-pdf 0.1.37__py3-none-any.whl → 0.1.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +6 -0
- natural_pdf/core/page.py +90 -22
- natural_pdf/core/pdf.py +183 -59
- natural_pdf/elements/collections.py +202 -47
- natural_pdf/elements/region.py +176 -56
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +702 -20
- natural_pdf/flows/region.py +52 -4
- natural_pdf/selectors/parser.py +34 -1
- natural_pdf/text_mixin.py +97 -0
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/RECORD +16 -15
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/top_level.txt +0 -0
natural_pdf/__init__.py
CHANGED
@@ -76,6 +76,9 @@ from natural_pdf.core.page import Page
|
|
76
76
|
from natural_pdf.core.pdf import PDF
|
77
77
|
from natural_pdf.elements.collections import ElementCollection
|
78
78
|
from natural_pdf.elements.region import Region
|
79
|
+
from natural_pdf.flows.flow import Flow
|
80
|
+
from natural_pdf.flows.region import FlowRegion
|
81
|
+
from natural_pdf.analyzers.guides import Guides
|
79
82
|
|
80
83
|
ElementCollection = None
|
81
84
|
|
@@ -116,6 +119,9 @@ __all__ = [
|
|
116
119
|
"Page",
|
117
120
|
"Region",
|
118
121
|
"ElementCollection",
|
122
|
+
"Flow",
|
123
|
+
"FlowRegion",
|
124
|
+
"Guides",
|
119
125
|
"TextSearchOptions",
|
120
126
|
"MultiModalSearchOptions",
|
121
127
|
"BaseSearchOptions",
|
natural_pdf/core/page.py
CHANGED
@@ -64,7 +64,6 @@ from natural_pdf.core.element_manager import ElementManager
|
|
64
64
|
from natural_pdf.describe.mixin import DescribeMixin # Import describe mixin
|
65
65
|
from natural_pdf.elements.base import Element # Import base element
|
66
66
|
from natural_pdf.elements.text import TextElement
|
67
|
-
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
68
67
|
from natural_pdf.ocr import OCRManager, OCROptions
|
69
68
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
70
69
|
from natural_pdf.qa import DocumentQA, get_qa_engine
|
@@ -76,8 +75,9 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerW
|
|
76
75
|
|
77
76
|
# --- End Classification Imports --- #
|
78
77
|
|
79
|
-
|
80
|
-
|
78
|
+
# --- Text update mixin import --- #
|
79
|
+
from natural_pdf.text_mixin import TextMixin
|
80
|
+
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
81
81
|
|
82
82
|
|
83
83
|
try:
|
@@ -92,7 +92,7 @@ except ImportError:
|
|
92
92
|
logger = logging.getLogger(__name__)
|
93
93
|
|
94
94
|
|
95
|
-
class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
|
95
|
+
class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
|
96
96
|
"""Enhanced Page wrapper built on top of pdfplumber.Page.
|
97
97
|
|
98
98
|
This class provides a fluent interface for working with PDF pages,
|
@@ -1655,7 +1655,27 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1655
1655
|
table_settings.setdefault("join_x_tolerance", join)
|
1656
1656
|
table_settings.setdefault("join_y_tolerance", join)
|
1657
1657
|
|
1658
|
-
|
1658
|
+
raw_tables = self._page.extract_tables(table_settings)
|
1659
|
+
|
1660
|
+
# Apply RTL text processing to all extracted tables
|
1661
|
+
if raw_tables:
|
1662
|
+
processed_tables = []
|
1663
|
+
for table in raw_tables:
|
1664
|
+
processed_table = []
|
1665
|
+
for row in table:
|
1666
|
+
processed_row = []
|
1667
|
+
for cell in row:
|
1668
|
+
if cell is not None:
|
1669
|
+
# Apply RTL text processing to each cell
|
1670
|
+
rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
|
1671
|
+
processed_row.append(rtl_processed_cell)
|
1672
|
+
else:
|
1673
|
+
processed_row.append(cell)
|
1674
|
+
processed_table.append(processed_row)
|
1675
|
+
processed_tables.append(processed_table)
|
1676
|
+
return processed_tables
|
1677
|
+
|
1678
|
+
return raw_tables
|
1659
1679
|
else:
|
1660
1680
|
raise ValueError(
|
1661
1681
|
f"Unknown tables extraction method: '{method}'. Choose from 'pdfplumber', 'stream', 'lattice'."
|
@@ -2866,25 +2886,25 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2866
2886
|
logger.info(f"Searchable PDF saved to: {output_path_str}")
|
2867
2887
|
|
2868
2888
|
# --- Added correct_ocr method ---
|
2869
|
-
def
|
2889
|
+
def update_text(
|
2870
2890
|
self,
|
2871
|
-
|
2872
|
-
selector:
|
2891
|
+
transform: Callable[[Any], Optional[str]],
|
2892
|
+
selector: str = "text",
|
2873
2893
|
max_workers: Optional[int] = None,
|
2874
2894
|
progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
|
2875
2895
|
) -> "Page": # Return self for chaining
|
2876
2896
|
"""
|
2877
|
-
Applies corrections to
|
2897
|
+
Applies corrections to text elements on this page
|
2878
2898
|
using a user-provided callback function, potentially in parallel.
|
2879
2899
|
|
2880
|
-
Finds text elements on this page
|
2881
|
-
|
2882
|
-
|
2883
|
-
a new string.
|
2900
|
+
Finds text elements on this page matching the *selector* argument and
|
2901
|
+
calls the ``transform`` for each, passing the element itself.
|
2902
|
+
Updates the element's text if the callback returns a new string.
|
2884
2903
|
|
2885
2904
|
Args:
|
2886
|
-
|
2887
|
-
|
2905
|
+
transform: A function accepting an element and returning
|
2906
|
+
`Optional[str]` (new text or None).
|
2907
|
+
selector: CSS-like selector string to match text elements.
|
2888
2908
|
max_workers: The maximum number of threads to use for parallel execution.
|
2889
2909
|
If None or 0 or 1, runs sequentially.
|
2890
2910
|
progress_callback: Optional callback function to call after processing each element.
|
@@ -2893,21 +2913,21 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2893
2913
|
Self for method chaining.
|
2894
2914
|
"""
|
2895
2915
|
logger.info(
|
2896
|
-
f"Page {self.number}: Starting
|
2916
|
+
f"Page {self.number}: Starting text update with callback '{transform.__name__}' (max_workers={max_workers}) and selector='{selector}'"
|
2897
2917
|
)
|
2898
2918
|
|
2899
2919
|
target_elements_collection = self.find_all(selector=selector, apply_exclusions=False)
|
2900
2920
|
target_elements = target_elements_collection.elements # Get the list
|
2901
2921
|
|
2902
2922
|
if not target_elements:
|
2903
|
-
logger.info(f"Page {self.number}: No
|
2923
|
+
logger.info(f"Page {self.number}: No text elements found to update.")
|
2904
2924
|
return self
|
2905
2925
|
|
2906
2926
|
element_pbar = None
|
2907
2927
|
try:
|
2908
2928
|
element_pbar = tqdm(
|
2909
2929
|
total=len(target_elements),
|
2910
|
-
desc=f"
|
2930
|
+
desc=f"Updating text Page {self.number}",
|
2911
2931
|
unit="element",
|
2912
2932
|
leave=False,
|
2913
2933
|
)
|
@@ -2921,7 +2941,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2921
2941
|
try:
|
2922
2942
|
current_text = getattr(element, "text", None)
|
2923
2943
|
# Call the user-provided callback
|
2924
|
-
corrected_text =
|
2944
|
+
corrected_text = transform(element)
|
2925
2945
|
|
2926
2946
|
# Validate result type
|
2927
2947
|
if corrected_text is not None and not isinstance(corrected_text, str):
|
@@ -2956,7 +2976,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2956
2976
|
if max_workers is not None and max_workers > 1:
|
2957
2977
|
# --- Parallel execution --- #
|
2958
2978
|
logger.info(
|
2959
|
-
f"Page {self.number}: Running
|
2979
|
+
f"Page {self.number}: Running text update in parallel with {max_workers} workers."
|
2960
2980
|
)
|
2961
2981
|
futures = []
|
2962
2982
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
@@ -2992,7 +3012,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2992
3012
|
|
2993
3013
|
else:
|
2994
3014
|
# --- Sequential execution --- #
|
2995
|
-
logger.info(f"Page {self.number}: Running
|
3015
|
+
logger.info(f"Page {self.number}: Running text update sequentially.")
|
2996
3016
|
for element in target_elements:
|
2997
3017
|
# Call the task function directly (it handles progress_callback)
|
2998
3018
|
processed_count += 1
|
@@ -3007,7 +3027,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
3007
3027
|
updated_count += 1
|
3008
3028
|
|
3009
3029
|
logger.info(
|
3010
|
-
f"Page {self.number}:
|
3030
|
+
f"Page {self.number}: Text update finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
|
3011
3031
|
)
|
3012
3032
|
|
3013
3033
|
return self # Return self for chaining
|
@@ -3280,6 +3300,54 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
3280
3300
|
)
|
3281
3301
|
return self
|
3282
3302
|
|
3303
|
+
def _apply_rtl_processing_to_text(self, text: str) -> str:
|
3304
|
+
"""
|
3305
|
+
Apply RTL (Right-to-Left) text processing to a string.
|
3306
|
+
|
3307
|
+
This converts visual order text (as stored in PDFs) to logical order
|
3308
|
+
for proper display of Arabic, Hebrew, and other RTL scripts.
|
3309
|
+
|
3310
|
+
Args:
|
3311
|
+
text: Input text string in visual order
|
3312
|
+
|
3313
|
+
Returns:
|
3314
|
+
Text string in logical order
|
3315
|
+
"""
|
3316
|
+
if not text or not text.strip():
|
3317
|
+
return text
|
3318
|
+
|
3319
|
+
# Quick check for RTL characters - if none found, return as-is
|
3320
|
+
import unicodedata
|
3321
|
+
|
3322
|
+
def _contains_rtl(s):
|
3323
|
+
return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
|
3324
|
+
|
3325
|
+
if not _contains_rtl(text):
|
3326
|
+
return text
|
3327
|
+
|
3328
|
+
try:
|
3329
|
+
from bidi.algorithm import get_display # type: ignore
|
3330
|
+
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
3331
|
+
|
3332
|
+
# Apply BiDi algorithm to convert from visual to logical order
|
3333
|
+
# Process line by line to handle mixed content properly
|
3334
|
+
processed_lines = []
|
3335
|
+
for line in text.split("\n"):
|
3336
|
+
if line.strip():
|
3337
|
+
# Determine base direction for this line
|
3338
|
+
base_dir = "R" if _contains_rtl(line) else "L"
|
3339
|
+
logical_line = get_display(line, base_dir=base_dir)
|
3340
|
+
# Apply bracket mirroring for correct logical order
|
3341
|
+
processed_lines.append(mirror_brackets(logical_line))
|
3342
|
+
else:
|
3343
|
+
processed_lines.append(line)
|
3344
|
+
|
3345
|
+
return "\n".join(processed_lines)
|
3346
|
+
|
3347
|
+
except (ImportError, Exception):
|
3348
|
+
# If bidi library is not available or fails, return original text
|
3349
|
+
return text
|
3350
|
+
|
3283
3351
|
@property
|
3284
3352
|
def lines(self) -> List[Any]:
|
3285
3353
|
"""Get all line elements on this page."""
|
natural_pdf/core/pdf.py
CHANGED
@@ -39,6 +39,10 @@ from natural_pdf.extraction.mixin import ExtractionMixin
|
|
39
39
|
from natural_pdf.ocr import OCRManager, OCROptions
|
40
40
|
from natural_pdf.selectors.parser import parse_selector
|
41
41
|
from natural_pdf.utils.locks import pdf_render_lock
|
42
|
+
from natural_pdf.text_mixin import TextMixin
|
43
|
+
|
44
|
+
if TYPE_CHECKING:
|
45
|
+
from natural_pdf.elements.collections import ElementCollection
|
42
46
|
|
43
47
|
try:
|
44
48
|
from typing import Any as TypingAny
|
@@ -103,6 +107,7 @@ except ImportError:
|
|
103
107
|
from collections.abc import Sequence
|
104
108
|
|
105
109
|
|
110
|
+
|
106
111
|
class _LazyPageList(Sequence):
|
107
112
|
"""A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
|
108
113
|
|
@@ -121,6 +126,7 @@ class _LazyPageList(Sequence):
|
|
121
126
|
_font_attrs: Font attributes to use when creating pages.
|
122
127
|
_cache: List of cached Page objects (None until accessed).
|
123
128
|
_load_text: Whether to load text layer when creating pages.
|
129
|
+
_indices: Optional range of indices this list represents (for slices).
|
124
130
|
|
125
131
|
Example:
|
126
132
|
```python
|
@@ -130,7 +136,7 @@ class _LazyPageList(Sequence):
|
|
130
136
|
last_page = pdf.pages[-1] # Creates another Page object
|
131
137
|
|
132
138
|
# Slicing works too
|
133
|
-
first_three = pdf.pages[0:3] #
|
139
|
+
first_three = pdf.pages[0:3] # Returns another lazy list
|
134
140
|
|
135
141
|
# Iteration creates all pages
|
136
142
|
for page in pdf.pages: # Each page created as needed
|
@@ -139,30 +145,71 @@ class _LazyPageList(Sequence):
|
|
139
145
|
"""
|
140
146
|
|
141
147
|
def __init__(
|
142
|
-
self,
|
148
|
+
self,
|
149
|
+
parent_pdf: "PDF",
|
150
|
+
plumber_pdf: "pdfplumber.PDF",
|
151
|
+
font_attrs=None,
|
152
|
+
load_text=True,
|
153
|
+
indices: Optional[List[int]] = None
|
143
154
|
):
|
144
155
|
self._parent_pdf = parent_pdf
|
145
156
|
self._plumber_pdf = plumber_pdf
|
146
157
|
self._font_attrs = font_attrs
|
147
|
-
# One slot per pdfplumber page – initially all None
|
148
|
-
self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
|
149
158
|
self._load_text = load_text
|
159
|
+
|
160
|
+
# If indices is provided, this is a sliced view
|
161
|
+
if indices is not None:
|
162
|
+
self._indices = indices
|
163
|
+
self._cache = [None] * len(indices)
|
164
|
+
else:
|
165
|
+
# Full PDF - one slot per pdfplumber page
|
166
|
+
self._indices = list(range(len(plumber_pdf.pages)))
|
167
|
+
self._cache = [None] * len(plumber_pdf.pages)
|
150
168
|
|
151
169
|
# Internal helper -----------------------------------------------------
|
152
170
|
def _create_page(self, index: int) -> "Page":
|
171
|
+
"""Create and cache a page at the given index within this list."""
|
153
172
|
cached = self._cache[index]
|
154
173
|
if cached is None:
|
155
174
|
# Import here to avoid circular import problems
|
156
175
|
from natural_pdf.core.page import Page
|
157
176
|
|
158
|
-
|
177
|
+
# Get the actual page index in the full PDF
|
178
|
+
actual_page_index = self._indices[index]
|
179
|
+
plumber_page = self._plumber_pdf.pages[actual_page_index]
|
159
180
|
cached = Page(
|
160
181
|
plumber_page,
|
161
182
|
parent=self._parent_pdf,
|
162
|
-
index=
|
183
|
+
index=actual_page_index,
|
163
184
|
font_attrs=self._font_attrs,
|
164
185
|
load_text=self._load_text,
|
165
186
|
)
|
187
|
+
|
188
|
+
# Apply any stored exclusions to the newly created page
|
189
|
+
if hasattr(self._parent_pdf, '_exclusions'):
|
190
|
+
for exclusion_data in self._parent_pdf._exclusions:
|
191
|
+
exclusion_func, label = exclusion_data
|
192
|
+
try:
|
193
|
+
cached.add_exclusion(exclusion_func, label=label)
|
194
|
+
except Exception as e:
|
195
|
+
logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
|
196
|
+
|
197
|
+
# Apply any stored regions to the newly created page
|
198
|
+
if hasattr(self._parent_pdf, '_regions'):
|
199
|
+
for region_data in self._parent_pdf._regions:
|
200
|
+
region_func, name = region_data
|
201
|
+
try:
|
202
|
+
region_instance = region_func(cached)
|
203
|
+
if region_instance and hasattr(region_instance, '__class__'):
|
204
|
+
# Check if it's a Region-like object (avoid importing Region here)
|
205
|
+
cached.add_region(region_instance, name=name, source="named")
|
206
|
+
elif region_instance is not None:
|
207
|
+
logger.warning(
|
208
|
+
f"Region function did not return a valid Region for page {cached.number}"
|
209
|
+
)
|
210
|
+
except Exception as e:
|
211
|
+
logger.warning(f"Failed to apply region to page {cached.number}: {e}")
|
212
|
+
|
166
213
|
self._cache[index] = cached
|
167
214
|
return cached
|
168
215
|
|
@@ -172,9 +219,18 @@ class _LazyPageList(Sequence):
|
|
172
219
|
|
173
220
|
def __getitem__(self, key):
|
174
221
|
if isinstance(key, slice):
|
175
|
-
#
|
176
|
-
|
177
|
-
|
222
|
+
# Get the slice of our current indices
|
223
|
+
slice_indices = range(*key.indices(len(self)))
|
224
|
+
# Extract the actual page indices for this slice
|
225
|
+
actual_indices = [self._indices[i] for i in slice_indices]
|
226
|
+
# Return a new lazy list for the slice
|
227
|
+
return _LazyPageList(
|
228
|
+
self._parent_pdf,
|
229
|
+
self._plumber_pdf,
|
230
|
+
font_attrs=self._font_attrs,
|
231
|
+
load_text=self._load_text,
|
232
|
+
indices=actual_indices
|
233
|
+
)
|
178
234
|
elif isinstance(key, int):
|
179
235
|
if key < 0:
|
180
236
|
key += len(self)
|
@@ -195,7 +251,7 @@ class _LazyPageList(Sequence):
|
|
195
251
|
# --- End Lazy Page List Helper --- #
|
196
252
|
|
197
253
|
|
198
|
-
class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
254
|
+
class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
199
255
|
"""Enhanced PDF wrapper built on top of pdfplumber.
|
200
256
|
|
201
257
|
This class provides a fluent interface for working with PDF documents,
|
@@ -556,8 +612,14 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
556
612
|
raise AttributeError("PDF pages not yet initialized.")
|
557
613
|
|
558
614
|
self._exclusions = []
|
559
|
-
|
560
|
-
|
615
|
+
|
616
|
+
# Clear exclusions only from already-created (cached) pages to avoid forcing page creation
|
617
|
+
for i in range(len(self._pages)):
|
618
|
+
if self._pages._cache[i] is not None: # Only clear from existing pages
|
619
|
+
try:
|
620
|
+
self._pages._cache[i].clear_exclusions()
|
621
|
+
except Exception as e:
|
622
|
+
logger.warning(f"Failed to clear exclusions from existing page {i}: {e}")
|
561
623
|
return self
|
562
624
|
|
563
625
|
def add_exclusion(
|
@@ -608,25 +670,35 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
608
670
|
raise AttributeError("PDF pages not yet initialized.")
|
609
671
|
|
610
672
|
# ------------------------------------------------------------------
|
611
|
-
#
|
612
|
-
#
|
613
|
-
# now knows how to interpret these inputs.
|
673
|
+
# Support selector strings and ElementCollection objects directly.
|
674
|
+
# Store exclusion and apply only to already-created pages.
|
614
675
|
# ------------------------------------------------------------------
|
615
676
|
from natural_pdf.elements.collections import ElementCollection # local import
|
616
677
|
|
617
678
|
if isinstance(exclusion_func, str) or isinstance(exclusion_func, ElementCollection):
|
618
|
-
# Store for bookkeeping
|
679
|
+
# Store for bookkeeping and lazy application
|
619
680
|
self._exclusions.append((exclusion_func, label))
|
620
|
-
|
621
|
-
|
681
|
+
|
682
|
+
# Apply only to already-created (cached) pages to avoid forcing page creation
|
683
|
+
for i in range(len(self._pages)):
|
684
|
+
if self._pages._cache[i] is not None: # Only apply to existing pages
|
685
|
+
try:
|
686
|
+
self._pages._cache[i].add_exclusion(exclusion_func, label=label)
|
687
|
+
except Exception as e:
|
688
|
+
logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
|
622
689
|
return self
|
623
690
|
|
624
691
|
# Fallback to original callable / Region behaviour ------------------
|
625
692
|
exclusion_data = (exclusion_func, label)
|
626
693
|
self._exclusions.append(exclusion_data)
|
627
694
|
|
628
|
-
|
629
|
-
|
695
|
+
# Apply only to already-created (cached) pages to avoid forcing page creation
|
696
|
+
for i in range(len(self._pages)):
|
697
|
+
if self._pages._cache[i] is not None: # Only apply to existing pages
|
698
|
+
try:
|
699
|
+
self._pages._cache[i].add_exclusion(exclusion_func, label=label)
|
700
|
+
except Exception as e:
|
701
|
+
logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
|
630
702
|
|
631
703
|
return self
|
632
704
|
|
@@ -868,7 +940,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
868
940
|
Add a region function to the PDF.
|
869
941
|
|
870
942
|
Args:
|
871
|
-
region_func: A function that takes a Page and returns a Region, or None
|
872
943
|
region_func: A function that takes a Page and returns a Region, or None
|
873
944
|
name: Optional name for the region
|
874
945
|
|
@@ -881,17 +952,20 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
881
952
|
region_data = (region_func, name)
|
882
953
|
self._regions.append(region_data)
|
883
954
|
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
955
|
+
# Apply only to already-created (cached) pages to avoid forcing page creation
|
956
|
+
for i in range(len(self._pages)):
|
957
|
+
if self._pages._cache[i] is not None: # Only apply to existing pages
|
958
|
+
page = self._pages._cache[i]
|
959
|
+
try:
|
960
|
+
region_instance = region_func(page)
|
961
|
+
if region_instance and isinstance(region_instance, Region):
|
962
|
+
page.add_region(region_instance, name=name, source="named")
|
963
|
+
elif region_instance is not None:
|
964
|
+
logger.warning(
|
965
|
+
f"Region function did not return a valid Region for page {page.number}"
|
966
|
+
)
|
967
|
+
except Exception as e:
|
968
|
+
logger.error(f"Error adding region for page {page.number}: {e}")
|
895
969
|
|
896
970
|
return self
|
897
971
|
|
@@ -1159,6 +1233,62 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1159
1233
|
|
1160
1234
|
return all_tables
|
1161
1235
|
|
1236
|
+
def get_sections(
|
1237
|
+
self,
|
1238
|
+
start_elements=None,
|
1239
|
+
end_elements=None,
|
1240
|
+
new_section_on_page_break=False,
|
1241
|
+
boundary_inclusion="both",
|
1242
|
+
) -> "ElementCollection":
|
1243
|
+
"""
|
1244
|
+
Extract sections from the entire PDF based on start/end elements.
|
1245
|
+
|
1246
|
+
This method delegates to the PageCollection.get_sections() method,
|
1247
|
+
providing a convenient way to extract document sections across all pages.
|
1248
|
+
|
1249
|
+
Args:
|
1250
|
+
start_elements: Elements or selector string that mark the start of sections (optional)
|
1251
|
+
end_elements: Elements or selector string that mark the end of sections (optional)
|
1252
|
+
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
1253
|
+
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
1254
|
+
|
1255
|
+
Returns:
|
1256
|
+
ElementCollection of Region objects representing the extracted sections
|
1257
|
+
|
1258
|
+
Example:
|
1259
|
+
Extract sections between headers:
|
1260
|
+
```python
|
1261
|
+
pdf = npdf.PDF("document.pdf")
|
1262
|
+
|
1263
|
+
# Get sections between headers
|
1264
|
+
sections = pdf.get_sections(
|
1265
|
+
start_elements='text[size>14]:bold',
|
1266
|
+
end_elements='text[size>14]:bold'
|
1267
|
+
)
|
1268
|
+
|
1269
|
+
# Get sections that break at page boundaries
|
1270
|
+
sections = pdf.get_sections(
|
1271
|
+
start_elements='text:contains("Chapter")',
|
1272
|
+
new_section_on_page_break=True
|
1273
|
+
)
|
1274
|
+
```
|
1275
|
+
|
1276
|
+
Note:
|
1277
|
+
You can provide only start_elements, only end_elements, or both.
|
1278
|
+
- With only start_elements: sections go from each start to the next start (or end of document)
|
1279
|
+
- With only end_elements: sections go from beginning of document to each end
|
1280
|
+
- With both: sections go from each start to the corresponding end
|
1281
|
+
"""
|
1282
|
+
if not hasattr(self, "_pages"):
|
1283
|
+
raise AttributeError("PDF pages not yet initialized.")
|
1284
|
+
|
1285
|
+
return self.pages.get_sections(
|
1286
|
+
start_elements=start_elements,
|
1287
|
+
end_elements=end_elements,
|
1288
|
+
new_section_on_page_break=new_section_on_page_break,
|
1289
|
+
boundary_inclusion=boundary_inclusion,
|
1290
|
+
)
|
1291
|
+
|
1162
1292
|
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
1163
1293
|
"""
|
1164
1294
|
DEPRECATED: Use save_pdf(..., ocr=True) instead.
|
@@ -1633,32 +1763,28 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1633
1763
|
logger.error(f"Failed to export correction task: {e}")
|
1634
1764
|
raise
|
1635
1765
|
|
1636
|
-
def
|
1766
|
+
def update_text(
|
1637
1767
|
self,
|
1638
|
-
|
1768
|
+
transform: Callable[[Any], Optional[str]],
|
1639
1769
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
1770
|
+
selector: str = "text",
|
1640
1771
|
max_workers: Optional[int] = None,
|
1641
1772
|
progress_callback: Optional[Callable[[], None]] = None,
|
1642
1773
|
) -> "PDF":
|
1643
1774
|
"""
|
1644
|
-
Applies corrections to
|
1645
|
-
Applies corrections to OCR text elements using a callback function.
|
1775
|
+
Applies corrections to text elements using a callback function.
|
1646
1776
|
|
1647
1777
|
Args:
|
1648
|
-
correction_callback: Function that takes an element and returns corrected text or None
|
1649
1778
|
correction_callback: Function that takes an element and returns corrected text or None
|
1650
1779
|
pages: Optional page indices/slice to limit the scope of correction
|
1651
|
-
|
1652
|
-
progress_callback: Optional callback function for progress updates
|
1780
|
+
selector: Selector to apply corrections to (default: "text")
|
1653
1781
|
max_workers: Maximum number of threads to use for parallel execution
|
1654
1782
|
progress_callback: Optional callback function for progress updates
|
1655
1783
|
|
1656
1784
|
Returns:
|
1657
1785
|
Self for method chaining
|
1658
|
-
Self for method chaining
|
1659
1786
|
"""
|
1660
1787
|
target_page_indices = []
|
1661
|
-
target_page_indices = []
|
1662
1788
|
if pages is None:
|
1663
1789
|
target_page_indices = list(range(len(self._pages)))
|
1664
1790
|
elif isinstance(pages, slice):
|
@@ -1671,32 +1797,29 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1671
1797
|
raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
|
1672
1798
|
except (IndexError, TypeError, ValueError) as e:
|
1673
1799
|
raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
|
1674
|
-
raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
|
1675
1800
|
else:
|
1676
1801
|
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1677
|
-
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1678
1802
|
|
1679
1803
|
if not target_page_indices:
|
1680
|
-
logger.warning("No pages selected for
|
1804
|
+
logger.warning("No pages selected for text update.")
|
1681
1805
|
return self
|
1682
1806
|
|
1683
|
-
logger.info(f"Starting
|
1684
|
-
logger.info(f"Starting OCR correction for pages: {target_page_indices}")
|
1807
|
+
logger.info(f"Starting text update for pages: {target_page_indices} with selector='{selector}'")
|
1685
1808
|
|
1686
1809
|
for page_idx in target_page_indices:
|
1687
1810
|
page = self._pages[page_idx]
|
1688
1811
|
try:
|
1689
|
-
|
1690
|
-
|
1691
|
-
|
1692
|
-
|
1693
|
-
|
1812
|
+
page.update_text(
|
1813
|
+
transform=transform,
|
1814
|
+
selector=selector,
|
1815
|
+
max_workers=max_workers,
|
1816
|
+
progress_callback=progress_callback,
|
1817
|
+
)
|
1694
1818
|
except Exception as e:
|
1695
|
-
logger.error(f"Error during
|
1696
|
-
logger.error(f"Error during
|
1819
|
+
logger.error(f"Error during text update on page {page_idx}: {e}")
|
1820
|
+
logger.error(f"Error during text update on page {page_idx}: {e}")
|
1697
1821
|
|
1698
|
-
logger.info("
|
1699
|
-
logger.info("OCR correction process finished.")
|
1822
|
+
logger.info("Text update process finished.")
|
1700
1823
|
return self
|
1701
1824
|
|
1702
1825
|
def __len__(self) -> int:
|
@@ -1712,10 +1835,11 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1712
1835
|
|
1713
1836
|
if isinstance(key, slice):
|
1714
1837
|
from natural_pdf.elements.collections import PageCollection
|
1715
|
-
|
1716
|
-
|
1717
|
-
|
1718
|
-
|
1838
|
+
# Use the lazy page list's slicing which returns another _LazyPageList
|
1839
|
+
lazy_slice = self._pages[key]
|
1840
|
+
# Wrap in PageCollection for compatibility
|
1841
|
+
return PageCollection(lazy_slice)
|
1842
|
+
elif isinstance(key, int):
|
1719
1843
|
if 0 <= key < len(self._pages):
|
1720
1844
|
return self._pages[key]
|
1721
1845
|
else:
|