natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/categorizing-documents/index.md +168 -0
- docs/data-extraction/index.md +87 -0
- docs/element-selection/index.ipynb +218 -164
- docs/element-selection/index.md +20 -0
- docs/index.md +19 -0
- docs/ocr/index.md +63 -16
- docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
- docs/tutorials/02-finding-elements.ipynb +123 -46
- docs/tutorials/03-extracting-blocks.ipynb +24 -19
- docs/tutorials/04-table-extraction.ipynb +17 -12
- docs/tutorials/05-excluding-content.ipynb +37 -32
- docs/tutorials/06-document-qa.ipynb +36 -31
- docs/tutorials/07-layout-analysis.ipynb +45 -40
- docs/tutorials/07-working-with-regions.ipynb +61 -60
- docs/tutorials/08-spatial-navigation.ipynb +76 -71
- docs/tutorials/09-section-extraction.ipynb +160 -155
- docs/tutorials/10-form-field-extraction.ipynb +71 -66
- docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- docs/tutorials/12-ocr-integration.ipynb +3420 -312
- docs/tutorials/12-ocr-integration.md +68 -106
- docs/tutorials/13-semantic-search.ipynb +641 -251
- natural_pdf/__init__.py +2 -0
- natural_pdf/classification/manager.py +343 -0
- natural_pdf/classification/mixin.py +149 -0
- natural_pdf/classification/results.py +62 -0
- natural_pdf/collections/mixins.py +63 -0
- natural_pdf/collections/pdf_collection.py +321 -15
- natural_pdf/core/element_manager.py +67 -0
- natural_pdf/core/page.py +227 -64
- natural_pdf/core/pdf.py +387 -378
- natural_pdf/elements/collections.py +272 -41
- natural_pdf/elements/region.py +99 -15
- natural_pdf/elements/text.py +5 -2
- natural_pdf/exporters/paddleocr.py +1 -1
- natural_pdf/extraction/manager.py +134 -0
- natural_pdf/extraction/mixin.py +246 -0
- natural_pdf/extraction/result.py +37 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_manager.py +85 -25
- natural_pdf/ocr/ocr_options.py +33 -10
- natural_pdf/ocr/utils.py +14 -3
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/selectors/parser.py +363 -238
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/text_extraction.py +52 -1
- natural_pdf/utils/tqdm_utils.py +43 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -11,17 +11,22 @@ from typing import (
|
|
11
11
|
Tuple,
|
12
12
|
TypeVar,
|
13
13
|
Union,
|
14
|
+
Iterable,
|
14
15
|
)
|
15
16
|
|
16
17
|
from pdfplumber.utils.geometry import objects_to_bbox
|
18
|
+
from tqdm.auto import tqdm
|
17
19
|
|
18
20
|
# New Imports
|
19
21
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
20
22
|
|
21
|
-
from natural_pdf.elements.text import TextElement
|
23
|
+
from natural_pdf.elements.text import TextElement
|
22
24
|
from natural_pdf.ocr import OCROptions
|
23
25
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
24
|
-
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
26
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
27
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
28
|
+
from natural_pdf.classification.manager import ClassificationManager
|
29
|
+
from natural_pdf.collections.mixins import ApplyMixin
|
25
30
|
|
26
31
|
logger = logging.getLogger(__name__)
|
27
32
|
|
@@ -33,7 +38,7 @@ T = TypeVar("T")
|
|
33
38
|
P = TypeVar("P", bound="Page")
|
34
39
|
|
35
40
|
|
36
|
-
class ElementCollection(Generic[T]):
|
41
|
+
class ElementCollection(Generic[T], ApplyMixin):
|
37
42
|
"""
|
38
43
|
Collection of PDF elements with batch operations.
|
39
44
|
"""
|
@@ -83,12 +88,55 @@ class ElementCollection(Generic[T]):
|
|
83
88
|
"""Get the last element in the collection."""
|
84
89
|
return self._elements[-1] if self._elements else None
|
85
90
|
|
91
|
+
def _are_on_multiple_pages(self) -> bool:
|
92
|
+
"""
|
93
|
+
Check if elements in this collection span multiple pages.
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
True if elements are on different pages, False otherwise
|
97
|
+
"""
|
98
|
+
if not self._elements:
|
99
|
+
return False
|
100
|
+
|
101
|
+
# Get the page index of the first element
|
102
|
+
if not hasattr(self._elements[0], "page"):
|
103
|
+
return False
|
104
|
+
|
105
|
+
first_page_idx = self._elements[0].page.index
|
106
|
+
|
107
|
+
# Check if any element is on a different page
|
108
|
+
return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
|
109
|
+
|
110
|
+
def _are_on_multiple_pdfs(self) -> bool:
|
111
|
+
"""
|
112
|
+
Check if elements in this collection span multiple PDFs.
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
True if elements are from different PDFs, False otherwise
|
116
|
+
"""
|
117
|
+
if not self._elements:
|
118
|
+
return False
|
119
|
+
|
120
|
+
# Get the PDF of the first element
|
121
|
+
if not hasattr(self._elements[0], "page") or not hasattr(self._elements[0].page, "pdf"):
|
122
|
+
return False
|
123
|
+
|
124
|
+
first_pdf = self._elements[0].page.pdf
|
125
|
+
|
126
|
+
# Check if any element is from a different PDF
|
127
|
+
return any(
|
128
|
+
hasattr(e, "page") and
|
129
|
+
hasattr(e.page, "pdf") and
|
130
|
+
e.page.pdf is not first_pdf
|
131
|
+
for e in self._elements
|
132
|
+
)
|
133
|
+
|
86
134
|
def highest(self) -> Optional["Element"]:
|
87
135
|
"""
|
88
136
|
Get element with the smallest top y-coordinate (highest on page).
|
89
137
|
|
90
138
|
Raises:
|
91
|
-
ValueError: If elements are on multiple pages
|
139
|
+
ValueError: If elements are on multiple pages or multiple PDFs
|
92
140
|
|
93
141
|
Returns:
|
94
142
|
Element with smallest top value or None if empty
|
@@ -96,7 +144,9 @@ class ElementCollection(Generic[T]):
|
|
96
144
|
if not self._elements:
|
97
145
|
return None
|
98
146
|
|
99
|
-
# Check if elements are on multiple pages
|
147
|
+
# Check if elements are on multiple pages or PDFs
|
148
|
+
if self._are_on_multiple_pdfs():
|
149
|
+
raise ValueError("Cannot determine highest element across multiple PDFs")
|
100
150
|
if self._are_on_multiple_pages():
|
101
151
|
raise ValueError("Cannot determine highest element across multiple pages")
|
102
152
|
|
@@ -107,7 +157,7 @@ class ElementCollection(Generic[T]):
|
|
107
157
|
Get element with the largest bottom y-coordinate (lowest on page).
|
108
158
|
|
109
159
|
Raises:
|
110
|
-
ValueError: If elements are on multiple pages
|
160
|
+
ValueError: If elements are on multiple pages or multiple PDFs
|
111
161
|
|
112
162
|
Returns:
|
113
163
|
Element with largest bottom value or None if empty
|
@@ -115,7 +165,9 @@ class ElementCollection(Generic[T]):
|
|
115
165
|
if not self._elements:
|
116
166
|
return None
|
117
167
|
|
118
|
-
# Check if elements are on multiple pages
|
168
|
+
# Check if elements are on multiple pages or PDFs
|
169
|
+
if self._are_on_multiple_pdfs():
|
170
|
+
raise ValueError("Cannot determine lowest element across multiple PDFs")
|
119
171
|
if self._are_on_multiple_pages():
|
120
172
|
raise ValueError("Cannot determine lowest element across multiple pages")
|
121
173
|
|
@@ -126,7 +178,7 @@ class ElementCollection(Generic[T]):
|
|
126
178
|
Get element with the smallest x0 coordinate (leftmost on page).
|
127
179
|
|
128
180
|
Raises:
|
129
|
-
ValueError: If elements are on multiple pages
|
181
|
+
ValueError: If elements are on multiple pages or multiple PDFs
|
130
182
|
|
131
183
|
Returns:
|
132
184
|
Element with smallest x0 value or None if empty
|
@@ -134,7 +186,9 @@ class ElementCollection(Generic[T]):
|
|
134
186
|
if not self._elements:
|
135
187
|
return None
|
136
188
|
|
137
|
-
# Check if elements are on multiple pages
|
189
|
+
# Check if elements are on multiple pages or PDFs
|
190
|
+
if self._are_on_multiple_pdfs():
|
191
|
+
raise ValueError("Cannot determine leftmost element across multiple PDFs")
|
138
192
|
if self._are_on_multiple_pages():
|
139
193
|
raise ValueError("Cannot determine leftmost element across multiple pages")
|
140
194
|
|
@@ -145,7 +199,7 @@ class ElementCollection(Generic[T]):
|
|
145
199
|
Get element with the largest x1 coordinate (rightmost on page).
|
146
200
|
|
147
201
|
Raises:
|
148
|
-
ValueError: If elements are on multiple pages
|
202
|
+
ValueError: If elements are on multiple pages or multiple PDFs
|
149
203
|
|
150
204
|
Returns:
|
151
205
|
Element with largest x1 value or None if empty
|
@@ -153,31 +207,14 @@ class ElementCollection(Generic[T]):
|
|
153
207
|
if not self._elements:
|
154
208
|
return None
|
155
209
|
|
156
|
-
# Check if elements are on multiple pages
|
210
|
+
# Check if elements are on multiple pages or PDFs
|
211
|
+
if self._are_on_multiple_pdfs():
|
212
|
+
raise ValueError("Cannot determine rightmost element across multiple PDFs")
|
157
213
|
if self._are_on_multiple_pages():
|
158
214
|
raise ValueError("Cannot determine rightmost element across multiple pages")
|
159
215
|
|
160
216
|
return max(self._elements, key=lambda e: e.x1)
|
161
217
|
|
162
|
-
def _are_on_multiple_pages(self) -> bool:
|
163
|
-
"""
|
164
|
-
Check if elements in this collection span multiple pages.
|
165
|
-
|
166
|
-
Returns:
|
167
|
-
True if elements are on different pages, False otherwise
|
168
|
-
"""
|
169
|
-
if not self._elements:
|
170
|
-
return False
|
171
|
-
|
172
|
-
# Get the page index of the first element
|
173
|
-
if not hasattr(self._elements[0], "page"):
|
174
|
-
return False
|
175
|
-
|
176
|
-
first_page_idx = self._elements[0].page.index
|
177
|
-
|
178
|
-
# Check if any element is on a different page
|
179
|
-
return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
|
180
|
-
|
181
218
|
def exclude_regions(self, regions: List["Region"]) -> "ElementCollection":
|
182
219
|
"""
|
183
220
|
Remove elements that are within any of the specified regions.
|
@@ -359,6 +396,9 @@ class ElementCollection(Generic[T]):
|
|
359
396
|
|
360
397
|
Uses grouping logic based on parameters (defaulting to grouping by type).
|
361
398
|
|
399
|
+
Note: Elements must be from the same PDF for this operation to work properly,
|
400
|
+
as each PDF has its own highlighting service.
|
401
|
+
|
362
402
|
Args:
|
363
403
|
label: Optional explicit label for the entire collection. If provided,
|
364
404
|
all elements are highlighted as a single group with this label,
|
@@ -389,8 +429,12 @@ class ElementCollection(Generic[T]):
|
|
389
429
|
AttributeError: If 'group_by' is provided but the attribute doesn't exist
|
390
430
|
on some elements.
|
391
431
|
ValueError: If 'label_format' is provided but contains invalid keys for
|
392
|
-
element attributes.
|
432
|
+
element attributes, or if elements span multiple PDFs.
|
393
433
|
"""
|
434
|
+
# Check if elements span multiple PDFs
|
435
|
+
if self._are_on_multiple_pdfs():
|
436
|
+
raise ValueError("highlight() does not support elements from multiple PDFs")
|
437
|
+
|
394
438
|
# 1. Prepare the highlight data based on parameters
|
395
439
|
highlight_data_list = self._prepare_highlight_data(
|
396
440
|
distinct=distinct,
|
@@ -761,7 +805,8 @@ class ElementCollection(Generic[T]):
|
|
761
805
|
Generates a temporary preview image highlighting elements in this collection
|
762
806
|
on their page, ignoring any persistent highlights.
|
763
807
|
|
764
|
-
Currently only supports collections where all elements are on the same page
|
808
|
+
Currently only supports collections where all elements are on the same page
|
809
|
+
of the same PDF.
|
765
810
|
|
766
811
|
Allows grouping and coloring elements based on attributes, similar to the
|
767
812
|
persistent `highlight()` method, but only for this temporary view.
|
@@ -780,14 +825,20 @@ class ElementCollection(Generic[T]):
|
|
780
825
|
|
781
826
|
Returns:
|
782
827
|
PIL Image object of the temporary preview, or None if rendering fails or
|
783
|
-
elements span multiple pages.
|
828
|
+
elements span multiple pages/PDFs.
|
784
829
|
|
785
830
|
Raises:
|
786
|
-
ValueError: If the collection is empty or elements are on different pages.
|
831
|
+
ValueError: If the collection is empty or elements are on different pages/PDFs.
|
787
832
|
"""
|
788
833
|
if not self._elements:
|
789
834
|
raise ValueError("Cannot show an empty collection.")
|
790
835
|
|
836
|
+
# Check if elements are on multiple PDFs
|
837
|
+
if self._are_on_multiple_pdfs():
|
838
|
+
raise ValueError(
|
839
|
+
"show() currently only supports collections where all elements are from the same PDF."
|
840
|
+
)
|
841
|
+
|
791
842
|
# Check if elements are on multiple pages
|
792
843
|
if self._are_on_multiple_pages():
|
793
844
|
raise ValueError(
|
@@ -1122,10 +1173,12 @@ class ElementCollection(Generic[T]):
|
|
1122
1173
|
def correct_ocr(
|
1123
1174
|
self,
|
1124
1175
|
correction_callback: Callable[[Any], Optional[str]],
|
1176
|
+
max_workers: Optional[int] = None,
|
1125
1177
|
) -> "ElementCollection":
|
1126
1178
|
"""
|
1127
1179
|
Applies corrections to OCR-generated text elements within this collection
|
1128
|
-
using a user-provided callback function
|
1180
|
+
using a user-provided callback function, executed
|
1181
|
+
in parallel if `max_workers` is specified.
|
1129
1182
|
|
1130
1183
|
Iterates through elements currently in the collection. If an element's
|
1131
1184
|
'source' attribute starts with 'ocr', it calls the `correction_callback`
|
@@ -1143,6 +1196,8 @@ class ElementCollection(Generic[T]):
|
|
1143
1196
|
Args:
|
1144
1197
|
correction_callback: A function accepting an element and returning
|
1145
1198
|
`Optional[str]` (new text or None).
|
1199
|
+
max_workers: The maximum number of worker threads to use for parallel
|
1200
|
+
correction on each page. If None, defaults are used.
|
1146
1201
|
|
1147
1202
|
Returns:
|
1148
1203
|
Self for method chaining.
|
@@ -1152,11 +1207,169 @@ class ElementCollection(Generic[T]):
|
|
1152
1207
|
elements=self._elements,
|
1153
1208
|
correction_callback=correction_callback,
|
1154
1209
|
caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
|
1210
|
+
max_workers=max_workers,
|
1155
1211
|
)
|
1156
1212
|
return self # Return self for chaining
|
1157
1213
|
|
1214
|
+
def remove(self) -> int:
|
1215
|
+
"""
|
1216
|
+
Remove all elements in this collection from their respective pages.
|
1217
|
+
|
1218
|
+
This method removes elements from the page's _element_mgr storage.
|
1219
|
+
It's particularly useful for removing OCR elements before applying new OCR.
|
1220
|
+
|
1221
|
+
Returns:
|
1222
|
+
int: Number of elements successfully removed
|
1223
|
+
"""
|
1224
|
+
if not self._elements:
|
1225
|
+
return 0
|
1226
|
+
|
1227
|
+
removed_count = 0
|
1228
|
+
|
1229
|
+
for element in self._elements:
|
1230
|
+
# Each element should have a reference to its page
|
1231
|
+
if hasattr(element, "page") and hasattr(element.page, "_element_mgr"):
|
1232
|
+
element_mgr = element.page._element_mgr
|
1233
|
+
|
1234
|
+
# Determine element type
|
1235
|
+
element_type = getattr(element, "object_type", None)
|
1236
|
+
if element_type:
|
1237
|
+
# Convert to plural form expected by element_mgr
|
1238
|
+
if element_type == "word":
|
1239
|
+
element_type = "words"
|
1240
|
+
elif element_type == "char":
|
1241
|
+
element_type = "chars"
|
1242
|
+
elif element_type == "rect":
|
1243
|
+
element_type = "rects"
|
1244
|
+
elif element_type == "line":
|
1245
|
+
element_type = "lines"
|
1246
|
+
|
1247
|
+
# Try to remove from the element manager
|
1248
|
+
if hasattr(element_mgr, "remove_element"):
|
1249
|
+
success = element_mgr.remove_element(element, element_type)
|
1250
|
+
if success:
|
1251
|
+
removed_count += 1
|
1252
|
+
else:
|
1253
|
+
logger.warning("ElementManager does not have remove_element method")
|
1254
|
+
else:
|
1255
|
+
logger.warning(f"Element has no page or page has no _element_mgr: {element}")
|
1256
|
+
|
1257
|
+
return removed_count
|
1258
|
+
|
1259
|
+
# --- Classification Method --- #
|
1260
|
+
def classify_all(
|
1261
|
+
self,
|
1262
|
+
categories: List[str],
|
1263
|
+
model: Optional[str] = None,
|
1264
|
+
using: Optional[str] = None,
|
1265
|
+
min_confidence: float = 0.0,
|
1266
|
+
analysis_key: str = 'classification',
|
1267
|
+
multi_label: bool = False,
|
1268
|
+
batch_size: int = 8,
|
1269
|
+
max_workers: Optional[int] = None,
|
1270
|
+
progress_bar: bool = True,
|
1271
|
+
**kwargs
|
1272
|
+
):
|
1273
|
+
"""Classifies all elements in the collection in batch.
|
1274
|
+
|
1275
|
+
Args:
|
1276
|
+
categories: List of category labels.
|
1277
|
+
model: Model ID (or alias 'text', 'vision').
|
1278
|
+
using: Optional processing mode ('text' or 'vision'). Inferred if None.
|
1279
|
+
min_confidence: Minimum confidence threshold.
|
1280
|
+
analysis_key: Key for storing results in element.analyses.
|
1281
|
+
multi_label: Allow multiple labels per item.
|
1282
|
+
batch_size: Size of batches passed to the inference pipeline.
|
1283
|
+
max_workers: (Not currently used for classification batching which is
|
1284
|
+
handled by the underlying pipeline).
|
1285
|
+
progress_bar: Display a progress bar.
|
1286
|
+
**kwargs: Additional arguments for the ClassificationManager.
|
1287
|
+
"""
|
1288
|
+
if not self.elements:
|
1289
|
+
logger.info("ElementCollection is empty, skipping classification.")
|
1290
|
+
return self
|
1291
|
+
|
1292
|
+
# Requires access to the PDF's manager. Assume first element has it.
|
1293
|
+
first_element = self.elements[0]
|
1294
|
+
manager_source = None
|
1295
|
+
if hasattr(first_element, 'page') and hasattr(first_element.page, 'pdf'):
|
1296
|
+
manager_source = first_element.page.pdf
|
1297
|
+
elif hasattr(first_element, 'pdf'): # Maybe it's a PageCollection?
|
1298
|
+
manager_source = first_element.pdf
|
1299
|
+
|
1300
|
+
if not manager_source or not hasattr(manager_source, 'get_manager'):
|
1301
|
+
raise RuntimeError("Cannot access ClassificationManager via elements.")
|
1302
|
+
|
1303
|
+
try:
|
1304
|
+
manager = manager_source.get_manager('classification')
|
1305
|
+
except Exception as e:
|
1306
|
+
raise RuntimeError(f"Failed to get ClassificationManager: {e}") from e
|
1307
|
+
|
1308
|
+
if not manager or not manager.is_available():
|
1309
|
+
raise RuntimeError("ClassificationManager is not available.")
|
1310
|
+
|
1311
|
+
# Determine engine type early for content gathering
|
1312
|
+
inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
|
1313
|
+
|
1314
|
+
# Gather content from all elements
|
1315
|
+
items_to_classify: List[Tuple[Any, Union[str, Image.Image]]] = []
|
1316
|
+
original_elements: List[Any] = []
|
1317
|
+
logger.info(f"Gathering content for {len(self.elements)} elements for batch classification...")
|
1318
|
+
for element in self.elements:
|
1319
|
+
if not isinstance(element, ClassificationMixin):
|
1320
|
+
logger.warning(f"Skipping element (not ClassificationMixin): {element!r}")
|
1321
|
+
continue
|
1322
|
+
try:
|
1323
|
+
# Delegate content fetching to the element itself
|
1324
|
+
content = element._get_classification_content(model_type=inferred_using, **kwargs)
|
1325
|
+
items_to_classify.append(content)
|
1326
|
+
original_elements.append(element)
|
1327
|
+
except (ValueError, NotImplementedError) as e:
|
1328
|
+
logger.warning(f"Skipping element {element!r}: Cannot get content for classification - {e}")
|
1329
|
+
except Exception as e:
|
1330
|
+
logger.warning(f"Skipping element {element!r}: Error getting classification content - {e}")
|
1331
|
+
|
1332
|
+
if not items_to_classify:
|
1333
|
+
logger.warning("No content could be gathered from elements for batch classification.")
|
1334
|
+
return self
|
1335
|
+
|
1336
|
+
logger.info(f"Collected content for {len(items_to_classify)} elements. Running batch classification...")
|
1337
|
+
|
1338
|
+
# Call manager's batch classify
|
1339
|
+
batch_results: List[ClassificationResult] = manager.classify_batch(
|
1340
|
+
item_contents=items_to_classify,
|
1341
|
+
categories=categories,
|
1342
|
+
model_id=model,
|
1343
|
+
using=inferred_using,
|
1344
|
+
min_confidence=min_confidence,
|
1345
|
+
multi_label=multi_label,
|
1346
|
+
batch_size=batch_size,
|
1347
|
+
progress_bar=progress_bar,
|
1348
|
+
**kwargs
|
1349
|
+
)
|
1350
|
+
|
1351
|
+
# Assign results back to elements
|
1352
|
+
if len(batch_results) != len(original_elements):
|
1353
|
+
logger.error(
|
1354
|
+
f"Batch classification result count ({len(batch_results)}) mismatch "
|
1355
|
+
f"with elements processed ({len(original_elements)}). Cannot assign results."
|
1356
|
+
)
|
1357
|
+
# Decide how to handle mismatch - maybe store errors?
|
1358
|
+
else:
|
1359
|
+
logger.info(f"Assigning {len(batch_results)} results to elements under key '{analysis_key}'.")
|
1360
|
+
for element, result_obj in zip(original_elements, batch_results):
|
1361
|
+
try:
|
1362
|
+
if not hasattr(element, 'analyses') or element.analyses is None:
|
1363
|
+
element.analyses = {}
|
1364
|
+
element.analyses[analysis_key] = result_obj
|
1365
|
+
except Exception as e:
|
1366
|
+
logger.warning(f"Failed to store classification result for {element!r}: {e}")
|
1367
|
+
|
1368
|
+
return self
|
1369
|
+
# --- End Classification Method --- #
|
1370
|
+
|
1158
1371
|
|
1159
|
-
class PageCollection(Generic[P]):
|
1372
|
+
class PageCollection(Generic[P], ApplyMixin):
|
1160
1373
|
"""
|
1161
1374
|
A collection of PDF pages with cross-page operations.
|
1162
1375
|
|
@@ -1221,6 +1434,7 @@ class PageCollection(Generic[P]):
|
|
1221
1434
|
device: Optional[str] = None,
|
1222
1435
|
resolution: Optional[int] = None, # DPI for rendering
|
1223
1436
|
apply_exclusions: bool = True, # New parameter
|
1437
|
+
replace: bool = True, # Whether to replace existing OCR elements
|
1224
1438
|
# --- Engine-Specific Options ---
|
1225
1439
|
options: Optional[Any] = None, # e.g., EasyOCROptions(...)
|
1226
1440
|
) -> "PageCollection[P]":
|
@@ -1240,6 +1454,8 @@ class PageCollection(Generic[P]):
|
|
1240
1454
|
apply_exclusions: If True (default), render page images for OCR with
|
1241
1455
|
excluded areas masked (whited out). If False, OCR
|
1242
1456
|
the raw page images without masking exclusions.
|
1457
|
+
replace: If True (default), remove any existing OCR elements before
|
1458
|
+
adding new ones. If False, add new OCR elements to existing ones.
|
1243
1459
|
options: An engine-specific options object (e.g., EasyOCROptions) or dict.
|
1244
1460
|
|
1245
1461
|
Returns:
|
@@ -1277,6 +1493,7 @@ class PageCollection(Generic[P]):
|
|
1277
1493
|
device=device,
|
1278
1494
|
resolution=resolution,
|
1279
1495
|
apply_exclusions=apply_exclusions, # Pass down
|
1496
|
+
replace=replace, # Pass the replace parameter
|
1280
1497
|
options=options,
|
1281
1498
|
)
|
1282
1499
|
# The PDF method modifies the Page objects directly by adding elements.
|
@@ -1324,10 +1541,12 @@ class PageCollection(Generic[P]):
|
|
1324
1541
|
def correct_ocr(
|
1325
1542
|
self,
|
1326
1543
|
correction_callback: Callable[[Any], Optional[str]],
|
1544
|
+
max_workers: Optional[int] = None,
|
1327
1545
|
) -> "PageCollection[P]":
|
1328
1546
|
"""
|
1329
1547
|
Applies corrections to OCR-generated text elements across all pages
|
1330
|
-
in this collection using a user-provided callback function
|
1548
|
+
in this collection using a user-provided callback function, executed
|
1549
|
+
in parallel if `max_workers` is specified.
|
1331
1550
|
|
1332
1551
|
This method delegates to the parent PDF's `correct_ocr` method,
|
1333
1552
|
targeting all pages within this collection.
|
@@ -1335,10 +1554,11 @@ class PageCollection(Generic[P]):
|
|
1335
1554
|
Args:
|
1336
1555
|
correction_callback: A function that accepts a single argument (an element
|
1337
1556
|
object) and returns `Optional[str]` (new text or None).
|
1557
|
+
max_workers: The maximum number of worker threads to use for parallel
|
1558
|
+
correction on each page. If None, defaults are used.
|
1338
1559
|
|
1339
1560
|
Returns:
|
1340
|
-
|
1341
|
-
{'elements_checked': total_checked, 'corrections_applied': total_applied}
|
1561
|
+
Self for method chaining.
|
1342
1562
|
|
1343
1563
|
Raises:
|
1344
1564
|
RuntimeError: If the collection is empty, pages lack a parent PDF reference,
|
@@ -1346,17 +1566,28 @@ class PageCollection(Generic[P]):
|
|
1346
1566
|
"""
|
1347
1567
|
if not self.pages:
|
1348
1568
|
logger.warning("Cannot correct OCR for an empty PageCollection.")
|
1569
|
+
# Return self even if empty to maintain chaining consistency
|
1570
|
+
return self
|
1349
1571
|
|
1350
1572
|
# Assume all pages share the same parent PDF object
|
1351
1573
|
parent_pdf = self.pages[0]._parent
|
1574
|
+
if not parent_pdf or not hasattr(parent_pdf, 'correct_ocr') or not callable(parent_pdf.correct_ocr):
|
1575
|
+
raise RuntimeError(
|
1576
|
+
"Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
|
1577
|
+
)
|
1352
1578
|
|
1353
1579
|
page_indices = [p.index for p in self.pages]
|
1354
1580
|
logger.info(
|
1355
|
-
f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}."
|
1581
|
+
f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
|
1356
1582
|
)
|
1357
1583
|
|
1358
1584
|
# Delegate the call to the parent PDF object for the relevant pages
|
1359
|
-
|
1585
|
+
# Pass the max_workers parameter down
|
1586
|
+
parent_pdf.correct_ocr(
|
1587
|
+
correction_callback=correction_callback,
|
1588
|
+
pages=page_indices,
|
1589
|
+
max_workers=max_workers # Pass it here
|
1590
|
+
)
|
1360
1591
|
|
1361
1592
|
return self
|
1362
1593
|
|