natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/index.md +19 -0
  6. docs/ocr/index.md +63 -16
  7. docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
  8. docs/tutorials/02-finding-elements.ipynb +123 -46
  9. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  10. docs/tutorials/04-table-extraction.ipynb +17 -12
  11. docs/tutorials/05-excluding-content.ipynb +37 -32
  12. docs/tutorials/06-document-qa.ipynb +36 -31
  13. docs/tutorials/07-layout-analysis.ipynb +45 -40
  14. docs/tutorials/07-working-with-regions.ipynb +61 -60
  15. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  16. docs/tutorials/09-section-extraction.ipynb +160 -155
  17. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  18. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  19. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  20. docs/tutorials/12-ocr-integration.md +68 -106
  21. docs/tutorials/13-semantic-search.ipynb +641 -251
  22. natural_pdf/__init__.py +2 -0
  23. natural_pdf/classification/manager.py +343 -0
  24. natural_pdf/classification/mixin.py +149 -0
  25. natural_pdf/classification/results.py +62 -0
  26. natural_pdf/collections/mixins.py +63 -0
  27. natural_pdf/collections/pdf_collection.py +321 -15
  28. natural_pdf/core/element_manager.py +67 -0
  29. natural_pdf/core/page.py +227 -64
  30. natural_pdf/core/pdf.py +387 -378
  31. natural_pdf/elements/collections.py +272 -41
  32. natural_pdf/elements/region.py +99 -15
  33. natural_pdf/elements/text.py +5 -2
  34. natural_pdf/exporters/paddleocr.py +1 -1
  35. natural_pdf/extraction/manager.py +134 -0
  36. natural_pdf/extraction/mixin.py +246 -0
  37. natural_pdf/extraction/result.py +37 -0
  38. natural_pdf/ocr/engine_easyocr.py +6 -3
  39. natural_pdf/ocr/ocr_manager.py +85 -25
  40. natural_pdf/ocr/ocr_options.py +33 -10
  41. natural_pdf/ocr/utils.py +14 -3
  42. natural_pdf/qa/document_qa.py +0 -4
  43. natural_pdf/selectors/parser.py +363 -238
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
  45. natural_pdf/utils/locks.py +8 -0
  46. natural_pdf/utils/text_extraction.py +52 -1
  47. natural_pdf/utils/tqdm_utils.py +43 -0
  48. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
  49. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
  50. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  51. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -11,17 +11,22 @@ from typing import (
11
11
  Tuple,
12
12
  TypeVar,
13
13
  Union,
14
+ Iterable,
14
15
  )
15
16
 
16
17
  from pdfplumber.utils.geometry import objects_to_bbox
18
+ from tqdm.auto import tqdm
17
19
 
18
20
  # New Imports
19
21
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
20
22
 
21
- from natural_pdf.elements.text import TextElement # Needed for isinstance check
23
+ from natural_pdf.elements.text import TextElement
22
24
  from natural_pdf.ocr import OCROptions
23
25
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
24
- from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import the new utility
26
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
27
+ from natural_pdf.classification.mixin import ClassificationMixin
28
+ from natural_pdf.classification.manager import ClassificationManager
29
+ from natural_pdf.collections.mixins import ApplyMixin
25
30
 
26
31
  logger = logging.getLogger(__name__)
27
32
 
@@ -33,7 +38,7 @@ T = TypeVar("T")
33
38
  P = TypeVar("P", bound="Page")
34
39
 
35
40
 
36
- class ElementCollection(Generic[T]):
41
+ class ElementCollection(Generic[T], ApplyMixin):
37
42
  """
38
43
  Collection of PDF elements with batch operations.
39
44
  """
@@ -83,12 +88,55 @@ class ElementCollection(Generic[T]):
83
88
  """Get the last element in the collection."""
84
89
  return self._elements[-1] if self._elements else None
85
90
 
91
+ def _are_on_multiple_pages(self) -> bool:
92
+ """
93
+ Check if elements in this collection span multiple pages.
94
+
95
+ Returns:
96
+ True if elements are on different pages, False otherwise
97
+ """
98
+ if not self._elements:
99
+ return False
100
+
101
+ # Get the page index of the first element
102
+ if not hasattr(self._elements[0], "page"):
103
+ return False
104
+
105
+ first_page_idx = self._elements[0].page.index
106
+
107
+ # Check if any element is on a different page
108
+ return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
109
+
110
+ def _are_on_multiple_pdfs(self) -> bool:
111
+ """
112
+ Check if elements in this collection span multiple PDFs.
113
+
114
+ Returns:
115
+ True if elements are from different PDFs, False otherwise
116
+ """
117
+ if not self._elements:
118
+ return False
119
+
120
+ # Get the PDF of the first element
121
+ if not hasattr(self._elements[0], "page") or not hasattr(self._elements[0].page, "pdf"):
122
+ return False
123
+
124
+ first_pdf = self._elements[0].page.pdf
125
+
126
+ # Check if any element is from a different PDF
127
+ return any(
128
+ hasattr(e, "page") and
129
+ hasattr(e.page, "pdf") and
130
+ e.page.pdf is not first_pdf
131
+ for e in self._elements
132
+ )
133
+
86
134
  def highest(self) -> Optional["Element"]:
87
135
  """
88
136
  Get element with the smallest top y-coordinate (highest on page).
89
137
 
90
138
  Raises:
91
- ValueError: If elements are on multiple pages
139
+ ValueError: If elements are on multiple pages or multiple PDFs
92
140
 
93
141
  Returns:
94
142
  Element with smallest top value or None if empty
@@ -96,7 +144,9 @@ class ElementCollection(Generic[T]):
96
144
  if not self._elements:
97
145
  return None
98
146
 
99
- # Check if elements are on multiple pages
147
+ # Check if elements are on multiple pages or PDFs
148
+ if self._are_on_multiple_pdfs():
149
+ raise ValueError("Cannot determine highest element across multiple PDFs")
100
150
  if self._are_on_multiple_pages():
101
151
  raise ValueError("Cannot determine highest element across multiple pages")
102
152
 
@@ -107,7 +157,7 @@ class ElementCollection(Generic[T]):
107
157
  Get element with the largest bottom y-coordinate (lowest on page).
108
158
 
109
159
  Raises:
110
- ValueError: If elements are on multiple pages
160
+ ValueError: If elements are on multiple pages or multiple PDFs
111
161
 
112
162
  Returns:
113
163
  Element with largest bottom value or None if empty
@@ -115,7 +165,9 @@ class ElementCollection(Generic[T]):
115
165
  if not self._elements:
116
166
  return None
117
167
 
118
- # Check if elements are on multiple pages
168
+ # Check if elements are on multiple pages or PDFs
169
+ if self._are_on_multiple_pdfs():
170
+ raise ValueError("Cannot determine lowest element across multiple PDFs")
119
171
  if self._are_on_multiple_pages():
120
172
  raise ValueError("Cannot determine lowest element across multiple pages")
121
173
 
@@ -126,7 +178,7 @@ class ElementCollection(Generic[T]):
126
178
  Get element with the smallest x0 coordinate (leftmost on page).
127
179
 
128
180
  Raises:
129
- ValueError: If elements are on multiple pages
181
+ ValueError: If elements are on multiple pages or multiple PDFs
130
182
 
131
183
  Returns:
132
184
  Element with smallest x0 value or None if empty
@@ -134,7 +186,9 @@ class ElementCollection(Generic[T]):
134
186
  if not self._elements:
135
187
  return None
136
188
 
137
- # Check if elements are on multiple pages
189
+ # Check if elements are on multiple pages or PDFs
190
+ if self._are_on_multiple_pdfs():
191
+ raise ValueError("Cannot determine leftmost element across multiple PDFs")
138
192
  if self._are_on_multiple_pages():
139
193
  raise ValueError("Cannot determine leftmost element across multiple pages")
140
194
 
@@ -145,7 +199,7 @@ class ElementCollection(Generic[T]):
145
199
  Get element with the largest x1 coordinate (rightmost on page).
146
200
 
147
201
  Raises:
148
- ValueError: If elements are on multiple pages
202
+ ValueError: If elements are on multiple pages or multiple PDFs
149
203
 
150
204
  Returns:
151
205
  Element with largest x1 value or None if empty
@@ -153,31 +207,14 @@ class ElementCollection(Generic[T]):
153
207
  if not self._elements:
154
208
  return None
155
209
 
156
- # Check if elements are on multiple pages
210
+ # Check if elements are on multiple pages or PDFs
211
+ if self._are_on_multiple_pdfs():
212
+ raise ValueError("Cannot determine rightmost element across multiple PDFs")
157
213
  if self._are_on_multiple_pages():
158
214
  raise ValueError("Cannot determine rightmost element across multiple pages")
159
215
 
160
216
  return max(self._elements, key=lambda e: e.x1)
161
217
 
162
- def _are_on_multiple_pages(self) -> bool:
163
- """
164
- Check if elements in this collection span multiple pages.
165
-
166
- Returns:
167
- True if elements are on different pages, False otherwise
168
- """
169
- if not self._elements:
170
- return False
171
-
172
- # Get the page index of the first element
173
- if not hasattr(self._elements[0], "page"):
174
- return False
175
-
176
- first_page_idx = self._elements[0].page.index
177
-
178
- # Check if any element is on a different page
179
- return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
180
-
181
218
  def exclude_regions(self, regions: List["Region"]) -> "ElementCollection":
182
219
  """
183
220
  Remove elements that are within any of the specified regions.
@@ -359,6 +396,9 @@ class ElementCollection(Generic[T]):
359
396
 
360
397
  Uses grouping logic based on parameters (defaulting to grouping by type).
361
398
 
399
+ Note: Elements must be from the same PDF for this operation to work properly,
400
+ as each PDF has its own highlighting service.
401
+
362
402
  Args:
363
403
  label: Optional explicit label for the entire collection. If provided,
364
404
  all elements are highlighted as a single group with this label,
@@ -389,8 +429,12 @@ class ElementCollection(Generic[T]):
389
429
  AttributeError: If 'group_by' is provided but the attribute doesn't exist
390
430
  on some elements.
391
431
  ValueError: If 'label_format' is provided but contains invalid keys for
392
- element attributes.
432
+ element attributes, or if elements span multiple PDFs.
393
433
  """
434
+ # Check if elements span multiple PDFs
435
+ if self._are_on_multiple_pdfs():
436
+ raise ValueError("highlight() does not support elements from multiple PDFs")
437
+
394
438
  # 1. Prepare the highlight data based on parameters
395
439
  highlight_data_list = self._prepare_highlight_data(
396
440
  distinct=distinct,
@@ -761,7 +805,8 @@ class ElementCollection(Generic[T]):
761
805
  Generates a temporary preview image highlighting elements in this collection
762
806
  on their page, ignoring any persistent highlights.
763
807
 
764
- Currently only supports collections where all elements are on the same page.
808
+ Currently only supports collections where all elements are on the same page
809
+ of the same PDF.
765
810
 
766
811
  Allows grouping and coloring elements based on attributes, similar to the
767
812
  persistent `highlight()` method, but only for this temporary view.
@@ -780,14 +825,20 @@ class ElementCollection(Generic[T]):
780
825
 
781
826
  Returns:
782
827
  PIL Image object of the temporary preview, or None if rendering fails or
783
- elements span multiple pages.
828
+ elements span multiple pages/PDFs.
784
829
 
785
830
  Raises:
786
- ValueError: If the collection is empty or elements are on different pages.
831
+ ValueError: If the collection is empty or elements are on different pages/PDFs.
787
832
  """
788
833
  if not self._elements:
789
834
  raise ValueError("Cannot show an empty collection.")
790
835
 
836
+ # Check if elements are on multiple PDFs
837
+ if self._are_on_multiple_pdfs():
838
+ raise ValueError(
839
+ "show() currently only supports collections where all elements are from the same PDF."
840
+ )
841
+
791
842
  # Check if elements are on multiple pages
792
843
  if self._are_on_multiple_pages():
793
844
  raise ValueError(
@@ -1122,10 +1173,12 @@ class ElementCollection(Generic[T]):
1122
1173
  def correct_ocr(
1123
1174
  self,
1124
1175
  correction_callback: Callable[[Any], Optional[str]],
1176
+ max_workers: Optional[int] = None,
1125
1177
  ) -> "ElementCollection":
1126
1178
  """
1127
1179
  Applies corrections to OCR-generated text elements within this collection
1128
- using a user-provided callback function.
1180
+ using a user-provided callback function, executed
1181
+ in parallel if `max_workers` is specified.
1129
1182
 
1130
1183
  Iterates through elements currently in the collection. If an element's
1131
1184
  'source' attribute starts with 'ocr', it calls the `correction_callback`
@@ -1143,6 +1196,8 @@ class ElementCollection(Generic[T]):
1143
1196
  Args:
1144
1197
  correction_callback: A function accepting an element and returning
1145
1198
  `Optional[str]` (new text or None).
1199
+ max_workers: The maximum number of worker threads to use for parallel
1200
+ correction on each page. If None, defaults are used.
1146
1201
 
1147
1202
  Returns:
1148
1203
  Self for method chaining.
@@ -1152,11 +1207,169 @@ class ElementCollection(Generic[T]):
1152
1207
  elements=self._elements,
1153
1208
  correction_callback=correction_callback,
1154
1209
  caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
1210
+ max_workers=max_workers,
1155
1211
  )
1156
1212
  return self # Return self for chaining
1157
1213
 
1214
+ def remove(self) -> int:
1215
+ """
1216
+ Remove all elements in this collection from their respective pages.
1217
+
1218
+ This method removes elements from the page's _element_mgr storage.
1219
+ It's particularly useful for removing OCR elements before applying new OCR.
1220
+
1221
+ Returns:
1222
+ int: Number of elements successfully removed
1223
+ """
1224
+ if not self._elements:
1225
+ return 0
1226
+
1227
+ removed_count = 0
1228
+
1229
+ for element in self._elements:
1230
+ # Each element should have a reference to its page
1231
+ if hasattr(element, "page") and hasattr(element.page, "_element_mgr"):
1232
+ element_mgr = element.page._element_mgr
1233
+
1234
+ # Determine element type
1235
+ element_type = getattr(element, "object_type", None)
1236
+ if element_type:
1237
+ # Convert to plural form expected by element_mgr
1238
+ if element_type == "word":
1239
+ element_type = "words"
1240
+ elif element_type == "char":
1241
+ element_type = "chars"
1242
+ elif element_type == "rect":
1243
+ element_type = "rects"
1244
+ elif element_type == "line":
1245
+ element_type = "lines"
1246
+
1247
+ # Try to remove from the element manager
1248
+ if hasattr(element_mgr, "remove_element"):
1249
+ success = element_mgr.remove_element(element, element_type)
1250
+ if success:
1251
+ removed_count += 1
1252
+ else:
1253
+ logger.warning("ElementManager does not have remove_element method")
1254
+ else:
1255
+ logger.warning(f"Element has no page or page has no _element_mgr: {element}")
1256
+
1257
+ return removed_count
1258
+
1259
+ # --- Classification Method --- #
1260
+ def classify_all(
1261
+ self,
1262
+ categories: List[str],
1263
+ model: Optional[str] = None,
1264
+ using: Optional[str] = None,
1265
+ min_confidence: float = 0.0,
1266
+ analysis_key: str = 'classification',
1267
+ multi_label: bool = False,
1268
+ batch_size: int = 8,
1269
+ max_workers: Optional[int] = None,
1270
+ progress_bar: bool = True,
1271
+ **kwargs
1272
+ ):
1273
+ """Classifies all elements in the collection in batch.
1274
+
1275
+ Args:
1276
+ categories: List of category labels.
1277
+ model: Model ID (or alias 'text', 'vision').
1278
+ using: Optional processing mode ('text' or 'vision'). Inferred if None.
1279
+ min_confidence: Minimum confidence threshold.
1280
+ analysis_key: Key for storing results in element.analyses.
1281
+ multi_label: Allow multiple labels per item.
1282
+ batch_size: Size of batches passed to the inference pipeline.
1283
+ max_workers: (Not currently used for classification batching which is
1284
+ handled by the underlying pipeline).
1285
+ progress_bar: Display a progress bar.
1286
+ **kwargs: Additional arguments for the ClassificationManager.
1287
+ """
1288
+ if not self.elements:
1289
+ logger.info("ElementCollection is empty, skipping classification.")
1290
+ return self
1291
+
1292
+ # Requires access to the PDF's manager. Assume first element has it.
1293
+ first_element = self.elements[0]
1294
+ manager_source = None
1295
+ if hasattr(first_element, 'page') and hasattr(first_element.page, 'pdf'):
1296
+ manager_source = first_element.page.pdf
1297
+ elif hasattr(first_element, 'pdf'): # Maybe it's a PageCollection?
1298
+ manager_source = first_element.pdf
1299
+
1300
+ if not manager_source or not hasattr(manager_source, 'get_manager'):
1301
+ raise RuntimeError("Cannot access ClassificationManager via elements.")
1302
+
1303
+ try:
1304
+ manager = manager_source.get_manager('classification')
1305
+ except Exception as e:
1306
+ raise RuntimeError(f"Failed to get ClassificationManager: {e}") from e
1307
+
1308
+ if not manager or not manager.is_available():
1309
+ raise RuntimeError("ClassificationManager is not available.")
1310
+
1311
+ # Determine engine type early for content gathering
1312
+ inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
1313
+
1314
+ # Gather content from all elements
1315
+ items_to_classify: List[Tuple[Any, Union[str, Image.Image]]] = []
1316
+ original_elements: List[Any] = []
1317
+ logger.info(f"Gathering content for {len(self.elements)} elements for batch classification...")
1318
+ for element in self.elements:
1319
+ if not isinstance(element, ClassificationMixin):
1320
+ logger.warning(f"Skipping element (not ClassificationMixin): {element!r}")
1321
+ continue
1322
+ try:
1323
+ # Delegate content fetching to the element itself
1324
+ content = element._get_classification_content(model_type=inferred_using, **kwargs)
1325
+ items_to_classify.append(content)
1326
+ original_elements.append(element)
1327
+ except (ValueError, NotImplementedError) as e:
1328
+ logger.warning(f"Skipping element {element!r}: Cannot get content for classification - {e}")
1329
+ except Exception as e:
1330
+ logger.warning(f"Skipping element {element!r}: Error getting classification content - {e}")
1331
+
1332
+ if not items_to_classify:
1333
+ logger.warning("No content could be gathered from elements for batch classification.")
1334
+ return self
1335
+
1336
+ logger.info(f"Collected content for {len(items_to_classify)} elements. Running batch classification...")
1337
+
1338
+ # Call manager's batch classify
1339
+ batch_results: List[ClassificationResult] = manager.classify_batch(
1340
+ item_contents=items_to_classify,
1341
+ categories=categories,
1342
+ model_id=model,
1343
+ using=inferred_using,
1344
+ min_confidence=min_confidence,
1345
+ multi_label=multi_label,
1346
+ batch_size=batch_size,
1347
+ progress_bar=progress_bar,
1348
+ **kwargs
1349
+ )
1350
+
1351
+ # Assign results back to elements
1352
+ if len(batch_results) != len(original_elements):
1353
+ logger.error(
1354
+ f"Batch classification result count ({len(batch_results)}) mismatch "
1355
+ f"with elements processed ({len(original_elements)}). Cannot assign results."
1356
+ )
1357
+ # Decide how to handle mismatch - maybe store errors?
1358
+ else:
1359
+ logger.info(f"Assigning {len(batch_results)} results to elements under key '{analysis_key}'.")
1360
+ for element, result_obj in zip(original_elements, batch_results):
1361
+ try:
1362
+ if not hasattr(element, 'analyses') or element.analyses is None:
1363
+ element.analyses = {}
1364
+ element.analyses[analysis_key] = result_obj
1365
+ except Exception as e:
1366
+ logger.warning(f"Failed to store classification result for {element!r}: {e}")
1367
+
1368
+ return self
1369
+ # --- End Classification Method --- #
1370
+
1158
1371
 
1159
- class PageCollection(Generic[P]):
1372
+ class PageCollection(Generic[P], ApplyMixin):
1160
1373
  """
1161
1374
  A collection of PDF pages with cross-page operations.
1162
1375
 
@@ -1221,6 +1434,7 @@ class PageCollection(Generic[P]):
1221
1434
  device: Optional[str] = None,
1222
1435
  resolution: Optional[int] = None, # DPI for rendering
1223
1436
  apply_exclusions: bool = True, # New parameter
1437
+ replace: bool = True, # Whether to replace existing OCR elements
1224
1438
  # --- Engine-Specific Options ---
1225
1439
  options: Optional[Any] = None, # e.g., EasyOCROptions(...)
1226
1440
  ) -> "PageCollection[P]":
@@ -1240,6 +1454,8 @@ class PageCollection(Generic[P]):
1240
1454
  apply_exclusions: If True (default), render page images for OCR with
1241
1455
  excluded areas masked (whited out). If False, OCR
1242
1456
  the raw page images without masking exclusions.
1457
+ replace: If True (default), remove any existing OCR elements before
1458
+ adding new ones. If False, add new OCR elements to existing ones.
1243
1459
  options: An engine-specific options object (e.g., EasyOCROptions) or dict.
1244
1460
 
1245
1461
  Returns:
@@ -1277,6 +1493,7 @@ class PageCollection(Generic[P]):
1277
1493
  device=device,
1278
1494
  resolution=resolution,
1279
1495
  apply_exclusions=apply_exclusions, # Pass down
1496
+ replace=replace, # Pass the replace parameter
1280
1497
  options=options,
1281
1498
  )
1282
1499
  # The PDF method modifies the Page objects directly by adding elements.
@@ -1324,10 +1541,12 @@ class PageCollection(Generic[P]):
1324
1541
  def correct_ocr(
1325
1542
  self,
1326
1543
  correction_callback: Callable[[Any], Optional[str]],
1544
+ max_workers: Optional[int] = None,
1327
1545
  ) -> "PageCollection[P]":
1328
1546
  """
1329
1547
  Applies corrections to OCR-generated text elements across all pages
1330
- in this collection using a user-provided callback function.
1548
+ in this collection using a user-provided callback function, executed
1549
+ in parallel if `max_workers` is specified.
1331
1550
 
1332
1551
  This method delegates to the parent PDF's `correct_ocr` method,
1333
1552
  targeting all pages within this collection.
@@ -1335,10 +1554,11 @@ class PageCollection(Generic[P]):
1335
1554
  Args:
1336
1555
  correction_callback: A function that accepts a single argument (an element
1337
1556
  object) and returns `Optional[str]` (new text or None).
1557
+ max_workers: The maximum number of worker threads to use for parallel
1558
+ correction on each page. If None, defaults are used.
1338
1559
 
1339
1560
  Returns:
1340
- A dictionary containing aggregate statistics for the process across all pages:
1341
- {'elements_checked': total_checked, 'corrections_applied': total_applied}
1561
+ Self for method chaining.
1342
1562
 
1343
1563
  Raises:
1344
1564
  RuntimeError: If the collection is empty, pages lack a parent PDF reference,
@@ -1346,17 +1566,28 @@ class PageCollection(Generic[P]):
1346
1566
  """
1347
1567
  if not self.pages:
1348
1568
  logger.warning("Cannot correct OCR for an empty PageCollection.")
1569
+ # Return self even if empty to maintain chaining consistency
1570
+ return self
1349
1571
 
1350
1572
  # Assume all pages share the same parent PDF object
1351
1573
  parent_pdf = self.pages[0]._parent
1574
+ if not parent_pdf or not hasattr(parent_pdf, 'correct_ocr') or not callable(parent_pdf.correct_ocr):
1575
+ raise RuntimeError(
1576
+ "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
1577
+ )
1352
1578
 
1353
1579
  page_indices = [p.index for p in self.pages]
1354
1580
  logger.info(
1355
- f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}."
1581
+ f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
1356
1582
  )
1357
1583
 
1358
1584
  # Delegate the call to the parent PDF object for the relevant pages
1359
- parent_pdf.correct_ocr(correction_callback=correction_callback, pages=page_indices)
1585
+ # Pass the max_workers parameter down
1586
+ parent_pdf.correct_ocr(
1587
+ correction_callback=correction_callback,
1588
+ pages=page_indices,
1589
+ max_workers=max_workers # Pass it here
1590
+ )
1360
1591
 
1361
1592
  return self
1362
1593