natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,27 +1,41 @@
1
1
  import logging
2
+ from collections.abc import MutableSequence
3
+ from pathlib import Path
2
4
  from typing import (
3
5
  TYPE_CHECKING,
4
6
  Any,
5
7
  Callable,
6
8
  Dict,
7
9
  Generic,
10
+ Iterable,
8
11
  Iterator,
9
12
  List,
10
13
  Optional,
14
+ Sequence,
11
15
  Tuple,
16
+ Type,
12
17
  TypeVar,
13
18
  Union,
19
+ overload,
14
20
  )
15
21
 
16
22
  from pdfplumber.utils.geometry import objects_to_bbox
17
23
 
18
24
  # New Imports
19
25
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
20
-
21
- from natural_pdf.elements.text import TextElement # Needed for isinstance check
26
+ from tqdm.auto import tqdm
27
+
28
+ from natural_pdf.classification.manager import ClassificationManager
29
+ from natural_pdf.classification.mixin import ClassificationMixin
30
+ from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
31
+ from natural_pdf.core.pdf import PDF
32
+ from natural_pdf.elements.base import Element
33
+ from natural_pdf.elements.region import Region
34
+ from natural_pdf.elements.text import TextElement
35
+ from natural_pdf.export.mixin import ExportMixin
22
36
  from natural_pdf.ocr import OCROptions
37
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
23
38
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
24
- from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import the new utility
25
39
 
26
40
  logger = logging.getLogger(__name__)
27
41
 
@@ -33,7 +47,9 @@ T = TypeVar("T")
33
47
  P = TypeVar("P", bound="Page")
34
48
 
35
49
 
36
- class ElementCollection(Generic[T]):
50
+ class ElementCollection(
51
+ Generic[T], ApplyMixin, ExportMixin, DirectionalCollectionMixin, MutableSequence
52
+ ):
37
53
  """
38
54
  Collection of PDF elements with batch operations.
39
55
  """
@@ -55,10 +71,6 @@ class ElementCollection(Generic[T]):
55
71
  """Get an element by index."""
56
72
  return self._elements[index]
57
73
 
58
- def __iter__(self):
59
- """Iterate over elements."""
60
- return iter(self._elements)
61
-
62
74
  def __repr__(self) -> str:
63
75
  """Return a string representation showing the element count."""
64
76
  element_type = "Mixed"
@@ -68,6 +80,20 @@ class ElementCollection(Generic[T]):
68
80
  element_type = types.pop()
69
81
  return f"<ElementCollection[{element_type}](count={len(self)})>"
70
82
 
83
+ def __add__(self, other: "ElementCollection") -> "ElementCollection":
84
+ if not isinstance(other, ElementCollection):
85
+ return NotImplemented
86
+ return ElementCollection(self._elements + other._elements)
87
+
88
+ def __setitem__(self, index, value):
89
+ self._elements[index] = value
90
+
91
+ def __delitem__(self, index):
92
+ del self._elements[index]
93
+
94
+ def insert(self, index, value):
95
+ self._elements.insert(index, value)
96
+
71
97
  @property
72
98
  def elements(self) -> List["Element"]:
73
99
  """Get the elements in this collection."""
@@ -83,12 +109,53 @@ class ElementCollection(Generic[T]):
83
109
  """Get the last element in the collection."""
84
110
  return self._elements[-1] if self._elements else None
85
111
 
112
+ def _are_on_multiple_pages(self) -> bool:
113
+ """
114
+ Check if elements in this collection span multiple pages.
115
+
116
+ Returns:
117
+ True if elements are on different pages, False otherwise
118
+ """
119
+ if not self._elements:
120
+ return False
121
+
122
+ # Get the page index of the first element
123
+ if not hasattr(self._elements[0], "page"):
124
+ return False
125
+
126
+ first_page_idx = self._elements[0].page.index
127
+
128
+ # Check if any element is on a different page
129
+ return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
130
+
131
+ def _are_on_multiple_pdfs(self) -> bool:
132
+ """
133
+ Check if elements in this collection span multiple PDFs.
134
+
135
+ Returns:
136
+ True if elements are from different PDFs, False otherwise
137
+ """
138
+ if not self._elements:
139
+ return False
140
+
141
+ # Get the PDF of the first element
142
+ if not hasattr(self._elements[0], "page") or not hasattr(self._elements[0].page, "pdf"):
143
+ return False
144
+
145
+ first_pdf = self._elements[0].page.pdf
146
+
147
+ # Check if any element is from a different PDF
148
+ return any(
149
+ hasattr(e, "page") and hasattr(e.page, "pdf") and e.page.pdf is not first_pdf
150
+ for e in self._elements
151
+ )
152
+
86
153
  def highest(self) -> Optional["Element"]:
87
154
  """
88
155
  Get element with the smallest top y-coordinate (highest on page).
89
156
 
90
157
  Raises:
91
- ValueError: If elements are on multiple pages
158
+ ValueError: If elements are on multiple pages or multiple PDFs
92
159
 
93
160
  Returns:
94
161
  Element with smallest top value or None if empty
@@ -96,7 +163,9 @@ class ElementCollection(Generic[T]):
96
163
  if not self._elements:
97
164
  return None
98
165
 
99
- # Check if elements are on multiple pages
166
+ # Check if elements are on multiple pages or PDFs
167
+ if self._are_on_multiple_pdfs():
168
+ raise ValueError("Cannot determine highest element across multiple PDFs")
100
169
  if self._are_on_multiple_pages():
101
170
  raise ValueError("Cannot determine highest element across multiple pages")
102
171
 
@@ -107,7 +176,7 @@ class ElementCollection(Generic[T]):
107
176
  Get element with the largest bottom y-coordinate (lowest on page).
108
177
 
109
178
  Raises:
110
- ValueError: If elements are on multiple pages
179
+ ValueError: If elements are on multiple pages or multiple PDFs
111
180
 
112
181
  Returns:
113
182
  Element with largest bottom value or None if empty
@@ -115,7 +184,9 @@ class ElementCollection(Generic[T]):
115
184
  if not self._elements:
116
185
  return None
117
186
 
118
- # Check if elements are on multiple pages
187
+ # Check if elements are on multiple pages or PDFs
188
+ if self._are_on_multiple_pdfs():
189
+ raise ValueError("Cannot determine lowest element across multiple PDFs")
119
190
  if self._are_on_multiple_pages():
120
191
  raise ValueError("Cannot determine lowest element across multiple pages")
121
192
 
@@ -126,7 +197,7 @@ class ElementCollection(Generic[T]):
126
197
  Get element with the smallest x0 coordinate (leftmost on page).
127
198
 
128
199
  Raises:
129
- ValueError: If elements are on multiple pages
200
+ ValueError: If elements are on multiple pages or multiple PDFs
130
201
 
131
202
  Returns:
132
203
  Element with smallest x0 value or None if empty
@@ -134,7 +205,9 @@ class ElementCollection(Generic[T]):
134
205
  if not self._elements:
135
206
  return None
136
207
 
137
- # Check if elements are on multiple pages
208
+ # Check if elements are on multiple pages or PDFs
209
+ if self._are_on_multiple_pdfs():
210
+ raise ValueError("Cannot determine leftmost element across multiple PDFs")
138
211
  if self._are_on_multiple_pages():
139
212
  raise ValueError("Cannot determine leftmost element across multiple pages")
140
213
 
@@ -145,7 +218,7 @@ class ElementCollection(Generic[T]):
145
218
  Get element with the largest x1 coordinate (rightmost on page).
146
219
 
147
220
  Raises:
148
- ValueError: If elements are on multiple pages
221
+ ValueError: If elements are on multiple pages or multiple PDFs
149
222
 
150
223
  Returns:
151
224
  Element with largest x1 value or None if empty
@@ -153,31 +226,14 @@ class ElementCollection(Generic[T]):
153
226
  if not self._elements:
154
227
  return None
155
228
 
156
- # Check if elements are on multiple pages
229
+ # Check if elements are on multiple pages or PDFs
230
+ if self._are_on_multiple_pdfs():
231
+ raise ValueError("Cannot determine rightmost element across multiple PDFs")
157
232
  if self._are_on_multiple_pages():
158
233
  raise ValueError("Cannot determine rightmost element across multiple pages")
159
234
 
160
235
  return max(self._elements, key=lambda e: e.x1)
161
236
 
162
- def _are_on_multiple_pages(self) -> bool:
163
- """
164
- Check if elements in this collection span multiple pages.
165
-
166
- Returns:
167
- True if elements are on different pages, False otherwise
168
- """
169
- if not self._elements:
170
- return False
171
-
172
- # Get the page index of the first element
173
- if not hasattr(self._elements[0], "page"):
174
- return False
175
-
176
- first_page_idx = self._elements[0].page.index
177
-
178
- # Check if any element is on a different page
179
- return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
180
-
181
237
  def exclude_regions(self, regions: List["Region"]) -> "ElementCollection":
182
238
  """
183
239
  Remove elements that are within any of the specified regions.
@@ -359,6 +415,9 @@ class ElementCollection(Generic[T]):
359
415
 
360
416
  Uses grouping logic based on parameters (defaulting to grouping by type).
361
417
 
418
+ Note: Elements must be from the same PDF for this operation to work properly,
419
+ as each PDF has its own highlighting service.
420
+
362
421
  Args:
363
422
  label: Optional explicit label for the entire collection. If provided,
364
423
  all elements are highlighted as a single group with this label,
@@ -389,8 +448,12 @@ class ElementCollection(Generic[T]):
389
448
  AttributeError: If 'group_by' is provided but the attribute doesn't exist
390
449
  on some elements.
391
450
  ValueError: If 'label_format' is provided but contains invalid keys for
392
- element attributes.
451
+ element attributes, or if elements span multiple PDFs.
393
452
  """
453
+ # Check if elements span multiple PDFs
454
+ if self._are_on_multiple_pdfs():
455
+ raise ValueError("highlight() does not support elements from multiple PDFs")
456
+
394
457
  # 1. Prepare the highlight data based on parameters
395
458
  highlight_data_list = self._prepare_highlight_data(
396
459
  distinct=distinct,
@@ -761,7 +824,8 @@ class ElementCollection(Generic[T]):
761
824
  Generates a temporary preview image highlighting elements in this collection
762
825
  on their page, ignoring any persistent highlights.
763
826
 
764
- Currently only supports collections where all elements are on the same page.
827
+ Currently only supports collections where all elements are on the same page
828
+ of the same PDF.
765
829
 
766
830
  Allows grouping and coloring elements based on attributes, similar to the
767
831
  persistent `highlight()` method, but only for this temporary view.
@@ -780,14 +844,20 @@ class ElementCollection(Generic[T]):
780
844
 
781
845
  Returns:
782
846
  PIL Image object of the temporary preview, or None if rendering fails or
783
- elements span multiple pages.
847
+ elements span multiple pages/PDFs.
784
848
 
785
849
  Raises:
786
- ValueError: If the collection is empty or elements are on different pages.
850
+ ValueError: If the collection is empty or elements are on different pages/PDFs.
787
851
  """
788
852
  if not self._elements:
789
853
  raise ValueError("Cannot show an empty collection.")
790
854
 
855
+ # Check if elements are on multiple PDFs
856
+ if self._are_on_multiple_pdfs():
857
+ raise ValueError(
858
+ "show() currently only supports collections where all elements are from the same PDF."
859
+ )
860
+
791
861
  # Check if elements are on multiple pages
792
862
  if self._are_on_multiple_pages():
793
863
  raise ValueError(
@@ -1062,70 +1132,33 @@ class ElementCollection(Generic[T]):
1062
1132
  logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
1063
1133
  return None
1064
1134
 
1065
- def find_all(
1066
- self, selector: str, regex: bool = False, case: bool = True, **kwargs
1067
- ) -> "ElementCollection[T]":
1135
+ def find(self, selector: str, **kwargs) -> "ElementCollection":
1068
1136
  """
1069
- Filter elements within this collection matching the selector.
1137
+ Find elements in this collection matching the selector.
1070
1138
 
1071
1139
  Args:
1072
- selector: CSS-like selector string.
1073
- regex: Whether to use regex for text search in :contains (default: False).
1074
- case: Whether to do case-sensitive text search (default: True).
1075
- **kwargs: Additional filter parameters passed to the selector function.
1076
-
1077
- Returns:
1078
- A new ElementCollection containing only the matching elements from this collection.
1140
+ selector: CSS-like selector string
1141
+ apply_exclusions: Whether to exclude elements in exclusion regions
1079
1142
  """
1080
- if not self._elements:
1081
- return ElementCollection([])
1082
-
1083
- try:
1084
- selector_obj = parse_selector(selector)
1085
- except Exception as e:
1086
- logger.error(f"Error parsing selector '{selector}': {e}")
1087
- return ElementCollection([]) # Return empty on parse error
1088
-
1089
- # Pass regex and case flags to selector function generator
1090
- kwargs["regex"] = regex
1091
- kwargs["case"] = case
1092
-
1093
- try:
1094
- filter_func = selector_to_filter_func(selector_obj, **kwargs)
1095
- except Exception as e:
1096
- logger.error(f"Error creating filter function for selector '{selector}': {e}")
1097
- return ElementCollection([]) # Return empty on filter creation error
1098
-
1099
- matching_elements = [element for element in self._elements if filter_func(element)]
1143
+ return self.apply(lambda element: element.find(selector, **kwargs))
1100
1144
 
1101
- # Note: Unlike Page.find_all, this doesn't re-sort.
1102
- # Sorting should be done explicitly on the collection if needed.
1103
-
1104
- return ElementCollection(matching_elements)
1105
-
1106
- def find(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> Optional[T]:
1145
+ def extract_each_text(self, **kwargs) -> List[str]:
1107
1146
  """
1108
- Find the first element within this collection matching the selector.
1109
-
1110
- Args:
1111
- selector: CSS-like selector string.
1112
- regex: Whether to use regex for text search in :contains (default: False).
1113
- case: Whether to do case-sensitive text search (default: True).
1114
- **kwargs: Additional filter parameters passed to the selector function.
1115
-
1116
- Returns:
1117
- The first matching element or None.
1147
+ Extract text from each element in this region.
1118
1148
  """
1119
- results = self.find_all(selector, regex=regex, case=case, **kwargs)
1120
- return results.first
1149
+ return self.apply(
1150
+ lambda element: element.extract_text(**kwargs) if element is not None else None
1151
+ )
1121
1152
 
1122
1153
  def correct_ocr(
1123
1154
  self,
1124
1155
  correction_callback: Callable[[Any], Optional[str]],
1156
+ max_workers: Optional[int] = None,
1125
1157
  ) -> "ElementCollection":
1126
1158
  """
1127
1159
  Applies corrections to OCR-generated text elements within this collection
1128
- using a user-provided callback function.
1160
+ using a user-provided callback function, executed
1161
+ in parallel if `max_workers` is specified.
1129
1162
 
1130
1163
  Iterates through elements currently in the collection. If an element's
1131
1164
  'source' attribute starts with 'ocr', it calls the `correction_callback`
@@ -1143,6 +1176,8 @@ class ElementCollection(Generic[T]):
1143
1176
  Args:
1144
1177
  correction_callback: A function accepting an element and returning
1145
1178
  `Optional[str]` (new text or None).
1179
+ max_workers: The maximum number of worker threads to use for parallel
1180
+ correction on each page. If None, defaults are used.
1146
1181
 
1147
1182
  Returns:
1148
1183
  Self for method chaining.
@@ -1152,11 +1187,296 @@ class ElementCollection(Generic[T]):
1152
1187
  elements=self._elements,
1153
1188
  correction_callback=correction_callback,
1154
1189
  caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
1190
+ max_workers=max_workers,
1155
1191
  )
1156
1192
  return self # Return self for chaining
1157
1193
 
1194
+ def remove(self) -> int:
1195
+ """
1196
+ Remove all elements in this collection from their respective pages.
1197
+
1198
+ This method removes elements from the page's _element_mgr storage.
1199
+ It's particularly useful for removing OCR elements before applying new OCR.
1200
+
1201
+ Returns:
1202
+ int: Number of elements successfully removed
1203
+ """
1204
+ if not self._elements:
1205
+ return 0
1206
+
1207
+ removed_count = 0
1208
+
1209
+ for element in self._elements:
1210
+ # Each element should have a reference to its page
1211
+ if hasattr(element, "page") and hasattr(element.page, "_element_mgr"):
1212
+ element_mgr = element.page._element_mgr
1213
+
1214
+ # Determine element type
1215
+ element_type = getattr(element, "object_type", None)
1216
+ if element_type:
1217
+ # Convert to plural form expected by element_mgr
1218
+ if element_type == "word":
1219
+ element_type = "words"
1220
+ elif element_type == "char":
1221
+ element_type = "chars"
1222
+ elif element_type == "rect":
1223
+ element_type = "rects"
1224
+ elif element_type == "line":
1225
+ element_type = "lines"
1226
+
1227
+ # Try to remove from the element manager
1228
+ if hasattr(element_mgr, "remove_element"):
1229
+ success = element_mgr.remove_element(element, element_type)
1230
+ if success:
1231
+ removed_count += 1
1232
+ else:
1233
+ logger.warning("ElementManager does not have remove_element method")
1234
+ else:
1235
+ logger.warning(f"Element has no page or page has no _element_mgr: {element}")
1236
+
1237
+ return removed_count
1238
+
1239
+ # --- Classification Method --- #
1240
+ def classify_all(
1241
+ self,
1242
+ categories: List[str],
1243
+ model: Optional[str] = None,
1244
+ using: Optional[str] = None,
1245
+ min_confidence: float = 0.0,
1246
+ analysis_key: str = "classification",
1247
+ multi_label: bool = False,
1248
+ batch_size: int = 8,
1249
+ max_workers: Optional[int] = None,
1250
+ progress_bar: bool = True,
1251
+ **kwargs,
1252
+ ):
1253
+ """Classifies all elements in the collection in batch.
1254
+
1255
+ Args:
1256
+ categories: List of category labels.
1257
+ model: Model ID (or alias 'text', 'vision').
1258
+ using: Optional processing mode ('text' or 'vision'). Inferred if None.
1259
+ min_confidence: Minimum confidence threshold.
1260
+ analysis_key: Key for storing results in element.analyses.
1261
+ multi_label: Allow multiple labels per item.
1262
+ batch_size: Size of batches passed to the inference pipeline.
1263
+ max_workers: (Not currently used for classification batching which is
1264
+ handled by the underlying pipeline).
1265
+ progress_bar: Display a progress bar.
1266
+ **kwargs: Additional arguments for the ClassificationManager.
1267
+ """
1268
+ if not self.elements:
1269
+ logger.info("ElementCollection is empty, skipping classification.")
1270
+ return self
1271
+
1272
+ # Requires access to the PDF's manager. Assume first element has it.
1273
+ first_element = self.elements[0]
1274
+ manager_source = None
1275
+ if hasattr(first_element, "page") and hasattr(first_element.page, "pdf"):
1276
+ manager_source = first_element.page.pdf
1277
+ elif hasattr(first_element, "pdf"): # Maybe it's a PageCollection?
1278
+ manager_source = first_element.pdf
1279
+
1280
+ if not manager_source or not hasattr(manager_source, "get_manager"):
1281
+ raise RuntimeError("Cannot access ClassificationManager via elements.")
1282
+
1283
+ try:
1284
+ manager = manager_source.get_manager("classification")
1285
+ except Exception as e:
1286
+ raise RuntimeError(f"Failed to get ClassificationManager: {e}") from e
1287
+
1288
+ if not manager or not manager.is_available():
1289
+ raise RuntimeError("ClassificationManager is not available.")
1290
+
1291
+ # Determine engine type early for content gathering
1292
+ inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
1293
+
1294
+ # Gather content from all elements
1295
+ items_to_classify: List[Tuple[Any, Union[str, Image.Image]]] = []
1296
+ original_elements: List[Any] = []
1297
+ logger.info(
1298
+ f"Gathering content for {len(self.elements)} elements for batch classification..."
1299
+ )
1300
+ for element in self.elements:
1301
+ if not isinstance(element, ClassificationMixin):
1302
+ logger.warning(f"Skipping element (not ClassificationMixin): {element!r}")
1303
+ continue
1304
+ try:
1305
+ # Delegate content fetching to the element itself
1306
+ content = element._get_classification_content(model_type=inferred_using, **kwargs)
1307
+ items_to_classify.append(content)
1308
+ original_elements.append(element)
1309
+ except (ValueError, NotImplementedError) as e:
1310
+ logger.warning(
1311
+ f"Skipping element {element!r}: Cannot get content for classification - {e}"
1312
+ )
1313
+ except Exception as e:
1314
+ logger.warning(
1315
+ f"Skipping element {element!r}: Error getting classification content - {e}"
1316
+ )
1317
+
1318
+ if not items_to_classify:
1319
+ logger.warning("No content could be gathered from elements for batch classification.")
1320
+ return self
1321
+
1322
+ logger.info(
1323
+ f"Collected content for {len(items_to_classify)} elements. Running batch classification..."
1324
+ )
1325
+
1326
+ # Call manager's batch classify
1327
+ batch_results: List[ClassificationResult] = manager.classify_batch(
1328
+ item_contents=items_to_classify,
1329
+ categories=categories,
1330
+ model_id=model,
1331
+ using=inferred_using,
1332
+ min_confidence=min_confidence,
1333
+ multi_label=multi_label,
1334
+ batch_size=batch_size,
1335
+ progress_bar=progress_bar,
1336
+ **kwargs,
1337
+ )
1338
+
1339
+ # Assign results back to elements
1340
+ if len(batch_results) != len(original_elements):
1341
+ logger.error(
1342
+ f"Batch classification result count ({len(batch_results)}) mismatch "
1343
+ f"with elements processed ({len(original_elements)}). Cannot assign results."
1344
+ )
1345
+ # Decide how to handle mismatch - maybe store errors?
1346
+ else:
1347
+ logger.info(
1348
+ f"Assigning {len(batch_results)} results to elements under key '{analysis_key}'."
1349
+ )
1350
+ for element, result_obj in zip(original_elements, batch_results):
1351
+ try:
1352
+ if not hasattr(element, "analyses") or element.analyses is None:
1353
+ element.analyses = {}
1354
+ element.analyses[analysis_key] = result_obj
1355
+ except Exception as e:
1356
+ logger.warning(f"Failed to store classification result for {element!r}: {e}")
1357
+
1358
+ return self
1359
+
1360
+ # --- End Classification Method --- #
1361
+
1362
+ def _gather_analysis_data(
1363
+ self,
1364
+ analysis_keys: List[str],
1365
+ include_content: bool,
1366
+ include_images: bool,
1367
+ image_dir: Optional[Path],
1368
+ image_format: str,
1369
+ image_resolution: int,
1370
+ ) -> List[Dict[str, Any]]:
1371
+ """
1372
+ Gather analysis data from all elements in the collection.
1373
+
1374
+ Args:
1375
+ analysis_keys: Keys in the analyses dictionary to export
1376
+ include_content: Whether to include extracted text
1377
+ include_images: Whether to export images
1378
+ image_dir: Directory to save images
1379
+ image_format: Format to save images
1380
+ image_resolution: Resolution for exported images
1381
+
1382
+ Returns:
1383
+ List of dictionaries containing analysis data
1384
+ """
1385
+ if not self.elements:
1386
+ logger.warning("No elements found in collection")
1387
+ return []
1388
+
1389
+ all_data = []
1390
+
1391
+ for i, element in enumerate(self.elements):
1392
+ # Base element information
1393
+ element_data = {
1394
+ "element_index": i,
1395
+ "element_type": getattr(element, "type", type(element).__name__),
1396
+ }
1397
+
1398
+ # Add geometry if available
1399
+ for attr in ["x0", "top", "x1", "bottom", "width", "height"]:
1400
+ if hasattr(element, attr):
1401
+ element_data[attr] = getattr(element, attr)
1402
+
1403
+ # Add page information if available
1404
+ if hasattr(element, "page"):
1405
+ page = element.page
1406
+ if page:
1407
+ element_data["page_number"] = getattr(page, "number", None)
1408
+ element_data["pdf_path"] = (
1409
+ getattr(page.pdf, "path", None) if hasattr(page, "pdf") else None
1410
+ )
1411
+
1412
+ # Include extracted text if requested
1413
+ if include_content and hasattr(element, "extract_text"):
1414
+ try:
1415
+ element_data["content"] = element.extract_text(preserve_whitespace=True)
1416
+ except Exception as e:
1417
+ logger.error(f"Error extracting text from element {i}: {e}")
1418
+ element_data["content"] = ""
1158
1419
 
1159
- class PageCollection(Generic[P]):
1420
+ # Save image if requested
1421
+ if include_images and hasattr(element, "to_image"):
1422
+ try:
1423
+ # Create identifier for the element
1424
+ pdf_name = "unknown"
1425
+ page_num = "unknown"
1426
+
1427
+ if hasattr(element, "page") and element.page:
1428
+ page_num = element.page.number
1429
+ if hasattr(element.page, "pdf") and element.page.pdf:
1430
+ pdf_name = Path(element.page.pdf.path).stem
1431
+
1432
+ # Create image filename
1433
+ element_type = element_data.get("element_type", "element").lower()
1434
+ image_filename = f"{pdf_name}_page{page_num}_{element_type}_{i}.{image_format}"
1435
+ image_path = image_dir / image_filename
1436
+
1437
+ # Save image
1438
+ element.to_image(
1439
+ path=str(image_path), resolution=image_resolution, include_highlights=True
1440
+ )
1441
+
1442
+ # Add relative path to data
1443
+ element_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
1444
+ except Exception as e:
1445
+ logger.error(f"Error saving image for element {i}: {e}")
1446
+ element_data["image_path"] = None
1447
+
1448
+ # Add analyses data
1449
+ if hasattr(element, "analyses"):
1450
+ for key in analysis_keys:
1451
+ if key not in element.analyses:
1452
+ # Skip this key if it doesn't exist - elements might have different analyses
1453
+ logger.warning(f"Analysis key '{key}' not found in element {i}")
1454
+ continue
1455
+
1456
+ # Get the analysis result
1457
+ analysis_result = element.analyses[key]
1458
+
1459
+ # If the result has a to_dict method, use it
1460
+ if hasattr(analysis_result, "to_dict"):
1461
+ analysis_data = analysis_result.to_dict()
1462
+ else:
1463
+ # Otherwise, use the result directly if it's dict-like
1464
+ try:
1465
+ analysis_data = dict(analysis_result)
1466
+ except (TypeError, ValueError):
1467
+ # Last resort: convert to string
1468
+ analysis_data = {"raw_result": str(analysis_result)}
1469
+
1470
+ # Add analysis data to element data with the key as prefix
1471
+ for k, v in analysis_data.items():
1472
+ element_data[f"{key}.{k}"] = v
1473
+
1474
+ all_data.append(element_data)
1475
+
1476
+ return all_data
1477
+
1478
+
1479
+ class PageCollection(Generic[P], ApplyMixin):
1160
1480
  """
1161
1481
  A collection of PDF pages with cross-page operations.
1162
1482
 
@@ -1221,6 +1541,7 @@ class PageCollection(Generic[P]):
1221
1541
  device: Optional[str] = None,
1222
1542
  resolution: Optional[int] = None, # DPI for rendering
1223
1543
  apply_exclusions: bool = True, # New parameter
1544
+ replace: bool = True, # Whether to replace existing OCR elements
1224
1545
  # --- Engine-Specific Options ---
1225
1546
  options: Optional[Any] = None, # e.g., EasyOCROptions(...)
1226
1547
  ) -> "PageCollection[P]":
@@ -1240,6 +1561,8 @@ class PageCollection(Generic[P]):
1240
1561
  apply_exclusions: If True (default), render page images for OCR with
1241
1562
  excluded areas masked (whited out). If False, OCR
1242
1563
  the raw page images without masking exclusions.
1564
+ replace: If True (default), remove any existing OCR elements before
1565
+ adding new ones. If False, add new OCR elements to existing ones.
1243
1566
  options: An engine-specific options object (e.g., EasyOCROptions) or dict.
1244
1567
 
1245
1568
  Returns:
@@ -1277,45 +1600,134 @@ class PageCollection(Generic[P]):
1277
1600
  device=device,
1278
1601
  resolution=resolution,
1279
1602
  apply_exclusions=apply_exclusions, # Pass down
1603
+ replace=replace, # Pass the replace parameter
1280
1604
  options=options,
1281
1605
  )
1282
1606
  # The PDF method modifies the Page objects directly by adding elements.
1283
1607
 
1284
1608
  return self # Return self for chaining
1285
1609
 
1286
- def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[T]:
1610
+ @overload
1611
+ def find(
1612
+ self,
1613
+ *,
1614
+ text: str,
1615
+ apply_exclusions: bool = True,
1616
+ regex: bool = False,
1617
+ case: bool = True,
1618
+ **kwargs,
1619
+ ) -> Optional[T]: ...
1620
+
1621
+ @overload
1622
+ def find(
1623
+ self,
1624
+ selector: str,
1625
+ *,
1626
+ apply_exclusions: bool = True,
1627
+ regex: bool = False,
1628
+ case: bool = True,
1629
+ **kwargs,
1630
+ ) -> Optional[T]: ...
1631
+
1632
+ def find(
1633
+ self,
1634
+ selector: Optional[str] = None,
1635
+ *,
1636
+ text: Optional[str] = None,
1637
+ apply_exclusions: bool = True,
1638
+ regex: bool = False,
1639
+ case: bool = True,
1640
+ **kwargs,
1641
+ ) -> Optional[T]:
1287
1642
  """
1288
- Find the first element matching the selector across all pages.
1643
+ Find the first element matching the selector OR text across all pages in the collection.
1644
+
1645
+ Provide EITHER `selector` OR `text`, but not both.
1289
1646
 
1290
1647
  Args:
1291
- selector: CSS-like selector string
1292
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
1293
- **kwargs: Additional filter parameters
1648
+ selector: CSS-like selector string.
1649
+ text: Text content to search for (equivalent to 'text:contains(...)').
1650
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1651
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1652
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
1653
+ **kwargs: Additional filter parameters.
1294
1654
 
1295
1655
  Returns:
1296
- First matching element or None
1656
+ First matching element or None.
1297
1657
  """
1658
+ # Input validation happens within page.find
1298
1659
  for page in self.pages:
1299
- element = page.find(selector, apply_exclusions=apply_exclusions, **kwargs)
1660
+ element = page.find(
1661
+ selector=selector,
1662
+ text=text,
1663
+ apply_exclusions=apply_exclusions,
1664
+ regex=regex,
1665
+ case=case,
1666
+ **kwargs,
1667
+ )
1300
1668
  if element:
1301
1669
  return element
1302
1670
  return None
1303
1671
 
1304
- def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> ElementCollection:
1672
+ @overload
1673
+ def find_all(
1674
+ self,
1675
+ *,
1676
+ text: str,
1677
+ apply_exclusions: bool = True,
1678
+ regex: bool = False,
1679
+ case: bool = True,
1680
+ **kwargs,
1681
+ ) -> "ElementCollection": ...
1682
+
1683
+ @overload
1684
+ def find_all(
1685
+ self,
1686
+ selector: str,
1687
+ *,
1688
+ apply_exclusions: bool = True,
1689
+ regex: bool = False,
1690
+ case: bool = True,
1691
+ **kwargs,
1692
+ ) -> "ElementCollection": ...
1693
+
1694
+ def find_all(
1695
+ self,
1696
+ selector: Optional[str] = None,
1697
+ *,
1698
+ text: Optional[str] = None,
1699
+ apply_exclusions: bool = True,
1700
+ regex: bool = False,
1701
+ case: bool = True,
1702
+ **kwargs,
1703
+ ) -> "ElementCollection":
1305
1704
  """
1306
- Find all elements matching the selector across all pages.
1705
+ Find all elements matching the selector OR text across all pages in the collection.
1706
+
1707
+ Provide EITHER `selector` OR `text`, but not both.
1307
1708
 
1308
1709
  Args:
1309
- selector: CSS-like selector string
1310
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
1311
- **kwargs: Additional filter parameters
1710
+ selector: CSS-like selector string.
1711
+ text: Text content to search for (equivalent to 'text:contains(...)').
1712
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1713
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1714
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
1715
+ **kwargs: Additional filter parameters.
1312
1716
 
1313
1717
  Returns:
1314
- ElementCollection with matching elements from all pages
1718
+ ElementCollection with matching elements from all pages.
1315
1719
  """
1316
1720
  all_elements = []
1721
+ # Input validation happens within page.find_all
1317
1722
  for page in self.pages:
1318
- elements = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1723
+ elements = page.find_all(
1724
+ selector=selector,
1725
+ text=text,
1726
+ apply_exclusions=apply_exclusions,
1727
+ regex=regex,
1728
+ case=case,
1729
+ **kwargs,
1730
+ )
1319
1731
  if elements:
1320
1732
  all_elements.extend(elements.elements)
1321
1733
 
@@ -1324,10 +1736,12 @@ class PageCollection(Generic[P]):
1324
1736
  def correct_ocr(
1325
1737
  self,
1326
1738
  correction_callback: Callable[[Any], Optional[str]],
1739
+ max_workers: Optional[int] = None,
1327
1740
  ) -> "PageCollection[P]":
1328
1741
  """
1329
1742
  Applies corrections to OCR-generated text elements across all pages
1330
- in this collection using a user-provided callback function.
1743
+ in this collection using a user-provided callback function, executed
1744
+ in parallel if `max_workers` is specified.
1331
1745
 
1332
1746
  This method delegates to the parent PDF's `correct_ocr` method,
1333
1747
  targeting all pages within this collection.
@@ -1335,10 +1749,11 @@ class PageCollection(Generic[P]):
1335
1749
  Args:
1336
1750
  correction_callback: A function that accepts a single argument (an element
1337
1751
  object) and returns `Optional[str]` (new text or None).
1752
+ max_workers: The maximum number of worker threads to use for parallel
1753
+ correction on each page. If None, defaults are used.
1338
1754
 
1339
1755
  Returns:
1340
- A dictionary containing aggregate statistics for the process across all pages:
1341
- {'elements_checked': total_checked, 'corrections_applied': total_applied}
1756
+ Self for method chaining.
1342
1757
 
1343
1758
  Raises:
1344
1759
  RuntimeError: If the collection is empty, pages lack a parent PDF reference,
@@ -1346,17 +1761,32 @@ class PageCollection(Generic[P]):
1346
1761
  """
1347
1762
  if not self.pages:
1348
1763
  logger.warning("Cannot correct OCR for an empty PageCollection.")
1764
+ # Return self even if empty to maintain chaining consistency
1765
+ return self
1349
1766
 
1350
1767
  # Assume all pages share the same parent PDF object
1351
1768
  parent_pdf = self.pages[0]._parent
1769
+ if (
1770
+ not parent_pdf
1771
+ or not hasattr(parent_pdf, "correct_ocr")
1772
+ or not callable(parent_pdf.correct_ocr)
1773
+ ):
1774
+ raise RuntimeError(
1775
+ "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
1776
+ )
1352
1777
 
1353
1778
  page_indices = [p.index for p in self.pages]
1354
1779
  logger.info(
1355
- f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}."
1780
+ f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
1356
1781
  )
1357
1782
 
1358
1783
  # Delegate the call to the parent PDF object for the relevant pages
1359
- parent_pdf.correct_ocr(correction_callback=correction_callback, pages=page_indices)
1784
+ # Pass the max_workers parameter down
1785
+ parent_pdf.correct_ocr(
1786
+ correction_callback=correction_callback,
1787
+ pages=page_indices,
1788
+ max_workers=max_workers, # Pass it here
1789
+ )
1360
1790
 
1361
1791
  return self
1362
1792
 
@@ -1660,3 +2090,176 @@ class PageCollection(Generic[P]):
1660
2090
  sections.append(region)
1661
2091
 
1662
2092
  return sections
2093
+
2094
+ def _gather_analysis_data(
2095
+ self,
2096
+ analysis_keys: List[str],
2097
+ include_content: bool,
2098
+ include_images: bool,
2099
+ image_dir: Optional[Path],
2100
+ image_format: str,
2101
+ image_resolution: int,
2102
+ ) -> List[Dict[str, Any]]:
2103
+ """
2104
+ Gather analysis data from all pages in the collection.
2105
+
2106
+ Args:
2107
+ analysis_keys: Keys in the analyses dictionary to export
2108
+ include_content: Whether to include extracted text
2109
+ include_images: Whether to export images
2110
+ image_dir: Directory to save images
2111
+ image_format: Format to save images
2112
+ image_resolution: Resolution for exported images
2113
+
2114
+ Returns:
2115
+ List of dictionaries containing analysis data
2116
+ """
2117
+ if not self.elements:
2118
+ logger.warning("No pages found in collection")
2119
+ return []
2120
+
2121
+ all_data = []
2122
+
2123
+ for page in self.elements:
2124
+ # Basic page information
2125
+ page_data = {
2126
+ "page_number": page.number,
2127
+ "page_index": page.index,
2128
+ "width": page.width,
2129
+ "height": page.height,
2130
+ }
2131
+
2132
+ # Add PDF information if available
2133
+ if hasattr(page, "pdf") and page.pdf:
2134
+ page_data["pdf_path"] = page.pdf.path
2135
+ page_data["pdf_filename"] = Path(page.pdf.path).name
2136
+
2137
+ # Include extracted text if requested
2138
+ if include_content:
2139
+ try:
2140
+ page_data["content"] = page.extract_text(preserve_whitespace=True)
2141
+ except Exception as e:
2142
+ logger.error(f"Error extracting text from page {page.number}: {e}")
2143
+ page_data["content"] = ""
2144
+
2145
+ # Save image if requested
2146
+ if include_images:
2147
+ try:
2148
+ # Create image filename
2149
+ pdf_name = "unknown"
2150
+ if hasattr(page, "pdf") and page.pdf:
2151
+ pdf_name = Path(page.pdf.path).stem
2152
+
2153
+ image_filename = f"{pdf_name}_page_{page.number}.{image_format}"
2154
+ image_path = image_dir / image_filename
2155
+
2156
+ # Save image
2157
+ page.save_image(
2158
+ str(image_path), resolution=image_resolution, include_highlights=True
2159
+ )
2160
+
2161
+ # Add relative path to data
2162
+ page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
2163
+ except Exception as e:
2164
+ logger.error(f"Error saving image for page {page.number}: {e}")
2165
+ page_data["image_path"] = None
2166
+
2167
+ # Add analyses data
2168
+ if hasattr(page, "analyses") and page.analyses:
2169
+ for key in analysis_keys:
2170
+ if key not in page.analyses:
2171
+ raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
2172
+
2173
+ # Get the analysis result
2174
+ analysis_result = page.analyses[key]
2175
+
2176
+ # If the result has a to_dict method, use it
2177
+ if hasattr(analysis_result, "to_dict"):
2178
+ analysis_data = analysis_result.to_dict()
2179
+ else:
2180
+ # Otherwise, use the result directly if it's dict-like
2181
+ try:
2182
+ analysis_data = dict(analysis_result)
2183
+ except (TypeError, ValueError):
2184
+ # Last resort: convert to string
2185
+ analysis_data = {"raw_result": str(analysis_result)}
2186
+
2187
+ # Add analysis data to page data with the key as prefix
2188
+ for k, v in analysis_data.items():
2189
+ page_data[f"{key}.{k}"] = v
2190
+
2191
+ all_data.append(page_data)
2192
+
2193
+ return all_data
2194
+
2195
+ # --- Deskew Method --- #
2196
+
2197
+ def deskew(
2198
+ self,
2199
+ resolution: int = 300,
2200
+ detection_resolution: int = 72,
2201
+ force_overwrite: bool = False,
2202
+ **deskew_kwargs,
2203
+ ) -> "PDF": # Changed return type
2204
+ """
2205
+ Creates a new, in-memory PDF object containing deskewed versions of the pages
2206
+ in this collection.
2207
+
2208
+ This method delegates the actual processing to the parent PDF object's
2209
+ `deskew` method.
2210
+
2211
+ Important: The returned PDF is image-based. Any existing text, OCR results,
2212
+ annotations, or other elements from the original pages will *not* be carried over.
2213
+
2214
+ Args:
2215
+ resolution: DPI resolution for rendering the output deskewed pages.
2216
+ detection_resolution: DPI resolution used for skew detection if angles are not
2217
+ already cached on the page objects.
2218
+ force_overwrite: If False (default), raises a ValueError if any target page
2219
+ already contains processed elements (text, OCR, regions) to
2220
+ prevent accidental data loss. Set to True to proceed anyway.
2221
+ **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
2222
+ during automatic detection (e.g., `max_angle`, `num_peaks`).
2223
+
2224
+ Returns:
2225
+ A new PDF object representing the deskewed document.
2226
+
2227
+ Raises:
2228
+ ImportError: If 'deskew' or 'img2pdf' libraries are not installed (raised by PDF.deskew).
2229
+ ValueError: If `force_overwrite` is False and target pages contain elements (raised by PDF.deskew),
2230
+ or if the collection is empty.
2231
+ RuntimeError: If pages lack a parent PDF reference, or the parent PDF lacks the `deskew` method.
2232
+ """
2233
+ if not self.pages:
2234
+ logger.warning("Cannot deskew an empty PageCollection.")
2235
+ raise ValueError("Cannot deskew an empty PageCollection.")
2236
+
2237
+ # Assume all pages share the same parent PDF object
2238
+ # Need to hint the type of _parent for type checkers
2239
+ if TYPE_CHECKING:
2240
+ parent_pdf: "natural_pdf.core.pdf.PDF" = self.pages[0]._parent
2241
+ else:
2242
+ parent_pdf = self.pages[0]._parent
2243
+
2244
+ if not parent_pdf or not hasattr(parent_pdf, "deskew") or not callable(parent_pdf.deskew):
2245
+ raise RuntimeError(
2246
+ "Parent PDF reference not found or parent PDF lacks the required 'deskew' method."
2247
+ )
2248
+
2249
+ # Get the 0-based indices of the pages in this collection
2250
+ page_indices = [p.index for p in self.pages]
2251
+ logger.info(
2252
+ f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
2253
+ )
2254
+
2255
+ # Delegate the call to the parent PDF object for the relevant pages
2256
+ # Pass all relevant arguments through (no output_path anymore)
2257
+ return parent_pdf.deskew(
2258
+ pages=page_indices,
2259
+ resolution=resolution,
2260
+ detection_resolution=detection_resolution,
2261
+ force_overwrite=force_overwrite,
2262
+ **deskew_kwargs,
2263
+ )
2264
+
2265
+ # --- End Deskew Method --- #