natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +230 -151
  13. natural_pdf/classification/mixin.py +49 -35
  14. natural_pdf/classification/results.py +64 -46
  15. natural_pdf/collections/mixins.py +68 -20
  16. natural_pdf/collections/pdf_collection.py +177 -64
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +633 -190
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +503 -131
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,32 +1,41 @@
1
1
  import logging
2
+ from collections.abc import MutableSequence
3
+ from pathlib import Path
2
4
  from typing import (
3
5
  TYPE_CHECKING,
4
6
  Any,
5
7
  Callable,
6
8
  Dict,
7
9
  Generic,
10
+ Iterable,
8
11
  Iterator,
9
12
  List,
10
13
  Optional,
14
+ Sequence,
11
15
  Tuple,
16
+ Type,
12
17
  TypeVar,
13
18
  Union,
14
- Iterable,
19
+ overload,
15
20
  )
16
21
 
17
22
  from pdfplumber.utils.geometry import objects_to_bbox
18
- from tqdm.auto import tqdm
19
23
 
20
24
  # New Imports
21
25
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
26
+ from tqdm.auto import tqdm
22
27
 
28
+ from natural_pdf.classification.manager import ClassificationManager
29
+ from natural_pdf.classification.mixin import ClassificationMixin
30
+ from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
31
+ from natural_pdf.core.pdf import PDF
32
+ from natural_pdf.elements.base import Element
33
+ from natural_pdf.elements.region import Region
23
34
  from natural_pdf.elements.text import TextElement
35
+ from natural_pdf.export.mixin import ExportMixin
24
36
  from natural_pdf.ocr import OCROptions
25
- from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
26
37
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
27
- from natural_pdf.classification.mixin import ClassificationMixin
28
- from natural_pdf.classification.manager import ClassificationManager
29
- from natural_pdf.collections.mixins import ApplyMixin
38
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
30
39
 
31
40
  logger = logging.getLogger(__name__)
32
41
 
@@ -38,7 +47,9 @@ T = TypeVar("T")
38
47
  P = TypeVar("P", bound="Page")
39
48
 
40
49
 
41
- class ElementCollection(Generic[T], ApplyMixin):
50
+ class ElementCollection(
51
+ Generic[T], ApplyMixin, ExportMixin, DirectionalCollectionMixin, MutableSequence
52
+ ):
42
53
  """
43
54
  Collection of PDF elements with batch operations.
44
55
  """
@@ -60,10 +71,6 @@ class ElementCollection(Generic[T], ApplyMixin):
60
71
  """Get an element by index."""
61
72
  return self._elements[index]
62
73
 
63
- def __iter__(self):
64
- """Iterate over elements."""
65
- return iter(self._elements)
66
-
67
74
  def __repr__(self) -> str:
68
75
  """Return a string representation showing the element count."""
69
76
  element_type = "Mixed"
@@ -73,6 +80,20 @@ class ElementCollection(Generic[T], ApplyMixin):
73
80
  element_type = types.pop()
74
81
  return f"<ElementCollection[{element_type}](count={len(self)})>"
75
82
 
83
+ def __add__(self, other: "ElementCollection") -> "ElementCollection":
84
+ if not isinstance(other, ElementCollection):
85
+ return NotImplemented
86
+ return ElementCollection(self._elements + other._elements)
87
+
88
+ def __setitem__(self, index, value):
89
+ self._elements[index] = value
90
+
91
+ def __delitem__(self, index):
92
+ del self._elements[index]
93
+
94
+ def insert(self, index, value):
95
+ self._elements.insert(index, value)
96
+
76
97
  @property
77
98
  def elements(self) -> List["Element"]:
78
99
  """Get the elements in this collection."""
@@ -125,9 +146,7 @@ class ElementCollection(Generic[T], ApplyMixin):
125
146
 
126
147
  # Check if any element is from a different PDF
127
148
  return any(
128
- hasattr(e, "page") and
129
- hasattr(e.page, "pdf") and
130
- e.page.pdf is not first_pdf
149
+ hasattr(e, "page") and hasattr(e.page, "pdf") and e.page.pdf is not first_pdf
131
150
  for e in self._elements
132
151
  )
133
152
 
@@ -1113,62 +1132,23 @@ class ElementCollection(Generic[T], ApplyMixin):
1113
1132
  logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
1114
1133
  return None
1115
1134
 
1116
- def find_all(
1117
- self, selector: str, regex: bool = False, case: bool = True, **kwargs
1118
- ) -> "ElementCollection[T]":
1135
+ def find(self, selector: str, **kwargs) -> "ElementCollection":
1119
1136
  """
1120
- Filter elements within this collection matching the selector.
1137
+ Find elements in this collection matching the selector.
1121
1138
 
1122
1139
  Args:
1123
- selector: CSS-like selector string.
1124
- regex: Whether to use regex for text search in :contains (default: False).
1125
- case: Whether to do case-sensitive text search (default: True).
1126
- **kwargs: Additional filter parameters passed to the selector function.
1127
-
1128
- Returns:
1129
- A new ElementCollection containing only the matching elements from this collection.
1140
+ selector: CSS-like selector string
1141
+ apply_exclusions: Whether to exclude elements in exclusion regions
1130
1142
  """
1131
- if not self._elements:
1132
- return ElementCollection([])
1143
+ return self.apply(lambda element: element.find(selector, **kwargs))
1133
1144
 
1134
- try:
1135
- selector_obj = parse_selector(selector)
1136
- except Exception as e:
1137
- logger.error(f"Error parsing selector '{selector}': {e}")
1138
- return ElementCollection([]) # Return empty on parse error
1139
-
1140
- # Pass regex and case flags to selector function generator
1141
- kwargs["regex"] = regex
1142
- kwargs["case"] = case
1143
-
1144
- try:
1145
- filter_func = selector_to_filter_func(selector_obj, **kwargs)
1146
- except Exception as e:
1147
- logger.error(f"Error creating filter function for selector '{selector}': {e}")
1148
- return ElementCollection([]) # Return empty on filter creation error
1149
-
1150
- matching_elements = [element for element in self._elements if filter_func(element)]
1151
-
1152
- # Note: Unlike Page.find_all, this doesn't re-sort.
1153
- # Sorting should be done explicitly on the collection if needed.
1154
-
1155
- return ElementCollection(matching_elements)
1156
-
1157
- def find(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> Optional[T]:
1145
+ def extract_each_text(self, **kwargs) -> List[str]:
1158
1146
  """
1159
- Find the first element within this collection matching the selector.
1160
-
1161
- Args:
1162
- selector: CSS-like selector string.
1163
- regex: Whether to use regex for text search in :contains (default: False).
1164
- case: Whether to do case-sensitive text search (default: True).
1165
- **kwargs: Additional filter parameters passed to the selector function.
1166
-
1167
- Returns:
1168
- The first matching element or None.
1147
+ Extract text from each element in this region.
1169
1148
  """
1170
- results = self.find_all(selector, regex=regex, case=case, **kwargs)
1171
- return results.first
1149
+ return self.apply(
1150
+ lambda element: element.extract_text(**kwargs) if element is not None else None
1151
+ )
1172
1152
 
1173
1153
  def correct_ocr(
1174
1154
  self,
@@ -1214,23 +1194,23 @@ class ElementCollection(Generic[T], ApplyMixin):
1214
1194
  def remove(self) -> int:
1215
1195
  """
1216
1196
  Remove all elements in this collection from their respective pages.
1217
-
1197
+
1218
1198
  This method removes elements from the page's _element_mgr storage.
1219
1199
  It's particularly useful for removing OCR elements before applying new OCR.
1220
-
1200
+
1221
1201
  Returns:
1222
1202
  int: Number of elements successfully removed
1223
1203
  """
1224
1204
  if not self._elements:
1225
1205
  return 0
1226
-
1206
+
1227
1207
  removed_count = 0
1228
-
1208
+
1229
1209
  for element in self._elements:
1230
1210
  # Each element should have a reference to its page
1231
1211
  if hasattr(element, "page") and hasattr(element.page, "_element_mgr"):
1232
1212
  element_mgr = element.page._element_mgr
1233
-
1213
+
1234
1214
  # Determine element type
1235
1215
  element_type = getattr(element, "object_type", None)
1236
1216
  if element_type:
@@ -1243,7 +1223,7 @@ class ElementCollection(Generic[T], ApplyMixin):
1243
1223
  element_type = "rects"
1244
1224
  elif element_type == "line":
1245
1225
  element_type = "lines"
1246
-
1226
+
1247
1227
  # Try to remove from the element manager
1248
1228
  if hasattr(element_mgr, "remove_element"):
1249
1229
  success = element_mgr.remove_element(element, element_type)
@@ -1253,7 +1233,7 @@ class ElementCollection(Generic[T], ApplyMixin):
1253
1233
  logger.warning("ElementManager does not have remove_element method")
1254
1234
  else:
1255
1235
  logger.warning(f"Element has no page or page has no _element_mgr: {element}")
1256
-
1236
+
1257
1237
  return removed_count
1258
1238
 
1259
1239
  # --- Classification Method --- #
@@ -1263,12 +1243,12 @@ class ElementCollection(Generic[T], ApplyMixin):
1263
1243
  model: Optional[str] = None,
1264
1244
  using: Optional[str] = None,
1265
1245
  min_confidence: float = 0.0,
1266
- analysis_key: str = 'classification',
1246
+ analysis_key: str = "classification",
1267
1247
  multi_label: bool = False,
1268
1248
  batch_size: int = 8,
1269
1249
  max_workers: Optional[int] = None,
1270
1250
  progress_bar: bool = True,
1271
- **kwargs
1251
+ **kwargs,
1272
1252
  ):
1273
1253
  """Classifies all elements in the collection in batch.
1274
1254
 
@@ -1292,21 +1272,21 @@ class ElementCollection(Generic[T], ApplyMixin):
1292
1272
  # Requires access to the PDF's manager. Assume first element has it.
1293
1273
  first_element = self.elements[0]
1294
1274
  manager_source = None
1295
- if hasattr(first_element, 'page') and hasattr(first_element.page, 'pdf'):
1296
- manager_source = first_element.page.pdf
1297
- elif hasattr(first_element, 'pdf'): # Maybe it's a PageCollection?
1298
- manager_source = first_element.pdf
1299
-
1300
- if not manager_source or not hasattr(manager_source, 'get_manager'):
1301
- raise RuntimeError("Cannot access ClassificationManager via elements.")
1275
+ if hasattr(first_element, "page") and hasattr(first_element.page, "pdf"):
1276
+ manager_source = first_element.page.pdf
1277
+ elif hasattr(first_element, "pdf"): # Maybe it's a PageCollection?
1278
+ manager_source = first_element.pdf
1279
+
1280
+ if not manager_source or not hasattr(manager_source, "get_manager"):
1281
+ raise RuntimeError("Cannot access ClassificationManager via elements.")
1302
1282
 
1303
1283
  try:
1304
- manager = manager_source.get_manager('classification')
1284
+ manager = manager_source.get_manager("classification")
1305
1285
  except Exception as e:
1306
- raise RuntimeError(f"Failed to get ClassificationManager: {e}") from e
1286
+ raise RuntimeError(f"Failed to get ClassificationManager: {e}") from e
1307
1287
 
1308
1288
  if not manager or not manager.is_available():
1309
- raise RuntimeError("ClassificationManager is not available.")
1289
+ raise RuntimeError("ClassificationManager is not available.")
1310
1290
 
1311
1291
  # Determine engine type early for content gathering
1312
1292
  inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
@@ -1314,26 +1294,34 @@ class ElementCollection(Generic[T], ApplyMixin):
1314
1294
  # Gather content from all elements
1315
1295
  items_to_classify: List[Tuple[Any, Union[str, Image.Image]]] = []
1316
1296
  original_elements: List[Any] = []
1317
- logger.info(f"Gathering content for {len(self.elements)} elements for batch classification...")
1297
+ logger.info(
1298
+ f"Gathering content for {len(self.elements)} elements for batch classification..."
1299
+ )
1318
1300
  for element in self.elements:
1319
- if not isinstance(element, ClassificationMixin):
1320
- logger.warning(f"Skipping element (not ClassificationMixin): {element!r}")
1321
- continue
1322
- try:
1323
- # Delegate content fetching to the element itself
1324
- content = element._get_classification_content(model_type=inferred_using, **kwargs)
1325
- items_to_classify.append(content)
1326
- original_elements.append(element)
1327
- except (ValueError, NotImplementedError) as e:
1328
- logger.warning(f"Skipping element {element!r}: Cannot get content for classification - {e}")
1329
- except Exception as e:
1330
- logger.warning(f"Skipping element {element!r}: Error getting classification content - {e}")
1301
+ if not isinstance(element, ClassificationMixin):
1302
+ logger.warning(f"Skipping element (not ClassificationMixin): {element!r}")
1303
+ continue
1304
+ try:
1305
+ # Delegate content fetching to the element itself
1306
+ content = element._get_classification_content(model_type=inferred_using, **kwargs)
1307
+ items_to_classify.append(content)
1308
+ original_elements.append(element)
1309
+ except (ValueError, NotImplementedError) as e:
1310
+ logger.warning(
1311
+ f"Skipping element {element!r}: Cannot get content for classification - {e}"
1312
+ )
1313
+ except Exception as e:
1314
+ logger.warning(
1315
+ f"Skipping element {element!r}: Error getting classification content - {e}"
1316
+ )
1331
1317
 
1332
1318
  if not items_to_classify:
1333
- logger.warning("No content could be gathered from elements for batch classification.")
1334
- return self
1319
+ logger.warning("No content could be gathered from elements for batch classification.")
1320
+ return self
1335
1321
 
1336
- logger.info(f"Collected content for {len(items_to_classify)} elements. Running batch classification...")
1322
+ logger.info(
1323
+ f"Collected content for {len(items_to_classify)} elements. Running batch classification..."
1324
+ )
1337
1325
 
1338
1326
  # Call manager's batch classify
1339
1327
  batch_results: List[ClassificationResult] = manager.classify_batch(
@@ -1345,29 +1333,148 @@ class ElementCollection(Generic[T], ApplyMixin):
1345
1333
  multi_label=multi_label,
1346
1334
  batch_size=batch_size,
1347
1335
  progress_bar=progress_bar,
1348
- **kwargs
1336
+ **kwargs,
1349
1337
  )
1350
1338
 
1351
1339
  # Assign results back to elements
1352
1340
  if len(batch_results) != len(original_elements):
1353
- logger.error(
1354
- f"Batch classification result count ({len(batch_results)}) mismatch "
1355
- f"with elements processed ({len(original_elements)}). Cannot assign results."
1356
- )
1357
- # Decide how to handle mismatch - maybe store errors?
1341
+ logger.error(
1342
+ f"Batch classification result count ({len(batch_results)}) mismatch "
1343
+ f"with elements processed ({len(original_elements)}). Cannot assign results."
1344
+ )
1345
+ # Decide how to handle mismatch - maybe store errors?
1358
1346
  else:
1359
- logger.info(f"Assigning {len(batch_results)} results to elements under key '{analysis_key}'.")
1360
- for element, result_obj in zip(original_elements, batch_results):
1361
- try:
1362
- if not hasattr(element, 'analyses') or element.analyses is None:
1363
- element.analyses = {}
1364
- element.analyses[analysis_key] = result_obj
1365
- except Exception as e:
1366
- logger.warning(f"Failed to store classification result for {element!r}: {e}")
1347
+ logger.info(
1348
+ f"Assigning {len(batch_results)} results to elements under key '{analysis_key}'."
1349
+ )
1350
+ for element, result_obj in zip(original_elements, batch_results):
1351
+ try:
1352
+ if not hasattr(element, "analyses") or element.analyses is None:
1353
+ element.analyses = {}
1354
+ element.analyses[analysis_key] = result_obj
1355
+ except Exception as e:
1356
+ logger.warning(f"Failed to store classification result for {element!r}: {e}")
1367
1357
 
1368
1358
  return self
1359
+
1369
1360
  # --- End Classification Method --- #
1370
1361
 
1362
+ def _gather_analysis_data(
1363
+ self,
1364
+ analysis_keys: List[str],
1365
+ include_content: bool,
1366
+ include_images: bool,
1367
+ image_dir: Optional[Path],
1368
+ image_format: str,
1369
+ image_resolution: int,
1370
+ ) -> List[Dict[str, Any]]:
1371
+ """
1372
+ Gather analysis data from all elements in the collection.
1373
+
1374
+ Args:
1375
+ analysis_keys: Keys in the analyses dictionary to export
1376
+ include_content: Whether to include extracted text
1377
+ include_images: Whether to export images
1378
+ image_dir: Directory to save images
1379
+ image_format: Format to save images
1380
+ image_resolution: Resolution for exported images
1381
+
1382
+ Returns:
1383
+ List of dictionaries containing analysis data
1384
+ """
1385
+ if not self.elements:
1386
+ logger.warning("No elements found in collection")
1387
+ return []
1388
+
1389
+ all_data = []
1390
+
1391
+ for i, element in enumerate(self.elements):
1392
+ # Base element information
1393
+ element_data = {
1394
+ "element_index": i,
1395
+ "element_type": getattr(element, "type", type(element).__name__),
1396
+ }
1397
+
1398
+ # Add geometry if available
1399
+ for attr in ["x0", "top", "x1", "bottom", "width", "height"]:
1400
+ if hasattr(element, attr):
1401
+ element_data[attr] = getattr(element, attr)
1402
+
1403
+ # Add page information if available
1404
+ if hasattr(element, "page"):
1405
+ page = element.page
1406
+ if page:
1407
+ element_data["page_number"] = getattr(page, "number", None)
1408
+ element_data["pdf_path"] = (
1409
+ getattr(page.pdf, "path", None) if hasattr(page, "pdf") else None
1410
+ )
1411
+
1412
+ # Include extracted text if requested
1413
+ if include_content and hasattr(element, "extract_text"):
1414
+ try:
1415
+ element_data["content"] = element.extract_text(preserve_whitespace=True)
1416
+ except Exception as e:
1417
+ logger.error(f"Error extracting text from element {i}: {e}")
1418
+ element_data["content"] = ""
1419
+
1420
+ # Save image if requested
1421
+ if include_images and hasattr(element, "to_image"):
1422
+ try:
1423
+ # Create identifier for the element
1424
+ pdf_name = "unknown"
1425
+ page_num = "unknown"
1426
+
1427
+ if hasattr(element, "page") and element.page:
1428
+ page_num = element.page.number
1429
+ if hasattr(element.page, "pdf") and element.page.pdf:
1430
+ pdf_name = Path(element.page.pdf.path).stem
1431
+
1432
+ # Create image filename
1433
+ element_type = element_data.get("element_type", "element").lower()
1434
+ image_filename = f"{pdf_name}_page{page_num}_{element_type}_{i}.{image_format}"
1435
+ image_path = image_dir / image_filename
1436
+
1437
+ # Save image
1438
+ element.to_image(
1439
+ path=str(image_path), resolution=image_resolution, include_highlights=True
1440
+ )
1441
+
1442
+ # Add relative path to data
1443
+ element_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
1444
+ except Exception as e:
1445
+ logger.error(f"Error saving image for element {i}: {e}")
1446
+ element_data["image_path"] = None
1447
+
1448
+ # Add analyses data
1449
+ if hasattr(element, "analyses"):
1450
+ for key in analysis_keys:
1451
+ if key not in element.analyses:
1452
+ # Skip this key if it doesn't exist - elements might have different analyses
1453
+ logger.warning(f"Analysis key '{key}' not found in element {i}")
1454
+ continue
1455
+
1456
+ # Get the analysis result
1457
+ analysis_result = element.analyses[key]
1458
+
1459
+ # If the result has a to_dict method, use it
1460
+ if hasattr(analysis_result, "to_dict"):
1461
+ analysis_data = analysis_result.to_dict()
1462
+ else:
1463
+ # Otherwise, use the result directly if it's dict-like
1464
+ try:
1465
+ analysis_data = dict(analysis_result)
1466
+ except (TypeError, ValueError):
1467
+ # Last resort: convert to string
1468
+ analysis_data = {"raw_result": str(analysis_result)}
1469
+
1470
+ # Add analysis data to element data with the key as prefix
1471
+ for k, v in analysis_data.items():
1472
+ element_data[f"{key}.{k}"] = v
1473
+
1474
+ all_data.append(element_data)
1475
+
1476
+ return all_data
1477
+
1371
1478
 
1372
1479
  class PageCollection(Generic[P], ApplyMixin):
1373
1480
  """
@@ -1500,39 +1607,127 @@ class PageCollection(Generic[P], ApplyMixin):
1500
1607
 
1501
1608
  return self # Return self for chaining
1502
1609
 
1503
- def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[T]:
1610
+ @overload
1611
+ def find(
1612
+ self,
1613
+ *,
1614
+ text: str,
1615
+ apply_exclusions: bool = True,
1616
+ regex: bool = False,
1617
+ case: bool = True,
1618
+ **kwargs,
1619
+ ) -> Optional[T]: ...
1620
+
1621
+ @overload
1622
+ def find(
1623
+ self,
1624
+ selector: str,
1625
+ *,
1626
+ apply_exclusions: bool = True,
1627
+ regex: bool = False,
1628
+ case: bool = True,
1629
+ **kwargs,
1630
+ ) -> Optional[T]: ...
1631
+
1632
+ def find(
1633
+ self,
1634
+ selector: Optional[str] = None,
1635
+ *,
1636
+ text: Optional[str] = None,
1637
+ apply_exclusions: bool = True,
1638
+ regex: bool = False,
1639
+ case: bool = True,
1640
+ **kwargs,
1641
+ ) -> Optional[T]:
1504
1642
  """
1505
- Find the first element matching the selector across all pages.
1643
+ Find the first element matching the selector OR text across all pages in the collection.
1644
+
1645
+ Provide EITHER `selector` OR `text`, but not both.
1506
1646
 
1507
1647
  Args:
1508
- selector: CSS-like selector string
1509
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
1510
- **kwargs: Additional filter parameters
1648
+ selector: CSS-like selector string.
1649
+ text: Text content to search for (equivalent to 'text:contains(...)').
1650
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1651
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1652
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
1653
+ **kwargs: Additional filter parameters.
1511
1654
 
1512
1655
  Returns:
1513
- First matching element or None
1656
+ First matching element or None.
1514
1657
  """
1658
+ # Input validation happens within page.find
1515
1659
  for page in self.pages:
1516
- element = page.find(selector, apply_exclusions=apply_exclusions, **kwargs)
1660
+ element = page.find(
1661
+ selector=selector,
1662
+ text=text,
1663
+ apply_exclusions=apply_exclusions,
1664
+ regex=regex,
1665
+ case=case,
1666
+ **kwargs,
1667
+ )
1517
1668
  if element:
1518
1669
  return element
1519
1670
  return None
1520
1671
 
1521
- def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> ElementCollection:
1672
+ @overload
1673
+ def find_all(
1674
+ self,
1675
+ *,
1676
+ text: str,
1677
+ apply_exclusions: bool = True,
1678
+ regex: bool = False,
1679
+ case: bool = True,
1680
+ **kwargs,
1681
+ ) -> "ElementCollection": ...
1682
+
1683
+ @overload
1684
+ def find_all(
1685
+ self,
1686
+ selector: str,
1687
+ *,
1688
+ apply_exclusions: bool = True,
1689
+ regex: bool = False,
1690
+ case: bool = True,
1691
+ **kwargs,
1692
+ ) -> "ElementCollection": ...
1693
+
1694
+ def find_all(
1695
+ self,
1696
+ selector: Optional[str] = None,
1697
+ *,
1698
+ text: Optional[str] = None,
1699
+ apply_exclusions: bool = True,
1700
+ regex: bool = False,
1701
+ case: bool = True,
1702
+ **kwargs,
1703
+ ) -> "ElementCollection":
1522
1704
  """
1523
- Find all elements matching the selector across all pages.
1705
+ Find all elements matching the selector OR text across all pages in the collection.
1706
+
1707
+ Provide EITHER `selector` OR `text`, but not both.
1524
1708
 
1525
1709
  Args:
1526
- selector: CSS-like selector string
1527
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
1528
- **kwargs: Additional filter parameters
1710
+ selector: CSS-like selector string.
1711
+ text: Text content to search for (equivalent to 'text:contains(...)').
1712
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1713
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1714
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
1715
+ **kwargs: Additional filter parameters.
1529
1716
 
1530
1717
  Returns:
1531
- ElementCollection with matching elements from all pages
1718
+ ElementCollection with matching elements from all pages.
1532
1719
  """
1533
1720
  all_elements = []
1721
+ # Input validation happens within page.find_all
1534
1722
  for page in self.pages:
1535
- elements = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1723
+ elements = page.find_all(
1724
+ selector=selector,
1725
+ text=text,
1726
+ apply_exclusions=apply_exclusions,
1727
+ regex=regex,
1728
+ case=case,
1729
+ **kwargs,
1730
+ )
1536
1731
  if elements:
1537
1732
  all_elements.extend(elements.elements)
1538
1733
 
@@ -1571,10 +1766,14 @@ class PageCollection(Generic[P], ApplyMixin):
1571
1766
 
1572
1767
  # Assume all pages share the same parent PDF object
1573
1768
  parent_pdf = self.pages[0]._parent
1574
- if not parent_pdf or not hasattr(parent_pdf, 'correct_ocr') or not callable(parent_pdf.correct_ocr):
1575
- raise RuntimeError(
1576
- "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
1577
- )
1769
+ if (
1770
+ not parent_pdf
1771
+ or not hasattr(parent_pdf, "correct_ocr")
1772
+ or not callable(parent_pdf.correct_ocr)
1773
+ ):
1774
+ raise RuntimeError(
1775
+ "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
1776
+ )
1578
1777
 
1579
1778
  page_indices = [p.index for p in self.pages]
1580
1779
  logger.info(
@@ -1586,7 +1785,7 @@ class PageCollection(Generic[P], ApplyMixin):
1586
1785
  parent_pdf.correct_ocr(
1587
1786
  correction_callback=correction_callback,
1588
1787
  pages=page_indices,
1589
- max_workers=max_workers # Pass it here
1788
+ max_workers=max_workers, # Pass it here
1590
1789
  )
1591
1790
 
1592
1791
  return self
@@ -1891,3 +2090,176 @@ class PageCollection(Generic[P], ApplyMixin):
1891
2090
  sections.append(region)
1892
2091
 
1893
2092
  return sections
2093
+
2094
+ def _gather_analysis_data(
2095
+ self,
2096
+ analysis_keys: List[str],
2097
+ include_content: bool,
2098
+ include_images: bool,
2099
+ image_dir: Optional[Path],
2100
+ image_format: str,
2101
+ image_resolution: int,
2102
+ ) -> List[Dict[str, Any]]:
2103
+ """
2104
+ Gather analysis data from all pages in the collection.
2105
+
2106
+ Args:
2107
+ analysis_keys: Keys in the analyses dictionary to export
2108
+ include_content: Whether to include extracted text
2109
+ include_images: Whether to export images
2110
+ image_dir: Directory to save images
2111
+ image_format: Format to save images
2112
+ image_resolution: Resolution for exported images
2113
+
2114
+ Returns:
2115
+ List of dictionaries containing analysis data
2116
+ """
2117
+ if not self.elements:
2118
+ logger.warning("No pages found in collection")
2119
+ return []
2120
+
2121
+ all_data = []
2122
+
2123
+ for page in self.elements:
2124
+ # Basic page information
2125
+ page_data = {
2126
+ "page_number": page.number,
2127
+ "page_index": page.index,
2128
+ "width": page.width,
2129
+ "height": page.height,
2130
+ }
2131
+
2132
+ # Add PDF information if available
2133
+ if hasattr(page, "pdf") and page.pdf:
2134
+ page_data["pdf_path"] = page.pdf.path
2135
+ page_data["pdf_filename"] = Path(page.pdf.path).name
2136
+
2137
+ # Include extracted text if requested
2138
+ if include_content:
2139
+ try:
2140
+ page_data["content"] = page.extract_text(preserve_whitespace=True)
2141
+ except Exception as e:
2142
+ logger.error(f"Error extracting text from page {page.number}: {e}")
2143
+ page_data["content"] = ""
2144
+
2145
+ # Save image if requested
2146
+ if include_images:
2147
+ try:
2148
+ # Create image filename
2149
+ pdf_name = "unknown"
2150
+ if hasattr(page, "pdf") and page.pdf:
2151
+ pdf_name = Path(page.pdf.path).stem
2152
+
2153
+ image_filename = f"{pdf_name}_page_{page.number}.{image_format}"
2154
+ image_path = image_dir / image_filename
2155
+
2156
+ # Save image
2157
+ page.save_image(
2158
+ str(image_path), resolution=image_resolution, include_highlights=True
2159
+ )
2160
+
2161
+ # Add relative path to data
2162
+ page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
2163
+ except Exception as e:
2164
+ logger.error(f"Error saving image for page {page.number}: {e}")
2165
+ page_data["image_path"] = None
2166
+
2167
+ # Add analyses data
2168
+ if hasattr(page, "analyses") and page.analyses:
2169
+ for key in analysis_keys:
2170
+ if key not in page.analyses:
2171
+ raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
2172
+
2173
+ # Get the analysis result
2174
+ analysis_result = page.analyses[key]
2175
+
2176
+ # If the result has a to_dict method, use it
2177
+ if hasattr(analysis_result, "to_dict"):
2178
+ analysis_data = analysis_result.to_dict()
2179
+ else:
2180
+ # Otherwise, use the result directly if it's dict-like
2181
+ try:
2182
+ analysis_data = dict(analysis_result)
2183
+ except (TypeError, ValueError):
2184
+ # Last resort: convert to string
2185
+ analysis_data = {"raw_result": str(analysis_result)}
2186
+
2187
+ # Add analysis data to page data with the key as prefix
2188
+ for k, v in analysis_data.items():
2189
+ page_data[f"{key}.{k}"] = v
2190
+
2191
+ all_data.append(page_data)
2192
+
2193
+ return all_data
2194
+
2195
+ # --- Deskew Method --- #
2196
+
2197
+ def deskew(
2198
+ self,
2199
+ resolution: int = 300,
2200
+ detection_resolution: int = 72,
2201
+ force_overwrite: bool = False,
2202
+ **deskew_kwargs,
2203
+ ) -> "PDF": # Changed return type
2204
+ """
2205
+ Creates a new, in-memory PDF object containing deskewed versions of the pages
2206
+ in this collection.
2207
+
2208
+ This method delegates the actual processing to the parent PDF object's
2209
+ `deskew` method.
2210
+
2211
+ Important: The returned PDF is image-based. Any existing text, OCR results,
2212
+ annotations, or other elements from the original pages will *not* be carried over.
2213
+
2214
+ Args:
2215
+ resolution: DPI resolution for rendering the output deskewed pages.
2216
+ detection_resolution: DPI resolution used for skew detection if angles are not
2217
+ already cached on the page objects.
2218
+ force_overwrite: If False (default), raises a ValueError if any target page
2219
+ already contains processed elements (text, OCR, regions) to
2220
+ prevent accidental data loss. Set to True to proceed anyway.
2221
+ **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
2222
+ during automatic detection (e.g., `max_angle`, `num_peaks`).
2223
+
2224
+ Returns:
2225
+ A new PDF object representing the deskewed document.
2226
+
2227
+ Raises:
2228
+ ImportError: If 'deskew' or 'img2pdf' libraries are not installed (raised by PDF.deskew).
2229
+ ValueError: If `force_overwrite` is False and target pages contain elements (raised by PDF.deskew),
2230
+ or if the collection is empty.
2231
+ RuntimeError: If pages lack a parent PDF reference, or the parent PDF lacks the `deskew` method.
2232
+ """
2233
+ if not self.pages:
2234
+ logger.warning("Cannot deskew an empty PageCollection.")
2235
+ raise ValueError("Cannot deskew an empty PageCollection.")
2236
+
2237
+ # Assume all pages share the same parent PDF object
2238
+ # Need to hint the type of _parent for type checkers
2239
+ if TYPE_CHECKING:
2240
+ parent_pdf: "natural_pdf.core.pdf.PDF" = self.pages[0]._parent
2241
+ else:
2242
+ parent_pdf = self.pages[0]._parent
2243
+
2244
+ if not parent_pdf or not hasattr(parent_pdf, "deskew") or not callable(parent_pdf.deskew):
2245
+ raise RuntimeError(
2246
+ "Parent PDF reference not found or parent PDF lacks the required 'deskew' method."
2247
+ )
2248
+
2249
+ # Get the 0-based indices of the pages in this collection
2250
+ page_indices = [p.index for p in self.pages]
2251
+ logger.info(
2252
+ f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
2253
+ )
2254
+
2255
+ # Delegate the call to the parent PDF object for the relevant pages
2256
+ # Pass all relevant arguments through (no output_path anymore)
2257
+ return parent_pdf.deskew(
2258
+ pages=page_indices,
2259
+ resolution=resolution,
2260
+ detection_resolution=detection_resolution,
2261
+ force_overwrite=force_overwrite,
2262
+ **deskew_kwargs,
2263
+ )
2264
+
2265
+ # --- End Deskew Method --- #