natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +241 -158
  13. natural_pdf/classification/mixin.py +52 -38
  14. natural_pdf/classification/results.py +71 -45
  15. natural_pdf/collections/mixins.py +85 -20
  16. natural_pdf/collections/pdf_collection.py +245 -100
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +694 -195
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +610 -134
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.10.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
@@ -1,32 +1,42 @@
1
1
  import logging
2
+ from collections.abc import MutableSequence
3
+ from pathlib import Path
2
4
  from typing import (
3
5
  TYPE_CHECKING,
4
6
  Any,
5
7
  Callable,
6
8
  Dict,
7
9
  Generic,
10
+ Iterable,
8
11
  Iterator,
9
12
  List,
10
13
  Optional,
14
+ Sequence,
11
15
  Tuple,
16
+ Type,
12
17
  TypeVar,
13
18
  Union,
14
- Iterable,
19
+ overload,
15
20
  )
16
21
 
17
22
  from pdfplumber.utils.geometry import objects_to_bbox
18
- from tqdm.auto import tqdm
23
+ from PIL import Image, ImageDraw, ImageFont
19
24
 
20
25
  # New Imports
21
26
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
27
+ from tqdm.auto import tqdm
22
28
 
29
+ from natural_pdf.classification.manager import ClassificationManager
30
+ from natural_pdf.classification.mixin import ClassificationMixin
31
+ from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
32
+ from natural_pdf.core.pdf import PDF
33
+ from natural_pdf.elements.base import Element
34
+ from natural_pdf.elements.region import Region
23
35
  from natural_pdf.elements.text import TextElement
36
+ from natural_pdf.export.mixin import ExportMixin
24
37
  from natural_pdf.ocr import OCROptions
25
- from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
26
38
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
27
- from natural_pdf.classification.mixin import ClassificationMixin
28
- from natural_pdf.classification.manager import ClassificationManager
29
- from natural_pdf.collections.mixins import ApplyMixin
39
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
30
40
 
31
41
  logger = logging.getLogger(__name__)
32
42
 
@@ -38,7 +48,9 @@ T = TypeVar("T")
38
48
  P = TypeVar("P", bound="Page")
39
49
 
40
50
 
41
- class ElementCollection(Generic[T], ApplyMixin):
51
+ class ElementCollection(
52
+ Generic[T], ApplyMixin, ExportMixin, DirectionalCollectionMixin, MutableSequence
53
+ ):
42
54
  """
43
55
  Collection of PDF elements with batch operations.
44
56
  """
@@ -60,10 +72,6 @@ class ElementCollection(Generic[T], ApplyMixin):
60
72
  """Get an element by index."""
61
73
  return self._elements[index]
62
74
 
63
- def __iter__(self):
64
- """Iterate over elements."""
65
- return iter(self._elements)
66
-
67
75
  def __repr__(self) -> str:
68
76
  """Return a string representation showing the element count."""
69
77
  element_type = "Mixed"
@@ -73,6 +81,20 @@ class ElementCollection(Generic[T], ApplyMixin):
73
81
  element_type = types.pop()
74
82
  return f"<ElementCollection[{element_type}](count={len(self)})>"
75
83
 
84
+ def __add__(self, other: "ElementCollection") -> "ElementCollection":
85
+ if not isinstance(other, ElementCollection):
86
+ return NotImplemented
87
+ return ElementCollection(self._elements + other._elements)
88
+
89
+ def __setitem__(self, index, value):
90
+ self._elements[index] = value
91
+
92
+ def __delitem__(self, index):
93
+ del self._elements[index]
94
+
95
+ def insert(self, index, value):
96
+ self._elements.insert(index, value)
97
+
76
98
  @property
77
99
  def elements(self) -> List["Element"]:
78
100
  """Get the elements in this collection."""
@@ -125,9 +147,7 @@ class ElementCollection(Generic[T], ApplyMixin):
125
147
 
126
148
  # Check if any element is from a different PDF
127
149
  return any(
128
- hasattr(e, "page") and
129
- hasattr(e.page, "pdf") and
130
- e.page.pdf is not first_pdf
150
+ hasattr(e, "page") and hasattr(e.page, "pdf") and e.page.pdf is not first_pdf
131
151
  for e in self._elements
132
152
  )
133
153
 
@@ -1113,62 +1133,23 @@ class ElementCollection(Generic[T], ApplyMixin):
1113
1133
  logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
1114
1134
  return None
1115
1135
 
1116
- def find_all(
1117
- self, selector: str, regex: bool = False, case: bool = True, **kwargs
1118
- ) -> "ElementCollection[T]":
1136
+ def find(self, selector: str, **kwargs) -> "ElementCollection":
1119
1137
  """
1120
- Filter elements within this collection matching the selector.
1138
+ Find elements in this collection matching the selector.
1121
1139
 
1122
1140
  Args:
1123
- selector: CSS-like selector string.
1124
- regex: Whether to use regex for text search in :contains (default: False).
1125
- case: Whether to do case-sensitive text search (default: True).
1126
- **kwargs: Additional filter parameters passed to the selector function.
1127
-
1128
- Returns:
1129
- A new ElementCollection containing only the matching elements from this collection.
1141
+ selector: CSS-like selector string
1142
+ apply_exclusions: Whether to exclude elements in exclusion regions
1130
1143
  """
1131
- if not self._elements:
1132
- return ElementCollection([])
1144
+ return self.apply(lambda element: element.find(selector, **kwargs))
1133
1145
 
1134
- try:
1135
- selector_obj = parse_selector(selector)
1136
- except Exception as e:
1137
- logger.error(f"Error parsing selector '{selector}': {e}")
1138
- return ElementCollection([]) # Return empty on parse error
1139
-
1140
- # Pass regex and case flags to selector function generator
1141
- kwargs["regex"] = regex
1142
- kwargs["case"] = case
1143
-
1144
- try:
1145
- filter_func = selector_to_filter_func(selector_obj, **kwargs)
1146
- except Exception as e:
1147
- logger.error(f"Error creating filter function for selector '{selector}': {e}")
1148
- return ElementCollection([]) # Return empty on filter creation error
1149
-
1150
- matching_elements = [element for element in self._elements if filter_func(element)]
1151
-
1152
- # Note: Unlike Page.find_all, this doesn't re-sort.
1153
- # Sorting should be done explicitly on the collection if needed.
1154
-
1155
- return ElementCollection(matching_elements)
1156
-
1157
- def find(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> Optional[T]:
1146
+ def extract_each_text(self, **kwargs) -> List[str]:
1158
1147
  """
1159
- Find the first element within this collection matching the selector.
1160
-
1161
- Args:
1162
- selector: CSS-like selector string.
1163
- regex: Whether to use regex for text search in :contains (default: False).
1164
- case: Whether to do case-sensitive text search (default: True).
1165
- **kwargs: Additional filter parameters passed to the selector function.
1166
-
1167
- Returns:
1168
- The first matching element or None.
1148
+ Extract text from each element in this region.
1169
1149
  """
1170
- results = self.find_all(selector, regex=regex, case=case, **kwargs)
1171
- return results.first
1150
+ return self.apply(
1151
+ lambda element: element.extract_text(**kwargs) if element is not None else None
1152
+ )
1172
1153
 
1173
1154
  def correct_ocr(
1174
1155
  self,
@@ -1214,23 +1195,23 @@ class ElementCollection(Generic[T], ApplyMixin):
1214
1195
  def remove(self) -> int:
1215
1196
  """
1216
1197
  Remove all elements in this collection from their respective pages.
1217
-
1198
+
1218
1199
  This method removes elements from the page's _element_mgr storage.
1219
1200
  It's particularly useful for removing OCR elements before applying new OCR.
1220
-
1201
+
1221
1202
  Returns:
1222
1203
  int: Number of elements successfully removed
1223
1204
  """
1224
1205
  if not self._elements:
1225
1206
  return 0
1226
-
1207
+
1227
1208
  removed_count = 0
1228
-
1209
+
1229
1210
  for element in self._elements:
1230
1211
  # Each element should have a reference to its page
1231
1212
  if hasattr(element, "page") and hasattr(element.page, "_element_mgr"):
1232
1213
  element_mgr = element.page._element_mgr
1233
-
1214
+
1234
1215
  # Determine element type
1235
1216
  element_type = getattr(element, "object_type", None)
1236
1217
  if element_type:
@@ -1243,7 +1224,7 @@ class ElementCollection(Generic[T], ApplyMixin):
1243
1224
  element_type = "rects"
1244
1225
  elif element_type == "line":
1245
1226
  element_type = "lines"
1246
-
1227
+
1247
1228
  # Try to remove from the element manager
1248
1229
  if hasattr(element_mgr, "remove_element"):
1249
1230
  success = element_mgr.remove_element(element, element_type)
@@ -1253,27 +1234,27 @@ class ElementCollection(Generic[T], ApplyMixin):
1253
1234
  logger.warning("ElementManager does not have remove_element method")
1254
1235
  else:
1255
1236
  logger.warning(f"Element has no page or page has no _element_mgr: {element}")
1256
-
1237
+
1257
1238
  return removed_count
1258
1239
 
1259
1240
  # --- Classification Method --- #
1260
1241
  def classify_all(
1261
1242
  self,
1262
- categories: List[str],
1243
+ labels: List[str],
1263
1244
  model: Optional[str] = None,
1264
1245
  using: Optional[str] = None,
1265
1246
  min_confidence: float = 0.0,
1266
- analysis_key: str = 'classification',
1247
+ analysis_key: str = "classification",
1267
1248
  multi_label: bool = False,
1268
1249
  batch_size: int = 8,
1269
1250
  max_workers: Optional[int] = None,
1270
1251
  progress_bar: bool = True,
1271
- **kwargs
1252
+ **kwargs,
1272
1253
  ):
1273
1254
  """Classifies all elements in the collection in batch.
1274
1255
 
1275
1256
  Args:
1276
- categories: List of category labels.
1257
+ labels: List of category labels.
1277
1258
  model: Model ID (or alias 'text', 'vision').
1278
1259
  using: Optional processing mode ('text' or 'vision'). Inferred if None.
1279
1260
  min_confidence: Minimum confidence threshold.
@@ -1292,21 +1273,21 @@ class ElementCollection(Generic[T], ApplyMixin):
1292
1273
  # Requires access to the PDF's manager. Assume first element has it.
1293
1274
  first_element = self.elements[0]
1294
1275
  manager_source = None
1295
- if hasattr(first_element, 'page') and hasattr(first_element.page, 'pdf'):
1296
- manager_source = first_element.page.pdf
1297
- elif hasattr(first_element, 'pdf'): # Maybe it's a PageCollection?
1298
- manager_source = first_element.pdf
1299
-
1300
- if not manager_source or not hasattr(manager_source, 'get_manager'):
1301
- raise RuntimeError("Cannot access ClassificationManager via elements.")
1276
+ if hasattr(first_element, "page") and hasattr(first_element.page, "pdf"):
1277
+ manager_source = first_element.page.pdf
1278
+ elif hasattr(first_element, "pdf"): # Maybe it's a PageCollection?
1279
+ manager_source = first_element.pdf
1280
+
1281
+ if not manager_source or not hasattr(manager_source, "get_manager"):
1282
+ raise RuntimeError("Cannot access ClassificationManager via elements.")
1302
1283
 
1303
1284
  try:
1304
- manager = manager_source.get_manager('classification')
1285
+ manager = manager_source.get_manager("classification")
1305
1286
  except Exception as e:
1306
- raise RuntimeError(f"Failed to get ClassificationManager: {e}") from e
1287
+ raise RuntimeError(f"Failed to get ClassificationManager: {e}") from e
1307
1288
 
1308
1289
  if not manager or not manager.is_available():
1309
- raise RuntimeError("ClassificationManager is not available.")
1290
+ raise RuntimeError("ClassificationManager is not available.")
1310
1291
 
1311
1292
  # Determine engine type early for content gathering
1312
1293
  inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
@@ -1314,60 +1295,187 @@ class ElementCollection(Generic[T], ApplyMixin):
1314
1295
  # Gather content from all elements
1315
1296
  items_to_classify: List[Tuple[Any, Union[str, Image.Image]]] = []
1316
1297
  original_elements: List[Any] = []
1317
- logger.info(f"Gathering content for {len(self.elements)} elements for batch classification...")
1298
+ logger.info(
1299
+ f"Gathering content for {len(self.elements)} elements for batch classification..."
1300
+ )
1318
1301
  for element in self.elements:
1319
- if not isinstance(element, ClassificationMixin):
1320
- logger.warning(f"Skipping element (not ClassificationMixin): {element!r}")
1321
- continue
1322
- try:
1323
- # Delegate content fetching to the element itself
1324
- content = element._get_classification_content(model_type=inferred_using, **kwargs)
1325
- items_to_classify.append(content)
1326
- original_elements.append(element)
1327
- except (ValueError, NotImplementedError) as e:
1328
- logger.warning(f"Skipping element {element!r}: Cannot get content for classification - {e}")
1329
- except Exception as e:
1330
- logger.warning(f"Skipping element {element!r}: Error getting classification content - {e}")
1302
+ if not isinstance(element, ClassificationMixin):
1303
+ logger.warning(f"Skipping element (not ClassificationMixin): {element!r}")
1304
+ continue
1305
+ try:
1306
+ # Delegate content fetching to the element itself
1307
+ content = element._get_classification_content(model_type=inferred_using, **kwargs)
1308
+ items_to_classify.append(content)
1309
+ original_elements.append(element)
1310
+ except (ValueError, NotImplementedError) as e:
1311
+ logger.warning(
1312
+ f"Skipping element {element!r}: Cannot get content for classification - {e}"
1313
+ )
1314
+ except Exception as e:
1315
+ logger.warning(
1316
+ f"Skipping element {element!r}: Error getting classification content - {e}"
1317
+ )
1331
1318
 
1332
1319
  if not items_to_classify:
1333
- logger.warning("No content could be gathered from elements for batch classification.")
1334
- return self
1320
+ logger.warning("No content could be gathered from elements for batch classification.")
1321
+ return self
1335
1322
 
1336
- logger.info(f"Collected content for {len(items_to_classify)} elements. Running batch classification...")
1323
+ logger.info(
1324
+ f"Collected content for {len(items_to_classify)} elements. Running batch classification..."
1325
+ )
1337
1326
 
1338
1327
  # Call manager's batch classify
1339
1328
  batch_results: List[ClassificationResult] = manager.classify_batch(
1340
1329
  item_contents=items_to_classify,
1341
- categories=categories,
1330
+ labels=labels,
1342
1331
  model_id=model,
1343
1332
  using=inferred_using,
1344
1333
  min_confidence=min_confidence,
1345
1334
  multi_label=multi_label,
1346
1335
  batch_size=batch_size,
1347
1336
  progress_bar=progress_bar,
1348
- **kwargs
1337
+ **kwargs,
1349
1338
  )
1350
1339
 
1351
1340
  # Assign results back to elements
1352
1341
  if len(batch_results) != len(original_elements):
1353
- logger.error(
1354
- f"Batch classification result count ({len(batch_results)}) mismatch "
1355
- f"with elements processed ({len(original_elements)}). Cannot assign results."
1356
- )
1357
- # Decide how to handle mismatch - maybe store errors?
1342
+ logger.error(
1343
+ f"Batch classification result count ({len(batch_results)}) mismatch "
1344
+ f"with elements processed ({len(original_elements)}). Cannot assign results."
1345
+ )
1346
+ # Decide how to handle mismatch - maybe store errors?
1358
1347
  else:
1359
- logger.info(f"Assigning {len(batch_results)} results to elements under key '{analysis_key}'.")
1360
- for element, result_obj in zip(original_elements, batch_results):
1361
- try:
1362
- if not hasattr(element, 'analyses') or element.analyses is None:
1363
- element.analyses = {}
1364
- element.analyses[analysis_key] = result_obj
1365
- except Exception as e:
1366
- logger.warning(f"Failed to store classification result for {element!r}: {e}")
1348
+ logger.info(
1349
+ f"Assigning {len(batch_results)} results to elements under key '{analysis_key}'."
1350
+ )
1351
+ for element, result_obj in zip(original_elements, batch_results):
1352
+ try:
1353
+ if not hasattr(element, "analyses") or element.analyses is None:
1354
+ element.analyses = {}
1355
+ element.analyses[analysis_key] = result_obj
1356
+ except Exception as e:
1357
+ logger.warning(f"Failed to store classification result for {element!r}: {e}")
1367
1358
 
1368
1359
  return self
1360
+
1369
1361
  # --- End Classification Method --- #
1370
1362
 
1363
+ def _gather_analysis_data(
1364
+ self,
1365
+ analysis_keys: List[str],
1366
+ include_content: bool,
1367
+ include_images: bool,
1368
+ image_dir: Optional[Path],
1369
+ image_format: str,
1370
+ image_resolution: int,
1371
+ ) -> List[Dict[str, Any]]:
1372
+ """
1373
+ Gather analysis data from all elements in the collection.
1374
+
1375
+ Args:
1376
+ analysis_keys: Keys in the analyses dictionary to export
1377
+ include_content: Whether to include extracted text
1378
+ include_images: Whether to export images
1379
+ image_dir: Directory to save images
1380
+ image_format: Format to save images
1381
+ image_resolution: Resolution for exported images
1382
+
1383
+ Returns:
1384
+ List of dictionaries containing analysis data
1385
+ """
1386
+ if not self.elements:
1387
+ logger.warning("No elements found in collection")
1388
+ return []
1389
+
1390
+ all_data = []
1391
+
1392
+ for i, element in enumerate(self.elements):
1393
+ # Base element information
1394
+ element_data = {
1395
+ "element_index": i,
1396
+ "element_type": getattr(element, "type", type(element).__name__),
1397
+ }
1398
+
1399
+ # Add geometry if available
1400
+ for attr in ["x0", "top", "x1", "bottom", "width", "height"]:
1401
+ if hasattr(element, attr):
1402
+ element_data[attr] = getattr(element, attr)
1403
+
1404
+ # Add page information if available
1405
+ if hasattr(element, "page"):
1406
+ page = element.page
1407
+ if page:
1408
+ element_data["page_number"] = getattr(page, "number", None)
1409
+ element_data["pdf_path"] = (
1410
+ getattr(page.pdf, "path", None) if hasattr(page, "pdf") else None
1411
+ )
1412
+
1413
+ # Include extracted text if requested
1414
+ if include_content and hasattr(element, "extract_text"):
1415
+ try:
1416
+ element_data["content"] = element.extract_text(preserve_whitespace=True)
1417
+ except Exception as e:
1418
+ logger.error(f"Error extracting text from element {i}: {e}")
1419
+ element_data["content"] = ""
1420
+
1421
+ # Save image if requested
1422
+ if include_images and hasattr(element, "to_image"):
1423
+ try:
1424
+ # Create identifier for the element
1425
+ pdf_name = "unknown"
1426
+ page_num = "unknown"
1427
+
1428
+ if hasattr(element, "page") and element.page:
1429
+ page_num = element.page.number
1430
+ if hasattr(element.page, "pdf") and element.page.pdf:
1431
+ pdf_name = Path(element.page.pdf.path).stem
1432
+
1433
+ # Create image filename
1434
+ element_type = element_data.get("element_type", "element").lower()
1435
+ image_filename = f"{pdf_name}_page{page_num}_{element_type}_{i}.{image_format}"
1436
+ image_path = image_dir / image_filename
1437
+
1438
+ # Save image
1439
+ element.to_image(
1440
+ path=str(image_path), resolution=image_resolution, include_highlights=True
1441
+ )
1442
+
1443
+ # Add relative path to data
1444
+ element_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
1445
+ except Exception as e:
1446
+ logger.error(f"Error saving image for element {i}: {e}")
1447
+ element_data["image_path"] = None
1448
+
1449
+ # Add analyses data
1450
+ if hasattr(element, "analyses"):
1451
+ for key in analysis_keys:
1452
+ if key not in element.analyses:
1453
+ # Skip this key if it doesn't exist - elements might have different analyses
1454
+ logger.warning(f"Analysis key '{key}' not found in element {i}")
1455
+ continue
1456
+
1457
+ # Get the analysis result
1458
+ analysis_result = element.analyses[key]
1459
+
1460
+ # If the result has a to_dict method, use it
1461
+ if hasattr(analysis_result, "to_dict"):
1462
+ analysis_data = analysis_result.to_dict()
1463
+ else:
1464
+ # Otherwise, use the result directly if it's dict-like
1465
+ try:
1466
+ analysis_data = dict(analysis_result)
1467
+ except (TypeError, ValueError):
1468
+ # Last resort: convert to string
1469
+ analysis_data = {"raw_result": str(analysis_result)}
1470
+
1471
+ # Add analysis data to element data with the key as prefix
1472
+ for k, v in analysis_data.items():
1473
+ element_data[f"{key}.{k}"] = v
1474
+
1475
+ all_data.append(element_data)
1476
+
1477
+ return all_data
1478
+
1371
1479
 
1372
1480
  class PageCollection(Generic[P], ApplyMixin):
1373
1481
  """
@@ -1500,39 +1608,127 @@ class PageCollection(Generic[P], ApplyMixin):
1500
1608
 
1501
1609
  return self # Return self for chaining
1502
1610
 
1503
- def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[T]:
1611
+ @overload
1612
+ def find(
1613
+ self,
1614
+ *,
1615
+ text: str,
1616
+ apply_exclusions: bool = True,
1617
+ regex: bool = False,
1618
+ case: bool = True,
1619
+ **kwargs,
1620
+ ) -> Optional[T]: ...
1621
+
1622
+ @overload
1623
+ def find(
1624
+ self,
1625
+ selector: str,
1626
+ *,
1627
+ apply_exclusions: bool = True,
1628
+ regex: bool = False,
1629
+ case: bool = True,
1630
+ **kwargs,
1631
+ ) -> Optional[T]: ...
1632
+
1633
+ def find(
1634
+ self,
1635
+ selector: Optional[str] = None,
1636
+ *,
1637
+ text: Optional[str] = None,
1638
+ apply_exclusions: bool = True,
1639
+ regex: bool = False,
1640
+ case: bool = True,
1641
+ **kwargs,
1642
+ ) -> Optional[T]:
1504
1643
  """
1505
- Find the first element matching the selector across all pages.
1644
+ Find the first element matching the selector OR text across all pages in the collection.
1645
+
1646
+ Provide EITHER `selector` OR `text`, but not both.
1506
1647
 
1507
1648
  Args:
1508
- selector: CSS-like selector string
1509
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
1510
- **kwargs: Additional filter parameters
1649
+ selector: CSS-like selector string.
1650
+ text: Text content to search for (equivalent to 'text:contains(...)').
1651
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1652
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1653
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
1654
+ **kwargs: Additional filter parameters.
1511
1655
 
1512
1656
  Returns:
1513
- First matching element or None
1657
+ First matching element or None.
1514
1658
  """
1659
+ # Input validation happens within page.find
1515
1660
  for page in self.pages:
1516
- element = page.find(selector, apply_exclusions=apply_exclusions, **kwargs)
1661
+ element = page.find(
1662
+ selector=selector,
1663
+ text=text,
1664
+ apply_exclusions=apply_exclusions,
1665
+ regex=regex,
1666
+ case=case,
1667
+ **kwargs,
1668
+ )
1517
1669
  if element:
1518
1670
  return element
1519
1671
  return None
1520
1672
 
1521
- def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> ElementCollection:
1673
+ @overload
1674
+ def find_all(
1675
+ self,
1676
+ *,
1677
+ text: str,
1678
+ apply_exclusions: bool = True,
1679
+ regex: bool = False,
1680
+ case: bool = True,
1681
+ **kwargs,
1682
+ ) -> "ElementCollection": ...
1683
+
1684
+ @overload
1685
+ def find_all(
1686
+ self,
1687
+ selector: str,
1688
+ *,
1689
+ apply_exclusions: bool = True,
1690
+ regex: bool = False,
1691
+ case: bool = True,
1692
+ **kwargs,
1693
+ ) -> "ElementCollection": ...
1694
+
1695
+ def find_all(
1696
+ self,
1697
+ selector: Optional[str] = None,
1698
+ *,
1699
+ text: Optional[str] = None,
1700
+ apply_exclusions: bool = True,
1701
+ regex: bool = False,
1702
+ case: bool = True,
1703
+ **kwargs,
1704
+ ) -> "ElementCollection":
1522
1705
  """
1523
- Find all elements matching the selector across all pages.
1706
+ Find all elements matching the selector OR text across all pages in the collection.
1707
+
1708
+ Provide EITHER `selector` OR `text`, but not both.
1524
1709
 
1525
1710
  Args:
1526
- selector: CSS-like selector string
1527
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
1528
- **kwargs: Additional filter parameters
1711
+ selector: CSS-like selector string.
1712
+ text: Text content to search for (equivalent to 'text:contains(...)').
1713
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1714
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1715
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
1716
+ **kwargs: Additional filter parameters.
1529
1717
 
1530
1718
  Returns:
1531
- ElementCollection with matching elements from all pages
1719
+ ElementCollection with matching elements from all pages.
1532
1720
  """
1533
1721
  all_elements = []
1722
+ # Input validation happens within page.find_all
1534
1723
  for page in self.pages:
1535
- elements = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1724
+ elements = page.find_all(
1725
+ selector=selector,
1726
+ text=text,
1727
+ apply_exclusions=apply_exclusions,
1728
+ regex=regex,
1729
+ case=case,
1730
+ **kwargs,
1731
+ )
1536
1732
  if elements:
1537
1733
  all_elements.extend(elements.elements)
1538
1734
 
@@ -1571,10 +1767,14 @@ class PageCollection(Generic[P], ApplyMixin):
1571
1767
 
1572
1768
  # Assume all pages share the same parent PDF object
1573
1769
  parent_pdf = self.pages[0]._parent
1574
- if not parent_pdf or not hasattr(parent_pdf, 'correct_ocr') or not callable(parent_pdf.correct_ocr):
1575
- raise RuntimeError(
1576
- "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
1577
- )
1770
+ if (
1771
+ not parent_pdf
1772
+ or not hasattr(parent_pdf, "correct_ocr")
1773
+ or not callable(parent_pdf.correct_ocr)
1774
+ ):
1775
+ raise RuntimeError(
1776
+ "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
1777
+ )
1578
1778
 
1579
1779
  page_indices = [p.index for p in self.pages]
1580
1780
  logger.info(
@@ -1586,7 +1786,7 @@ class PageCollection(Generic[P], ApplyMixin):
1586
1786
  parent_pdf.correct_ocr(
1587
1787
  correction_callback=correction_callback,
1588
1788
  pages=page_indices,
1589
- max_workers=max_workers # Pass it here
1789
+ max_workers=max_workers, # Pass it here
1590
1790
  )
1591
1791
 
1592
1792
  return self
@@ -1891,3 +2091,279 @@ class PageCollection(Generic[P], ApplyMixin):
1891
2091
  sections.append(region)
1892
2092
 
1893
2093
  return sections
2094
+
2095
+ def _gather_analysis_data(
2096
+ self,
2097
+ analysis_keys: List[str],
2098
+ include_content: bool,
2099
+ include_images: bool,
2100
+ image_dir: Optional[Path],
2101
+ image_format: str,
2102
+ image_resolution: int,
2103
+ ) -> List[Dict[str, Any]]:
2104
+ """
2105
+ Gather analysis data from all pages in the collection.
2106
+
2107
+ Args:
2108
+ analysis_keys: Keys in the analyses dictionary to export
2109
+ include_content: Whether to include extracted text
2110
+ include_images: Whether to export images
2111
+ image_dir: Directory to save images
2112
+ image_format: Format to save images
2113
+ image_resolution: Resolution for exported images
2114
+
2115
+ Returns:
2116
+ List of dictionaries containing analysis data
2117
+ """
2118
+ if not self.elements:
2119
+ logger.warning("No pages found in collection")
2120
+ return []
2121
+
2122
+ all_data = []
2123
+
2124
+ for page in self.elements:
2125
+ # Basic page information
2126
+ page_data = {
2127
+ "page_number": page.number,
2128
+ "page_index": page.index,
2129
+ "width": page.width,
2130
+ "height": page.height,
2131
+ }
2132
+
2133
+ # Add PDF information if available
2134
+ if hasattr(page, "pdf") and page.pdf:
2135
+ page_data["pdf_path"] = page.pdf.path
2136
+ page_data["pdf_filename"] = Path(page.pdf.path).name
2137
+
2138
+ # Include extracted text if requested
2139
+ if include_content:
2140
+ try:
2141
+ page_data["content"] = page.extract_text(preserve_whitespace=True)
2142
+ except Exception as e:
2143
+ logger.error(f"Error extracting text from page {page.number}: {e}")
2144
+ page_data["content"] = ""
2145
+
2146
+ # Save image if requested
2147
+ if include_images:
2148
+ try:
2149
+ # Create image filename
2150
+ pdf_name = "unknown"
2151
+ if hasattr(page, "pdf") and page.pdf:
2152
+ pdf_name = Path(page.pdf.path).stem
2153
+
2154
+ image_filename = f"{pdf_name}_page_{page.number}.{image_format}"
2155
+ image_path = image_dir / image_filename
2156
+
2157
+ # Save image
2158
+ page.save_image(
2159
+ str(image_path), resolution=image_resolution, include_highlights=True
2160
+ )
2161
+
2162
+ # Add relative path to data
2163
+ page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
2164
+ except Exception as e:
2165
+ logger.error(f"Error saving image for page {page.number}: {e}")
2166
+ page_data["image_path"] = None
2167
+
2168
+ # Add analyses data
2169
+ if hasattr(page, "analyses") and page.analyses:
2170
+ for key in analysis_keys:
2171
+ if key not in page.analyses:
2172
+ raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
2173
+
2174
+ # Get the analysis result
2175
+ analysis_result = page.analyses[key]
2176
+
2177
+ # If the result has a to_dict method, use it
2178
+ if hasattr(analysis_result, "to_dict"):
2179
+ analysis_data = analysis_result.to_dict()
2180
+ else:
2181
+ # Otherwise, use the result directly if it's dict-like
2182
+ try:
2183
+ analysis_data = dict(analysis_result)
2184
+ except (TypeError, ValueError):
2185
+ # Last resort: convert to string
2186
+ analysis_data = {"raw_result": str(analysis_result)}
2187
+
2188
+ # Add analysis data to page data with the key as prefix
2189
+ for k, v in analysis_data.items():
2190
+ page_data[f"{key}.{k}"] = v
2191
+
2192
+ all_data.append(page_data)
2193
+
2194
+ return all_data
2195
+
2196
+ # --- Deskew Method --- #
2197
+
2198
+ def deskew(
2199
+ self,
2200
+ resolution: int = 300,
2201
+ detection_resolution: int = 72,
2202
+ force_overwrite: bool = False,
2203
+ **deskew_kwargs,
2204
+ ) -> "PDF": # Changed return type
2205
+ """
2206
+ Creates a new, in-memory PDF object containing deskewed versions of the pages
2207
+ in this collection.
2208
+
2209
+ This method delegates the actual processing to the parent PDF object's
2210
+ `deskew` method.
2211
+
2212
+ Important: The returned PDF is image-based. Any existing text, OCR results,
2213
+ annotations, or other elements from the original pages will *not* be carried over.
2214
+
2215
+ Args:
2216
+ resolution: DPI resolution for rendering the output deskewed pages.
2217
+ detection_resolution: DPI resolution used for skew detection if angles are not
2218
+ already cached on the page objects.
2219
+ force_overwrite: If False (default), raises a ValueError if any target page
2220
+ already contains processed elements (text, OCR, regions) to
2221
+ prevent accidental data loss. Set to True to proceed anyway.
2222
+ **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
2223
+ during automatic detection (e.g., `max_angle`, `num_peaks`).
2224
+
2225
+ Returns:
2226
+ A new PDF object representing the deskewed document.
2227
+
2228
+ Raises:
2229
+ ImportError: If 'deskew' or 'img2pdf' libraries are not installed (raised by PDF.deskew).
2230
+ ValueError: If `force_overwrite` is False and target pages contain elements (raised by PDF.deskew),
2231
+ or if the collection is empty.
2232
+ RuntimeError: If pages lack a parent PDF reference, or the parent PDF lacks the `deskew` method.
2233
+ """
2234
+ if not self.pages:
2235
+ logger.warning("Cannot deskew an empty PageCollection.")
2236
+ raise ValueError("Cannot deskew an empty PageCollection.")
2237
+
2238
+ # Assume all pages share the same parent PDF object
2239
+ # Need to hint the type of _parent for type checkers
2240
+ if TYPE_CHECKING:
2241
+ parent_pdf: "natural_pdf.core.pdf.PDF" = self.pages[0]._parent
2242
+ else:
2243
+ parent_pdf = self.pages[0]._parent
2244
+
2245
+ if not parent_pdf or not hasattr(parent_pdf, "deskew") or not callable(parent_pdf.deskew):
2246
+ raise RuntimeError(
2247
+ "Parent PDF reference not found or parent PDF lacks the required 'deskew' method."
2248
+ )
2249
+
2250
+ # Get the 0-based indices of the pages in this collection
2251
+ page_indices = [p.index for p in self.pages]
2252
+ logger.info(
2253
+ f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
2254
+ )
2255
+
2256
+ # Delegate the call to the parent PDF object for the relevant pages
2257
+ # Pass all relevant arguments through (no output_path anymore)
2258
+ return parent_pdf.deskew(
2259
+ pages=page_indices,
2260
+ resolution=resolution,
2261
+ detection_resolution=detection_resolution,
2262
+ force_overwrite=force_overwrite,
2263
+ **deskew_kwargs,
2264
+ )
2265
+
2266
+ # --- End Deskew Method --- #
2267
+
2268
+ def to_image(
2269
+ self,
2270
+ page_width: int = 300,
2271
+ cols: Optional[int] = 4,
2272
+ rows: Optional[int] = None,
2273
+ max_pages: Optional[int] = None,
2274
+ spacing: int = 10,
2275
+ add_labels: bool = True,
2276
+ show_category: bool = False, # Add new flag
2277
+ ) -> Optional["Image.Image"]:
2278
+ """
2279
+ Generate a grid of page images for this collection.
2280
+
2281
+ Args:
2282
+ page_width: Width in pixels for rendering individual pages
2283
+ cols: Number of columns in grid (default: 4)
2284
+ rows: Number of rows in grid (calculated automatically if None)
2285
+ max_pages: Maximum number of pages to include (default: all)
2286
+ spacing: Spacing between page thumbnails in pixels
2287
+ add_labels: Whether to add page number labels
2288
+ show_category: Whether to add category and confidence labels (if available)
2289
+
2290
+ Returns:
2291
+ PIL Image of the page grid or None if no pages
2292
+ """
2293
+ if not self.pages:
2294
+ logger.warning("Cannot generate image for empty PageCollection")
2295
+ return None
2296
+
2297
+ # Limit pages if max_pages is specified
2298
+ pages_to_render = self.pages[:max_pages] if max_pages else self.pages
2299
+
2300
+ # Load font once outside the loop
2301
+ font = ImageFont.load_default(16) if add_labels else None
2302
+
2303
+ # Render individual page images
2304
+ page_images = []
2305
+ for page in pages_to_render:
2306
+ img = page.to_image(width=page_width)
2307
+
2308
+ # Add page number label
2309
+ if add_labels and font: # Check if font was loaded
2310
+ draw = ImageDraw.Draw(img)
2311
+ pdf_name = Path(page.pdf.path).stem if hasattr(page, "pdf") and page.pdf else ""
2312
+ label_text = f"p{page.number} - {pdf_name}"
2313
+
2314
+ # Add category if requested and available
2315
+ if show_category:
2316
+ category = getattr(page, "category", None)
2317
+ confidence = getattr(page, "category_confidence", None)
2318
+ if category is not None and confidence is not None:
2319
+ category_str = f"{category} {confidence:.3f}"
2320
+ label_text += f"\n{category_str}"
2321
+
2322
+ # Calculate bounding box for multi-line text
2323
+ # Use (5, 5) as top-left anchor for textbbox calculation for padding
2324
+ # Use multiline_textbbox for accurate bounds with newlines
2325
+ bbox = draw.multiline_textbbox((5, 5), label_text, font=font)
2326
+ # Add padding to the calculated bbox for the white background
2327
+ bg_rect = (bbox[0] - 2, bbox[1] - 2, bbox[2] + 2, bbox[3] + 2)
2328
+
2329
+ # Draw white background rectangle
2330
+ draw.rectangle(bg_rect, fill=(255, 255, 255))
2331
+
2332
+ # Draw the potentially multi-line text using multiline_text
2333
+ draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font)
2334
+
2335
+ page_images.append(img)
2336
+
2337
+ # Calculate grid dimensions if not provided
2338
+ if not rows and not cols:
2339
+ # Default to a square-ish grid
2340
+ cols = min(4, int(len(page_images) ** 0.5) + 1)
2341
+ rows = (len(page_images) + cols - 1) // cols
2342
+ elif rows and not cols:
2343
+ cols = (len(page_images) + rows - 1) // rows
2344
+ elif cols and not rows:
2345
+ rows = (len(page_images) + cols - 1) // cols
2346
+
2347
+ # Get maximum dimensions for consistent grid cells
2348
+ max_width = max(img.width for img in page_images)
2349
+ max_height = max(img.height for img in page_images)
2350
+
2351
+ # Create grid image
2352
+ grid_width = cols * max_width + (cols + 1) * spacing
2353
+ grid_height = rows * max_height + (rows + 1) * spacing
2354
+ grid_img = Image.new("RGB", (grid_width, grid_height), (255, 255, 255))
2355
+
2356
+ # Place images in grid
2357
+ for i, img in enumerate(page_images):
2358
+ if i >= rows * cols:
2359
+ break
2360
+
2361
+ row = i // cols
2362
+ col = i % cols
2363
+
2364
+ x = col * max_width + (col + 1) * spacing
2365
+ y = row * max_height + (row + 1) * spacing
2366
+
2367
+ grid_img.paste(img, (x, y))
2368
+
2369
+ return grid_img