natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +230 -151
- natural_pdf/classification/mixin.py +49 -35
- natural_pdf/classification/results.py +64 -46
- natural_pdf/collections/mixins.py +68 -20
- natural_pdf/collections/pdf_collection.py +177 -64
- natural_pdf/core/element_manager.py +30 -14
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +423 -101
- natural_pdf/core/pdf.py +633 -190
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +503 -131
- natural_pdf/elements/region.py +659 -90
- natural_pdf/elements/text.py +1 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +4 -3
- natural_pdf/extraction/manager.py +50 -49
- natural_pdf/extraction/mixin.py +90 -57
- natural_pdf/extraction/result.py +9 -23
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +61 -25
- natural_pdf/ocr/ocr_options.py +70 -10
- natural_pdf/ocr/utils.py +6 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +219 -143
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +1 -1
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +24 -16
- natural_pdf/utils/tqdm_utils.py +18 -10
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/categorizing-documents/index.md +0 -168
- docs/data-extraction/index.md +0 -87
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -969
- docs/element-selection/index.md +0 -249
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -189
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -256
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -417
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -152
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -119
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -275
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -337
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -293
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -414
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -513
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2439
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -517
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -3712
- docs/tutorials/12-ocr-integration.md +0 -137
- docs/tutorials/13-semantic-search.ipynb +0 -1718
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.8.dist-info/RECORD +0 -156
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,32 +1,41 @@
|
|
1
1
|
import logging
|
2
|
+
from collections.abc import MutableSequence
|
3
|
+
from pathlib import Path
|
2
4
|
from typing import (
|
3
5
|
TYPE_CHECKING,
|
4
6
|
Any,
|
5
7
|
Callable,
|
6
8
|
Dict,
|
7
9
|
Generic,
|
10
|
+
Iterable,
|
8
11
|
Iterator,
|
9
12
|
List,
|
10
13
|
Optional,
|
14
|
+
Sequence,
|
11
15
|
Tuple,
|
16
|
+
Type,
|
12
17
|
TypeVar,
|
13
18
|
Union,
|
14
|
-
|
19
|
+
overload,
|
15
20
|
)
|
16
21
|
|
17
22
|
from pdfplumber.utils.geometry import objects_to_bbox
|
18
|
-
from tqdm.auto import tqdm
|
19
23
|
|
20
24
|
# New Imports
|
21
25
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
26
|
+
from tqdm.auto import tqdm
|
22
27
|
|
28
|
+
from natural_pdf.classification.manager import ClassificationManager
|
29
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
30
|
+
from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
|
31
|
+
from natural_pdf.core.pdf import PDF
|
32
|
+
from natural_pdf.elements.base import Element
|
33
|
+
from natural_pdf.elements.region import Region
|
23
34
|
from natural_pdf.elements.text import TextElement
|
35
|
+
from natural_pdf.export.mixin import ExportMixin
|
24
36
|
from natural_pdf.ocr import OCROptions
|
25
|
-
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
26
37
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
27
|
-
from natural_pdf.
|
28
|
-
from natural_pdf.classification.manager import ClassificationManager
|
29
|
-
from natural_pdf.collections.mixins import ApplyMixin
|
38
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
30
39
|
|
31
40
|
logger = logging.getLogger(__name__)
|
32
41
|
|
@@ -38,7 +47,9 @@ T = TypeVar("T")
|
|
38
47
|
P = TypeVar("P", bound="Page")
|
39
48
|
|
40
49
|
|
41
|
-
class ElementCollection(
|
50
|
+
class ElementCollection(
|
51
|
+
Generic[T], ApplyMixin, ExportMixin, DirectionalCollectionMixin, MutableSequence
|
52
|
+
):
|
42
53
|
"""
|
43
54
|
Collection of PDF elements with batch operations.
|
44
55
|
"""
|
@@ -60,10 +71,6 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
60
71
|
"""Get an element by index."""
|
61
72
|
return self._elements[index]
|
62
73
|
|
63
|
-
def __iter__(self):
|
64
|
-
"""Iterate over elements."""
|
65
|
-
return iter(self._elements)
|
66
|
-
|
67
74
|
def __repr__(self) -> str:
|
68
75
|
"""Return a string representation showing the element count."""
|
69
76
|
element_type = "Mixed"
|
@@ -73,6 +80,20 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
73
80
|
element_type = types.pop()
|
74
81
|
return f"<ElementCollection[{element_type}](count={len(self)})>"
|
75
82
|
|
83
|
+
def __add__(self, other: "ElementCollection") -> "ElementCollection":
|
84
|
+
if not isinstance(other, ElementCollection):
|
85
|
+
return NotImplemented
|
86
|
+
return ElementCollection(self._elements + other._elements)
|
87
|
+
|
88
|
+
def __setitem__(self, index, value):
|
89
|
+
self._elements[index] = value
|
90
|
+
|
91
|
+
def __delitem__(self, index):
|
92
|
+
del self._elements[index]
|
93
|
+
|
94
|
+
def insert(self, index, value):
|
95
|
+
self._elements.insert(index, value)
|
96
|
+
|
76
97
|
@property
|
77
98
|
def elements(self) -> List["Element"]:
|
78
99
|
"""Get the elements in this collection."""
|
@@ -125,9 +146,7 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
125
146
|
|
126
147
|
# Check if any element is from a different PDF
|
127
148
|
return any(
|
128
|
-
hasattr(e, "page") and
|
129
|
-
hasattr(e.page, "pdf") and
|
130
|
-
e.page.pdf is not first_pdf
|
149
|
+
hasattr(e, "page") and hasattr(e.page, "pdf") and e.page.pdf is not first_pdf
|
131
150
|
for e in self._elements
|
132
151
|
)
|
133
152
|
|
@@ -1113,62 +1132,23 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
1113
1132
|
logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
|
1114
1133
|
return None
|
1115
1134
|
|
1116
|
-
def
|
1117
|
-
self, selector: str, regex: bool = False, case: bool = True, **kwargs
|
1118
|
-
) -> "ElementCollection[T]":
|
1135
|
+
def find(self, selector: str, **kwargs) -> "ElementCollection":
|
1119
1136
|
"""
|
1120
|
-
|
1137
|
+
Find elements in this collection matching the selector.
|
1121
1138
|
|
1122
1139
|
Args:
|
1123
|
-
selector: CSS-like selector string
|
1124
|
-
|
1125
|
-
case: Whether to do case-sensitive text search (default: True).
|
1126
|
-
**kwargs: Additional filter parameters passed to the selector function.
|
1127
|
-
|
1128
|
-
Returns:
|
1129
|
-
A new ElementCollection containing only the matching elements from this collection.
|
1140
|
+
selector: CSS-like selector string
|
1141
|
+
apply_exclusions: Whether to exclude elements in exclusion regions
|
1130
1142
|
"""
|
1131
|
-
|
1132
|
-
return ElementCollection([])
|
1143
|
+
return self.apply(lambda element: element.find(selector, **kwargs))
|
1133
1144
|
|
1134
|
-
|
1135
|
-
selector_obj = parse_selector(selector)
|
1136
|
-
except Exception as e:
|
1137
|
-
logger.error(f"Error parsing selector '{selector}': {e}")
|
1138
|
-
return ElementCollection([]) # Return empty on parse error
|
1139
|
-
|
1140
|
-
# Pass regex and case flags to selector function generator
|
1141
|
-
kwargs["regex"] = regex
|
1142
|
-
kwargs["case"] = case
|
1143
|
-
|
1144
|
-
try:
|
1145
|
-
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
1146
|
-
except Exception as e:
|
1147
|
-
logger.error(f"Error creating filter function for selector '{selector}': {e}")
|
1148
|
-
return ElementCollection([]) # Return empty on filter creation error
|
1149
|
-
|
1150
|
-
matching_elements = [element for element in self._elements if filter_func(element)]
|
1151
|
-
|
1152
|
-
# Note: Unlike Page.find_all, this doesn't re-sort.
|
1153
|
-
# Sorting should be done explicitly on the collection if needed.
|
1154
|
-
|
1155
|
-
return ElementCollection(matching_elements)
|
1156
|
-
|
1157
|
-
def find(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> Optional[T]:
|
1145
|
+
def extract_each_text(self, **kwargs) -> List[str]:
|
1158
1146
|
"""
|
1159
|
-
|
1160
|
-
|
1161
|
-
Args:
|
1162
|
-
selector: CSS-like selector string.
|
1163
|
-
regex: Whether to use regex for text search in :contains (default: False).
|
1164
|
-
case: Whether to do case-sensitive text search (default: True).
|
1165
|
-
**kwargs: Additional filter parameters passed to the selector function.
|
1166
|
-
|
1167
|
-
Returns:
|
1168
|
-
The first matching element or None.
|
1147
|
+
Extract text from each element in this region.
|
1169
1148
|
"""
|
1170
|
-
|
1171
|
-
|
1149
|
+
return self.apply(
|
1150
|
+
lambda element: element.extract_text(**kwargs) if element is not None else None
|
1151
|
+
)
|
1172
1152
|
|
1173
1153
|
def correct_ocr(
|
1174
1154
|
self,
|
@@ -1214,23 +1194,23 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
1214
1194
|
def remove(self) -> int:
|
1215
1195
|
"""
|
1216
1196
|
Remove all elements in this collection from their respective pages.
|
1217
|
-
|
1197
|
+
|
1218
1198
|
This method removes elements from the page's _element_mgr storage.
|
1219
1199
|
It's particularly useful for removing OCR elements before applying new OCR.
|
1220
|
-
|
1200
|
+
|
1221
1201
|
Returns:
|
1222
1202
|
int: Number of elements successfully removed
|
1223
1203
|
"""
|
1224
1204
|
if not self._elements:
|
1225
1205
|
return 0
|
1226
|
-
|
1206
|
+
|
1227
1207
|
removed_count = 0
|
1228
|
-
|
1208
|
+
|
1229
1209
|
for element in self._elements:
|
1230
1210
|
# Each element should have a reference to its page
|
1231
1211
|
if hasattr(element, "page") and hasattr(element.page, "_element_mgr"):
|
1232
1212
|
element_mgr = element.page._element_mgr
|
1233
|
-
|
1213
|
+
|
1234
1214
|
# Determine element type
|
1235
1215
|
element_type = getattr(element, "object_type", None)
|
1236
1216
|
if element_type:
|
@@ -1243,7 +1223,7 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
1243
1223
|
element_type = "rects"
|
1244
1224
|
elif element_type == "line":
|
1245
1225
|
element_type = "lines"
|
1246
|
-
|
1226
|
+
|
1247
1227
|
# Try to remove from the element manager
|
1248
1228
|
if hasattr(element_mgr, "remove_element"):
|
1249
1229
|
success = element_mgr.remove_element(element, element_type)
|
@@ -1253,7 +1233,7 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
1253
1233
|
logger.warning("ElementManager does not have remove_element method")
|
1254
1234
|
else:
|
1255
1235
|
logger.warning(f"Element has no page or page has no _element_mgr: {element}")
|
1256
|
-
|
1236
|
+
|
1257
1237
|
return removed_count
|
1258
1238
|
|
1259
1239
|
# --- Classification Method --- #
|
@@ -1263,12 +1243,12 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
1263
1243
|
model: Optional[str] = None,
|
1264
1244
|
using: Optional[str] = None,
|
1265
1245
|
min_confidence: float = 0.0,
|
1266
|
-
analysis_key: str =
|
1246
|
+
analysis_key: str = "classification",
|
1267
1247
|
multi_label: bool = False,
|
1268
1248
|
batch_size: int = 8,
|
1269
1249
|
max_workers: Optional[int] = None,
|
1270
1250
|
progress_bar: bool = True,
|
1271
|
-
**kwargs
|
1251
|
+
**kwargs,
|
1272
1252
|
):
|
1273
1253
|
"""Classifies all elements in the collection in batch.
|
1274
1254
|
|
@@ -1292,21 +1272,21 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
1292
1272
|
# Requires access to the PDF's manager. Assume first element has it.
|
1293
1273
|
first_element = self.elements[0]
|
1294
1274
|
manager_source = None
|
1295
|
-
if hasattr(first_element,
|
1296
|
-
|
1297
|
-
elif hasattr(first_element,
|
1298
|
-
|
1299
|
-
|
1300
|
-
if not manager_source or not hasattr(manager_source,
|
1301
|
-
|
1275
|
+
if hasattr(first_element, "page") and hasattr(first_element.page, "pdf"):
|
1276
|
+
manager_source = first_element.page.pdf
|
1277
|
+
elif hasattr(first_element, "pdf"): # Maybe it's a PageCollection?
|
1278
|
+
manager_source = first_element.pdf
|
1279
|
+
|
1280
|
+
if not manager_source or not hasattr(manager_source, "get_manager"):
|
1281
|
+
raise RuntimeError("Cannot access ClassificationManager via elements.")
|
1302
1282
|
|
1303
1283
|
try:
|
1304
|
-
manager = manager_source.get_manager(
|
1284
|
+
manager = manager_source.get_manager("classification")
|
1305
1285
|
except Exception as e:
|
1306
|
-
|
1286
|
+
raise RuntimeError(f"Failed to get ClassificationManager: {e}") from e
|
1307
1287
|
|
1308
1288
|
if not manager or not manager.is_available():
|
1309
|
-
|
1289
|
+
raise RuntimeError("ClassificationManager is not available.")
|
1310
1290
|
|
1311
1291
|
# Determine engine type early for content gathering
|
1312
1292
|
inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
|
@@ -1314,26 +1294,34 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
1314
1294
|
# Gather content from all elements
|
1315
1295
|
items_to_classify: List[Tuple[Any, Union[str, Image.Image]]] = []
|
1316
1296
|
original_elements: List[Any] = []
|
1317
|
-
logger.info(
|
1297
|
+
logger.info(
|
1298
|
+
f"Gathering content for {len(self.elements)} elements for batch classification..."
|
1299
|
+
)
|
1318
1300
|
for element in self.elements:
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1301
|
+
if not isinstance(element, ClassificationMixin):
|
1302
|
+
logger.warning(f"Skipping element (not ClassificationMixin): {element!r}")
|
1303
|
+
continue
|
1304
|
+
try:
|
1305
|
+
# Delegate content fetching to the element itself
|
1306
|
+
content = element._get_classification_content(model_type=inferred_using, **kwargs)
|
1307
|
+
items_to_classify.append(content)
|
1308
|
+
original_elements.append(element)
|
1309
|
+
except (ValueError, NotImplementedError) as e:
|
1310
|
+
logger.warning(
|
1311
|
+
f"Skipping element {element!r}: Cannot get content for classification - {e}"
|
1312
|
+
)
|
1313
|
+
except Exception as e:
|
1314
|
+
logger.warning(
|
1315
|
+
f"Skipping element {element!r}: Error getting classification content - {e}"
|
1316
|
+
)
|
1331
1317
|
|
1332
1318
|
if not items_to_classify:
|
1333
|
-
|
1334
|
-
|
1319
|
+
logger.warning("No content could be gathered from elements for batch classification.")
|
1320
|
+
return self
|
1335
1321
|
|
1336
|
-
logger.info(
|
1322
|
+
logger.info(
|
1323
|
+
f"Collected content for {len(items_to_classify)} elements. Running batch classification..."
|
1324
|
+
)
|
1337
1325
|
|
1338
1326
|
# Call manager's batch classify
|
1339
1327
|
batch_results: List[ClassificationResult] = manager.classify_batch(
|
@@ -1345,29 +1333,148 @@ class ElementCollection(Generic[T], ApplyMixin):
|
|
1345
1333
|
multi_label=multi_label,
|
1346
1334
|
batch_size=batch_size,
|
1347
1335
|
progress_bar=progress_bar,
|
1348
|
-
**kwargs
|
1336
|
+
**kwargs,
|
1349
1337
|
)
|
1350
1338
|
|
1351
1339
|
# Assign results back to elements
|
1352
1340
|
if len(batch_results) != len(original_elements):
|
1353
|
-
|
1354
|
-
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1341
|
+
logger.error(
|
1342
|
+
f"Batch classification result count ({len(batch_results)}) mismatch "
|
1343
|
+
f"with elements processed ({len(original_elements)}). Cannot assign results."
|
1344
|
+
)
|
1345
|
+
# Decide how to handle mismatch - maybe store errors?
|
1358
1346
|
else:
|
1359
|
-
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1363
|
-
|
1364
|
-
|
1365
|
-
|
1366
|
-
|
1347
|
+
logger.info(
|
1348
|
+
f"Assigning {len(batch_results)} results to elements under key '{analysis_key}'."
|
1349
|
+
)
|
1350
|
+
for element, result_obj in zip(original_elements, batch_results):
|
1351
|
+
try:
|
1352
|
+
if not hasattr(element, "analyses") or element.analyses is None:
|
1353
|
+
element.analyses = {}
|
1354
|
+
element.analyses[analysis_key] = result_obj
|
1355
|
+
except Exception as e:
|
1356
|
+
logger.warning(f"Failed to store classification result for {element!r}: {e}")
|
1367
1357
|
|
1368
1358
|
return self
|
1359
|
+
|
1369
1360
|
# --- End Classification Method --- #
|
1370
1361
|
|
1362
|
+
def _gather_analysis_data(
|
1363
|
+
self,
|
1364
|
+
analysis_keys: List[str],
|
1365
|
+
include_content: bool,
|
1366
|
+
include_images: bool,
|
1367
|
+
image_dir: Optional[Path],
|
1368
|
+
image_format: str,
|
1369
|
+
image_resolution: int,
|
1370
|
+
) -> List[Dict[str, Any]]:
|
1371
|
+
"""
|
1372
|
+
Gather analysis data from all elements in the collection.
|
1373
|
+
|
1374
|
+
Args:
|
1375
|
+
analysis_keys: Keys in the analyses dictionary to export
|
1376
|
+
include_content: Whether to include extracted text
|
1377
|
+
include_images: Whether to export images
|
1378
|
+
image_dir: Directory to save images
|
1379
|
+
image_format: Format to save images
|
1380
|
+
image_resolution: Resolution for exported images
|
1381
|
+
|
1382
|
+
Returns:
|
1383
|
+
List of dictionaries containing analysis data
|
1384
|
+
"""
|
1385
|
+
if not self.elements:
|
1386
|
+
logger.warning("No elements found in collection")
|
1387
|
+
return []
|
1388
|
+
|
1389
|
+
all_data = []
|
1390
|
+
|
1391
|
+
for i, element in enumerate(self.elements):
|
1392
|
+
# Base element information
|
1393
|
+
element_data = {
|
1394
|
+
"element_index": i,
|
1395
|
+
"element_type": getattr(element, "type", type(element).__name__),
|
1396
|
+
}
|
1397
|
+
|
1398
|
+
# Add geometry if available
|
1399
|
+
for attr in ["x0", "top", "x1", "bottom", "width", "height"]:
|
1400
|
+
if hasattr(element, attr):
|
1401
|
+
element_data[attr] = getattr(element, attr)
|
1402
|
+
|
1403
|
+
# Add page information if available
|
1404
|
+
if hasattr(element, "page"):
|
1405
|
+
page = element.page
|
1406
|
+
if page:
|
1407
|
+
element_data["page_number"] = getattr(page, "number", None)
|
1408
|
+
element_data["pdf_path"] = (
|
1409
|
+
getattr(page.pdf, "path", None) if hasattr(page, "pdf") else None
|
1410
|
+
)
|
1411
|
+
|
1412
|
+
# Include extracted text if requested
|
1413
|
+
if include_content and hasattr(element, "extract_text"):
|
1414
|
+
try:
|
1415
|
+
element_data["content"] = element.extract_text(preserve_whitespace=True)
|
1416
|
+
except Exception as e:
|
1417
|
+
logger.error(f"Error extracting text from element {i}: {e}")
|
1418
|
+
element_data["content"] = ""
|
1419
|
+
|
1420
|
+
# Save image if requested
|
1421
|
+
if include_images and hasattr(element, "to_image"):
|
1422
|
+
try:
|
1423
|
+
# Create identifier for the element
|
1424
|
+
pdf_name = "unknown"
|
1425
|
+
page_num = "unknown"
|
1426
|
+
|
1427
|
+
if hasattr(element, "page") and element.page:
|
1428
|
+
page_num = element.page.number
|
1429
|
+
if hasattr(element.page, "pdf") and element.page.pdf:
|
1430
|
+
pdf_name = Path(element.page.pdf.path).stem
|
1431
|
+
|
1432
|
+
# Create image filename
|
1433
|
+
element_type = element_data.get("element_type", "element").lower()
|
1434
|
+
image_filename = f"{pdf_name}_page{page_num}_{element_type}_{i}.{image_format}"
|
1435
|
+
image_path = image_dir / image_filename
|
1436
|
+
|
1437
|
+
# Save image
|
1438
|
+
element.to_image(
|
1439
|
+
path=str(image_path), resolution=image_resolution, include_highlights=True
|
1440
|
+
)
|
1441
|
+
|
1442
|
+
# Add relative path to data
|
1443
|
+
element_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
|
1444
|
+
except Exception as e:
|
1445
|
+
logger.error(f"Error saving image for element {i}: {e}")
|
1446
|
+
element_data["image_path"] = None
|
1447
|
+
|
1448
|
+
# Add analyses data
|
1449
|
+
if hasattr(element, "analyses"):
|
1450
|
+
for key in analysis_keys:
|
1451
|
+
if key not in element.analyses:
|
1452
|
+
# Skip this key if it doesn't exist - elements might have different analyses
|
1453
|
+
logger.warning(f"Analysis key '{key}' not found in element {i}")
|
1454
|
+
continue
|
1455
|
+
|
1456
|
+
# Get the analysis result
|
1457
|
+
analysis_result = element.analyses[key]
|
1458
|
+
|
1459
|
+
# If the result has a to_dict method, use it
|
1460
|
+
if hasattr(analysis_result, "to_dict"):
|
1461
|
+
analysis_data = analysis_result.to_dict()
|
1462
|
+
else:
|
1463
|
+
# Otherwise, use the result directly if it's dict-like
|
1464
|
+
try:
|
1465
|
+
analysis_data = dict(analysis_result)
|
1466
|
+
except (TypeError, ValueError):
|
1467
|
+
# Last resort: convert to string
|
1468
|
+
analysis_data = {"raw_result": str(analysis_result)}
|
1469
|
+
|
1470
|
+
# Add analysis data to element data with the key as prefix
|
1471
|
+
for k, v in analysis_data.items():
|
1472
|
+
element_data[f"{key}.{k}"] = v
|
1473
|
+
|
1474
|
+
all_data.append(element_data)
|
1475
|
+
|
1476
|
+
return all_data
|
1477
|
+
|
1371
1478
|
|
1372
1479
|
class PageCollection(Generic[P], ApplyMixin):
|
1373
1480
|
"""
|
@@ -1500,39 +1607,127 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1500
1607
|
|
1501
1608
|
return self # Return self for chaining
|
1502
1609
|
|
1503
|
-
|
1610
|
+
@overload
|
1611
|
+
def find(
|
1612
|
+
self,
|
1613
|
+
*,
|
1614
|
+
text: str,
|
1615
|
+
apply_exclusions: bool = True,
|
1616
|
+
regex: bool = False,
|
1617
|
+
case: bool = True,
|
1618
|
+
**kwargs,
|
1619
|
+
) -> Optional[T]: ...
|
1620
|
+
|
1621
|
+
@overload
|
1622
|
+
def find(
|
1623
|
+
self,
|
1624
|
+
selector: str,
|
1625
|
+
*,
|
1626
|
+
apply_exclusions: bool = True,
|
1627
|
+
regex: bool = False,
|
1628
|
+
case: bool = True,
|
1629
|
+
**kwargs,
|
1630
|
+
) -> Optional[T]: ...
|
1631
|
+
|
1632
|
+
def find(
|
1633
|
+
self,
|
1634
|
+
selector: Optional[str] = None,
|
1635
|
+
*,
|
1636
|
+
text: Optional[str] = None,
|
1637
|
+
apply_exclusions: bool = True,
|
1638
|
+
regex: bool = False,
|
1639
|
+
case: bool = True,
|
1640
|
+
**kwargs,
|
1641
|
+
) -> Optional[T]:
|
1504
1642
|
"""
|
1505
|
-
Find the first element matching the selector across all pages.
|
1643
|
+
Find the first element matching the selector OR text across all pages in the collection.
|
1644
|
+
|
1645
|
+
Provide EITHER `selector` OR `text`, but not both.
|
1506
1646
|
|
1507
1647
|
Args:
|
1508
|
-
selector: CSS-like selector string
|
1509
|
-
|
1510
|
-
|
1648
|
+
selector: CSS-like selector string.
|
1649
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
1650
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1651
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1652
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
1653
|
+
**kwargs: Additional filter parameters.
|
1511
1654
|
|
1512
1655
|
Returns:
|
1513
|
-
First matching element or None
|
1656
|
+
First matching element or None.
|
1514
1657
|
"""
|
1658
|
+
# Input validation happens within page.find
|
1515
1659
|
for page in self.pages:
|
1516
|
-
element = page.find(
|
1660
|
+
element = page.find(
|
1661
|
+
selector=selector,
|
1662
|
+
text=text,
|
1663
|
+
apply_exclusions=apply_exclusions,
|
1664
|
+
regex=regex,
|
1665
|
+
case=case,
|
1666
|
+
**kwargs,
|
1667
|
+
)
|
1517
1668
|
if element:
|
1518
1669
|
return element
|
1519
1670
|
return None
|
1520
1671
|
|
1521
|
-
|
1672
|
+
@overload
|
1673
|
+
def find_all(
|
1674
|
+
self,
|
1675
|
+
*,
|
1676
|
+
text: str,
|
1677
|
+
apply_exclusions: bool = True,
|
1678
|
+
regex: bool = False,
|
1679
|
+
case: bool = True,
|
1680
|
+
**kwargs,
|
1681
|
+
) -> "ElementCollection": ...
|
1682
|
+
|
1683
|
+
@overload
|
1684
|
+
def find_all(
|
1685
|
+
self,
|
1686
|
+
selector: str,
|
1687
|
+
*,
|
1688
|
+
apply_exclusions: bool = True,
|
1689
|
+
regex: bool = False,
|
1690
|
+
case: bool = True,
|
1691
|
+
**kwargs,
|
1692
|
+
) -> "ElementCollection": ...
|
1693
|
+
|
1694
|
+
def find_all(
|
1695
|
+
self,
|
1696
|
+
selector: Optional[str] = None,
|
1697
|
+
*,
|
1698
|
+
text: Optional[str] = None,
|
1699
|
+
apply_exclusions: bool = True,
|
1700
|
+
regex: bool = False,
|
1701
|
+
case: bool = True,
|
1702
|
+
**kwargs,
|
1703
|
+
) -> "ElementCollection":
|
1522
1704
|
"""
|
1523
|
-
Find all elements matching the selector across all pages.
|
1705
|
+
Find all elements matching the selector OR text across all pages in the collection.
|
1706
|
+
|
1707
|
+
Provide EITHER `selector` OR `text`, but not both.
|
1524
1708
|
|
1525
1709
|
Args:
|
1526
|
-
selector: CSS-like selector string
|
1527
|
-
|
1528
|
-
|
1710
|
+
selector: CSS-like selector string.
|
1711
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
1712
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1713
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1714
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
1715
|
+
**kwargs: Additional filter parameters.
|
1529
1716
|
|
1530
1717
|
Returns:
|
1531
|
-
ElementCollection with matching elements from all pages
|
1718
|
+
ElementCollection with matching elements from all pages.
|
1532
1719
|
"""
|
1533
1720
|
all_elements = []
|
1721
|
+
# Input validation happens within page.find_all
|
1534
1722
|
for page in self.pages:
|
1535
|
-
elements = page.find_all(
|
1723
|
+
elements = page.find_all(
|
1724
|
+
selector=selector,
|
1725
|
+
text=text,
|
1726
|
+
apply_exclusions=apply_exclusions,
|
1727
|
+
regex=regex,
|
1728
|
+
case=case,
|
1729
|
+
**kwargs,
|
1730
|
+
)
|
1536
1731
|
if elements:
|
1537
1732
|
all_elements.extend(elements.elements)
|
1538
1733
|
|
@@ -1571,10 +1766,14 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1571
1766
|
|
1572
1767
|
# Assume all pages share the same parent PDF object
|
1573
1768
|
parent_pdf = self.pages[0]._parent
|
1574
|
-
if
|
1575
|
-
|
1576
|
-
|
1577
|
-
|
1769
|
+
if (
|
1770
|
+
not parent_pdf
|
1771
|
+
or not hasattr(parent_pdf, "correct_ocr")
|
1772
|
+
or not callable(parent_pdf.correct_ocr)
|
1773
|
+
):
|
1774
|
+
raise RuntimeError(
|
1775
|
+
"Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
|
1776
|
+
)
|
1578
1777
|
|
1579
1778
|
page_indices = [p.index for p in self.pages]
|
1580
1779
|
logger.info(
|
@@ -1586,7 +1785,7 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1586
1785
|
parent_pdf.correct_ocr(
|
1587
1786
|
correction_callback=correction_callback,
|
1588
1787
|
pages=page_indices,
|
1589
|
-
max_workers=max_workers
|
1788
|
+
max_workers=max_workers, # Pass it here
|
1590
1789
|
)
|
1591
1790
|
|
1592
1791
|
return self
|
@@ -1891,3 +2090,176 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
1891
2090
|
sections.append(region)
|
1892
2091
|
|
1893
2092
|
return sections
|
2093
|
+
|
2094
|
+
def _gather_analysis_data(
|
2095
|
+
self,
|
2096
|
+
analysis_keys: List[str],
|
2097
|
+
include_content: bool,
|
2098
|
+
include_images: bool,
|
2099
|
+
image_dir: Optional[Path],
|
2100
|
+
image_format: str,
|
2101
|
+
image_resolution: int,
|
2102
|
+
) -> List[Dict[str, Any]]:
|
2103
|
+
"""
|
2104
|
+
Gather analysis data from all pages in the collection.
|
2105
|
+
|
2106
|
+
Args:
|
2107
|
+
analysis_keys: Keys in the analyses dictionary to export
|
2108
|
+
include_content: Whether to include extracted text
|
2109
|
+
include_images: Whether to export images
|
2110
|
+
image_dir: Directory to save images
|
2111
|
+
image_format: Format to save images
|
2112
|
+
image_resolution: Resolution for exported images
|
2113
|
+
|
2114
|
+
Returns:
|
2115
|
+
List of dictionaries containing analysis data
|
2116
|
+
"""
|
2117
|
+
if not self.elements:
|
2118
|
+
logger.warning("No pages found in collection")
|
2119
|
+
return []
|
2120
|
+
|
2121
|
+
all_data = []
|
2122
|
+
|
2123
|
+
for page in self.elements:
|
2124
|
+
# Basic page information
|
2125
|
+
page_data = {
|
2126
|
+
"page_number": page.number,
|
2127
|
+
"page_index": page.index,
|
2128
|
+
"width": page.width,
|
2129
|
+
"height": page.height,
|
2130
|
+
}
|
2131
|
+
|
2132
|
+
# Add PDF information if available
|
2133
|
+
if hasattr(page, "pdf") and page.pdf:
|
2134
|
+
page_data["pdf_path"] = page.pdf.path
|
2135
|
+
page_data["pdf_filename"] = Path(page.pdf.path).name
|
2136
|
+
|
2137
|
+
# Include extracted text if requested
|
2138
|
+
if include_content:
|
2139
|
+
try:
|
2140
|
+
page_data["content"] = page.extract_text(preserve_whitespace=True)
|
2141
|
+
except Exception as e:
|
2142
|
+
logger.error(f"Error extracting text from page {page.number}: {e}")
|
2143
|
+
page_data["content"] = ""
|
2144
|
+
|
2145
|
+
# Save image if requested
|
2146
|
+
if include_images:
|
2147
|
+
try:
|
2148
|
+
# Create image filename
|
2149
|
+
pdf_name = "unknown"
|
2150
|
+
if hasattr(page, "pdf") and page.pdf:
|
2151
|
+
pdf_name = Path(page.pdf.path).stem
|
2152
|
+
|
2153
|
+
image_filename = f"{pdf_name}_page_{page.number}.{image_format}"
|
2154
|
+
image_path = image_dir / image_filename
|
2155
|
+
|
2156
|
+
# Save image
|
2157
|
+
page.save_image(
|
2158
|
+
str(image_path), resolution=image_resolution, include_highlights=True
|
2159
|
+
)
|
2160
|
+
|
2161
|
+
# Add relative path to data
|
2162
|
+
page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
|
2163
|
+
except Exception as e:
|
2164
|
+
logger.error(f"Error saving image for page {page.number}: {e}")
|
2165
|
+
page_data["image_path"] = None
|
2166
|
+
|
2167
|
+
# Add analyses data
|
2168
|
+
if hasattr(page, "analyses") and page.analyses:
|
2169
|
+
for key in analysis_keys:
|
2170
|
+
if key not in page.analyses:
|
2171
|
+
raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
|
2172
|
+
|
2173
|
+
# Get the analysis result
|
2174
|
+
analysis_result = page.analyses[key]
|
2175
|
+
|
2176
|
+
# If the result has a to_dict method, use it
|
2177
|
+
if hasattr(analysis_result, "to_dict"):
|
2178
|
+
analysis_data = analysis_result.to_dict()
|
2179
|
+
else:
|
2180
|
+
# Otherwise, use the result directly if it's dict-like
|
2181
|
+
try:
|
2182
|
+
analysis_data = dict(analysis_result)
|
2183
|
+
except (TypeError, ValueError):
|
2184
|
+
# Last resort: convert to string
|
2185
|
+
analysis_data = {"raw_result": str(analysis_result)}
|
2186
|
+
|
2187
|
+
# Add analysis data to page data with the key as prefix
|
2188
|
+
for k, v in analysis_data.items():
|
2189
|
+
page_data[f"{key}.{k}"] = v
|
2190
|
+
|
2191
|
+
all_data.append(page_data)
|
2192
|
+
|
2193
|
+
return all_data
|
2194
|
+
|
2195
|
+
# --- Deskew Method --- #
|
2196
|
+
|
2197
|
+
def deskew(
|
2198
|
+
self,
|
2199
|
+
resolution: int = 300,
|
2200
|
+
detection_resolution: int = 72,
|
2201
|
+
force_overwrite: bool = False,
|
2202
|
+
**deskew_kwargs,
|
2203
|
+
) -> "PDF": # Changed return type
|
2204
|
+
"""
|
2205
|
+
Creates a new, in-memory PDF object containing deskewed versions of the pages
|
2206
|
+
in this collection.
|
2207
|
+
|
2208
|
+
This method delegates the actual processing to the parent PDF object's
|
2209
|
+
`deskew` method.
|
2210
|
+
|
2211
|
+
Important: The returned PDF is image-based. Any existing text, OCR results,
|
2212
|
+
annotations, or other elements from the original pages will *not* be carried over.
|
2213
|
+
|
2214
|
+
Args:
|
2215
|
+
resolution: DPI resolution for rendering the output deskewed pages.
|
2216
|
+
detection_resolution: DPI resolution used for skew detection if angles are not
|
2217
|
+
already cached on the page objects.
|
2218
|
+
force_overwrite: If False (default), raises a ValueError if any target page
|
2219
|
+
already contains processed elements (text, OCR, regions) to
|
2220
|
+
prevent accidental data loss. Set to True to proceed anyway.
|
2221
|
+
**deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
|
2222
|
+
during automatic detection (e.g., `max_angle`, `num_peaks`).
|
2223
|
+
|
2224
|
+
Returns:
|
2225
|
+
A new PDF object representing the deskewed document.
|
2226
|
+
|
2227
|
+
Raises:
|
2228
|
+
ImportError: If 'deskew' or 'img2pdf' libraries are not installed (raised by PDF.deskew).
|
2229
|
+
ValueError: If `force_overwrite` is False and target pages contain elements (raised by PDF.deskew),
|
2230
|
+
or if the collection is empty.
|
2231
|
+
RuntimeError: If pages lack a parent PDF reference, or the parent PDF lacks the `deskew` method.
|
2232
|
+
"""
|
2233
|
+
if not self.pages:
|
2234
|
+
logger.warning("Cannot deskew an empty PageCollection.")
|
2235
|
+
raise ValueError("Cannot deskew an empty PageCollection.")
|
2236
|
+
|
2237
|
+
# Assume all pages share the same parent PDF object
|
2238
|
+
# Need to hint the type of _parent for type checkers
|
2239
|
+
if TYPE_CHECKING:
|
2240
|
+
parent_pdf: "natural_pdf.core.pdf.PDF" = self.pages[0]._parent
|
2241
|
+
else:
|
2242
|
+
parent_pdf = self.pages[0]._parent
|
2243
|
+
|
2244
|
+
if not parent_pdf or not hasattr(parent_pdf, "deskew") or not callable(parent_pdf.deskew):
|
2245
|
+
raise RuntimeError(
|
2246
|
+
"Parent PDF reference not found or parent PDF lacks the required 'deskew' method."
|
2247
|
+
)
|
2248
|
+
|
2249
|
+
# Get the 0-based indices of the pages in this collection
|
2250
|
+
page_indices = [p.index for p in self.pages]
|
2251
|
+
logger.info(
|
2252
|
+
f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
|
2253
|
+
)
|
2254
|
+
|
2255
|
+
# Delegate the call to the parent PDF object for the relevant pages
|
2256
|
+
# Pass all relevant arguments through (no output_path anymore)
|
2257
|
+
return parent_pdf.deskew(
|
2258
|
+
pages=page_indices,
|
2259
|
+
resolution=resolution,
|
2260
|
+
detection_resolution=detection_resolution,
|
2261
|
+
force_overwrite=force_overwrite,
|
2262
|
+
**deskew_kwargs,
|
2263
|
+
)
|
2264
|
+
|
2265
|
+
# --- End Deskew Method --- #
|