natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,27 +1,41 @@
|
|
1
1
|
import logging
|
2
|
+
from collections.abc import MutableSequence
|
3
|
+
from pathlib import Path
|
2
4
|
from typing import (
|
3
5
|
TYPE_CHECKING,
|
4
6
|
Any,
|
5
7
|
Callable,
|
6
8
|
Dict,
|
7
9
|
Generic,
|
10
|
+
Iterable,
|
8
11
|
Iterator,
|
9
12
|
List,
|
10
13
|
Optional,
|
14
|
+
Sequence,
|
11
15
|
Tuple,
|
16
|
+
Type,
|
12
17
|
TypeVar,
|
13
18
|
Union,
|
19
|
+
overload,
|
14
20
|
)
|
15
21
|
|
16
22
|
from pdfplumber.utils.geometry import objects_to_bbox
|
17
23
|
|
18
24
|
# New Imports
|
19
25
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
20
|
-
|
21
|
-
|
26
|
+
from tqdm.auto import tqdm
|
27
|
+
|
28
|
+
from natural_pdf.classification.manager import ClassificationManager
|
29
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
30
|
+
from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
|
31
|
+
from natural_pdf.core.pdf import PDF
|
32
|
+
from natural_pdf.elements.base import Element
|
33
|
+
from natural_pdf.elements.region import Region
|
34
|
+
from natural_pdf.elements.text import TextElement
|
35
|
+
from natural_pdf.export.mixin import ExportMixin
|
22
36
|
from natural_pdf.ocr import OCROptions
|
37
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
23
38
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
24
|
-
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import the new utility
|
25
39
|
|
26
40
|
logger = logging.getLogger(__name__)
|
27
41
|
|
@@ -33,7 +47,9 @@ T = TypeVar("T")
|
|
33
47
|
P = TypeVar("P", bound="Page")
|
34
48
|
|
35
49
|
|
36
|
-
class ElementCollection(
|
50
|
+
class ElementCollection(
|
51
|
+
Generic[T], ApplyMixin, ExportMixin, DirectionalCollectionMixin, MutableSequence
|
52
|
+
):
|
37
53
|
"""
|
38
54
|
Collection of PDF elements with batch operations.
|
39
55
|
"""
|
@@ -55,10 +71,6 @@ class ElementCollection(Generic[T]):
|
|
55
71
|
"""Get an element by index."""
|
56
72
|
return self._elements[index]
|
57
73
|
|
58
|
-
def __iter__(self):
|
59
|
-
"""Iterate over elements."""
|
60
|
-
return iter(self._elements)
|
61
|
-
|
62
74
|
def __repr__(self) -> str:
|
63
75
|
"""Return a string representation showing the element count."""
|
64
76
|
element_type = "Mixed"
|
@@ -68,6 +80,20 @@ class ElementCollection(Generic[T]):
|
|
68
80
|
element_type = types.pop()
|
69
81
|
return f"<ElementCollection[{element_type}](count={len(self)})>"
|
70
82
|
|
83
|
+
def __add__(self, other: "ElementCollection") -> "ElementCollection":
|
84
|
+
if not isinstance(other, ElementCollection):
|
85
|
+
return NotImplemented
|
86
|
+
return ElementCollection(self._elements + other._elements)
|
87
|
+
|
88
|
+
def __setitem__(self, index, value):
|
89
|
+
self._elements[index] = value
|
90
|
+
|
91
|
+
def __delitem__(self, index):
|
92
|
+
del self._elements[index]
|
93
|
+
|
94
|
+
def insert(self, index, value):
|
95
|
+
self._elements.insert(index, value)
|
96
|
+
|
71
97
|
@property
|
72
98
|
def elements(self) -> List["Element"]:
|
73
99
|
"""Get the elements in this collection."""
|
@@ -83,12 +109,53 @@ class ElementCollection(Generic[T]):
|
|
83
109
|
"""Get the last element in the collection."""
|
84
110
|
return self._elements[-1] if self._elements else None
|
85
111
|
|
112
|
+
def _are_on_multiple_pages(self) -> bool:
|
113
|
+
"""
|
114
|
+
Check if elements in this collection span multiple pages.
|
115
|
+
|
116
|
+
Returns:
|
117
|
+
True if elements are on different pages, False otherwise
|
118
|
+
"""
|
119
|
+
if not self._elements:
|
120
|
+
return False
|
121
|
+
|
122
|
+
# Get the page index of the first element
|
123
|
+
if not hasattr(self._elements[0], "page"):
|
124
|
+
return False
|
125
|
+
|
126
|
+
first_page_idx = self._elements[0].page.index
|
127
|
+
|
128
|
+
# Check if any element is on a different page
|
129
|
+
return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
|
130
|
+
|
131
|
+
def _are_on_multiple_pdfs(self) -> bool:
|
132
|
+
"""
|
133
|
+
Check if elements in this collection span multiple PDFs.
|
134
|
+
|
135
|
+
Returns:
|
136
|
+
True if elements are from different PDFs, False otherwise
|
137
|
+
"""
|
138
|
+
if not self._elements:
|
139
|
+
return False
|
140
|
+
|
141
|
+
# Get the PDF of the first element
|
142
|
+
if not hasattr(self._elements[0], "page") or not hasattr(self._elements[0].page, "pdf"):
|
143
|
+
return False
|
144
|
+
|
145
|
+
first_pdf = self._elements[0].page.pdf
|
146
|
+
|
147
|
+
# Check if any element is from a different PDF
|
148
|
+
return any(
|
149
|
+
hasattr(e, "page") and hasattr(e.page, "pdf") and e.page.pdf is not first_pdf
|
150
|
+
for e in self._elements
|
151
|
+
)
|
152
|
+
|
86
153
|
def highest(self) -> Optional["Element"]:
|
87
154
|
"""
|
88
155
|
Get element with the smallest top y-coordinate (highest on page).
|
89
156
|
|
90
157
|
Raises:
|
91
|
-
ValueError: If elements are on multiple pages
|
158
|
+
ValueError: If elements are on multiple pages or multiple PDFs
|
92
159
|
|
93
160
|
Returns:
|
94
161
|
Element with smallest top value or None if empty
|
@@ -96,7 +163,9 @@ class ElementCollection(Generic[T]):
|
|
96
163
|
if not self._elements:
|
97
164
|
return None
|
98
165
|
|
99
|
-
# Check if elements are on multiple pages
|
166
|
+
# Check if elements are on multiple pages or PDFs
|
167
|
+
if self._are_on_multiple_pdfs():
|
168
|
+
raise ValueError("Cannot determine highest element across multiple PDFs")
|
100
169
|
if self._are_on_multiple_pages():
|
101
170
|
raise ValueError("Cannot determine highest element across multiple pages")
|
102
171
|
|
@@ -107,7 +176,7 @@ class ElementCollection(Generic[T]):
|
|
107
176
|
Get element with the largest bottom y-coordinate (lowest on page).
|
108
177
|
|
109
178
|
Raises:
|
110
|
-
ValueError: If elements are on multiple pages
|
179
|
+
ValueError: If elements are on multiple pages or multiple PDFs
|
111
180
|
|
112
181
|
Returns:
|
113
182
|
Element with largest bottom value or None if empty
|
@@ -115,7 +184,9 @@ class ElementCollection(Generic[T]):
|
|
115
184
|
if not self._elements:
|
116
185
|
return None
|
117
186
|
|
118
|
-
# Check if elements are on multiple pages
|
187
|
+
# Check if elements are on multiple pages or PDFs
|
188
|
+
if self._are_on_multiple_pdfs():
|
189
|
+
raise ValueError("Cannot determine lowest element across multiple PDFs")
|
119
190
|
if self._are_on_multiple_pages():
|
120
191
|
raise ValueError("Cannot determine lowest element across multiple pages")
|
121
192
|
|
@@ -126,7 +197,7 @@ class ElementCollection(Generic[T]):
|
|
126
197
|
Get element with the smallest x0 coordinate (leftmost on page).
|
127
198
|
|
128
199
|
Raises:
|
129
|
-
ValueError: If elements are on multiple pages
|
200
|
+
ValueError: If elements are on multiple pages or multiple PDFs
|
130
201
|
|
131
202
|
Returns:
|
132
203
|
Element with smallest x0 value or None if empty
|
@@ -134,7 +205,9 @@ class ElementCollection(Generic[T]):
|
|
134
205
|
if not self._elements:
|
135
206
|
return None
|
136
207
|
|
137
|
-
# Check if elements are on multiple pages
|
208
|
+
# Check if elements are on multiple pages or PDFs
|
209
|
+
if self._are_on_multiple_pdfs():
|
210
|
+
raise ValueError("Cannot determine leftmost element across multiple PDFs")
|
138
211
|
if self._are_on_multiple_pages():
|
139
212
|
raise ValueError("Cannot determine leftmost element across multiple pages")
|
140
213
|
|
@@ -145,7 +218,7 @@ class ElementCollection(Generic[T]):
|
|
145
218
|
Get element with the largest x1 coordinate (rightmost on page).
|
146
219
|
|
147
220
|
Raises:
|
148
|
-
ValueError: If elements are on multiple pages
|
221
|
+
ValueError: If elements are on multiple pages or multiple PDFs
|
149
222
|
|
150
223
|
Returns:
|
151
224
|
Element with largest x1 value or None if empty
|
@@ -153,31 +226,14 @@ class ElementCollection(Generic[T]):
|
|
153
226
|
if not self._elements:
|
154
227
|
return None
|
155
228
|
|
156
|
-
# Check if elements are on multiple pages
|
229
|
+
# Check if elements are on multiple pages or PDFs
|
230
|
+
if self._are_on_multiple_pdfs():
|
231
|
+
raise ValueError("Cannot determine rightmost element across multiple PDFs")
|
157
232
|
if self._are_on_multiple_pages():
|
158
233
|
raise ValueError("Cannot determine rightmost element across multiple pages")
|
159
234
|
|
160
235
|
return max(self._elements, key=lambda e: e.x1)
|
161
236
|
|
162
|
-
def _are_on_multiple_pages(self) -> bool:
|
163
|
-
"""
|
164
|
-
Check if elements in this collection span multiple pages.
|
165
|
-
|
166
|
-
Returns:
|
167
|
-
True if elements are on different pages, False otherwise
|
168
|
-
"""
|
169
|
-
if not self._elements:
|
170
|
-
return False
|
171
|
-
|
172
|
-
# Get the page index of the first element
|
173
|
-
if not hasattr(self._elements[0], "page"):
|
174
|
-
return False
|
175
|
-
|
176
|
-
first_page_idx = self._elements[0].page.index
|
177
|
-
|
178
|
-
# Check if any element is on a different page
|
179
|
-
return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
|
180
|
-
|
181
237
|
def exclude_regions(self, regions: List["Region"]) -> "ElementCollection":
|
182
238
|
"""
|
183
239
|
Remove elements that are within any of the specified regions.
|
@@ -359,6 +415,9 @@ class ElementCollection(Generic[T]):
|
|
359
415
|
|
360
416
|
Uses grouping logic based on parameters (defaulting to grouping by type).
|
361
417
|
|
418
|
+
Note: Elements must be from the same PDF for this operation to work properly,
|
419
|
+
as each PDF has its own highlighting service.
|
420
|
+
|
362
421
|
Args:
|
363
422
|
label: Optional explicit label for the entire collection. If provided,
|
364
423
|
all elements are highlighted as a single group with this label,
|
@@ -389,8 +448,12 @@ class ElementCollection(Generic[T]):
|
|
389
448
|
AttributeError: If 'group_by' is provided but the attribute doesn't exist
|
390
449
|
on some elements.
|
391
450
|
ValueError: If 'label_format' is provided but contains invalid keys for
|
392
|
-
element attributes.
|
451
|
+
element attributes, or if elements span multiple PDFs.
|
393
452
|
"""
|
453
|
+
# Check if elements span multiple PDFs
|
454
|
+
if self._are_on_multiple_pdfs():
|
455
|
+
raise ValueError("highlight() does not support elements from multiple PDFs")
|
456
|
+
|
394
457
|
# 1. Prepare the highlight data based on parameters
|
395
458
|
highlight_data_list = self._prepare_highlight_data(
|
396
459
|
distinct=distinct,
|
@@ -761,7 +824,8 @@ class ElementCollection(Generic[T]):
|
|
761
824
|
Generates a temporary preview image highlighting elements in this collection
|
762
825
|
on their page, ignoring any persistent highlights.
|
763
826
|
|
764
|
-
Currently only supports collections where all elements are on the same page
|
827
|
+
Currently only supports collections where all elements are on the same page
|
828
|
+
of the same PDF.
|
765
829
|
|
766
830
|
Allows grouping and coloring elements based on attributes, similar to the
|
767
831
|
persistent `highlight()` method, but only for this temporary view.
|
@@ -780,14 +844,20 @@ class ElementCollection(Generic[T]):
|
|
780
844
|
|
781
845
|
Returns:
|
782
846
|
PIL Image object of the temporary preview, or None if rendering fails or
|
783
|
-
elements span multiple pages.
|
847
|
+
elements span multiple pages/PDFs.
|
784
848
|
|
785
849
|
Raises:
|
786
|
-
ValueError: If the collection is empty or elements are on different pages.
|
850
|
+
ValueError: If the collection is empty or elements are on different pages/PDFs.
|
787
851
|
"""
|
788
852
|
if not self._elements:
|
789
853
|
raise ValueError("Cannot show an empty collection.")
|
790
854
|
|
855
|
+
# Check if elements are on multiple PDFs
|
856
|
+
if self._are_on_multiple_pdfs():
|
857
|
+
raise ValueError(
|
858
|
+
"show() currently only supports collections where all elements are from the same PDF."
|
859
|
+
)
|
860
|
+
|
791
861
|
# Check if elements are on multiple pages
|
792
862
|
if self._are_on_multiple_pages():
|
793
863
|
raise ValueError(
|
@@ -1062,70 +1132,33 @@ class ElementCollection(Generic[T]):
|
|
1062
1132
|
logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
|
1063
1133
|
return None
|
1064
1134
|
|
1065
|
-
def
|
1066
|
-
self, selector: str, regex: bool = False, case: bool = True, **kwargs
|
1067
|
-
) -> "ElementCollection[T]":
|
1135
|
+
def find(self, selector: str, **kwargs) -> "ElementCollection":
|
1068
1136
|
"""
|
1069
|
-
|
1137
|
+
Find elements in this collection matching the selector.
|
1070
1138
|
|
1071
1139
|
Args:
|
1072
|
-
selector: CSS-like selector string
|
1073
|
-
|
1074
|
-
case: Whether to do case-sensitive text search (default: True).
|
1075
|
-
**kwargs: Additional filter parameters passed to the selector function.
|
1076
|
-
|
1077
|
-
Returns:
|
1078
|
-
A new ElementCollection containing only the matching elements from this collection.
|
1140
|
+
selector: CSS-like selector string
|
1141
|
+
apply_exclusions: Whether to exclude elements in exclusion regions
|
1079
1142
|
"""
|
1080
|
-
|
1081
|
-
return ElementCollection([])
|
1082
|
-
|
1083
|
-
try:
|
1084
|
-
selector_obj = parse_selector(selector)
|
1085
|
-
except Exception as e:
|
1086
|
-
logger.error(f"Error parsing selector '{selector}': {e}")
|
1087
|
-
return ElementCollection([]) # Return empty on parse error
|
1088
|
-
|
1089
|
-
# Pass regex and case flags to selector function generator
|
1090
|
-
kwargs["regex"] = regex
|
1091
|
-
kwargs["case"] = case
|
1092
|
-
|
1093
|
-
try:
|
1094
|
-
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
1095
|
-
except Exception as e:
|
1096
|
-
logger.error(f"Error creating filter function for selector '{selector}': {e}")
|
1097
|
-
return ElementCollection([]) # Return empty on filter creation error
|
1098
|
-
|
1099
|
-
matching_elements = [element for element in self._elements if filter_func(element)]
|
1143
|
+
return self.apply(lambda element: element.find(selector, **kwargs))
|
1100
1144
|
|
1101
|
-
|
1102
|
-
# Sorting should be done explicitly on the collection if needed.
|
1103
|
-
|
1104
|
-
return ElementCollection(matching_elements)
|
1105
|
-
|
1106
|
-
def find(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> Optional[T]:
|
1145
|
+
def extract_each_text(self, **kwargs) -> List[str]:
|
1107
1146
|
"""
|
1108
|
-
|
1109
|
-
|
1110
|
-
Args:
|
1111
|
-
selector: CSS-like selector string.
|
1112
|
-
regex: Whether to use regex for text search in :contains (default: False).
|
1113
|
-
case: Whether to do case-sensitive text search (default: True).
|
1114
|
-
**kwargs: Additional filter parameters passed to the selector function.
|
1115
|
-
|
1116
|
-
Returns:
|
1117
|
-
The first matching element or None.
|
1147
|
+
Extract text from each element in this region.
|
1118
1148
|
"""
|
1119
|
-
|
1120
|
-
|
1149
|
+
return self.apply(
|
1150
|
+
lambda element: element.extract_text(**kwargs) if element is not None else None
|
1151
|
+
)
|
1121
1152
|
|
1122
1153
|
def correct_ocr(
|
1123
1154
|
self,
|
1124
1155
|
correction_callback: Callable[[Any], Optional[str]],
|
1156
|
+
max_workers: Optional[int] = None,
|
1125
1157
|
) -> "ElementCollection":
|
1126
1158
|
"""
|
1127
1159
|
Applies corrections to OCR-generated text elements within this collection
|
1128
|
-
using a user-provided callback function
|
1160
|
+
using a user-provided callback function, executed
|
1161
|
+
in parallel if `max_workers` is specified.
|
1129
1162
|
|
1130
1163
|
Iterates through elements currently in the collection. If an element's
|
1131
1164
|
'source' attribute starts with 'ocr', it calls the `correction_callback`
|
@@ -1143,6 +1176,8 @@ class ElementCollection(Generic[T]):
|
|
1143
1176
|
Args:
|
1144
1177
|
correction_callback: A function accepting an element and returning
|
1145
1178
|
`Optional[str]` (new text or None).
|
1179
|
+
max_workers: The maximum number of worker threads to use for parallel
|
1180
|
+
correction on each page. If None, defaults are used.
|
1146
1181
|
|
1147
1182
|
Returns:
|
1148
1183
|
Self for method chaining.
|
@@ -1152,11 +1187,296 @@ class ElementCollection(Generic[T]):
|
|
1152
1187
|
elements=self._elements,
|
1153
1188
|
correction_callback=correction_callback,
|
1154
1189
|
caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
|
1190
|
+
max_workers=max_workers,
|
1155
1191
|
)
|
1156
1192
|
return self # Return self for chaining
|
1157
1193
|
|
1194
|
+
def remove(self) -> int:
|
1195
|
+
"""
|
1196
|
+
Remove all elements in this collection from their respective pages.
|
1197
|
+
|
1198
|
+
This method removes elements from the page's _element_mgr storage.
|
1199
|
+
It's particularly useful for removing OCR elements before applying new OCR.
|
1200
|
+
|
1201
|
+
Returns:
|
1202
|
+
int: Number of elements successfully removed
|
1203
|
+
"""
|
1204
|
+
if not self._elements:
|
1205
|
+
return 0
|
1206
|
+
|
1207
|
+
removed_count = 0
|
1208
|
+
|
1209
|
+
for element in self._elements:
|
1210
|
+
# Each element should have a reference to its page
|
1211
|
+
if hasattr(element, "page") and hasattr(element.page, "_element_mgr"):
|
1212
|
+
element_mgr = element.page._element_mgr
|
1213
|
+
|
1214
|
+
# Determine element type
|
1215
|
+
element_type = getattr(element, "object_type", None)
|
1216
|
+
if element_type:
|
1217
|
+
# Convert to plural form expected by element_mgr
|
1218
|
+
if element_type == "word":
|
1219
|
+
element_type = "words"
|
1220
|
+
elif element_type == "char":
|
1221
|
+
element_type = "chars"
|
1222
|
+
elif element_type == "rect":
|
1223
|
+
element_type = "rects"
|
1224
|
+
elif element_type == "line":
|
1225
|
+
element_type = "lines"
|
1226
|
+
|
1227
|
+
# Try to remove from the element manager
|
1228
|
+
if hasattr(element_mgr, "remove_element"):
|
1229
|
+
success = element_mgr.remove_element(element, element_type)
|
1230
|
+
if success:
|
1231
|
+
removed_count += 1
|
1232
|
+
else:
|
1233
|
+
logger.warning("ElementManager does not have remove_element method")
|
1234
|
+
else:
|
1235
|
+
logger.warning(f"Element has no page or page has no _element_mgr: {element}")
|
1236
|
+
|
1237
|
+
return removed_count
|
1238
|
+
|
1239
|
+
# --- Classification Method --- #
|
1240
|
+
def classify_all(
|
1241
|
+
self,
|
1242
|
+
categories: List[str],
|
1243
|
+
model: Optional[str] = None,
|
1244
|
+
using: Optional[str] = None,
|
1245
|
+
min_confidence: float = 0.0,
|
1246
|
+
analysis_key: str = "classification",
|
1247
|
+
multi_label: bool = False,
|
1248
|
+
batch_size: int = 8,
|
1249
|
+
max_workers: Optional[int] = None,
|
1250
|
+
progress_bar: bool = True,
|
1251
|
+
**kwargs,
|
1252
|
+
):
|
1253
|
+
"""Classifies all elements in the collection in batch.
|
1254
|
+
|
1255
|
+
Args:
|
1256
|
+
categories: List of category labels.
|
1257
|
+
model: Model ID (or alias 'text', 'vision').
|
1258
|
+
using: Optional processing mode ('text' or 'vision'). Inferred if None.
|
1259
|
+
min_confidence: Minimum confidence threshold.
|
1260
|
+
analysis_key: Key for storing results in element.analyses.
|
1261
|
+
multi_label: Allow multiple labels per item.
|
1262
|
+
batch_size: Size of batches passed to the inference pipeline.
|
1263
|
+
max_workers: (Not currently used for classification batching which is
|
1264
|
+
handled by the underlying pipeline).
|
1265
|
+
progress_bar: Display a progress bar.
|
1266
|
+
**kwargs: Additional arguments for the ClassificationManager.
|
1267
|
+
"""
|
1268
|
+
if not self.elements:
|
1269
|
+
logger.info("ElementCollection is empty, skipping classification.")
|
1270
|
+
return self
|
1271
|
+
|
1272
|
+
# Requires access to the PDF's manager. Assume first element has it.
|
1273
|
+
first_element = self.elements[0]
|
1274
|
+
manager_source = None
|
1275
|
+
if hasattr(first_element, "page") and hasattr(first_element.page, "pdf"):
|
1276
|
+
manager_source = first_element.page.pdf
|
1277
|
+
elif hasattr(first_element, "pdf"): # Maybe it's a PageCollection?
|
1278
|
+
manager_source = first_element.pdf
|
1279
|
+
|
1280
|
+
if not manager_source or not hasattr(manager_source, "get_manager"):
|
1281
|
+
raise RuntimeError("Cannot access ClassificationManager via elements.")
|
1282
|
+
|
1283
|
+
try:
|
1284
|
+
manager = manager_source.get_manager("classification")
|
1285
|
+
except Exception as e:
|
1286
|
+
raise RuntimeError(f"Failed to get ClassificationManager: {e}") from e
|
1287
|
+
|
1288
|
+
if not manager or not manager.is_available():
|
1289
|
+
raise RuntimeError("ClassificationManager is not available.")
|
1290
|
+
|
1291
|
+
# Determine engine type early for content gathering
|
1292
|
+
inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
|
1293
|
+
|
1294
|
+
# Gather content from all elements
|
1295
|
+
items_to_classify: List[Tuple[Any, Union[str, Image.Image]]] = []
|
1296
|
+
original_elements: List[Any] = []
|
1297
|
+
logger.info(
|
1298
|
+
f"Gathering content for {len(self.elements)} elements for batch classification..."
|
1299
|
+
)
|
1300
|
+
for element in self.elements:
|
1301
|
+
if not isinstance(element, ClassificationMixin):
|
1302
|
+
logger.warning(f"Skipping element (not ClassificationMixin): {element!r}")
|
1303
|
+
continue
|
1304
|
+
try:
|
1305
|
+
# Delegate content fetching to the element itself
|
1306
|
+
content = element._get_classification_content(model_type=inferred_using, **kwargs)
|
1307
|
+
items_to_classify.append(content)
|
1308
|
+
original_elements.append(element)
|
1309
|
+
except (ValueError, NotImplementedError) as e:
|
1310
|
+
logger.warning(
|
1311
|
+
f"Skipping element {element!r}: Cannot get content for classification - {e}"
|
1312
|
+
)
|
1313
|
+
except Exception as e:
|
1314
|
+
logger.warning(
|
1315
|
+
f"Skipping element {element!r}: Error getting classification content - {e}"
|
1316
|
+
)
|
1317
|
+
|
1318
|
+
if not items_to_classify:
|
1319
|
+
logger.warning("No content could be gathered from elements for batch classification.")
|
1320
|
+
return self
|
1321
|
+
|
1322
|
+
logger.info(
|
1323
|
+
f"Collected content for {len(items_to_classify)} elements. Running batch classification..."
|
1324
|
+
)
|
1325
|
+
|
1326
|
+
# Call manager's batch classify
|
1327
|
+
batch_results: List[ClassificationResult] = manager.classify_batch(
|
1328
|
+
item_contents=items_to_classify,
|
1329
|
+
categories=categories,
|
1330
|
+
model_id=model,
|
1331
|
+
using=inferred_using,
|
1332
|
+
min_confidence=min_confidence,
|
1333
|
+
multi_label=multi_label,
|
1334
|
+
batch_size=batch_size,
|
1335
|
+
progress_bar=progress_bar,
|
1336
|
+
**kwargs,
|
1337
|
+
)
|
1338
|
+
|
1339
|
+
# Assign results back to elements
|
1340
|
+
if len(batch_results) != len(original_elements):
|
1341
|
+
logger.error(
|
1342
|
+
f"Batch classification result count ({len(batch_results)}) mismatch "
|
1343
|
+
f"with elements processed ({len(original_elements)}). Cannot assign results."
|
1344
|
+
)
|
1345
|
+
# Decide how to handle mismatch - maybe store errors?
|
1346
|
+
else:
|
1347
|
+
logger.info(
|
1348
|
+
f"Assigning {len(batch_results)} results to elements under key '{analysis_key}'."
|
1349
|
+
)
|
1350
|
+
for element, result_obj in zip(original_elements, batch_results):
|
1351
|
+
try:
|
1352
|
+
if not hasattr(element, "analyses") or element.analyses is None:
|
1353
|
+
element.analyses = {}
|
1354
|
+
element.analyses[analysis_key] = result_obj
|
1355
|
+
except Exception as e:
|
1356
|
+
logger.warning(f"Failed to store classification result for {element!r}: {e}")
|
1357
|
+
|
1358
|
+
return self
|
1359
|
+
|
1360
|
+
# --- End Classification Method --- #
|
1361
|
+
|
1362
|
+
def _gather_analysis_data(
|
1363
|
+
self,
|
1364
|
+
analysis_keys: List[str],
|
1365
|
+
include_content: bool,
|
1366
|
+
include_images: bool,
|
1367
|
+
image_dir: Optional[Path],
|
1368
|
+
image_format: str,
|
1369
|
+
image_resolution: int,
|
1370
|
+
) -> List[Dict[str, Any]]:
|
1371
|
+
"""
|
1372
|
+
Gather analysis data from all elements in the collection.
|
1373
|
+
|
1374
|
+
Args:
|
1375
|
+
analysis_keys: Keys in the analyses dictionary to export
|
1376
|
+
include_content: Whether to include extracted text
|
1377
|
+
include_images: Whether to export images
|
1378
|
+
image_dir: Directory to save images
|
1379
|
+
image_format: Format to save images
|
1380
|
+
image_resolution: Resolution for exported images
|
1381
|
+
|
1382
|
+
Returns:
|
1383
|
+
List of dictionaries containing analysis data
|
1384
|
+
"""
|
1385
|
+
if not self.elements:
|
1386
|
+
logger.warning("No elements found in collection")
|
1387
|
+
return []
|
1388
|
+
|
1389
|
+
all_data = []
|
1390
|
+
|
1391
|
+
for i, element in enumerate(self.elements):
|
1392
|
+
# Base element information
|
1393
|
+
element_data = {
|
1394
|
+
"element_index": i,
|
1395
|
+
"element_type": getattr(element, "type", type(element).__name__),
|
1396
|
+
}
|
1397
|
+
|
1398
|
+
# Add geometry if available
|
1399
|
+
for attr in ["x0", "top", "x1", "bottom", "width", "height"]:
|
1400
|
+
if hasattr(element, attr):
|
1401
|
+
element_data[attr] = getattr(element, attr)
|
1402
|
+
|
1403
|
+
# Add page information if available
|
1404
|
+
if hasattr(element, "page"):
|
1405
|
+
page = element.page
|
1406
|
+
if page:
|
1407
|
+
element_data["page_number"] = getattr(page, "number", None)
|
1408
|
+
element_data["pdf_path"] = (
|
1409
|
+
getattr(page.pdf, "path", None) if hasattr(page, "pdf") else None
|
1410
|
+
)
|
1411
|
+
|
1412
|
+
# Include extracted text if requested
|
1413
|
+
if include_content and hasattr(element, "extract_text"):
|
1414
|
+
try:
|
1415
|
+
element_data["content"] = element.extract_text(preserve_whitespace=True)
|
1416
|
+
except Exception as e:
|
1417
|
+
logger.error(f"Error extracting text from element {i}: {e}")
|
1418
|
+
element_data["content"] = ""
|
1158
1419
|
|
1159
|
-
|
1420
|
+
# Save image if requested
|
1421
|
+
if include_images and hasattr(element, "to_image"):
|
1422
|
+
try:
|
1423
|
+
# Create identifier for the element
|
1424
|
+
pdf_name = "unknown"
|
1425
|
+
page_num = "unknown"
|
1426
|
+
|
1427
|
+
if hasattr(element, "page") and element.page:
|
1428
|
+
page_num = element.page.number
|
1429
|
+
if hasattr(element.page, "pdf") and element.page.pdf:
|
1430
|
+
pdf_name = Path(element.page.pdf.path).stem
|
1431
|
+
|
1432
|
+
# Create image filename
|
1433
|
+
element_type = element_data.get("element_type", "element").lower()
|
1434
|
+
image_filename = f"{pdf_name}_page{page_num}_{element_type}_{i}.{image_format}"
|
1435
|
+
image_path = image_dir / image_filename
|
1436
|
+
|
1437
|
+
# Save image
|
1438
|
+
element.to_image(
|
1439
|
+
path=str(image_path), resolution=image_resolution, include_highlights=True
|
1440
|
+
)
|
1441
|
+
|
1442
|
+
# Add relative path to data
|
1443
|
+
element_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
|
1444
|
+
except Exception as e:
|
1445
|
+
logger.error(f"Error saving image for element {i}: {e}")
|
1446
|
+
element_data["image_path"] = None
|
1447
|
+
|
1448
|
+
# Add analyses data
|
1449
|
+
if hasattr(element, "analyses"):
|
1450
|
+
for key in analysis_keys:
|
1451
|
+
if key not in element.analyses:
|
1452
|
+
# Skip this key if it doesn't exist - elements might have different analyses
|
1453
|
+
logger.warning(f"Analysis key '{key}' not found in element {i}")
|
1454
|
+
continue
|
1455
|
+
|
1456
|
+
# Get the analysis result
|
1457
|
+
analysis_result = element.analyses[key]
|
1458
|
+
|
1459
|
+
# If the result has a to_dict method, use it
|
1460
|
+
if hasattr(analysis_result, "to_dict"):
|
1461
|
+
analysis_data = analysis_result.to_dict()
|
1462
|
+
else:
|
1463
|
+
# Otherwise, use the result directly if it's dict-like
|
1464
|
+
try:
|
1465
|
+
analysis_data = dict(analysis_result)
|
1466
|
+
except (TypeError, ValueError):
|
1467
|
+
# Last resort: convert to string
|
1468
|
+
analysis_data = {"raw_result": str(analysis_result)}
|
1469
|
+
|
1470
|
+
# Add analysis data to element data with the key as prefix
|
1471
|
+
for k, v in analysis_data.items():
|
1472
|
+
element_data[f"{key}.{k}"] = v
|
1473
|
+
|
1474
|
+
all_data.append(element_data)
|
1475
|
+
|
1476
|
+
return all_data
|
1477
|
+
|
1478
|
+
|
1479
|
+
class PageCollection(Generic[P], ApplyMixin):
|
1160
1480
|
"""
|
1161
1481
|
A collection of PDF pages with cross-page operations.
|
1162
1482
|
|
@@ -1221,6 +1541,7 @@ class PageCollection(Generic[P]):
|
|
1221
1541
|
device: Optional[str] = None,
|
1222
1542
|
resolution: Optional[int] = None, # DPI for rendering
|
1223
1543
|
apply_exclusions: bool = True, # New parameter
|
1544
|
+
replace: bool = True, # Whether to replace existing OCR elements
|
1224
1545
|
# --- Engine-Specific Options ---
|
1225
1546
|
options: Optional[Any] = None, # e.g., EasyOCROptions(...)
|
1226
1547
|
) -> "PageCollection[P]":
|
@@ -1240,6 +1561,8 @@ class PageCollection(Generic[P]):
|
|
1240
1561
|
apply_exclusions: If True (default), render page images for OCR with
|
1241
1562
|
excluded areas masked (whited out). If False, OCR
|
1242
1563
|
the raw page images without masking exclusions.
|
1564
|
+
replace: If True (default), remove any existing OCR elements before
|
1565
|
+
adding new ones. If False, add new OCR elements to existing ones.
|
1243
1566
|
options: An engine-specific options object (e.g., EasyOCROptions) or dict.
|
1244
1567
|
|
1245
1568
|
Returns:
|
@@ -1277,45 +1600,134 @@ class PageCollection(Generic[P]):
|
|
1277
1600
|
device=device,
|
1278
1601
|
resolution=resolution,
|
1279
1602
|
apply_exclusions=apply_exclusions, # Pass down
|
1603
|
+
replace=replace, # Pass the replace parameter
|
1280
1604
|
options=options,
|
1281
1605
|
)
|
1282
1606
|
# The PDF method modifies the Page objects directly by adding elements.
|
1283
1607
|
|
1284
1608
|
return self # Return self for chaining
|
1285
1609
|
|
1286
|
-
|
1610
|
+
@overload
|
1611
|
+
def find(
|
1612
|
+
self,
|
1613
|
+
*,
|
1614
|
+
text: str,
|
1615
|
+
apply_exclusions: bool = True,
|
1616
|
+
regex: bool = False,
|
1617
|
+
case: bool = True,
|
1618
|
+
**kwargs,
|
1619
|
+
) -> Optional[T]: ...
|
1620
|
+
|
1621
|
+
@overload
|
1622
|
+
def find(
|
1623
|
+
self,
|
1624
|
+
selector: str,
|
1625
|
+
*,
|
1626
|
+
apply_exclusions: bool = True,
|
1627
|
+
regex: bool = False,
|
1628
|
+
case: bool = True,
|
1629
|
+
**kwargs,
|
1630
|
+
) -> Optional[T]: ...
|
1631
|
+
|
1632
|
+
def find(
|
1633
|
+
self,
|
1634
|
+
selector: Optional[str] = None,
|
1635
|
+
*,
|
1636
|
+
text: Optional[str] = None,
|
1637
|
+
apply_exclusions: bool = True,
|
1638
|
+
regex: bool = False,
|
1639
|
+
case: bool = True,
|
1640
|
+
**kwargs,
|
1641
|
+
) -> Optional[T]:
|
1287
1642
|
"""
|
1288
|
-
Find the first element matching the selector across all pages.
|
1643
|
+
Find the first element matching the selector OR text across all pages in the collection.
|
1644
|
+
|
1645
|
+
Provide EITHER `selector` OR `text`, but not both.
|
1289
1646
|
|
1290
1647
|
Args:
|
1291
|
-
selector: CSS-like selector string
|
1292
|
-
|
1293
|
-
|
1648
|
+
selector: CSS-like selector string.
|
1649
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
1650
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1651
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1652
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
1653
|
+
**kwargs: Additional filter parameters.
|
1294
1654
|
|
1295
1655
|
Returns:
|
1296
|
-
First matching element or None
|
1656
|
+
First matching element or None.
|
1297
1657
|
"""
|
1658
|
+
# Input validation happens within page.find
|
1298
1659
|
for page in self.pages:
|
1299
|
-
element = page.find(
|
1660
|
+
element = page.find(
|
1661
|
+
selector=selector,
|
1662
|
+
text=text,
|
1663
|
+
apply_exclusions=apply_exclusions,
|
1664
|
+
regex=regex,
|
1665
|
+
case=case,
|
1666
|
+
**kwargs,
|
1667
|
+
)
|
1300
1668
|
if element:
|
1301
1669
|
return element
|
1302
1670
|
return None
|
1303
1671
|
|
1304
|
-
|
1672
|
+
@overload
|
1673
|
+
def find_all(
|
1674
|
+
self,
|
1675
|
+
*,
|
1676
|
+
text: str,
|
1677
|
+
apply_exclusions: bool = True,
|
1678
|
+
regex: bool = False,
|
1679
|
+
case: bool = True,
|
1680
|
+
**kwargs,
|
1681
|
+
) -> "ElementCollection": ...
|
1682
|
+
|
1683
|
+
@overload
|
1684
|
+
def find_all(
|
1685
|
+
self,
|
1686
|
+
selector: str,
|
1687
|
+
*,
|
1688
|
+
apply_exclusions: bool = True,
|
1689
|
+
regex: bool = False,
|
1690
|
+
case: bool = True,
|
1691
|
+
**kwargs,
|
1692
|
+
) -> "ElementCollection": ...
|
1693
|
+
|
1694
|
+
def find_all(
|
1695
|
+
self,
|
1696
|
+
selector: Optional[str] = None,
|
1697
|
+
*,
|
1698
|
+
text: Optional[str] = None,
|
1699
|
+
apply_exclusions: bool = True,
|
1700
|
+
regex: bool = False,
|
1701
|
+
case: bool = True,
|
1702
|
+
**kwargs,
|
1703
|
+
) -> "ElementCollection":
|
1305
1704
|
"""
|
1306
|
-
Find all elements matching the selector across all pages.
|
1705
|
+
Find all elements matching the selector OR text across all pages in the collection.
|
1706
|
+
|
1707
|
+
Provide EITHER `selector` OR `text`, but not both.
|
1307
1708
|
|
1308
1709
|
Args:
|
1309
|
-
selector: CSS-like selector string
|
1310
|
-
|
1311
|
-
|
1710
|
+
selector: CSS-like selector string.
|
1711
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
1712
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1713
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1714
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
1715
|
+
**kwargs: Additional filter parameters.
|
1312
1716
|
|
1313
1717
|
Returns:
|
1314
|
-
ElementCollection with matching elements from all pages
|
1718
|
+
ElementCollection with matching elements from all pages.
|
1315
1719
|
"""
|
1316
1720
|
all_elements = []
|
1721
|
+
# Input validation happens within page.find_all
|
1317
1722
|
for page in self.pages:
|
1318
|
-
elements = page.find_all(
|
1723
|
+
elements = page.find_all(
|
1724
|
+
selector=selector,
|
1725
|
+
text=text,
|
1726
|
+
apply_exclusions=apply_exclusions,
|
1727
|
+
regex=regex,
|
1728
|
+
case=case,
|
1729
|
+
**kwargs,
|
1730
|
+
)
|
1319
1731
|
if elements:
|
1320
1732
|
all_elements.extend(elements.elements)
|
1321
1733
|
|
@@ -1324,10 +1736,12 @@ class PageCollection(Generic[P]):
|
|
1324
1736
|
def correct_ocr(
|
1325
1737
|
self,
|
1326
1738
|
correction_callback: Callable[[Any], Optional[str]],
|
1739
|
+
max_workers: Optional[int] = None,
|
1327
1740
|
) -> "PageCollection[P]":
|
1328
1741
|
"""
|
1329
1742
|
Applies corrections to OCR-generated text elements across all pages
|
1330
|
-
in this collection using a user-provided callback function
|
1743
|
+
in this collection using a user-provided callback function, executed
|
1744
|
+
in parallel if `max_workers` is specified.
|
1331
1745
|
|
1332
1746
|
This method delegates to the parent PDF's `correct_ocr` method,
|
1333
1747
|
targeting all pages within this collection.
|
@@ -1335,10 +1749,11 @@ class PageCollection(Generic[P]):
|
|
1335
1749
|
Args:
|
1336
1750
|
correction_callback: A function that accepts a single argument (an element
|
1337
1751
|
object) and returns `Optional[str]` (new text or None).
|
1752
|
+
max_workers: The maximum number of worker threads to use for parallel
|
1753
|
+
correction on each page. If None, defaults are used.
|
1338
1754
|
|
1339
1755
|
Returns:
|
1340
|
-
|
1341
|
-
{'elements_checked': total_checked, 'corrections_applied': total_applied}
|
1756
|
+
Self for method chaining.
|
1342
1757
|
|
1343
1758
|
Raises:
|
1344
1759
|
RuntimeError: If the collection is empty, pages lack a parent PDF reference,
|
@@ -1346,17 +1761,32 @@ class PageCollection(Generic[P]):
|
|
1346
1761
|
"""
|
1347
1762
|
if not self.pages:
|
1348
1763
|
logger.warning("Cannot correct OCR for an empty PageCollection.")
|
1764
|
+
# Return self even if empty to maintain chaining consistency
|
1765
|
+
return self
|
1349
1766
|
|
1350
1767
|
# Assume all pages share the same parent PDF object
|
1351
1768
|
parent_pdf = self.pages[0]._parent
|
1769
|
+
if (
|
1770
|
+
not parent_pdf
|
1771
|
+
or not hasattr(parent_pdf, "correct_ocr")
|
1772
|
+
or not callable(parent_pdf.correct_ocr)
|
1773
|
+
):
|
1774
|
+
raise RuntimeError(
|
1775
|
+
"Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
|
1776
|
+
)
|
1352
1777
|
|
1353
1778
|
page_indices = [p.index for p in self.pages]
|
1354
1779
|
logger.info(
|
1355
|
-
f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}."
|
1780
|
+
f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
|
1356
1781
|
)
|
1357
1782
|
|
1358
1783
|
# Delegate the call to the parent PDF object for the relevant pages
|
1359
|
-
|
1784
|
+
# Pass the max_workers parameter down
|
1785
|
+
parent_pdf.correct_ocr(
|
1786
|
+
correction_callback=correction_callback,
|
1787
|
+
pages=page_indices,
|
1788
|
+
max_workers=max_workers, # Pass it here
|
1789
|
+
)
|
1360
1790
|
|
1361
1791
|
return self
|
1362
1792
|
|
@@ -1660,3 +2090,176 @@ class PageCollection(Generic[P]):
|
|
1660
2090
|
sections.append(region)
|
1661
2091
|
|
1662
2092
|
return sections
|
2093
|
+
|
2094
|
+
def _gather_analysis_data(
|
2095
|
+
self,
|
2096
|
+
analysis_keys: List[str],
|
2097
|
+
include_content: bool,
|
2098
|
+
include_images: bool,
|
2099
|
+
image_dir: Optional[Path],
|
2100
|
+
image_format: str,
|
2101
|
+
image_resolution: int,
|
2102
|
+
) -> List[Dict[str, Any]]:
|
2103
|
+
"""
|
2104
|
+
Gather analysis data from all pages in the collection.
|
2105
|
+
|
2106
|
+
Args:
|
2107
|
+
analysis_keys: Keys in the analyses dictionary to export
|
2108
|
+
include_content: Whether to include extracted text
|
2109
|
+
include_images: Whether to export images
|
2110
|
+
image_dir: Directory to save images
|
2111
|
+
image_format: Format to save images
|
2112
|
+
image_resolution: Resolution for exported images
|
2113
|
+
|
2114
|
+
Returns:
|
2115
|
+
List of dictionaries containing analysis data
|
2116
|
+
"""
|
2117
|
+
if not self.elements:
|
2118
|
+
logger.warning("No pages found in collection")
|
2119
|
+
return []
|
2120
|
+
|
2121
|
+
all_data = []
|
2122
|
+
|
2123
|
+
for page in self.elements:
|
2124
|
+
# Basic page information
|
2125
|
+
page_data = {
|
2126
|
+
"page_number": page.number,
|
2127
|
+
"page_index": page.index,
|
2128
|
+
"width": page.width,
|
2129
|
+
"height": page.height,
|
2130
|
+
}
|
2131
|
+
|
2132
|
+
# Add PDF information if available
|
2133
|
+
if hasattr(page, "pdf") and page.pdf:
|
2134
|
+
page_data["pdf_path"] = page.pdf.path
|
2135
|
+
page_data["pdf_filename"] = Path(page.pdf.path).name
|
2136
|
+
|
2137
|
+
# Include extracted text if requested
|
2138
|
+
if include_content:
|
2139
|
+
try:
|
2140
|
+
page_data["content"] = page.extract_text(preserve_whitespace=True)
|
2141
|
+
except Exception as e:
|
2142
|
+
logger.error(f"Error extracting text from page {page.number}: {e}")
|
2143
|
+
page_data["content"] = ""
|
2144
|
+
|
2145
|
+
# Save image if requested
|
2146
|
+
if include_images:
|
2147
|
+
try:
|
2148
|
+
# Create image filename
|
2149
|
+
pdf_name = "unknown"
|
2150
|
+
if hasattr(page, "pdf") and page.pdf:
|
2151
|
+
pdf_name = Path(page.pdf.path).stem
|
2152
|
+
|
2153
|
+
image_filename = f"{pdf_name}_page_{page.number}.{image_format}"
|
2154
|
+
image_path = image_dir / image_filename
|
2155
|
+
|
2156
|
+
# Save image
|
2157
|
+
page.save_image(
|
2158
|
+
str(image_path), resolution=image_resolution, include_highlights=True
|
2159
|
+
)
|
2160
|
+
|
2161
|
+
# Add relative path to data
|
2162
|
+
page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
|
2163
|
+
except Exception as e:
|
2164
|
+
logger.error(f"Error saving image for page {page.number}: {e}")
|
2165
|
+
page_data["image_path"] = None
|
2166
|
+
|
2167
|
+
# Add analyses data
|
2168
|
+
if hasattr(page, "analyses") and page.analyses:
|
2169
|
+
for key in analysis_keys:
|
2170
|
+
if key not in page.analyses:
|
2171
|
+
raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
|
2172
|
+
|
2173
|
+
# Get the analysis result
|
2174
|
+
analysis_result = page.analyses[key]
|
2175
|
+
|
2176
|
+
# If the result has a to_dict method, use it
|
2177
|
+
if hasattr(analysis_result, "to_dict"):
|
2178
|
+
analysis_data = analysis_result.to_dict()
|
2179
|
+
else:
|
2180
|
+
# Otherwise, use the result directly if it's dict-like
|
2181
|
+
try:
|
2182
|
+
analysis_data = dict(analysis_result)
|
2183
|
+
except (TypeError, ValueError):
|
2184
|
+
# Last resort: convert to string
|
2185
|
+
analysis_data = {"raw_result": str(analysis_result)}
|
2186
|
+
|
2187
|
+
# Add analysis data to page data with the key as prefix
|
2188
|
+
for k, v in analysis_data.items():
|
2189
|
+
page_data[f"{key}.{k}"] = v
|
2190
|
+
|
2191
|
+
all_data.append(page_data)
|
2192
|
+
|
2193
|
+
return all_data
|
2194
|
+
|
2195
|
+
# --- Deskew Method --- #
|
2196
|
+
|
2197
|
+
def deskew(
|
2198
|
+
self,
|
2199
|
+
resolution: int = 300,
|
2200
|
+
detection_resolution: int = 72,
|
2201
|
+
force_overwrite: bool = False,
|
2202
|
+
**deskew_kwargs,
|
2203
|
+
) -> "PDF": # Changed return type
|
2204
|
+
"""
|
2205
|
+
Creates a new, in-memory PDF object containing deskewed versions of the pages
|
2206
|
+
in this collection.
|
2207
|
+
|
2208
|
+
This method delegates the actual processing to the parent PDF object's
|
2209
|
+
`deskew` method.
|
2210
|
+
|
2211
|
+
Important: The returned PDF is image-based. Any existing text, OCR results,
|
2212
|
+
annotations, or other elements from the original pages will *not* be carried over.
|
2213
|
+
|
2214
|
+
Args:
|
2215
|
+
resolution: DPI resolution for rendering the output deskewed pages.
|
2216
|
+
detection_resolution: DPI resolution used for skew detection if angles are not
|
2217
|
+
already cached on the page objects.
|
2218
|
+
force_overwrite: If False (default), raises a ValueError if any target page
|
2219
|
+
already contains processed elements (text, OCR, regions) to
|
2220
|
+
prevent accidental data loss. Set to True to proceed anyway.
|
2221
|
+
**deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
|
2222
|
+
during automatic detection (e.g., `max_angle`, `num_peaks`).
|
2223
|
+
|
2224
|
+
Returns:
|
2225
|
+
A new PDF object representing the deskewed document.
|
2226
|
+
|
2227
|
+
Raises:
|
2228
|
+
ImportError: If 'deskew' or 'img2pdf' libraries are not installed (raised by PDF.deskew).
|
2229
|
+
ValueError: If `force_overwrite` is False and target pages contain elements (raised by PDF.deskew),
|
2230
|
+
or if the collection is empty.
|
2231
|
+
RuntimeError: If pages lack a parent PDF reference, or the parent PDF lacks the `deskew` method.
|
2232
|
+
"""
|
2233
|
+
if not self.pages:
|
2234
|
+
logger.warning("Cannot deskew an empty PageCollection.")
|
2235
|
+
raise ValueError("Cannot deskew an empty PageCollection.")
|
2236
|
+
|
2237
|
+
# Assume all pages share the same parent PDF object
|
2238
|
+
# Need to hint the type of _parent for type checkers
|
2239
|
+
if TYPE_CHECKING:
|
2240
|
+
parent_pdf: "natural_pdf.core.pdf.PDF" = self.pages[0]._parent
|
2241
|
+
else:
|
2242
|
+
parent_pdf = self.pages[0]._parent
|
2243
|
+
|
2244
|
+
if not parent_pdf or not hasattr(parent_pdf, "deskew") or not callable(parent_pdf.deskew):
|
2245
|
+
raise RuntimeError(
|
2246
|
+
"Parent PDF reference not found or parent PDF lacks the required 'deskew' method."
|
2247
|
+
)
|
2248
|
+
|
2249
|
+
# Get the 0-based indices of the pages in this collection
|
2250
|
+
page_indices = [p.index for p in self.pages]
|
2251
|
+
logger.info(
|
2252
|
+
f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
|
2253
|
+
)
|
2254
|
+
|
2255
|
+
# Delegate the call to the parent PDF object for the relevant pages
|
2256
|
+
# Pass all relevant arguments through (no output_path anymore)
|
2257
|
+
return parent_pdf.deskew(
|
2258
|
+
pages=page_indices,
|
2259
|
+
resolution=resolution,
|
2260
|
+
detection_resolution=detection_resolution,
|
2261
|
+
force_overwrite=force_overwrite,
|
2262
|
+
**deskew_kwargs,
|
2263
|
+
)
|
2264
|
+
|
2265
|
+
# --- End Deskew Method --- #
|