natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +209 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +288 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +413 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +512 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +604 -0
- docs/tutorials/12-ocr-integration.md +175 -0
- docs/tutorials/13-semantic-search.ipynb +1328 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +50 -33
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +125 -58
- natural_pdf/analyzers/layout/layout_options.py +43 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +89 -45
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +146 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +419 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +1044 -521
- natural_pdf/core/pdf.py +516 -313
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +307 -225
- natural_pdf/elements/collections.py +805 -543
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +889 -879
- natural_pdf/elements/text.py +127 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +57 -35
- natural_pdf/ocr/engine.py +150 -46
- natural_pdf/ocr/engine_easyocr.py +146 -150
- natural_pdf/ocr/engine_paddle.py +118 -175
- natural_pdf/ocr/engine_surya.py +78 -141
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +122 -124
- natural_pdf/ocr/ocr_options.py +16 -20
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
- natural_pdf-0.1.6.dist-info/RECORD +141 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- natural_pdf/templates/ocr_debug.html +0 -517
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,8 +1,27 @@
|
|
1
1
|
import logging
|
2
|
-
|
3
|
-
|
2
|
+
from typing import (
|
3
|
+
TYPE_CHECKING,
|
4
|
+
Any,
|
5
|
+
Callable,
|
6
|
+
Dict,
|
7
|
+
Generic,
|
8
|
+
Iterator,
|
9
|
+
List,
|
10
|
+
Optional,
|
11
|
+
Tuple,
|
12
|
+
TypeVar,
|
13
|
+
Union,
|
14
|
+
)
|
15
|
+
|
16
|
+
from pdfplumber.utils.geometry import objects_to_bbox
|
17
|
+
|
18
|
+
# New Imports
|
19
|
+
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
20
|
+
|
21
|
+
from natural_pdf.elements.text import TextElement # Needed for isinstance check
|
4
22
|
from natural_pdf.ocr import OCROptions
|
5
23
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
24
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import the new utility
|
6
25
|
|
7
26
|
logger = logging.getLogger(__name__)
|
8
27
|
|
@@ -10,35 +29,36 @@ if TYPE_CHECKING:
|
|
10
29
|
from natural_pdf.core.page import Page
|
11
30
|
from natural_pdf.elements.region import Region
|
12
31
|
|
13
|
-
T = TypeVar(
|
14
|
-
P = TypeVar(
|
32
|
+
T = TypeVar("T")
|
33
|
+
P = TypeVar("P", bound="Page")
|
34
|
+
|
15
35
|
|
16
36
|
class ElementCollection(Generic[T]):
|
17
37
|
"""
|
18
38
|
Collection of PDF elements with batch operations.
|
19
39
|
"""
|
20
|
-
|
40
|
+
|
21
41
|
def __init__(self, elements: List[T]):
|
22
42
|
"""
|
23
43
|
Initialize a collection of elements.
|
24
|
-
|
44
|
+
|
25
45
|
Args:
|
26
46
|
elements: List of Element objects
|
27
47
|
"""
|
28
48
|
self._elements = elements or []
|
29
|
-
|
49
|
+
|
30
50
|
def __len__(self) -> int:
|
31
51
|
"""Get the number of elements in the collection."""
|
32
52
|
return len(self._elements)
|
33
|
-
|
34
|
-
def __getitem__(self, index: int) ->
|
53
|
+
|
54
|
+
def __getitem__(self, index: int) -> "Element":
|
35
55
|
"""Get an element by index."""
|
36
56
|
return self._elements[index]
|
37
|
-
|
57
|
+
|
38
58
|
def __iter__(self):
|
39
59
|
"""Iterate over elements."""
|
40
60
|
return iter(self._elements)
|
41
|
-
|
61
|
+
|
42
62
|
def __repr__(self) -> str:
|
43
63
|
"""Return a string representation showing the element count."""
|
44
64
|
element_type = "Mixed"
|
@@ -47,130 +67,130 @@ class ElementCollection(Generic[T]):
|
|
47
67
|
if len(types) == 1:
|
48
68
|
element_type = types.pop()
|
49
69
|
return f"<ElementCollection[{element_type}](count={len(self)})>"
|
50
|
-
|
70
|
+
|
51
71
|
@property
|
52
|
-
def elements(self) -> List[
|
72
|
+
def elements(self) -> List["Element"]:
|
53
73
|
"""Get the elements in this collection."""
|
54
74
|
return self._elements
|
55
|
-
|
75
|
+
|
56
76
|
@property
|
57
|
-
def first(self) -> Optional[
|
77
|
+
def first(self) -> Optional["Element"]:
|
58
78
|
"""Get the first element in the collection."""
|
59
79
|
return self._elements[0] if self._elements else None
|
60
|
-
|
80
|
+
|
61
81
|
@property
|
62
|
-
def last(self) -> Optional[
|
82
|
+
def last(self) -> Optional["Element"]:
|
63
83
|
"""Get the last element in the collection."""
|
64
84
|
return self._elements[-1] if self._elements else None
|
65
|
-
|
66
|
-
def highest(self) -> Optional[
|
85
|
+
|
86
|
+
def highest(self) -> Optional["Element"]:
|
67
87
|
"""
|
68
88
|
Get element with the smallest top y-coordinate (highest on page).
|
69
|
-
|
89
|
+
|
70
90
|
Raises:
|
71
91
|
ValueError: If elements are on multiple pages
|
72
|
-
|
92
|
+
|
73
93
|
Returns:
|
74
94
|
Element with smallest top value or None if empty
|
75
95
|
"""
|
76
96
|
if not self._elements:
|
77
97
|
return None
|
78
|
-
|
98
|
+
|
79
99
|
# Check if elements are on multiple pages
|
80
100
|
if self._are_on_multiple_pages():
|
81
101
|
raise ValueError("Cannot determine highest element across multiple pages")
|
82
|
-
|
102
|
+
|
83
103
|
return min(self._elements, key=lambda e: e.top)
|
84
|
-
|
85
|
-
def lowest(self) -> Optional[
|
104
|
+
|
105
|
+
def lowest(self) -> Optional["Element"]:
|
86
106
|
"""
|
87
107
|
Get element with the largest bottom y-coordinate (lowest on page).
|
88
|
-
|
108
|
+
|
89
109
|
Raises:
|
90
110
|
ValueError: If elements are on multiple pages
|
91
|
-
|
111
|
+
|
92
112
|
Returns:
|
93
113
|
Element with largest bottom value or None if empty
|
94
114
|
"""
|
95
115
|
if not self._elements:
|
96
116
|
return None
|
97
|
-
|
117
|
+
|
98
118
|
# Check if elements are on multiple pages
|
99
119
|
if self._are_on_multiple_pages():
|
100
120
|
raise ValueError("Cannot determine lowest element across multiple pages")
|
101
|
-
|
121
|
+
|
102
122
|
return max(self._elements, key=lambda e: e.bottom)
|
103
|
-
|
104
|
-
def leftmost(self) -> Optional[
|
123
|
+
|
124
|
+
def leftmost(self) -> Optional["Element"]:
|
105
125
|
"""
|
106
126
|
Get element with the smallest x0 coordinate (leftmost on page).
|
107
|
-
|
127
|
+
|
108
128
|
Raises:
|
109
129
|
ValueError: If elements are on multiple pages
|
110
|
-
|
130
|
+
|
111
131
|
Returns:
|
112
132
|
Element with smallest x0 value or None if empty
|
113
133
|
"""
|
114
134
|
if not self._elements:
|
115
135
|
return None
|
116
|
-
|
136
|
+
|
117
137
|
# Check if elements are on multiple pages
|
118
138
|
if self._are_on_multiple_pages():
|
119
139
|
raise ValueError("Cannot determine leftmost element across multiple pages")
|
120
|
-
|
140
|
+
|
121
141
|
return min(self._elements, key=lambda e: e.x0)
|
122
|
-
|
123
|
-
def rightmost(self) -> Optional[
|
142
|
+
|
143
|
+
def rightmost(self) -> Optional["Element"]:
|
124
144
|
"""
|
125
145
|
Get element with the largest x1 coordinate (rightmost on page).
|
126
|
-
|
146
|
+
|
127
147
|
Raises:
|
128
148
|
ValueError: If elements are on multiple pages
|
129
|
-
|
149
|
+
|
130
150
|
Returns:
|
131
151
|
Element with largest x1 value or None if empty
|
132
152
|
"""
|
133
153
|
if not self._elements:
|
134
154
|
return None
|
135
|
-
|
155
|
+
|
136
156
|
# Check if elements are on multiple pages
|
137
157
|
if self._are_on_multiple_pages():
|
138
158
|
raise ValueError("Cannot determine rightmost element across multiple pages")
|
139
|
-
|
159
|
+
|
140
160
|
return max(self._elements, key=lambda e: e.x1)
|
141
|
-
|
161
|
+
|
142
162
|
def _are_on_multiple_pages(self) -> bool:
|
143
163
|
"""
|
144
164
|
Check if elements in this collection span multiple pages.
|
145
|
-
|
165
|
+
|
146
166
|
Returns:
|
147
167
|
True if elements are on different pages, False otherwise
|
148
168
|
"""
|
149
169
|
if not self._elements:
|
150
170
|
return False
|
151
|
-
|
171
|
+
|
152
172
|
# Get the page index of the first element
|
153
|
-
if not hasattr(self._elements[0],
|
173
|
+
if not hasattr(self._elements[0], "page"):
|
154
174
|
return False
|
155
|
-
|
175
|
+
|
156
176
|
first_page_idx = self._elements[0].page.index
|
157
|
-
|
177
|
+
|
158
178
|
# Check if any element is on a different page
|
159
|
-
return any(hasattr(e,
|
160
|
-
|
161
|
-
def exclude_regions(self, regions: List[
|
179
|
+
return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
|
180
|
+
|
181
|
+
def exclude_regions(self, regions: List["Region"]) -> "ElementCollection":
|
162
182
|
"""
|
163
183
|
Remove elements that are within any of the specified regions.
|
164
|
-
|
184
|
+
|
165
185
|
Args:
|
166
186
|
regions: List of Region objects to exclude
|
167
|
-
|
187
|
+
|
168
188
|
Returns:
|
169
189
|
New ElementCollection with filtered elements
|
170
190
|
"""
|
171
191
|
if not regions:
|
172
192
|
return ElementCollection(self._elements)
|
173
|
-
|
193
|
+
|
174
194
|
filtered = []
|
175
195
|
for element in self._elements:
|
176
196
|
exclude = False
|
@@ -180,72 +200,156 @@ class ElementCollection(Generic[T]):
|
|
180
200
|
break
|
181
201
|
if not exclude:
|
182
202
|
filtered.append(element)
|
183
|
-
|
203
|
+
|
184
204
|
return ElementCollection(filtered)
|
185
|
-
|
205
|
+
|
186
206
|
def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
|
187
207
|
"""
|
188
|
-
Extract text from all
|
189
|
-
|
208
|
+
Extract text from all TextElements in the collection, optionally using
|
209
|
+
pdfplumber's layout engine if layout=True is specified.
|
210
|
+
|
190
211
|
Args:
|
191
|
-
preserve_whitespace:
|
192
|
-
use_exclusions:
|
193
|
-
|
194
|
-
|
212
|
+
preserve_whitespace: Deprecated. Use layout=False for simple joining.
|
213
|
+
use_exclusions: Deprecated. Exclusions should be applied *before* creating
|
214
|
+
the collection or by filtering the collection itself.
|
215
|
+
**kwargs: Additional layout parameters passed directly to pdfplumber's
|
216
|
+
`chars_to_textmap` function ONLY if `layout=True` is passed.
|
217
|
+
See Page.extract_text docstring for common parameters.
|
218
|
+
If `layout=False` or omitted, performs a simple join.
|
219
|
+
|
195
220
|
Returns:
|
196
|
-
Combined text from
|
221
|
+
Combined text from elements, potentially with layout-based spacing.
|
197
222
|
"""
|
198
|
-
# Filter to just
|
199
|
-
text_elements = [
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
223
|
+
# Filter to just TextElements that likely have _char_dicts
|
224
|
+
text_elements = [
|
225
|
+
el
|
226
|
+
for el in self._elements
|
227
|
+
if isinstance(el, TextElement) and hasattr(el, "_char_dicts")
|
228
|
+
]
|
229
|
+
|
230
|
+
if not text_elements:
|
231
|
+
return ""
|
232
|
+
|
233
|
+
# Collect all character dictionaries
|
234
|
+
all_char_dicts = []
|
235
|
+
for el in text_elements:
|
236
|
+
all_char_dicts.extend(getattr(el, "_char_dicts", []))
|
237
|
+
|
238
|
+
if not all_char_dicts:
|
239
|
+
# Handle case where elements exist but have no char dicts
|
240
|
+
logger.warning(
|
241
|
+
"ElementCollection.extract_text: No character dictionaries found in TextElements."
|
242
|
+
)
|
243
|
+
return " ".join(
|
244
|
+
getattr(el, "text", "") for el in text_elements
|
245
|
+
) # Fallback to simple join of word text
|
246
|
+
|
247
|
+
# Check if layout is requested
|
248
|
+
use_layout = kwargs.get("layout", False)
|
249
|
+
|
250
|
+
if use_layout:
|
251
|
+
logger.debug("ElementCollection.extract_text: Using layout=True path.")
|
252
|
+
# Layout requested: Use chars_to_textmap
|
253
|
+
|
254
|
+
# Prepare layout kwargs
|
255
|
+
layout_kwargs = {}
|
256
|
+
allowed_keys = set(WORD_EXTRACTOR_KWARGS) | set(TEXTMAP_KWARGS)
|
257
|
+
for key, value in kwargs.items():
|
258
|
+
if key in allowed_keys:
|
259
|
+
layout_kwargs[key] = value
|
260
|
+
layout_kwargs["layout"] = True # Ensure layout is True
|
261
|
+
|
262
|
+
# Calculate overall bbox for the elements used
|
263
|
+
collection_bbox = objects_to_bbox(all_char_dicts)
|
264
|
+
coll_x0, coll_top, coll_x1, coll_bottom = collection_bbox
|
265
|
+
coll_width = coll_x1 - coll_x0
|
266
|
+
coll_height = coll_bottom - coll_top
|
267
|
+
|
268
|
+
# Set layout parameters based on collection bounds
|
269
|
+
# Warn if collection is sparse? TBD.
|
270
|
+
if "layout_bbox" not in layout_kwargs:
|
271
|
+
layout_kwargs["layout_bbox"] = collection_bbox
|
272
|
+
if "layout_width" not in layout_kwargs:
|
273
|
+
layout_kwargs["layout_width"] = coll_width
|
274
|
+
if "layout_height" not in layout_kwargs:
|
275
|
+
layout_kwargs["layout_height"] = coll_height
|
276
|
+
# Set shifts relative to the collection's top-left
|
277
|
+
if "x_shift" not in layout_kwargs:
|
278
|
+
layout_kwargs["x_shift"] = coll_x0
|
279
|
+
if "y_shift" not in layout_kwargs:
|
280
|
+
layout_kwargs["y_shift"] = coll_top
|
281
|
+
|
282
|
+
try:
|
283
|
+
# Sort chars by document order (page, top, x0)
|
284
|
+
# Need page info on char dicts for multi-page collections
|
285
|
+
# Assuming char dicts have 'page_number' from element creation
|
286
|
+
all_char_dicts.sort(
|
287
|
+
key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
|
288
|
+
)
|
289
|
+
textmap = chars_to_textmap(all_char_dicts, **layout_kwargs)
|
290
|
+
result = textmap.as_string
|
291
|
+
except Exception as e:
|
292
|
+
logger.error(
|
293
|
+
f"ElementCollection: Error calling chars_to_textmap: {e}", exc_info=True
|
294
|
+
)
|
295
|
+
logger.warning(
|
296
|
+
"ElementCollection: Falling back to simple text join due to layout error."
|
297
|
+
)
|
298
|
+
# Fallback sorting and joining
|
299
|
+
all_char_dicts.sort(
|
300
|
+
key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
|
301
|
+
)
|
302
|
+
result = " ".join(c.get("text", "") for c in all_char_dicts)
|
303
|
+
|
304
|
+
else:
|
305
|
+
# Default: Simple join without layout
|
306
|
+
logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
|
307
|
+
# Sort chars by document order (page, top, x0)
|
308
|
+
all_char_dicts.sort(
|
309
|
+
key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
|
310
|
+
)
|
311
|
+
# Simple join of character text
|
312
|
+
result = "".join(c.get("text", "") for c in all_char_dicts)
|
313
|
+
# Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
|
314
|
+
|
315
|
+
return result
|
316
|
+
|
317
|
+
def filter(self, func: Callable[["Element"], bool]) -> "ElementCollection":
|
216
318
|
"""
|
217
319
|
Filter elements using a function.
|
218
|
-
|
320
|
+
|
219
321
|
Args:
|
220
322
|
func: Function that takes an element and returns True to keep it
|
221
|
-
|
323
|
+
|
222
324
|
Returns:
|
223
325
|
New ElementCollection with filtered elements
|
224
326
|
"""
|
225
327
|
return ElementCollection([e for e in self._elements if func(e)])
|
226
|
-
|
227
|
-
def sort(self, key=None, reverse=False) ->
|
328
|
+
|
329
|
+
def sort(self, key=None, reverse=False) -> "ElementCollection":
|
228
330
|
"""
|
229
331
|
Sort elements by the given key function.
|
230
|
-
|
332
|
+
|
231
333
|
Args:
|
232
334
|
key: Function to generate a key for sorting
|
233
335
|
reverse: Whether to sort in descending order
|
234
|
-
|
336
|
+
|
235
337
|
Returns:
|
236
338
|
Self for method chaining
|
237
339
|
"""
|
238
340
|
self._elements.sort(key=key, reverse=reverse)
|
239
341
|
return self
|
240
|
-
|
241
|
-
def highlight(
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
342
|
+
|
343
|
+
def highlight(
|
344
|
+
self,
|
345
|
+
label: Optional[str] = None,
|
346
|
+
color: Optional[Union[Tuple, str]] = None,
|
347
|
+
group_by: Optional[str] = None,
|
348
|
+
label_format: Optional[str] = None,
|
349
|
+
distinct: bool = False,
|
350
|
+
include_attrs: Optional[List[str]] = None,
|
351
|
+
replace: bool = False,
|
352
|
+
) -> "ElementCollection":
|
249
353
|
"""
|
250
354
|
Adds persistent highlights for all elements in the collection to the page
|
251
355
|
via the HighlightingService.
|
@@ -294,17 +398,17 @@ class ElementCollection(Generic[T]):
|
|
294
398
|
color=color,
|
295
399
|
group_by=group_by,
|
296
400
|
label_format=label_format,
|
297
|
-
include_attrs=include_attrs
|
401
|
+
include_attrs=include_attrs,
|
298
402
|
# 'replace' flag is handled during the add call below
|
299
403
|
)
|
300
404
|
|
301
405
|
# 2. Add prepared highlights to the persistent service
|
302
406
|
if not highlight_data_list:
|
303
|
-
return self
|
407
|
+
return self # Nothing to add
|
304
408
|
|
305
409
|
# Get page and highlighter from the first element (assume uniform page)
|
306
410
|
first_element = self._elements[0]
|
307
|
-
if not hasattr(first_element,
|
411
|
+
if not hasattr(first_element, "page") or not hasattr(first_element.page, "_highlighter"):
|
308
412
|
logger.warning("Cannot highlight collection: Elements lack page or highlighter access.")
|
309
413
|
return self
|
310
414
|
|
@@ -317,42 +421,48 @@ class ElementCollection(Generic[T]):
|
|
317
421
|
if replace:
|
318
422
|
# Identify all unique page indices in this operation
|
319
423
|
for data in highlight_data_list:
|
320
|
-
pages_to_clear.add(data[
|
424
|
+
pages_to_clear.add(data["page_index"])
|
321
425
|
# Clear those pages *before* adding new highlights
|
322
|
-
logger.debug(
|
426
|
+
logger.debug(
|
427
|
+
f"Highlighting with replace=True. Clearing highlights for pages: {pages_to_clear}"
|
428
|
+
)
|
323
429
|
for page_idx in pages_to_clear:
|
324
430
|
highlighter.clear_page(page_idx)
|
325
431
|
|
326
432
|
for data in highlight_data_list:
|
327
433
|
# Call the appropriate service add method
|
328
434
|
add_args = {
|
329
|
-
"page_index": data[
|
330
|
-
"color": data[
|
331
|
-
"label": data[
|
332
|
-
"use_color_cycling": data.get(
|
333
|
-
|
334
|
-
|
435
|
+
"page_index": data["page_index"],
|
436
|
+
"color": data["color"], # Color determined by _prepare
|
437
|
+
"label": data["label"], # Label determined by _prepare
|
438
|
+
"use_color_cycling": data.get(
|
439
|
+
"use_color_cycling", False
|
440
|
+
), # Set by _prepare if distinct
|
441
|
+
"element": data["element"],
|
442
|
+
"include_attrs": data["include_attrs"],
|
335
443
|
# Internal call to service always appends, as clearing was handled above
|
336
|
-
"existing":
|
444
|
+
"existing": "append",
|
337
445
|
}
|
338
|
-
if data.get(
|
339
|
-
add_args["polygon"] = data[
|
446
|
+
if data.get("polygon"):
|
447
|
+
add_args["polygon"] = data["polygon"]
|
340
448
|
highlighter.add_polygon(**add_args)
|
341
|
-
elif data.get(
|
342
|
-
add_args["bbox"] = data[
|
449
|
+
elif data.get("bbox"):
|
450
|
+
add_args["bbox"] = data["bbox"]
|
343
451
|
highlighter.add(**add_args)
|
344
452
|
else:
|
345
453
|
logger.warning(f"Skipping highlight data, no bbox or polygon found: {data}")
|
346
454
|
|
347
455
|
return self
|
348
456
|
|
349
|
-
def _prepare_highlight_data(
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
457
|
+
def _prepare_highlight_data(
|
458
|
+
self,
|
459
|
+
distinct: bool = False,
|
460
|
+
label: Optional[str] = None,
|
461
|
+
color: Optional[Union[Tuple, str]] = None,
|
462
|
+
group_by: Optional[str] = None,
|
463
|
+
label_format: Optional[str] = None,
|
464
|
+
include_attrs: Optional[List[str]] = None,
|
465
|
+
) -> List[Dict]:
|
356
466
|
"""
|
357
467
|
Determines the parameters for highlighting each element based on the strategy.
|
358
468
|
|
@@ -364,58 +474,64 @@ class ElementCollection(Generic[T]):
|
|
364
474
|
Color and label determination happens here.
|
365
475
|
"""
|
366
476
|
prepared_data = []
|
367
|
-
if not self._elements:
|
477
|
+
if not self._elements:
|
478
|
+
return prepared_data
|
368
479
|
|
369
480
|
# Need access to the HighlightingService to determine colors correctly.
|
370
481
|
highlighter = None
|
371
482
|
first_element = self._elements[0]
|
372
|
-
if hasattr(first_element,
|
483
|
+
if hasattr(first_element, "page") and hasattr(first_element.page, "_highlighter"):
|
373
484
|
highlighter = first_element.page._highlighter
|
374
485
|
else:
|
375
|
-
logger.warning(
|
486
|
+
logger.warning(
|
487
|
+
"Cannot determine highlight colors: HighlightingService not accessible from elements."
|
488
|
+
)
|
376
489
|
return []
|
377
490
|
|
378
491
|
if distinct:
|
379
492
|
logger.debug("_prepare: Distinct highlighting strategy.")
|
380
493
|
for element in self._elements:
|
381
494
|
# Call the service's color determination logic
|
382
|
-
final_color = highlighter._determine_highlight_color(
|
495
|
+
final_color = highlighter._determine_highlight_color(
|
496
|
+
label=None, color_input=None, use_color_cycling=True
|
497
|
+
)
|
383
498
|
element_data = self._get_element_highlight_params(element, include_attrs)
|
384
499
|
if element_data:
|
385
|
-
element_data.update(
|
386
|
-
|
387
|
-
|
388
|
-
'use_color_cycling': True
|
389
|
-
})
|
500
|
+
element_data.update(
|
501
|
+
{"color": final_color, "label": None, "use_color_cycling": True}
|
502
|
+
)
|
390
503
|
prepared_data.append(element_data)
|
391
504
|
|
392
505
|
elif label is not None:
|
393
506
|
logger.debug(f"_prepare: Explicit label '{label}' strategy.")
|
394
|
-
final_color = highlighter._determine_highlight_color(
|
507
|
+
final_color = highlighter._determine_highlight_color(
|
508
|
+
label=label, color_input=color, use_color_cycling=False
|
509
|
+
)
|
395
510
|
for element in self._elements:
|
396
511
|
element_data = self._get_element_highlight_params(element, include_attrs)
|
397
512
|
if element_data:
|
398
|
-
element_data.update({
|
399
|
-
'color': final_color,
|
400
|
-
'label': label
|
401
|
-
})
|
513
|
+
element_data.update({"color": final_color, "label": label})
|
402
514
|
prepared_data.append(element_data)
|
403
515
|
|
404
516
|
elif group_by is not None:
|
405
517
|
logger.debug("_prepare: Grouping by attribute strategy.")
|
406
518
|
grouped_elements = self._group_elements_by_attr(group_by)
|
407
519
|
for group_key, group_elements in grouped_elements.items():
|
408
|
-
if not group_elements:
|
409
|
-
|
410
|
-
|
411
|
-
|
520
|
+
if not group_elements:
|
521
|
+
continue
|
522
|
+
group_label = self._format_group_label(
|
523
|
+
group_key, label_format, group_elements[0], group_by
|
524
|
+
)
|
525
|
+
final_color = highlighter._determine_highlight_color(
|
526
|
+
label=group_label, color_input=None, use_color_cycling=False
|
527
|
+
)
|
528
|
+
logger.debug(
|
529
|
+
f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
|
530
|
+
)
|
412
531
|
for element in group_elements:
|
413
532
|
element_data = self._get_element_highlight_params(element, include_attrs)
|
414
533
|
if element_data:
|
415
|
-
element_data.update({
|
416
|
-
'color': final_color,
|
417
|
-
'label': group_label
|
418
|
-
})
|
534
|
+
element_data.update({"color": final_color, "label": group_label})
|
419
535
|
prepared_data.append(element_data)
|
420
536
|
else:
|
421
537
|
logger.debug("_prepare: Default grouping strategy.")
|
@@ -423,15 +539,21 @@ class ElementCollection(Generic[T]):
|
|
423
539
|
|
424
540
|
if len(element_types) == 1:
|
425
541
|
type_name = element_types.pop()
|
426
|
-
base_name =
|
542
|
+
base_name = (
|
543
|
+
type_name.replace("Element", "").replace("Region", "")
|
544
|
+
if type_name != "Region"
|
545
|
+
else "Region"
|
546
|
+
)
|
427
547
|
auto_label = f"{base_name} Elements" if base_name else "Elements"
|
428
548
|
# Determine color *before* logging or using it
|
429
|
-
final_color = highlighter._determine_highlight_color(
|
549
|
+
final_color = highlighter._determine_highlight_color(
|
550
|
+
label=auto_label, color_input=color, use_color_cycling=False
|
551
|
+
)
|
430
552
|
logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
|
431
553
|
for element in self._elements:
|
432
554
|
element_data = self._get_element_highlight_params(element, include_attrs)
|
433
555
|
if element_data:
|
434
|
-
element_data.update({
|
556
|
+
element_data.update({"color": final_color, "label": auto_label})
|
435
557
|
prepared_data.append(element_data)
|
436
558
|
else:
|
437
559
|
# Mixed types: Generate generic label and warn
|
@@ -442,26 +564,33 @@ class ElementCollection(Generic[T]):
|
|
442
564
|
f"using generic label '{auto_label}'. Consider using 'label', 'group_by', "
|
443
565
|
f"or 'distinct=True' for more specific highlighting."
|
444
566
|
)
|
445
|
-
final_color = highlighter._determine_highlight_color(
|
567
|
+
final_color = highlighter._determine_highlight_color(
|
568
|
+
label=auto_label, color_input=color, use_color_cycling=False
|
569
|
+
)
|
446
570
|
# Determine color *before* logging or using it (already done above for this branch)
|
447
571
|
logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
|
448
572
|
for element in self._elements:
|
449
573
|
element_data = self._get_element_highlight_params(element, include_attrs)
|
450
574
|
if element_data:
|
451
|
-
element_data.update({
|
575
|
+
element_data.update({"color": final_color, "label": auto_label})
|
452
576
|
prepared_data.append(element_data)
|
453
577
|
|
454
578
|
return prepared_data
|
455
579
|
|
456
|
-
def _call_element_highlighter(
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
580
|
+
def _call_element_highlighter(
|
581
|
+
self,
|
582
|
+
element: T,
|
583
|
+
color: Optional[Union[Tuple, str]],
|
584
|
+
label: Optional[str],
|
585
|
+
use_color_cycling: bool,
|
586
|
+
include_attrs: Optional[List[str]],
|
587
|
+
existing: str,
|
588
|
+
):
|
462
589
|
"""Low-level helper to call the appropriate HighlightingService method for an element."""
|
463
|
-
if not hasattr(element,
|
464
|
-
logger.warning(
|
590
|
+
if not hasattr(element, "page") or not hasattr(element.page, "_highlighter"):
|
591
|
+
logger.warning(
|
592
|
+
f"Cannot highlight element, missing 'page' attribute or page lacks highlighter access: {element}"
|
593
|
+
)
|
465
594
|
return
|
466
595
|
|
467
596
|
page = element.page
|
@@ -472,59 +601,68 @@ class ElementCollection(Generic[T]):
|
|
472
601
|
"use_color_cycling": use_color_cycling,
|
473
602
|
"include_attrs": include_attrs,
|
474
603
|
"existing": existing,
|
475
|
-
"element": element
|
604
|
+
"element": element,
|
476
605
|
}
|
477
606
|
|
478
|
-
is_polygon = getattr(element,
|
607
|
+
is_polygon = getattr(element, "has_polygon", False)
|
479
608
|
geom_data = None
|
480
609
|
add_method = None
|
481
610
|
|
482
611
|
if is_polygon:
|
483
|
-
geom_data = getattr(element,
|
612
|
+
geom_data = getattr(element, "polygon", None)
|
484
613
|
if geom_data:
|
485
|
-
args_for_highlighter[
|
614
|
+
args_for_highlighter["polygon"] = geom_data
|
486
615
|
add_method = page._highlighter.add_polygon
|
487
616
|
else:
|
488
|
-
geom_data = getattr(element,
|
617
|
+
geom_data = getattr(element, "bbox", None)
|
489
618
|
if geom_data:
|
490
|
-
args_for_highlighter[
|
619
|
+
args_for_highlighter["bbox"] = geom_data
|
491
620
|
add_method = page._highlighter.add
|
492
621
|
|
493
622
|
if add_method and geom_data:
|
494
623
|
try:
|
495
624
|
add_method(**args_for_highlighter)
|
496
625
|
except Exception as e:
|
497
|
-
logger.error(
|
626
|
+
logger.error(
|
627
|
+
f"Error calling highlighter method for element {element} on page {page.index}: {e}",
|
628
|
+
exc_info=True,
|
629
|
+
)
|
498
630
|
elif not geom_data:
|
499
631
|
logger.warning(f"Cannot highlight element, no bbox or polygon found: {element}")
|
500
632
|
|
501
|
-
def _highlight_as_single_group(
|
502
|
-
|
503
|
-
|
504
|
-
|
633
|
+
def _highlight_as_single_group(
|
634
|
+
self,
|
635
|
+
label: str,
|
636
|
+
color: Optional[Union[Tuple, str]],
|
637
|
+
include_attrs: Optional[List[str]],
|
638
|
+
existing: str,
|
639
|
+
):
|
505
640
|
"""Highlights all elements with the same explicit label and color."""
|
506
641
|
for element in self._elements:
|
507
642
|
self._call_element_highlighter(
|
508
643
|
element=element,
|
509
|
-
color=color,
|
510
|
-
label=label,
|
511
|
-
use_color_cycling=False,
|
644
|
+
color=color, # Use explicit color if provided
|
645
|
+
label=label, # Use the explicit group label
|
646
|
+
use_color_cycling=False, # Use consistent color for the label
|
512
647
|
include_attrs=include_attrs,
|
513
|
-
existing=existing
|
648
|
+
existing=existing,
|
514
649
|
)
|
515
650
|
|
516
|
-
def _highlight_grouped_by_attribute(
|
517
|
-
|
518
|
-
|
519
|
-
|
651
|
+
def _highlight_grouped_by_attribute(
|
652
|
+
self,
|
653
|
+
group_by: str,
|
654
|
+
label_format: Optional[str],
|
655
|
+
include_attrs: Optional[List[str]],
|
656
|
+
existing: str,
|
657
|
+
):
|
520
658
|
"""Groups elements by attribute and highlights each group distinctly."""
|
521
659
|
grouped_elements: Dict[Any, List[T]] = {}
|
522
660
|
# Group elements by the specified attribute value
|
523
661
|
for element in self._elements:
|
524
662
|
try:
|
525
663
|
group_key = getattr(element, group_by, None)
|
526
|
-
if group_key is None:
|
527
|
-
|
664
|
+
if group_key is None: # Handle elements missing the attribute
|
665
|
+
group_key = f"Missing '{group_by}'"
|
528
666
|
# Ensure group_key is hashable (convert list/dict if necessary)
|
529
667
|
if isinstance(group_key, (list, dict)):
|
530
668
|
group_key = str(group_key)
|
@@ -533,41 +671,49 @@ class ElementCollection(Generic[T]):
|
|
533
671
|
grouped_elements[group_key] = []
|
534
672
|
grouped_elements[group_key].append(element)
|
535
673
|
except AttributeError:
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
674
|
+
logger.warning(
|
675
|
+
f"Attribute '{group_by}' not found on element {element}. Skipping grouping."
|
676
|
+
)
|
677
|
+
group_key = f"Error accessing '{group_by}'"
|
678
|
+
if group_key not in grouped_elements:
|
679
|
+
grouped_elements[group_key] = []
|
680
|
+
grouped_elements[group_key].append(element)
|
681
|
+
except TypeError: # Handle unhashable types
|
682
|
+
logger.warning(
|
683
|
+
f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation."
|
684
|
+
)
|
685
|
+
group_key = str(group_key)
|
686
|
+
if group_key not in grouped_elements:
|
687
|
+
grouped_elements[group_key] = []
|
688
|
+
grouped_elements[group_key].append(element)
|
548
689
|
|
549
690
|
# Highlight each group
|
550
691
|
for group_key, group_elements in grouped_elements.items():
|
551
|
-
if not group_elements:
|
692
|
+
if not group_elements:
|
693
|
+
continue
|
552
694
|
|
553
695
|
# Determine the label for this group
|
554
|
-
first_element = group_elements[0]
|
696
|
+
first_element = group_elements[0] # Use first element for formatting
|
555
697
|
group_label = None
|
556
698
|
if label_format:
|
557
699
|
try:
|
558
700
|
# Create a dict of element attributes for formatting
|
559
|
-
element_attrs = first_element.__dict__.copy()
|
701
|
+
element_attrs = first_element.__dict__.copy() # Start with element's dict
|
560
702
|
# Ensure the group_by key itself is present correctly
|
561
703
|
element_attrs[group_by] = group_key
|
562
704
|
group_label = label_format.format(**element_attrs)
|
563
705
|
except KeyError as e:
|
564
|
-
logger.warning(
|
706
|
+
logger.warning(
|
707
|
+
f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label."
|
708
|
+
)
|
565
709
|
group_label = str(group_key)
|
566
710
|
except Exception as format_e:
|
567
|
-
logger.warning(
|
711
|
+
logger.warning(
|
712
|
+
f"Error formatting label '{label_format}': {format_e}. Using group key as label."
|
713
|
+
)
|
568
714
|
group_label = str(group_key)
|
569
715
|
else:
|
570
|
-
group_label = str(group_key)
|
716
|
+
group_label = str(group_key) # Use the attribute value as label
|
571
717
|
|
572
718
|
logger.debug(f" Highlighting group '{group_label}' ({len(group_elements)} elements)")
|
573
719
|
|
@@ -575,11 +721,11 @@ class ElementCollection(Generic[T]):
|
|
575
721
|
for element in group_elements:
|
576
722
|
self._call_element_highlighter(
|
577
723
|
element=element,
|
578
|
-
color=None,
|
579
|
-
label=group_label,
|
580
|
-
use_color_cycling=False,
|
724
|
+
color=None, # Let ColorManager choose based on label
|
725
|
+
label=group_label, # Use the derived group label
|
726
|
+
use_color_cycling=False, # Use consistent color for the label
|
581
727
|
include_attrs=include_attrs,
|
582
|
-
existing=existing
|
728
|
+
existing=existing,
|
583
729
|
)
|
584
730
|
|
585
731
|
def _highlight_distinctly(self, include_attrs: Optional[List[str]], existing: str):
|
@@ -589,116 +735,122 @@ class ElementCollection(Generic[T]):
|
|
589
735
|
for element in self._elements:
|
590
736
|
self._call_element_highlighter(
|
591
737
|
element=element,
|
592
|
-
color=None,
|
593
|
-
label=None,
|
594
|
-
use_color_cycling=True,
|
738
|
+
color=None, # Let ColorManager cycle
|
739
|
+
label=None, # No label for distinct elements
|
740
|
+
use_color_cycling=True, # Force cycling
|
595
741
|
include_attrs=include_attrs,
|
596
|
-
existing=existing
|
742
|
+
existing=existing,
|
743
|
+
)
|
744
|
+
|
745
|
+
def show(
|
746
|
+
self,
|
747
|
+
# --- Visualization Parameters ---
|
748
|
+
group_by: Optional[str] = None,
|
749
|
+
label: Optional[str] = None,
|
750
|
+
color: Optional[Union[Tuple, str]] = None,
|
751
|
+
label_format: Optional[str] = None,
|
752
|
+
distinct: bool = False,
|
753
|
+
include_attrs: Optional[List[str]] = None,
|
754
|
+
# --- Rendering Parameters ---
|
755
|
+
scale: float = 2.0,
|
756
|
+
labels: bool = True, # Use 'labels' consistent with service
|
757
|
+
legend_position: str = "right",
|
758
|
+
render_ocr: bool = False,
|
759
|
+
) -> Optional["Image.Image"]:
|
760
|
+
"""
|
761
|
+
Generates a temporary preview image highlighting elements in this collection
|
762
|
+
on their page, ignoring any persistent highlights.
|
763
|
+
|
764
|
+
Currently only supports collections where all elements are on the same page.
|
765
|
+
|
766
|
+
Allows grouping and coloring elements based on attributes, similar to the
|
767
|
+
persistent `highlight()` method, but only for this temporary view.
|
768
|
+
|
769
|
+
Args:
|
770
|
+
group_by: Attribute name to group elements by for distinct colors/labels.
|
771
|
+
label: Explicit label for all elements (overrides group_by).
|
772
|
+
color: Explicit color for all elements (if label used) or base color.
|
773
|
+
label_format: F-string to format group labels if group_by is used.
|
774
|
+
distinct: Highlight each element distinctly (overrides group_by/label).
|
775
|
+
include_attrs: Attributes to display on individual highlights.
|
776
|
+
scale: Scale factor for rendering image.
|
777
|
+
labels: Whether to include a legend for the temporary highlights.
|
778
|
+
legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
|
779
|
+
render_ocr: Whether to render OCR text.
|
780
|
+
|
781
|
+
Returns:
|
782
|
+
PIL Image object of the temporary preview, or None if rendering fails or
|
783
|
+
elements span multiple pages.
|
784
|
+
|
785
|
+
Raises:
|
786
|
+
ValueError: If the collection is empty or elements are on different pages.
|
787
|
+
"""
|
788
|
+
if not self._elements:
|
789
|
+
raise ValueError("Cannot show an empty collection.")
|
790
|
+
|
791
|
+
# Check if elements are on multiple pages
|
792
|
+
if self._are_on_multiple_pages():
|
793
|
+
raise ValueError(
|
794
|
+
"show() currently only supports collections where all elements are on the same page."
|
795
|
+
)
|
796
|
+
|
797
|
+
# Get the page and highlighting service from the first element
|
798
|
+
first_element = self._elements[0]
|
799
|
+
if not hasattr(first_element, "page") or not first_element.page:
|
800
|
+
logger.warning("Cannot show collection: First element has no associated page.")
|
801
|
+
return None
|
802
|
+
page = first_element.page
|
803
|
+
if not hasattr(page, "pdf") or not page.pdf:
|
804
|
+
logger.warning("Cannot show collection: Page has no associated PDF object.")
|
805
|
+
return None
|
806
|
+
|
807
|
+
service = page._highlighter
|
808
|
+
if not service:
|
809
|
+
logger.warning("Cannot show collection: PDF object has no highlighting service.")
|
810
|
+
return None
|
811
|
+
|
812
|
+
# 1. Prepare temporary highlight data based on grouping parameters
|
813
|
+
# This returns a list of dicts, suitable for render_preview
|
814
|
+
highlight_data_list = self._prepare_highlight_data(
|
815
|
+
distinct=distinct,
|
816
|
+
label=label,
|
817
|
+
color=color,
|
818
|
+
group_by=group_by,
|
819
|
+
label_format=label_format,
|
820
|
+
include_attrs=include_attrs,
|
821
|
+
)
|
822
|
+
|
823
|
+
if not highlight_data_list:
|
824
|
+
logger.warning("No highlight data generated for show(). Rendering clean page.")
|
825
|
+
# Render the page without any temporary highlights
|
826
|
+
highlight_data_list = []
|
827
|
+
|
828
|
+
# 2. Call render_preview on the HighlightingService
|
829
|
+
try:
|
830
|
+
return service.render_preview(
|
831
|
+
page_index=page.index,
|
832
|
+
temporary_highlights=highlight_data_list,
|
833
|
+
scale=scale,
|
834
|
+
labels=labels, # Use 'labels'
|
835
|
+
legend_position=legend_position,
|
836
|
+
render_ocr=render_ocr,
|
597
837
|
)
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
render_ocr: bool = False) -> Optional['Image.Image']:
|
612
|
-
"""
|
613
|
-
Generates a temporary preview image highlighting elements in this collection
|
614
|
-
on their page, ignoring any persistent highlights.
|
615
|
-
|
616
|
-
Currently only supports collections where all elements are on the same page.
|
617
|
-
|
618
|
-
Allows grouping and coloring elements based on attributes, similar to the
|
619
|
-
persistent `highlight()` method, but only for this temporary view.
|
620
|
-
|
621
|
-
Args:
|
622
|
-
group_by: Attribute name to group elements by for distinct colors/labels.
|
623
|
-
label: Explicit label for all elements (overrides group_by).
|
624
|
-
color: Explicit color for all elements (if label used) or base color.
|
625
|
-
label_format: F-string to format group labels if group_by is used.
|
626
|
-
distinct: Highlight each element distinctly (overrides group_by/label).
|
627
|
-
include_attrs: Attributes to display on individual highlights.
|
628
|
-
scale: Scale factor for rendering image.
|
629
|
-
labels: Whether to include a legend for the temporary highlights.
|
630
|
-
legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
|
631
|
-
render_ocr: Whether to render OCR text.
|
632
|
-
|
633
|
-
Returns:
|
634
|
-
PIL Image object of the temporary preview, or None if rendering fails or
|
635
|
-
elements span multiple pages.
|
636
|
-
|
637
|
-
Raises:
|
638
|
-
ValueError: If the collection is empty or elements are on different pages.
|
639
|
-
"""
|
640
|
-
if not self._elements:
|
641
|
-
raise ValueError("Cannot show an empty collection.")
|
642
|
-
|
643
|
-
# Check if elements are on multiple pages
|
644
|
-
if self._are_on_multiple_pages():
|
645
|
-
raise ValueError("show() currently only supports collections where all elements are on the same page.")
|
646
|
-
|
647
|
-
# Get the page and highlighting service from the first element
|
648
|
-
first_element = self._elements[0]
|
649
|
-
if not hasattr(first_element, 'page') or not first_element.page:
|
650
|
-
logger.warning("Cannot show collection: First element has no associated page.")
|
651
|
-
return None
|
652
|
-
page = first_element.page
|
653
|
-
if not hasattr(page, 'pdf') or not page.pdf:
|
654
|
-
logger.warning("Cannot show collection: Page has no associated PDF object.")
|
655
|
-
return None
|
656
|
-
|
657
|
-
service = page._highlighter
|
658
|
-
if not service:
|
659
|
-
logger.warning("Cannot show collection: PDF object has no highlighting service.")
|
660
|
-
return None
|
661
|
-
|
662
|
-
# 1. Prepare temporary highlight data based on grouping parameters
|
663
|
-
# This returns a list of dicts, suitable for render_preview
|
664
|
-
highlight_data_list = self._prepare_highlight_data(
|
665
|
-
distinct=distinct,
|
666
|
-
label=label,
|
667
|
-
color=color,
|
668
|
-
group_by=group_by,
|
669
|
-
label_format=label_format,
|
670
|
-
include_attrs=include_attrs
|
671
|
-
)
|
672
|
-
|
673
|
-
if not highlight_data_list:
|
674
|
-
logger.warning("No highlight data generated for show(). Rendering clean page.")
|
675
|
-
# Render the page without any temporary highlights
|
676
|
-
highlight_data_list = []
|
677
|
-
|
678
|
-
# 2. Call render_preview on the HighlightingService
|
679
|
-
try:
|
680
|
-
return service.render_preview(
|
681
|
-
page_index=page.index,
|
682
|
-
temporary_highlights=highlight_data_list,
|
683
|
-
scale=scale,
|
684
|
-
labels=labels, # Use 'labels'
|
685
|
-
legend_position=legend_position,
|
686
|
-
render_ocr=render_ocr
|
687
|
-
)
|
688
|
-
except Exception as e:
|
689
|
-
logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
|
690
|
-
return None
|
691
|
-
|
692
|
-
def save(self,
|
693
|
-
filename: str,
|
694
|
-
scale: float = 2.0,
|
695
|
-
width: Optional[int] = None,
|
696
|
-
labels: bool = True,
|
697
|
-
legend_position: str = 'right',
|
698
|
-
render_ocr: bool = False) -> 'ElementCollection':
|
838
|
+
except Exception as e:
|
839
|
+
logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
|
840
|
+
return None
|
841
|
+
|
842
|
+
def save(
|
843
|
+
self,
|
844
|
+
filename: str,
|
845
|
+
scale: float = 2.0,
|
846
|
+
width: Optional[int] = None,
|
847
|
+
labels: bool = True,
|
848
|
+
legend_position: str = "right",
|
849
|
+
render_ocr: bool = False,
|
850
|
+
) -> "ElementCollection":
|
699
851
|
"""
|
700
852
|
Save the page with this collection's elements highlighted to an image file.
|
701
|
-
|
853
|
+
|
702
854
|
Args:
|
703
855
|
filename: Path to save the image to
|
704
856
|
scale: Scale factor for rendering
|
@@ -706,32 +858,34 @@ class ElementCollection(Generic[T]):
|
|
706
858
|
labels: Whether to include a legend for labels
|
707
859
|
legend_position: Position of the legend
|
708
860
|
render_ocr: Whether to render OCR text with white background boxes
|
709
|
-
|
861
|
+
|
710
862
|
Returns:
|
711
863
|
Self for method chaining
|
712
864
|
"""
|
713
865
|
# Use to_image to generate and save the image
|
714
866
|
self.to_image(
|
715
|
-
path=filename,
|
867
|
+
path=filename,
|
716
868
|
scale=scale,
|
717
869
|
width=width,
|
718
|
-
labels=labels,
|
870
|
+
labels=labels,
|
719
871
|
legend_position=legend_position,
|
720
|
-
render_ocr=render_ocr
|
872
|
+
render_ocr=render_ocr,
|
721
873
|
)
|
722
874
|
return self
|
723
|
-
|
724
|
-
def to_image(
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
875
|
+
|
876
|
+
def to_image(
|
877
|
+
self,
|
878
|
+
path: Optional[str] = None,
|
879
|
+
scale: float = 2.0,
|
880
|
+
width: Optional[int] = None,
|
881
|
+
labels: bool = True,
|
882
|
+
legend_position: str = "right",
|
883
|
+
render_ocr: bool = False,
|
884
|
+
) -> Optional["Image.Image"]:
|
731
885
|
"""
|
732
886
|
Generate an image of the page with this collection's elements highlighted,
|
733
887
|
optionally saving it to a file.
|
734
|
-
|
888
|
+
|
735
889
|
Args:
|
736
890
|
path: Optional path to save the image to
|
737
891
|
scale: Scale factor for rendering
|
@@ -739,21 +893,21 @@ class ElementCollection(Generic[T]):
|
|
739
893
|
labels: Whether to include a legend for labels
|
740
894
|
legend_position: Position of the legend
|
741
895
|
render_ocr: Whether to render OCR text with white background boxes
|
742
|
-
|
896
|
+
|
743
897
|
Returns:
|
744
898
|
PIL Image of the page with elements highlighted, or None if no valid page
|
745
899
|
"""
|
746
900
|
# Get the page from the first element (if available)
|
747
|
-
if self._elements and hasattr(self._elements[0],
|
901
|
+
if self._elements and hasattr(self._elements[0], "page"):
|
748
902
|
page = self._elements[0].page
|
749
903
|
# Generate the image using to_image
|
750
904
|
return page.to_image(
|
751
|
-
path=path,
|
905
|
+
path=path,
|
752
906
|
scale=scale,
|
753
907
|
width=width,
|
754
|
-
labels=labels,
|
908
|
+
labels=labels,
|
755
909
|
legend_position=legend_position,
|
756
|
-
render_ocr=render_ocr
|
910
|
+
render_ocr=render_ocr,
|
757
911
|
)
|
758
912
|
return None
|
759
913
|
|
@@ -763,7 +917,7 @@ class ElementCollection(Generic[T]):
|
|
763
917
|
for element in self._elements:
|
764
918
|
try:
|
765
919
|
group_key = getattr(element, group_by, None)
|
766
|
-
if group_key is None:
|
920
|
+
if group_key is None: # Handle elements missing the attribute
|
767
921
|
group_key = f"Missing '{group_by}'"
|
768
922
|
# Ensure group_key is hashable (convert list/dict if necessary)
|
769
923
|
if isinstance(group_key, (list, dict)):
|
@@ -773,13 +927,17 @@ class ElementCollection(Generic[T]):
|
|
773
927
|
grouped_elements[group_key] = []
|
774
928
|
grouped_elements[group_key].append(element)
|
775
929
|
except AttributeError:
|
776
|
-
logger.warning(
|
930
|
+
logger.warning(
|
931
|
+
f"Attribute '{group_by}' not found on element {element}. Skipping grouping."
|
932
|
+
)
|
777
933
|
group_key = f"Error accessing '{group_by}'"
|
778
934
|
if group_key not in grouped_elements:
|
779
935
|
grouped_elements[group_key] = []
|
780
936
|
grouped_elements[group_key].append(element)
|
781
|
-
except TypeError:
|
782
|
-
logger.warning(
|
937
|
+
except TypeError: # Handle unhashable types
|
938
|
+
logger.warning(
|
939
|
+
f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation."
|
940
|
+
)
|
783
941
|
group_key = str(group_key)
|
784
942
|
if group_key not in grouped_elements:
|
785
943
|
grouped_elements[group_key] = []
|
@@ -787,48 +945,61 @@ class ElementCollection(Generic[T]):
|
|
787
945
|
|
788
946
|
return grouped_elements
|
789
947
|
|
790
|
-
def _format_group_label(
|
948
|
+
def _format_group_label(
|
949
|
+
self, group_key: Any, label_format: Optional[str], sample_element: T, group_by_attr: str
|
950
|
+
) -> str:
|
791
951
|
"""Formats the label for a group based on the key and format string."""
|
792
952
|
if label_format:
|
793
953
|
try:
|
794
954
|
element_attrs = sample_element.__dict__.copy()
|
795
|
-
element_attrs[group_by_attr] = group_key
|
955
|
+
element_attrs[group_by_attr] = group_key # Ensure key is present
|
796
956
|
return label_format.format(**element_attrs)
|
797
957
|
except KeyError as e:
|
798
|
-
logger.warning(
|
958
|
+
logger.warning(
|
959
|
+
f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label."
|
960
|
+
)
|
799
961
|
return str(group_key)
|
800
962
|
except Exception as format_e:
|
801
|
-
logger.warning(
|
963
|
+
logger.warning(
|
964
|
+
f"Error formatting label '{label_format}': {format_e}. Using group key as label."
|
965
|
+
)
|
802
966
|
return str(group_key)
|
803
967
|
else:
|
804
968
|
return str(group_key)
|
805
969
|
|
806
|
-
def _get_element_highlight_params(
|
970
|
+
def _get_element_highlight_params(
|
971
|
+
self, element: T, include_attrs: Optional[List[str]]
|
972
|
+
) -> Optional[Dict]:
|
807
973
|
"""Extracts common parameters needed for highlighting a single element."""
|
808
|
-
if not hasattr(element,
|
974
|
+
if not hasattr(element, "page"):
|
975
|
+
return None
|
809
976
|
page = element.page
|
810
977
|
|
811
978
|
base_data = {
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
979
|
+
"page_index": page.index,
|
980
|
+
"element": element,
|
981
|
+
"include_attrs": include_attrs,
|
982
|
+
"attributes_to_draw": {},
|
983
|
+
"bbox": None,
|
984
|
+
"polygon": None,
|
818
985
|
}
|
819
986
|
|
820
987
|
# Extract geometry
|
821
|
-
is_polygon = getattr(element,
|
988
|
+
is_polygon = getattr(element, "has_polygon", False)
|
822
989
|
geom_data = None
|
823
990
|
if is_polygon:
|
824
|
-
geom_data = getattr(element,
|
825
|
-
if geom_data:
|
991
|
+
geom_data = getattr(element, "polygon", None)
|
992
|
+
if geom_data:
|
993
|
+
base_data["polygon"] = geom_data
|
826
994
|
else:
|
827
|
-
geom_data = getattr(element,
|
828
|
-
if geom_data:
|
995
|
+
geom_data = getattr(element, "bbox", None)
|
996
|
+
if geom_data:
|
997
|
+
base_data["bbox"] = geom_data
|
829
998
|
|
830
999
|
if not geom_data:
|
831
|
-
logger.warning(
|
1000
|
+
logger.warning(
|
1001
|
+
f"Cannot prepare highlight, no bbox or polygon found for element: {element}"
|
1002
|
+
)
|
832
1003
|
return None
|
833
1004
|
|
834
1005
|
# Extract attributes if requested
|
@@ -837,13 +1008,15 @@ class ElementCollection(Generic[T]):
|
|
837
1008
|
try:
|
838
1009
|
attr_value = getattr(element, attr_name, None)
|
839
1010
|
if attr_value is not None:
|
840
|
-
base_data[
|
1011
|
+
base_data["attributes_to_draw"][attr_name] = attr_value
|
841
1012
|
except AttributeError:
|
842
|
-
logger.warning(
|
1013
|
+
logger.warning(
|
1014
|
+
f"Attribute '{attr_name}' not found on element {element} for include_attrs"
|
1015
|
+
)
|
843
1016
|
|
844
1017
|
return base_data
|
845
1018
|
|
846
|
-
def viewer(self, title: Optional[str] = None) -> Optional[
|
1019
|
+
def viewer(self, title: Optional[str] = None) -> Optional["widgets.DOMWidget"]:
|
847
1020
|
"""
|
848
1021
|
Creates and returns an interactive ipywidget showing ONLY the elements
|
849
1022
|
in this collection on their page background.
|
@@ -862,28 +1035,36 @@ class ElementCollection(Generic[T]):
|
|
862
1035
|
try:
|
863
1036
|
page = self.elements[0].page
|
864
1037
|
# Check if the page object actually has the method
|
865
|
-
if hasattr(page,
|
866
|
-
final_title =
|
1038
|
+
if hasattr(page, "viewer") and callable(page.viewer):
|
1039
|
+
final_title = (
|
1040
|
+
title or f"Interactive Viewer for Collection ({len(self.elements)} elements)"
|
1041
|
+
)
|
867
1042
|
# Call the page method, passing this collection's elements
|
868
1043
|
return page.viewer(
|
869
1044
|
elements_to_render=self.elements,
|
870
|
-
title=final_title
|
1045
|
+
title=final_title, # Pass title if Page method accepts it
|
871
1046
|
)
|
872
1047
|
else:
|
873
|
-
|
874
|
-
|
1048
|
+
logger.error("Page object is missing the 'viewer' method.")
|
1049
|
+
return None
|
875
1050
|
except AttributeError:
|
876
|
-
logger.error(
|
1051
|
+
logger.error(
|
1052
|
+
"Cannot generate interactive viewer: Elements in collection lack 'page' attribute."
|
1053
|
+
)
|
877
1054
|
return None
|
878
1055
|
except IndexError:
|
879
|
-
|
880
|
-
|
881
|
-
|
1056
|
+
# Should be caught by the empty check, but just in case
|
1057
|
+
logger.error(
|
1058
|
+
"Cannot generate interactive viewer: Collection unexpectedly became empty."
|
1059
|
+
)
|
1060
|
+
return None
|
882
1061
|
except Exception as e:
|
883
|
-
|
884
|
-
|
1062
|
+
logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
|
1063
|
+
return None
|
885
1064
|
|
886
|
-
def find_all(
|
1065
|
+
def find_all(
|
1066
|
+
self, selector: str, regex: bool = False, case: bool = True, **kwargs
|
1067
|
+
) -> "ElementCollection[T]":
|
887
1068
|
"""
|
888
1069
|
Filter elements within this collection matching the selector.
|
889
1070
|
|
@@ -903,21 +1084,21 @@ class ElementCollection(Generic[T]):
|
|
903
1084
|
selector_obj = parse_selector(selector)
|
904
1085
|
except Exception as e:
|
905
1086
|
logger.error(f"Error parsing selector '{selector}': {e}")
|
906
|
-
return ElementCollection([])
|
1087
|
+
return ElementCollection([]) # Return empty on parse error
|
907
1088
|
|
908
1089
|
# Pass regex and case flags to selector function generator
|
909
|
-
kwargs[
|
910
|
-
kwargs[
|
1090
|
+
kwargs["regex"] = regex
|
1091
|
+
kwargs["case"] = case
|
911
1092
|
|
912
1093
|
try:
|
913
1094
|
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
914
1095
|
except Exception as e:
|
915
1096
|
logger.error(f"Error creating filter function for selector '{selector}': {e}")
|
916
|
-
return ElementCollection([])
|
1097
|
+
return ElementCollection([]) # Return empty on filter creation error
|
917
1098
|
|
918
1099
|
matching_elements = [element for element in self._elements if filter_func(element)]
|
919
1100
|
|
920
|
-
# Note: Unlike Page.find_all, this doesn't re-sort.
|
1101
|
+
# Note: Unlike Page.find_all, this doesn't re-sort.
|
921
1102
|
# Sorting should be done explicitly on the collection if needed.
|
922
1103
|
|
923
1104
|
return ElementCollection(matching_elements)
|
@@ -938,96 +1119,134 @@ class ElementCollection(Generic[T]):
|
|
938
1119
|
results = self.find_all(selector, regex=regex, case=case, **kwargs)
|
939
1120
|
return results.first
|
940
1121
|
|
1122
|
+
def correct_ocr(
|
1123
|
+
self,
|
1124
|
+
correction_callback: Callable[[Any], Optional[str]],
|
1125
|
+
) -> "ElementCollection":
|
1126
|
+
"""
|
1127
|
+
Applies corrections to OCR-generated text elements within this collection
|
1128
|
+
using a user-provided callback function.
|
1129
|
+
|
1130
|
+
Iterates through elements currently in the collection. If an element's
|
1131
|
+
'source' attribute starts with 'ocr', it calls the `correction_callback`
|
1132
|
+
for that element, passing the element itself.
|
1133
|
+
|
1134
|
+
The `correction_callback` should contain the logic to:
|
1135
|
+
1. Determine if the element needs correction.
|
1136
|
+
2. Perform the correction (e.g., call an LLM).
|
1137
|
+
3. Return the new text (`str`) or `None`.
|
1138
|
+
|
1139
|
+
If the callback returns a string, the element's `.text` is updated in place.
|
1140
|
+
Metadata updates (source, confidence, etc.) should happen within the callback.
|
1141
|
+
Elements without a source starting with 'ocr' are skipped.
|
1142
|
+
|
1143
|
+
Args:
|
1144
|
+
correction_callback: A function accepting an element and returning
|
1145
|
+
`Optional[str]` (new text or None).
|
1146
|
+
|
1147
|
+
Returns:
|
1148
|
+
Self for method chaining.
|
1149
|
+
"""
|
1150
|
+
# Delegate to the utility function
|
1151
|
+
_apply_ocr_correction_to_elements(
|
1152
|
+
elements=self._elements,
|
1153
|
+
correction_callback=correction_callback,
|
1154
|
+
caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
|
1155
|
+
)
|
1156
|
+
return self # Return self for chaining
|
1157
|
+
|
1158
|
+
|
941
1159
|
class PageCollection(Generic[P]):
|
942
1160
|
"""
|
943
1161
|
A collection of PDF pages with cross-page operations.
|
944
|
-
|
1162
|
+
|
945
1163
|
This class provides methods for working with multiple pages, such as finding
|
946
1164
|
elements across pages, extracting text from page ranges, and more.
|
947
1165
|
"""
|
948
|
-
|
1166
|
+
|
949
1167
|
def __init__(self, pages: List[P]):
|
950
1168
|
"""
|
951
1169
|
Initialize a page collection.
|
952
|
-
|
1170
|
+
|
953
1171
|
Args:
|
954
1172
|
pages: List of Page objects
|
955
1173
|
"""
|
956
1174
|
self.pages = pages
|
957
|
-
|
1175
|
+
|
958
1176
|
def __len__(self) -> int:
|
959
1177
|
"""Return the number of pages in the collection."""
|
960
1178
|
return len(self.pages)
|
961
|
-
|
962
|
-
def __getitem__(self, idx) -> Union[P,
|
1179
|
+
|
1180
|
+
def __getitem__(self, idx) -> Union[P, "PageCollection[P]"]:
|
963
1181
|
"""Support indexing and slicing."""
|
964
1182
|
if isinstance(idx, slice):
|
965
1183
|
return PageCollection(self.pages[idx])
|
966
1184
|
return self.pages[idx]
|
967
|
-
|
1185
|
+
|
968
1186
|
def __iter__(self) -> Iterator[P]:
|
969
1187
|
"""Support iteration."""
|
970
1188
|
return iter(self.pages)
|
971
|
-
|
1189
|
+
|
972
1190
|
def __repr__(self) -> str:
|
973
1191
|
"""Return a string representation showing the page count."""
|
974
1192
|
return f"<PageCollection(count={len(self)})>"
|
975
|
-
|
1193
|
+
|
976
1194
|
def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
|
977
1195
|
"""
|
978
1196
|
Extract text from all pages in the collection.
|
979
|
-
|
1197
|
+
|
980
1198
|
Args:
|
981
1199
|
keep_blank_chars: Whether to keep blank characters (default: True)
|
982
1200
|
apply_exclusions: Whether to apply exclusion regions (default: True)
|
983
1201
|
**kwargs: Additional extraction parameters
|
984
|
-
|
1202
|
+
|
985
1203
|
Returns:
|
986
1204
|
Combined text from all pages
|
987
1205
|
"""
|
988
1206
|
texts = []
|
989
1207
|
for page in self.pages:
|
990
1208
|
text = page.extract_text(
|
991
|
-
keep_blank_chars=keep_blank_chars,
|
992
|
-
apply_exclusions=apply_exclusions,
|
993
|
-
**kwargs
|
1209
|
+
keep_blank_chars=keep_blank_chars, apply_exclusions=apply_exclusions, **kwargs
|
994
1210
|
)
|
995
1211
|
texts.append(text)
|
996
|
-
|
1212
|
+
|
997
1213
|
return "\n".join(texts)
|
998
1214
|
|
999
|
-
# --- NEW METHOD ---
|
1000
1215
|
def apply_ocr(
|
1001
1216
|
self,
|
1002
1217
|
engine: Optional[str] = None,
|
1003
|
-
|
1218
|
+
# --- Common OCR Parameters (Direct Arguments) ---
|
1004
1219
|
languages: Optional[List[str]] = None,
|
1005
|
-
min_confidence: Optional[float] = None,
|
1220
|
+
min_confidence: Optional[float] = None, # Min confidence threshold
|
1006
1221
|
device: Optional[str] = None,
|
1007
|
-
|
1008
|
-
|
1222
|
+
resolution: Optional[int] = None, # DPI for rendering
|
1223
|
+
apply_exclusions: bool = True, # New parameter
|
1224
|
+
# --- Engine-Specific Options ---
|
1225
|
+
options: Optional[Any] = None, # e.g., EasyOCROptions(...)
|
1226
|
+
) -> "PageCollection[P]":
|
1009
1227
|
"""
|
1010
1228
|
Applies OCR to all pages within this collection using batch processing.
|
1011
1229
|
|
1012
|
-
This delegates the work to the parent PDF object's `apply_ocr` method
|
1013
|
-
to the respective Page objects within this collection.
|
1230
|
+
This delegates the work to the parent PDF object's `apply_ocr` method.
|
1014
1231
|
|
1015
1232
|
Args:
|
1016
|
-
engine: Name of the engine (e.g., 'easyocr', 'paddleocr'
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1020
|
-
|
1021
|
-
|
1022
|
-
|
1233
|
+
engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr').
|
1234
|
+
languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch']).
|
1235
|
+
**Must be codes understood by the specific selected engine.**
|
1236
|
+
No mapping is performed.
|
1237
|
+
min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
|
1238
|
+
device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
|
1239
|
+
resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
|
1240
|
+
apply_exclusions: If True (default), render page images for OCR with
|
1241
|
+
excluded areas masked (whited out). If False, OCR
|
1242
|
+
the raw page images without masking exclusions.
|
1243
|
+
options: An engine-specific options object (e.g., EasyOCROptions) or dict.
|
1023
1244
|
|
1024
1245
|
Returns:
|
1025
1246
|
Self for method chaining.
|
1026
1247
|
|
1027
1248
|
Raises:
|
1028
|
-
RuntimeError: If pages
|
1029
|
-
or if the parent PDF object lacks the required
|
1030
|
-
`apply_ocr` method.
|
1249
|
+
RuntimeError: If pages lack a parent PDF or parent lacks `apply_ocr`.
|
1031
1250
|
(Propagates exceptions from PDF.apply_ocr)
|
1032
1251
|
"""
|
1033
1252
|
if not self.pages:
|
@@ -1036,43 +1255,43 @@ class PageCollection(Generic[P]):
|
|
1036
1255
|
|
1037
1256
|
# Assume all pages share the same parent PDF object
|
1038
1257
|
first_page = self.pages[0]
|
1039
|
-
if not hasattr(first_page,
|
1258
|
+
if not hasattr(first_page, "_parent") or not first_page._parent:
|
1040
1259
|
raise RuntimeError("Pages in this collection do not have a parent PDF reference.")
|
1041
1260
|
|
1042
1261
|
parent_pdf = first_page._parent
|
1043
1262
|
|
1044
|
-
|
1045
|
-
|
1046
|
-
raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
|
1263
|
+
if not hasattr(parent_pdf, "apply_ocr") or not callable(parent_pdf.apply_ocr):
|
1264
|
+
raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
|
1047
1265
|
|
1048
1266
|
# Get the 0-based indices of the pages in this collection
|
1049
1267
|
page_indices = [p.index for p in self.pages]
|
1050
1268
|
|
1051
1269
|
logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
|
1052
1270
|
|
1053
|
-
# Delegate the batch call to the parent PDF object
|
1271
|
+
# Delegate the batch call to the parent PDF object, passing direct args and apply_exclusions
|
1054
1272
|
parent_pdf.apply_ocr(
|
1055
1273
|
pages=page_indices,
|
1056
1274
|
engine=engine,
|
1057
|
-
options=options,
|
1058
1275
|
languages=languages,
|
1059
|
-
min_confidence=min_confidence,
|
1060
|
-
device=device
|
1061
|
-
|
1276
|
+
min_confidence=min_confidence, # Pass the renamed parameter
|
1277
|
+
device=device,
|
1278
|
+
resolution=resolution,
|
1279
|
+
apply_exclusions=apply_exclusions, # Pass down
|
1280
|
+
options=options,
|
1062
1281
|
)
|
1063
1282
|
# The PDF method modifies the Page objects directly by adding elements.
|
1064
1283
|
|
1065
|
-
return self
|
1284
|
+
return self # Return self for chaining
|
1066
1285
|
|
1067
1286
|
def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[T]:
|
1068
1287
|
"""
|
1069
1288
|
Find the first element matching the selector across all pages.
|
1070
|
-
|
1289
|
+
|
1071
1290
|
Args:
|
1072
1291
|
selector: CSS-like selector string
|
1073
1292
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
1074
1293
|
**kwargs: Additional filter parameters
|
1075
|
-
|
1294
|
+
|
1076
1295
|
Returns:
|
1077
1296
|
First matching element or None
|
1078
1297
|
"""
|
@@ -1081,16 +1300,16 @@ class PageCollection(Generic[P]):
|
|
1081
1300
|
if element:
|
1082
1301
|
return element
|
1083
1302
|
return None
|
1084
|
-
|
1303
|
+
|
1085
1304
|
def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> ElementCollection:
|
1086
1305
|
"""
|
1087
1306
|
Find all elements matching the selector across all pages.
|
1088
|
-
|
1307
|
+
|
1089
1308
|
Args:
|
1090
1309
|
selector: CSS-like selector string
|
1091
1310
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
1092
1311
|
**kwargs: Additional filter parameters
|
1093
|
-
|
1312
|
+
|
1094
1313
|
Returns:
|
1095
1314
|
ElementCollection with matching elements from all pages
|
1096
1315
|
"""
|
@@ -1099,57 +1318,79 @@ class PageCollection(Generic[P]):
|
|
1099
1318
|
elements = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
1100
1319
|
if elements:
|
1101
1320
|
all_elements.extend(elements.elements)
|
1102
|
-
|
1321
|
+
|
1103
1322
|
return ElementCollection(all_elements)
|
1104
|
-
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1323
|
+
|
1324
|
+
def correct_ocr(
|
1325
|
+
self,
|
1326
|
+
correction_callback: Callable[[Any], Optional[str]],
|
1327
|
+
) -> "PageCollection[P]":
|
1328
|
+
"""
|
1329
|
+
Applies corrections to OCR-generated text elements across all pages
|
1330
|
+
in this collection using a user-provided callback function.
|
1331
|
+
|
1332
|
+
This method delegates to the parent PDF's `correct_ocr` method,
|
1333
|
+
targeting all pages within this collection.
|
1334
|
+
|
1335
|
+
Args:
|
1336
|
+
correction_callback: A function that accepts a single argument (an element
|
1337
|
+
object) and returns `Optional[str]` (new text or None).
|
1338
|
+
|
1339
|
+
Returns:
|
1340
|
+
A dictionary containing aggregate statistics for the process across all pages:
|
1341
|
+
{'elements_checked': total_checked, 'corrections_applied': total_applied}
|
1342
|
+
|
1343
|
+
Raises:
|
1344
|
+
RuntimeError: If the collection is empty, pages lack a parent PDF reference,
|
1345
|
+
or the parent PDF lacks the `correct_ocr` method.
|
1346
|
+
"""
|
1347
|
+
if not self.pages:
|
1348
|
+
logger.warning("Cannot correct OCR for an empty PageCollection.")
|
1349
|
+
|
1350
|
+
# Assume all pages share the same parent PDF object
|
1351
|
+
parent_pdf = self.pages[0]._parent
|
1352
|
+
|
1353
|
+
page_indices = [p.index for p in self.pages]
|
1354
|
+
logger.info(f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}.")
|
1355
|
+
|
1356
|
+
# Delegate the call to the parent PDF object for the relevant pages
|
1357
|
+
parent_pdf.correct_ocr(
|
1358
|
+
correction_callback=correction_callback,
|
1359
|
+
pages=page_indices
|
1360
|
+
)
|
1361
|
+
|
1362
|
+
return self
|
1363
|
+
|
1364
|
+
def get_sections(
|
1365
|
+
self,
|
1366
|
+
start_elements=None,
|
1367
|
+
end_elements=None,
|
1368
|
+
new_section_on_page_break=False,
|
1369
|
+
boundary_inclusion="both",
|
1370
|
+
) -> List["Region"]:
|
1130
1371
|
"""
|
1131
1372
|
Extract sections from a page collection based on start/end elements.
|
1132
|
-
|
1373
|
+
|
1133
1374
|
Args:
|
1134
1375
|
start_elements: Elements or selector string that mark the start of sections
|
1135
1376
|
end_elements: Elements or selector string that mark the end of sections
|
1136
1377
|
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
1137
1378
|
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
1138
|
-
|
1379
|
+
|
1139
1380
|
Returns:
|
1140
1381
|
List of Region objects representing the extracted sections
|
1141
1382
|
"""
|
1142
1383
|
# Find start and end elements across all pages
|
1143
1384
|
if isinstance(start_elements, str):
|
1144
1385
|
start_elements = self.find_all(start_elements).elements
|
1145
|
-
|
1386
|
+
|
1146
1387
|
if isinstance(end_elements, str):
|
1147
1388
|
end_elements = self.find_all(end_elements).elements
|
1148
|
-
|
1389
|
+
|
1149
1390
|
# If no start elements, return empty list
|
1150
1391
|
if not start_elements:
|
1151
1392
|
return []
|
1152
|
-
|
1393
|
+
|
1153
1394
|
# If there are page break boundaries, we'll need to add them
|
1154
1395
|
if new_section_on_page_break:
|
1155
1396
|
# For each page boundary, create virtual "end" and "start" elements
|
@@ -1159,183 +1400,200 @@ class PageCollection(Generic[P]):
|
|
1159
1400
|
# If end_elements is None, initialize it as an empty list
|
1160
1401
|
if end_elements is None:
|
1161
1402
|
end_elements = []
|
1162
|
-
|
1403
|
+
|
1163
1404
|
# Create a region at the bottom of the page as an artificial end marker
|
1164
1405
|
from natural_pdf.elements.region import Region
|
1406
|
+
|
1165
1407
|
bottom_region = Region(page, (0, page.height - 1, page.width, page.height))
|
1166
1408
|
bottom_region.is_page_boundary = True # Mark it as a special boundary
|
1167
1409
|
end_elements.append(bottom_region)
|
1168
|
-
|
1410
|
+
|
1169
1411
|
# Add a virtual "start" element at the top of the next page
|
1170
1412
|
next_page = self.pages[i + 1]
|
1171
1413
|
top_region = Region(next_page, (0, 0, next_page.width, 1))
|
1172
1414
|
top_region.is_page_boundary = True # Mark it as a special boundary
|
1173
1415
|
start_elements.append(top_region)
|
1174
|
-
|
1416
|
+
|
1175
1417
|
# Get all elements from all pages and sort them in document order
|
1176
1418
|
all_elements = []
|
1177
1419
|
for page in self.pages:
|
1178
1420
|
elements = page.get_elements()
|
1179
1421
|
all_elements.extend(elements)
|
1180
|
-
|
1422
|
+
|
1181
1423
|
# Sort by page index, then vertical position, then horizontal position
|
1182
1424
|
all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
|
1183
|
-
|
1425
|
+
|
1184
1426
|
# Mark section boundaries
|
1185
1427
|
section_boundaries = []
|
1186
|
-
|
1428
|
+
|
1187
1429
|
# Add start element boundaries
|
1188
1430
|
for element in start_elements:
|
1189
1431
|
if element in all_elements:
|
1190
1432
|
idx = all_elements.index(element)
|
1191
|
-
section_boundaries.append(
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1433
|
+
section_boundaries.append(
|
1434
|
+
{
|
1435
|
+
"index": idx,
|
1436
|
+
"element": element,
|
1437
|
+
"type": "start",
|
1438
|
+
"page_idx": element.page.index,
|
1439
|
+
}
|
1440
|
+
)
|
1441
|
+
elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
|
1198
1442
|
# This is a virtual page boundary element
|
1199
|
-
section_boundaries.append(
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1443
|
+
section_boundaries.append(
|
1444
|
+
{
|
1445
|
+
"index": -1, # Special index for page boundaries
|
1446
|
+
"element": element,
|
1447
|
+
"type": "start",
|
1448
|
+
"page_idx": element.page.index,
|
1449
|
+
}
|
1450
|
+
)
|
1451
|
+
|
1206
1452
|
# Add end element boundaries if provided
|
1207
1453
|
if end_elements:
|
1208
1454
|
for element in end_elements:
|
1209
1455
|
if element in all_elements:
|
1210
1456
|
idx = all_elements.index(element)
|
1211
|
-
section_boundaries.append(
|
1212
|
-
|
1213
|
-
|
1214
|
-
|
1215
|
-
|
1216
|
-
|
1217
|
-
|
1457
|
+
section_boundaries.append(
|
1458
|
+
{
|
1459
|
+
"index": idx,
|
1460
|
+
"element": element,
|
1461
|
+
"type": "end",
|
1462
|
+
"page_idx": element.page.index,
|
1463
|
+
}
|
1464
|
+
)
|
1465
|
+
elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
|
1218
1466
|
# This is a virtual page boundary element
|
1219
|
-
section_boundaries.append(
|
1220
|
-
|
1221
|
-
|
1222
|
-
|
1223
|
-
|
1224
|
-
|
1225
|
-
|
1467
|
+
section_boundaries.append(
|
1468
|
+
{
|
1469
|
+
"index": -1, # Special index for page boundaries
|
1470
|
+
"element": element,
|
1471
|
+
"type": "end",
|
1472
|
+
"page_idx": element.page.index,
|
1473
|
+
}
|
1474
|
+
)
|
1475
|
+
|
1226
1476
|
# Sort boundaries by page index, then by actual document position
|
1227
|
-
section_boundaries.sort(
|
1228
|
-
|
1229
|
-
|
1230
|
-
|
1477
|
+
section_boundaries.sort(
|
1478
|
+
key=lambda x: (
|
1479
|
+
x["page_idx"],
|
1480
|
+
x["index"] if x["index"] != -1 else (0 if x["type"] == "start" else float("inf")),
|
1481
|
+
)
|
1482
|
+
)
|
1483
|
+
|
1231
1484
|
# Generate sections
|
1232
1485
|
sections = []
|
1233
1486
|
current_start = None
|
1234
|
-
|
1487
|
+
|
1235
1488
|
for i, boundary in enumerate(section_boundaries):
|
1236
1489
|
# If it's a start boundary and we don't have a current start
|
1237
|
-
if boundary[
|
1490
|
+
if boundary["type"] == "start" and current_start is None:
|
1238
1491
|
current_start = boundary
|
1239
|
-
|
1492
|
+
|
1240
1493
|
# If it's an end boundary and we have a current start
|
1241
|
-
elif boundary[
|
1494
|
+
elif boundary["type"] == "end" and current_start is not None:
|
1242
1495
|
# Create a section from current_start to this boundary
|
1243
|
-
start_element = current_start[
|
1244
|
-
end_element = boundary[
|
1245
|
-
|
1496
|
+
start_element = current_start["element"]
|
1497
|
+
end_element = boundary["element"]
|
1498
|
+
|
1246
1499
|
# If both elements are on the same page, use the page's get_section_between
|
1247
1500
|
if start_element.page == end_element.page:
|
1248
1501
|
section = start_element.page.get_section_between(
|
1249
|
-
start_element,
|
1250
|
-
end_element,
|
1251
|
-
boundary_inclusion
|
1502
|
+
start_element, end_element, boundary_inclusion
|
1252
1503
|
)
|
1253
1504
|
sections.append(section)
|
1254
1505
|
else:
|
1255
1506
|
# Create a multi-page section
|
1256
1507
|
from natural_pdf.elements.region import Region
|
1257
|
-
|
1508
|
+
|
1258
1509
|
# Get the start and end pages
|
1259
1510
|
start_page = start_element.page
|
1260
1511
|
end_page = end_element.page
|
1261
|
-
|
1512
|
+
|
1262
1513
|
# Create a combined region
|
1263
1514
|
combined_region = Region(
|
1264
|
-
start_page,
|
1265
|
-
(0, start_element.top, start_page.width, start_page.height)
|
1515
|
+
start_page, (0, start_element.top, start_page.width, start_page.height)
|
1266
1516
|
)
|
1267
1517
|
combined_region._spans_pages = True
|
1268
1518
|
combined_region._page_range = (start_page.index, end_page.index)
|
1269
1519
|
combined_region.start_element = start_element
|
1270
1520
|
combined_region.end_element = end_element
|
1271
|
-
|
1521
|
+
|
1272
1522
|
# Get all elements that fall within this multi-page region
|
1273
1523
|
combined_elements = []
|
1274
|
-
|
1524
|
+
|
1275
1525
|
# Get elements from the first page
|
1276
|
-
first_page_elements = [
|
1277
|
-
|
1526
|
+
first_page_elements = [
|
1527
|
+
e
|
1528
|
+
for e in all_elements
|
1529
|
+
if e.page == start_page and e.top >= start_element.top
|
1530
|
+
]
|
1278
1531
|
combined_elements.extend(first_page_elements)
|
1279
|
-
|
1532
|
+
|
1280
1533
|
# Get elements from middle pages (if any)
|
1281
1534
|
for page_idx in range(start_page.index + 1, end_page.index):
|
1282
1535
|
middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
|
1283
1536
|
combined_elements.extend(middle_page_elements)
|
1284
|
-
|
1537
|
+
|
1285
1538
|
# Get elements from the last page
|
1286
|
-
last_page_elements = [
|
1287
|
-
|
1539
|
+
last_page_elements = [
|
1540
|
+
e
|
1541
|
+
for e in all_elements
|
1542
|
+
if e.page == end_page and e.bottom <= end_element.bottom
|
1543
|
+
]
|
1288
1544
|
combined_elements.extend(last_page_elements)
|
1289
|
-
|
1545
|
+
|
1290
1546
|
# Store the elements in the combined region
|
1291
1547
|
combined_region._multi_page_elements = combined_elements
|
1292
|
-
|
1548
|
+
|
1293
1549
|
sections.append(combined_region)
|
1294
|
-
|
1550
|
+
|
1295
1551
|
current_start = None
|
1296
|
-
|
1552
|
+
|
1297
1553
|
# If it's another start boundary and we have a current start (for splitting by starts only)
|
1298
|
-
elif boundary[
|
1554
|
+
elif boundary["type"] == "start" and current_start is not None and not end_elements:
|
1299
1555
|
# Create a section from current_start to just before this boundary
|
1300
|
-
start_element = current_start[
|
1301
|
-
|
1556
|
+
start_element = current_start["element"]
|
1557
|
+
|
1302
1558
|
# Find the last element before this boundary on the same page
|
1303
|
-
if start_element.page == boundary[
|
1559
|
+
if start_element.page == boundary["element"].page:
|
1304
1560
|
# Find elements on this page
|
1305
1561
|
page_elements = [e for e in all_elements if e.page == start_element.page]
|
1306
1562
|
# Sort by position
|
1307
1563
|
page_elements.sort(key=lambda e: (e.top, e.x0))
|
1308
|
-
|
1564
|
+
|
1309
1565
|
# Find the last element before the boundary
|
1310
|
-
end_idx =
|
1566
|
+
end_idx = (
|
1567
|
+
page_elements.index(boundary["element"]) - 1
|
1568
|
+
if boundary["element"] in page_elements
|
1569
|
+
else -1
|
1570
|
+
)
|
1311
1571
|
end_element = page_elements[end_idx] if end_idx >= 0 else None
|
1312
|
-
|
1572
|
+
|
1313
1573
|
# Create the section
|
1314
1574
|
section = start_element.page.get_section_between(
|
1315
|
-
start_element,
|
1316
|
-
end_element,
|
1317
|
-
boundary_inclusion
|
1575
|
+
start_element, end_element, boundary_inclusion
|
1318
1576
|
)
|
1319
1577
|
sections.append(section)
|
1320
1578
|
else:
|
1321
1579
|
# Cross-page section - create from current_start to the end of its page
|
1322
1580
|
from natural_pdf.elements.region import Region
|
1581
|
+
|
1323
1582
|
start_page = start_element.page
|
1324
|
-
|
1583
|
+
|
1325
1584
|
region = Region(
|
1326
|
-
start_page,
|
1327
|
-
(0, start_element.top, start_page.width, start_page.height)
|
1585
|
+
start_page, (0, start_element.top, start_page.width, start_page.height)
|
1328
1586
|
)
|
1329
1587
|
region.start_element = start_element
|
1330
1588
|
sections.append(region)
|
1331
|
-
|
1589
|
+
|
1332
1590
|
current_start = boundary
|
1333
|
-
|
1591
|
+
|
1334
1592
|
# Handle the last section if we have a current start
|
1335
1593
|
if current_start is not None:
|
1336
|
-
start_element = current_start[
|
1594
|
+
start_element = current_start["element"]
|
1337
1595
|
start_page = start_element.page
|
1338
|
-
|
1596
|
+
|
1339
1597
|
if end_elements:
|
1340
1598
|
# With end_elements, we need an explicit end - use the last element
|
1341
1599
|
# on the last page of the collection
|
@@ -1343,59 +1601,63 @@ class PageCollection(Generic[P]):
|
|
1343
1601
|
last_page_elements = [e for e in all_elements if e.page == last_page]
|
1344
1602
|
last_page_elements.sort(key=lambda e: (e.top, e.x0))
|
1345
1603
|
end_element = last_page_elements[-1] if last_page_elements else None
|
1346
|
-
|
1604
|
+
|
1347
1605
|
# Create a multi-page section
|
1348
1606
|
from natural_pdf.elements.region import Region
|
1349
|
-
|
1607
|
+
|
1350
1608
|
if start_page == last_page:
|
1351
1609
|
# Simple case - both on same page
|
1352
1610
|
section = start_page.get_section_between(
|
1353
|
-
start_element,
|
1354
|
-
end_element,
|
1355
|
-
boundary_inclusion
|
1611
|
+
start_element, end_element, boundary_inclusion
|
1356
1612
|
)
|
1357
1613
|
sections.append(section)
|
1358
1614
|
else:
|
1359
1615
|
# Create a multi-page section
|
1360
1616
|
combined_region = Region(
|
1361
|
-
start_page,
|
1362
|
-
(0, start_element.top, start_page.width, start_page.height)
|
1617
|
+
start_page, (0, start_element.top, start_page.width, start_page.height)
|
1363
1618
|
)
|
1364
1619
|
combined_region._spans_pages = True
|
1365
1620
|
combined_region._page_range = (start_page.index, last_page.index)
|
1366
1621
|
combined_region.start_element = start_element
|
1367
1622
|
combined_region.end_element = end_element
|
1368
|
-
|
1623
|
+
|
1369
1624
|
# Get all elements that fall within this multi-page region
|
1370
1625
|
combined_elements = []
|
1371
|
-
|
1626
|
+
|
1372
1627
|
# Get elements from the first page
|
1373
|
-
first_page_elements = [
|
1374
|
-
|
1628
|
+
first_page_elements = [
|
1629
|
+
e
|
1630
|
+
for e in all_elements
|
1631
|
+
if e.page == start_page and e.top >= start_element.top
|
1632
|
+
]
|
1375
1633
|
combined_elements.extend(first_page_elements)
|
1376
|
-
|
1634
|
+
|
1377
1635
|
# Get elements from middle pages (if any)
|
1378
1636
|
for page_idx in range(start_page.index + 1, last_page.index):
|
1379
1637
|
middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
|
1380
1638
|
combined_elements.extend(middle_page_elements)
|
1381
|
-
|
1639
|
+
|
1382
1640
|
# Get elements from the last page
|
1383
|
-
last_page_elements = [
|
1384
|
-
|
1641
|
+
last_page_elements = [
|
1642
|
+
e
|
1643
|
+
for e in all_elements
|
1644
|
+
if e.page == last_page
|
1645
|
+
and (end_element is None or e.bottom <= end_element.bottom)
|
1646
|
+
]
|
1385
1647
|
combined_elements.extend(last_page_elements)
|
1386
|
-
|
1648
|
+
|
1387
1649
|
# Store the elements in the combined region
|
1388
1650
|
combined_region._multi_page_elements = combined_elements
|
1389
|
-
|
1651
|
+
|
1390
1652
|
sections.append(combined_region)
|
1391
1653
|
else:
|
1392
1654
|
# With start_elements only, create a section to the end of the current page
|
1393
1655
|
from natural_pdf.elements.region import Region
|
1656
|
+
|
1394
1657
|
region = Region(
|
1395
|
-
start_page,
|
1396
|
-
(0, start_element.top, start_page.width, start_page.height)
|
1658
|
+
start_page, (0, start_element.top, start_page.width, start_page.height)
|
1397
1659
|
)
|
1398
1660
|
region.start_element = start_element
|
1399
1661
|
sections.append(region)
|
1400
|
-
|
1401
|
-
return sections
|
1662
|
+
|
1663
|
+
return sections
|