natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +126 -98
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +910 -516
- natural_pdf/core/pdf.py +387 -289
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +714 -514
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.3.dist-info/RECORD +0 -61
- natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,24 @@
|
|
1
1
|
import logging
|
2
|
-
|
3
|
-
|
2
|
+
from typing import (
|
3
|
+
TYPE_CHECKING,
|
4
|
+
Any,
|
5
|
+
Callable,
|
6
|
+
Dict,
|
7
|
+
Generic,
|
8
|
+
Iterator,
|
9
|
+
List,
|
10
|
+
Optional,
|
11
|
+
Tuple,
|
12
|
+
TypeVar,
|
13
|
+
Union,
|
14
|
+
)
|
15
|
+
|
16
|
+
from pdfplumber.utils.geometry import objects_to_bbox
|
17
|
+
|
18
|
+
# New Imports
|
19
|
+
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
20
|
+
|
21
|
+
from natural_pdf.elements.text import TextElement # Needed for isinstance check
|
4
22
|
from natural_pdf.ocr import OCROptions
|
5
23
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
6
24
|
|
@@ -10,35 +28,36 @@ if TYPE_CHECKING:
|
|
10
28
|
from natural_pdf.core.page import Page
|
11
29
|
from natural_pdf.elements.region import Region
|
12
30
|
|
13
|
-
T = TypeVar(
|
14
|
-
P = TypeVar(
|
31
|
+
T = TypeVar("T")
|
32
|
+
P = TypeVar("P", bound="Page")
|
33
|
+
|
15
34
|
|
16
35
|
class ElementCollection(Generic[T]):
|
17
36
|
"""
|
18
37
|
Collection of PDF elements with batch operations.
|
19
38
|
"""
|
20
|
-
|
39
|
+
|
21
40
|
def __init__(self, elements: List[T]):
|
22
41
|
"""
|
23
42
|
Initialize a collection of elements.
|
24
|
-
|
43
|
+
|
25
44
|
Args:
|
26
45
|
elements: List of Element objects
|
27
46
|
"""
|
28
47
|
self._elements = elements or []
|
29
|
-
|
48
|
+
|
30
49
|
def __len__(self) -> int:
|
31
50
|
"""Get the number of elements in the collection."""
|
32
51
|
return len(self._elements)
|
33
|
-
|
34
|
-
def __getitem__(self, index: int) ->
|
52
|
+
|
53
|
+
def __getitem__(self, index: int) -> "Element":
|
35
54
|
"""Get an element by index."""
|
36
55
|
return self._elements[index]
|
37
|
-
|
56
|
+
|
38
57
|
def __iter__(self):
|
39
58
|
"""Iterate over elements."""
|
40
59
|
return iter(self._elements)
|
41
|
-
|
60
|
+
|
42
61
|
def __repr__(self) -> str:
|
43
62
|
"""Return a string representation showing the element count."""
|
44
63
|
element_type = "Mixed"
|
@@ -47,130 +66,130 @@ class ElementCollection(Generic[T]):
|
|
47
66
|
if len(types) == 1:
|
48
67
|
element_type = types.pop()
|
49
68
|
return f"<ElementCollection[{element_type}](count={len(self)})>"
|
50
|
-
|
69
|
+
|
51
70
|
@property
|
52
|
-
def elements(self) -> List[
|
71
|
+
def elements(self) -> List["Element"]:
|
53
72
|
"""Get the elements in this collection."""
|
54
73
|
return self._elements
|
55
|
-
|
74
|
+
|
56
75
|
@property
|
57
|
-
def first(self) -> Optional[
|
76
|
+
def first(self) -> Optional["Element"]:
|
58
77
|
"""Get the first element in the collection."""
|
59
78
|
return self._elements[0] if self._elements else None
|
60
|
-
|
79
|
+
|
61
80
|
@property
|
62
|
-
def last(self) -> Optional[
|
81
|
+
def last(self) -> Optional["Element"]:
|
63
82
|
"""Get the last element in the collection."""
|
64
83
|
return self._elements[-1] if self._elements else None
|
65
|
-
|
66
|
-
def highest(self) -> Optional[
|
84
|
+
|
85
|
+
def highest(self) -> Optional["Element"]:
|
67
86
|
"""
|
68
87
|
Get element with the smallest top y-coordinate (highest on page).
|
69
|
-
|
88
|
+
|
70
89
|
Raises:
|
71
90
|
ValueError: If elements are on multiple pages
|
72
|
-
|
91
|
+
|
73
92
|
Returns:
|
74
93
|
Element with smallest top value or None if empty
|
75
94
|
"""
|
76
95
|
if not self._elements:
|
77
96
|
return None
|
78
|
-
|
97
|
+
|
79
98
|
# Check if elements are on multiple pages
|
80
99
|
if self._are_on_multiple_pages():
|
81
100
|
raise ValueError("Cannot determine highest element across multiple pages")
|
82
|
-
|
101
|
+
|
83
102
|
return min(self._elements, key=lambda e: e.top)
|
84
|
-
|
85
|
-
def lowest(self) -> Optional[
|
103
|
+
|
104
|
+
def lowest(self) -> Optional["Element"]:
|
86
105
|
"""
|
87
106
|
Get element with the largest bottom y-coordinate (lowest on page).
|
88
|
-
|
107
|
+
|
89
108
|
Raises:
|
90
109
|
ValueError: If elements are on multiple pages
|
91
|
-
|
110
|
+
|
92
111
|
Returns:
|
93
112
|
Element with largest bottom value or None if empty
|
94
113
|
"""
|
95
114
|
if not self._elements:
|
96
115
|
return None
|
97
|
-
|
116
|
+
|
98
117
|
# Check if elements are on multiple pages
|
99
118
|
if self._are_on_multiple_pages():
|
100
119
|
raise ValueError("Cannot determine lowest element across multiple pages")
|
101
|
-
|
120
|
+
|
102
121
|
return max(self._elements, key=lambda e: e.bottom)
|
103
|
-
|
104
|
-
def leftmost(self) -> Optional[
|
122
|
+
|
123
|
+
def leftmost(self) -> Optional["Element"]:
|
105
124
|
"""
|
106
125
|
Get element with the smallest x0 coordinate (leftmost on page).
|
107
|
-
|
126
|
+
|
108
127
|
Raises:
|
109
128
|
ValueError: If elements are on multiple pages
|
110
|
-
|
129
|
+
|
111
130
|
Returns:
|
112
131
|
Element with smallest x0 value or None if empty
|
113
132
|
"""
|
114
133
|
if not self._elements:
|
115
134
|
return None
|
116
|
-
|
135
|
+
|
117
136
|
# Check if elements are on multiple pages
|
118
137
|
if self._are_on_multiple_pages():
|
119
138
|
raise ValueError("Cannot determine leftmost element across multiple pages")
|
120
|
-
|
139
|
+
|
121
140
|
return min(self._elements, key=lambda e: e.x0)
|
122
|
-
|
123
|
-
def rightmost(self) -> Optional[
|
141
|
+
|
142
|
+
def rightmost(self) -> Optional["Element"]:
|
124
143
|
"""
|
125
144
|
Get element with the largest x1 coordinate (rightmost on page).
|
126
|
-
|
145
|
+
|
127
146
|
Raises:
|
128
147
|
ValueError: If elements are on multiple pages
|
129
|
-
|
148
|
+
|
130
149
|
Returns:
|
131
150
|
Element with largest x1 value or None if empty
|
132
151
|
"""
|
133
152
|
if not self._elements:
|
134
153
|
return None
|
135
|
-
|
154
|
+
|
136
155
|
# Check if elements are on multiple pages
|
137
156
|
if self._are_on_multiple_pages():
|
138
157
|
raise ValueError("Cannot determine rightmost element across multiple pages")
|
139
|
-
|
158
|
+
|
140
159
|
return max(self._elements, key=lambda e: e.x1)
|
141
|
-
|
160
|
+
|
142
161
|
def _are_on_multiple_pages(self) -> bool:
|
143
162
|
"""
|
144
163
|
Check if elements in this collection span multiple pages.
|
145
|
-
|
164
|
+
|
146
165
|
Returns:
|
147
166
|
True if elements are on different pages, False otherwise
|
148
167
|
"""
|
149
168
|
if not self._elements:
|
150
169
|
return False
|
151
|
-
|
170
|
+
|
152
171
|
# Get the page index of the first element
|
153
|
-
if not hasattr(self._elements[0],
|
172
|
+
if not hasattr(self._elements[0], "page"):
|
154
173
|
return False
|
155
|
-
|
174
|
+
|
156
175
|
first_page_idx = self._elements[0].page.index
|
157
|
-
|
176
|
+
|
158
177
|
# Check if any element is on a different page
|
159
|
-
return any(hasattr(e,
|
160
|
-
|
161
|
-
def exclude_regions(self, regions: List[
|
178
|
+
return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
|
179
|
+
|
180
|
+
def exclude_regions(self, regions: List["Region"]) -> "ElementCollection":
|
162
181
|
"""
|
163
182
|
Remove elements that are within any of the specified regions.
|
164
|
-
|
183
|
+
|
165
184
|
Args:
|
166
185
|
regions: List of Region objects to exclude
|
167
|
-
|
186
|
+
|
168
187
|
Returns:
|
169
188
|
New ElementCollection with filtered elements
|
170
189
|
"""
|
171
190
|
if not regions:
|
172
191
|
return ElementCollection(self._elements)
|
173
|
-
|
192
|
+
|
174
193
|
filtered = []
|
175
194
|
for element in self._elements:
|
176
195
|
exclude = False
|
@@ -180,72 +199,156 @@ class ElementCollection(Generic[T]):
|
|
180
199
|
break
|
181
200
|
if not exclude:
|
182
201
|
filtered.append(element)
|
183
|
-
|
202
|
+
|
184
203
|
return ElementCollection(filtered)
|
185
|
-
|
204
|
+
|
186
205
|
def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
|
187
206
|
"""
|
188
|
-
Extract text from all
|
189
|
-
|
207
|
+
Extract text from all TextElements in the collection, optionally using
|
208
|
+
pdfplumber's layout engine if layout=True is specified.
|
209
|
+
|
190
210
|
Args:
|
191
|
-
preserve_whitespace:
|
192
|
-
use_exclusions:
|
193
|
-
|
194
|
-
|
211
|
+
preserve_whitespace: Deprecated. Use layout=False for simple joining.
|
212
|
+
use_exclusions: Deprecated. Exclusions should be applied *before* creating
|
213
|
+
the collection or by filtering the collection itself.
|
214
|
+
**kwargs: Additional layout parameters passed directly to pdfplumber's
|
215
|
+
`chars_to_textmap` function ONLY if `layout=True` is passed.
|
216
|
+
See Page.extract_text docstring for common parameters.
|
217
|
+
If `layout=False` or omitted, performs a simple join.
|
218
|
+
|
195
219
|
Returns:
|
196
|
-
Combined text from
|
220
|
+
Combined text from elements, potentially with layout-based spacing.
|
197
221
|
"""
|
198
|
-
# Filter to just
|
199
|
-
text_elements = [
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
222
|
+
# Filter to just TextElements that likely have _char_dicts
|
223
|
+
text_elements = [
|
224
|
+
el
|
225
|
+
for el in self._elements
|
226
|
+
if isinstance(el, TextElement) and hasattr(el, "_char_dicts")
|
227
|
+
]
|
228
|
+
|
229
|
+
if not text_elements:
|
230
|
+
return ""
|
231
|
+
|
232
|
+
# Collect all character dictionaries
|
233
|
+
all_char_dicts = []
|
234
|
+
for el in text_elements:
|
235
|
+
all_char_dicts.extend(getattr(el, "_char_dicts", []))
|
236
|
+
|
237
|
+
if not all_char_dicts:
|
238
|
+
# Handle case where elements exist but have no char dicts
|
239
|
+
logger.warning(
|
240
|
+
"ElementCollection.extract_text: No character dictionaries found in TextElements."
|
241
|
+
)
|
242
|
+
return " ".join(
|
243
|
+
getattr(el, "text", "") for el in text_elements
|
244
|
+
) # Fallback to simple join of word text
|
245
|
+
|
246
|
+
# Check if layout is requested
|
247
|
+
use_layout = kwargs.get("layout", False)
|
248
|
+
|
249
|
+
if use_layout:
|
250
|
+
logger.debug("ElementCollection.extract_text: Using layout=True path.")
|
251
|
+
# Layout requested: Use chars_to_textmap
|
252
|
+
|
253
|
+
# Prepare layout kwargs
|
254
|
+
layout_kwargs = {}
|
255
|
+
allowed_keys = set(WORD_EXTRACTOR_KWARGS) | set(TEXTMAP_KWARGS)
|
256
|
+
for key, value in kwargs.items():
|
257
|
+
if key in allowed_keys:
|
258
|
+
layout_kwargs[key] = value
|
259
|
+
layout_kwargs["layout"] = True # Ensure layout is True
|
260
|
+
|
261
|
+
# Calculate overall bbox for the elements used
|
262
|
+
collection_bbox = objects_to_bbox(all_char_dicts)
|
263
|
+
coll_x0, coll_top, coll_x1, coll_bottom = collection_bbox
|
264
|
+
coll_width = coll_x1 - coll_x0
|
265
|
+
coll_height = coll_bottom - coll_top
|
266
|
+
|
267
|
+
# Set layout parameters based on collection bounds
|
268
|
+
# Warn if collection is sparse? TBD.
|
269
|
+
if "layout_bbox" not in layout_kwargs:
|
270
|
+
layout_kwargs["layout_bbox"] = collection_bbox
|
271
|
+
if "layout_width" not in layout_kwargs:
|
272
|
+
layout_kwargs["layout_width"] = coll_width
|
273
|
+
if "layout_height" not in layout_kwargs:
|
274
|
+
layout_kwargs["layout_height"] = coll_height
|
275
|
+
# Set shifts relative to the collection's top-left
|
276
|
+
if "x_shift" not in layout_kwargs:
|
277
|
+
layout_kwargs["x_shift"] = coll_x0
|
278
|
+
if "y_shift" not in layout_kwargs:
|
279
|
+
layout_kwargs["y_shift"] = coll_top
|
280
|
+
|
281
|
+
try:
|
282
|
+
# Sort chars by document order (page, top, x0)
|
283
|
+
# Need page info on char dicts for multi-page collections
|
284
|
+
# Assuming char dicts have 'page_number' from element creation
|
285
|
+
all_char_dicts.sort(
|
286
|
+
key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
|
287
|
+
)
|
288
|
+
textmap = chars_to_textmap(all_char_dicts, **layout_kwargs)
|
289
|
+
result = textmap.as_string
|
290
|
+
except Exception as e:
|
291
|
+
logger.error(
|
292
|
+
f"ElementCollection: Error calling chars_to_textmap: {e}", exc_info=True
|
293
|
+
)
|
294
|
+
logger.warning(
|
295
|
+
"ElementCollection: Falling back to simple text join due to layout error."
|
296
|
+
)
|
297
|
+
# Fallback sorting and joining
|
298
|
+
all_char_dicts.sort(
|
299
|
+
key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
|
300
|
+
)
|
301
|
+
result = " ".join(c.get("text", "") for c in all_char_dicts)
|
302
|
+
|
303
|
+
else:
|
304
|
+
# Default: Simple join without layout
|
305
|
+
logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
|
306
|
+
# Sort chars by document order (page, top, x0)
|
307
|
+
all_char_dicts.sort(
|
308
|
+
key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
|
309
|
+
)
|
310
|
+
# Simple join of character text
|
311
|
+
result = "".join(c.get("text", "") for c in all_char_dicts)
|
312
|
+
# Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
|
313
|
+
|
314
|
+
return result
|
315
|
+
|
316
|
+
def filter(self, func: Callable[["Element"], bool]) -> "ElementCollection":
|
216
317
|
"""
|
217
318
|
Filter elements using a function.
|
218
|
-
|
319
|
+
|
219
320
|
Args:
|
220
321
|
func: Function that takes an element and returns True to keep it
|
221
|
-
|
322
|
+
|
222
323
|
Returns:
|
223
324
|
New ElementCollection with filtered elements
|
224
325
|
"""
|
225
326
|
return ElementCollection([e for e in self._elements if func(e)])
|
226
|
-
|
227
|
-
def sort(self, key=None, reverse=False) ->
|
327
|
+
|
328
|
+
def sort(self, key=None, reverse=False) -> "ElementCollection":
|
228
329
|
"""
|
229
330
|
Sort elements by the given key function.
|
230
|
-
|
331
|
+
|
231
332
|
Args:
|
232
333
|
key: Function to generate a key for sorting
|
233
334
|
reverse: Whether to sort in descending order
|
234
|
-
|
335
|
+
|
235
336
|
Returns:
|
236
337
|
Self for method chaining
|
237
338
|
"""
|
238
339
|
self._elements.sort(key=key, reverse=reverse)
|
239
340
|
return self
|
240
|
-
|
241
|
-
def highlight(
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
341
|
+
|
342
|
+
def highlight(
|
343
|
+
self,
|
344
|
+
label: Optional[str] = None,
|
345
|
+
color: Optional[Union[Tuple, str]] = None,
|
346
|
+
group_by: Optional[str] = None,
|
347
|
+
label_format: Optional[str] = None,
|
348
|
+
distinct: bool = False,
|
349
|
+
include_attrs: Optional[List[str]] = None,
|
350
|
+
replace: bool = False,
|
351
|
+
) -> "ElementCollection":
|
249
352
|
"""
|
250
353
|
Adds persistent highlights for all elements in the collection to the page
|
251
354
|
via the HighlightingService.
|
@@ -294,17 +397,17 @@ class ElementCollection(Generic[T]):
|
|
294
397
|
color=color,
|
295
398
|
group_by=group_by,
|
296
399
|
label_format=label_format,
|
297
|
-
include_attrs=include_attrs
|
400
|
+
include_attrs=include_attrs,
|
298
401
|
# 'replace' flag is handled during the add call below
|
299
402
|
)
|
300
403
|
|
301
404
|
# 2. Add prepared highlights to the persistent service
|
302
405
|
if not highlight_data_list:
|
303
|
-
return self
|
406
|
+
return self # Nothing to add
|
304
407
|
|
305
408
|
# Get page and highlighter from the first element (assume uniform page)
|
306
409
|
first_element = self._elements[0]
|
307
|
-
if not hasattr(first_element,
|
410
|
+
if not hasattr(first_element, "page") or not hasattr(first_element.page, "_highlighter"):
|
308
411
|
logger.warning("Cannot highlight collection: Elements lack page or highlighter access.")
|
309
412
|
return self
|
310
413
|
|
@@ -317,42 +420,48 @@ class ElementCollection(Generic[T]):
|
|
317
420
|
if replace:
|
318
421
|
# Identify all unique page indices in this operation
|
319
422
|
for data in highlight_data_list:
|
320
|
-
pages_to_clear.add(data[
|
423
|
+
pages_to_clear.add(data["page_index"])
|
321
424
|
# Clear those pages *before* adding new highlights
|
322
|
-
logger.debug(
|
425
|
+
logger.debug(
|
426
|
+
f"Highlighting with replace=True. Clearing highlights for pages: {pages_to_clear}"
|
427
|
+
)
|
323
428
|
for page_idx in pages_to_clear:
|
324
429
|
highlighter.clear_page(page_idx)
|
325
430
|
|
326
431
|
for data in highlight_data_list:
|
327
432
|
# Call the appropriate service add method
|
328
433
|
add_args = {
|
329
|
-
"page_index": data[
|
330
|
-
"color": data[
|
331
|
-
"label": data[
|
332
|
-
"use_color_cycling": data.get(
|
333
|
-
|
334
|
-
|
434
|
+
"page_index": data["page_index"],
|
435
|
+
"color": data["color"], # Color determined by _prepare
|
436
|
+
"label": data["label"], # Label determined by _prepare
|
437
|
+
"use_color_cycling": data.get(
|
438
|
+
"use_color_cycling", False
|
439
|
+
), # Set by _prepare if distinct
|
440
|
+
"element": data["element"],
|
441
|
+
"include_attrs": data["include_attrs"],
|
335
442
|
# Internal call to service always appends, as clearing was handled above
|
336
|
-
"existing":
|
443
|
+
"existing": "append",
|
337
444
|
}
|
338
|
-
if data.get(
|
339
|
-
add_args["polygon"] = data[
|
445
|
+
if data.get("polygon"):
|
446
|
+
add_args["polygon"] = data["polygon"]
|
340
447
|
highlighter.add_polygon(**add_args)
|
341
|
-
elif data.get(
|
342
|
-
add_args["bbox"] = data[
|
448
|
+
elif data.get("bbox"):
|
449
|
+
add_args["bbox"] = data["bbox"]
|
343
450
|
highlighter.add(**add_args)
|
344
451
|
else:
|
345
452
|
logger.warning(f"Skipping highlight data, no bbox or polygon found: {data}")
|
346
453
|
|
347
454
|
return self
|
348
455
|
|
349
|
-
def _prepare_highlight_data(
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
456
|
+
def _prepare_highlight_data(
|
457
|
+
self,
|
458
|
+
distinct: bool = False,
|
459
|
+
label: Optional[str] = None,
|
460
|
+
color: Optional[Union[Tuple, str]] = None,
|
461
|
+
group_by: Optional[str] = None,
|
462
|
+
label_format: Optional[str] = None,
|
463
|
+
include_attrs: Optional[List[str]] = None,
|
464
|
+
) -> List[Dict]:
|
356
465
|
"""
|
357
466
|
Determines the parameters for highlighting each element based on the strategy.
|
358
467
|
|
@@ -364,58 +473,64 @@ class ElementCollection(Generic[T]):
|
|
364
473
|
Color and label determination happens here.
|
365
474
|
"""
|
366
475
|
prepared_data = []
|
367
|
-
if not self._elements:
|
476
|
+
if not self._elements:
|
477
|
+
return prepared_data
|
368
478
|
|
369
479
|
# Need access to the HighlightingService to determine colors correctly.
|
370
480
|
highlighter = None
|
371
481
|
first_element = self._elements[0]
|
372
|
-
if hasattr(first_element,
|
482
|
+
if hasattr(first_element, "page") and hasattr(first_element.page, "_highlighter"):
|
373
483
|
highlighter = first_element.page._highlighter
|
374
484
|
else:
|
375
|
-
logger.warning(
|
485
|
+
logger.warning(
|
486
|
+
"Cannot determine highlight colors: HighlightingService not accessible from elements."
|
487
|
+
)
|
376
488
|
return []
|
377
489
|
|
378
490
|
if distinct:
|
379
491
|
logger.debug("_prepare: Distinct highlighting strategy.")
|
380
492
|
for element in self._elements:
|
381
493
|
# Call the service's color determination logic
|
382
|
-
final_color = highlighter._determine_highlight_color(
|
494
|
+
final_color = highlighter._determine_highlight_color(
|
495
|
+
label=None, color_input=None, use_color_cycling=True
|
496
|
+
)
|
383
497
|
element_data = self._get_element_highlight_params(element, include_attrs)
|
384
498
|
if element_data:
|
385
|
-
element_data.update(
|
386
|
-
|
387
|
-
|
388
|
-
'use_color_cycling': True
|
389
|
-
})
|
499
|
+
element_data.update(
|
500
|
+
{"color": final_color, "label": None, "use_color_cycling": True}
|
501
|
+
)
|
390
502
|
prepared_data.append(element_data)
|
391
503
|
|
392
504
|
elif label is not None:
|
393
505
|
logger.debug(f"_prepare: Explicit label '{label}' strategy.")
|
394
|
-
final_color = highlighter._determine_highlight_color(
|
506
|
+
final_color = highlighter._determine_highlight_color(
|
507
|
+
label=label, color_input=color, use_color_cycling=False
|
508
|
+
)
|
395
509
|
for element in self._elements:
|
396
510
|
element_data = self._get_element_highlight_params(element, include_attrs)
|
397
511
|
if element_data:
|
398
|
-
element_data.update({
|
399
|
-
'color': final_color,
|
400
|
-
'label': label
|
401
|
-
})
|
512
|
+
element_data.update({"color": final_color, "label": label})
|
402
513
|
prepared_data.append(element_data)
|
403
514
|
|
404
515
|
elif group_by is not None:
|
405
516
|
logger.debug("_prepare: Grouping by attribute strategy.")
|
406
517
|
grouped_elements = self._group_elements_by_attr(group_by)
|
407
518
|
for group_key, group_elements in grouped_elements.items():
|
408
|
-
if not group_elements:
|
409
|
-
|
410
|
-
|
411
|
-
|
519
|
+
if not group_elements:
|
520
|
+
continue
|
521
|
+
group_label = self._format_group_label(
|
522
|
+
group_key, label_format, group_elements[0], group_by
|
523
|
+
)
|
524
|
+
final_color = highlighter._determine_highlight_color(
|
525
|
+
label=group_label, color_input=None, use_color_cycling=False
|
526
|
+
)
|
527
|
+
logger.debug(
|
528
|
+
f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
|
529
|
+
)
|
412
530
|
for element in group_elements:
|
413
531
|
element_data = self._get_element_highlight_params(element, include_attrs)
|
414
532
|
if element_data:
|
415
|
-
element_data.update({
|
416
|
-
'color': final_color,
|
417
|
-
'label': group_label
|
418
|
-
})
|
533
|
+
element_data.update({"color": final_color, "label": group_label})
|
419
534
|
prepared_data.append(element_data)
|
420
535
|
else:
|
421
536
|
logger.debug("_prepare: Default grouping strategy.")
|
@@ -423,15 +538,21 @@ class ElementCollection(Generic[T]):
|
|
423
538
|
|
424
539
|
if len(element_types) == 1:
|
425
540
|
type_name = element_types.pop()
|
426
|
-
base_name =
|
541
|
+
base_name = (
|
542
|
+
type_name.replace("Element", "").replace("Region", "")
|
543
|
+
if type_name != "Region"
|
544
|
+
else "Region"
|
545
|
+
)
|
427
546
|
auto_label = f"{base_name} Elements" if base_name else "Elements"
|
428
547
|
# Determine color *before* logging or using it
|
429
|
-
final_color = highlighter._determine_highlight_color(
|
548
|
+
final_color = highlighter._determine_highlight_color(
|
549
|
+
label=auto_label, color_input=color, use_color_cycling=False
|
550
|
+
)
|
430
551
|
logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
|
431
552
|
for element in self._elements:
|
432
553
|
element_data = self._get_element_highlight_params(element, include_attrs)
|
433
554
|
if element_data:
|
434
|
-
element_data.update({
|
555
|
+
element_data.update({"color": final_color, "label": auto_label})
|
435
556
|
prepared_data.append(element_data)
|
436
557
|
else:
|
437
558
|
# Mixed types: Generate generic label and warn
|
@@ -442,26 +563,33 @@ class ElementCollection(Generic[T]):
|
|
442
563
|
f"using generic label '{auto_label}'. Consider using 'label', 'group_by', "
|
443
564
|
f"or 'distinct=True' for more specific highlighting."
|
444
565
|
)
|
445
|
-
final_color = highlighter._determine_highlight_color(
|
566
|
+
final_color = highlighter._determine_highlight_color(
|
567
|
+
label=auto_label, color_input=color, use_color_cycling=False
|
568
|
+
)
|
446
569
|
# Determine color *before* logging or using it (already done above for this branch)
|
447
570
|
logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
|
448
571
|
for element in self._elements:
|
449
572
|
element_data = self._get_element_highlight_params(element, include_attrs)
|
450
573
|
if element_data:
|
451
|
-
element_data.update({
|
574
|
+
element_data.update({"color": final_color, "label": auto_label})
|
452
575
|
prepared_data.append(element_data)
|
453
576
|
|
454
577
|
return prepared_data
|
455
578
|
|
456
|
-
def _call_element_highlighter(
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
579
|
+
def _call_element_highlighter(
|
580
|
+
self,
|
581
|
+
element: T,
|
582
|
+
color: Optional[Union[Tuple, str]],
|
583
|
+
label: Optional[str],
|
584
|
+
use_color_cycling: bool,
|
585
|
+
include_attrs: Optional[List[str]],
|
586
|
+
existing: str,
|
587
|
+
):
|
462
588
|
"""Low-level helper to call the appropriate HighlightingService method for an element."""
|
463
|
-
if not hasattr(element,
|
464
|
-
logger.warning(
|
589
|
+
if not hasattr(element, "page") or not hasattr(element.page, "_highlighter"):
|
590
|
+
logger.warning(
|
591
|
+
f"Cannot highlight element, missing 'page' attribute or page lacks highlighter access: {element}"
|
592
|
+
)
|
465
593
|
return
|
466
594
|
|
467
595
|
page = element.page
|
@@ -472,59 +600,68 @@ class ElementCollection(Generic[T]):
|
|
472
600
|
"use_color_cycling": use_color_cycling,
|
473
601
|
"include_attrs": include_attrs,
|
474
602
|
"existing": existing,
|
475
|
-
"element": element
|
603
|
+
"element": element,
|
476
604
|
}
|
477
605
|
|
478
|
-
is_polygon = getattr(element,
|
606
|
+
is_polygon = getattr(element, "has_polygon", False)
|
479
607
|
geom_data = None
|
480
608
|
add_method = None
|
481
609
|
|
482
610
|
if is_polygon:
|
483
|
-
geom_data = getattr(element,
|
611
|
+
geom_data = getattr(element, "polygon", None)
|
484
612
|
if geom_data:
|
485
|
-
args_for_highlighter[
|
613
|
+
args_for_highlighter["polygon"] = geom_data
|
486
614
|
add_method = page._highlighter.add_polygon
|
487
615
|
else:
|
488
|
-
geom_data = getattr(element,
|
616
|
+
geom_data = getattr(element, "bbox", None)
|
489
617
|
if geom_data:
|
490
|
-
args_for_highlighter[
|
618
|
+
args_for_highlighter["bbox"] = geom_data
|
491
619
|
add_method = page._highlighter.add
|
492
620
|
|
493
621
|
if add_method and geom_data:
|
494
622
|
try:
|
495
623
|
add_method(**args_for_highlighter)
|
496
624
|
except Exception as e:
|
497
|
-
logger.error(
|
625
|
+
logger.error(
|
626
|
+
f"Error calling highlighter method for element {element} on page {page.index}: {e}",
|
627
|
+
exc_info=True,
|
628
|
+
)
|
498
629
|
elif not geom_data:
|
499
630
|
logger.warning(f"Cannot highlight element, no bbox or polygon found: {element}")
|
500
631
|
|
501
|
-
def _highlight_as_single_group(
|
502
|
-
|
503
|
-
|
504
|
-
|
632
|
+
def _highlight_as_single_group(
|
633
|
+
self,
|
634
|
+
label: str,
|
635
|
+
color: Optional[Union[Tuple, str]],
|
636
|
+
include_attrs: Optional[List[str]],
|
637
|
+
existing: str,
|
638
|
+
):
|
505
639
|
"""Highlights all elements with the same explicit label and color."""
|
506
640
|
for element in self._elements:
|
507
641
|
self._call_element_highlighter(
|
508
642
|
element=element,
|
509
|
-
color=color,
|
510
|
-
label=label,
|
511
|
-
use_color_cycling=False,
|
643
|
+
color=color, # Use explicit color if provided
|
644
|
+
label=label, # Use the explicit group label
|
645
|
+
use_color_cycling=False, # Use consistent color for the label
|
512
646
|
include_attrs=include_attrs,
|
513
|
-
existing=existing
|
647
|
+
existing=existing,
|
514
648
|
)
|
515
649
|
|
516
|
-
def _highlight_grouped_by_attribute(
|
517
|
-
|
518
|
-
|
519
|
-
|
650
|
+
def _highlight_grouped_by_attribute(
|
651
|
+
self,
|
652
|
+
group_by: str,
|
653
|
+
label_format: Optional[str],
|
654
|
+
include_attrs: Optional[List[str]],
|
655
|
+
existing: str,
|
656
|
+
):
|
520
657
|
"""Groups elements by attribute and highlights each group distinctly."""
|
521
658
|
grouped_elements: Dict[Any, List[T]] = {}
|
522
659
|
# Group elements by the specified attribute value
|
523
660
|
for element in self._elements:
|
524
661
|
try:
|
525
662
|
group_key = getattr(element, group_by, None)
|
526
|
-
if group_key is None:
|
527
|
-
|
663
|
+
if group_key is None: # Handle elements missing the attribute
|
664
|
+
group_key = f"Missing '{group_by}'"
|
528
665
|
# Ensure group_key is hashable (convert list/dict if necessary)
|
529
666
|
if isinstance(group_key, (list, dict)):
|
530
667
|
group_key = str(group_key)
|
@@ -533,41 +670,49 @@ class ElementCollection(Generic[T]):
|
|
533
670
|
grouped_elements[group_key] = []
|
534
671
|
grouped_elements[group_key].append(element)
|
535
672
|
except AttributeError:
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
673
|
+
logger.warning(
|
674
|
+
f"Attribute '{group_by}' not found on element {element}. Skipping grouping."
|
675
|
+
)
|
676
|
+
group_key = f"Error accessing '{group_by}'"
|
677
|
+
if group_key not in grouped_elements:
|
678
|
+
grouped_elements[group_key] = []
|
679
|
+
grouped_elements[group_key].append(element)
|
680
|
+
except TypeError: # Handle unhashable types
|
681
|
+
logger.warning(
|
682
|
+
f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation."
|
683
|
+
)
|
684
|
+
group_key = str(group_key)
|
685
|
+
if group_key not in grouped_elements:
|
686
|
+
grouped_elements[group_key] = []
|
687
|
+
grouped_elements[group_key].append(element)
|
548
688
|
|
549
689
|
# Highlight each group
|
550
690
|
for group_key, group_elements in grouped_elements.items():
|
551
|
-
if not group_elements:
|
691
|
+
if not group_elements:
|
692
|
+
continue
|
552
693
|
|
553
694
|
# Determine the label for this group
|
554
|
-
first_element = group_elements[0]
|
695
|
+
first_element = group_elements[0] # Use first element for formatting
|
555
696
|
group_label = None
|
556
697
|
if label_format:
|
557
698
|
try:
|
558
699
|
# Create a dict of element attributes for formatting
|
559
|
-
element_attrs = first_element.__dict__.copy()
|
700
|
+
element_attrs = first_element.__dict__.copy() # Start with element's dict
|
560
701
|
# Ensure the group_by key itself is present correctly
|
561
702
|
element_attrs[group_by] = group_key
|
562
703
|
group_label = label_format.format(**element_attrs)
|
563
704
|
except KeyError as e:
|
564
|
-
logger.warning(
|
705
|
+
logger.warning(
|
706
|
+
f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label."
|
707
|
+
)
|
565
708
|
group_label = str(group_key)
|
566
709
|
except Exception as format_e:
|
567
|
-
logger.warning(
|
710
|
+
logger.warning(
|
711
|
+
f"Error formatting label '{label_format}': {format_e}. Using group key as label."
|
712
|
+
)
|
568
713
|
group_label = str(group_key)
|
569
714
|
else:
|
570
|
-
group_label = str(group_key)
|
715
|
+
group_label = str(group_key) # Use the attribute value as label
|
571
716
|
|
572
717
|
logger.debug(f" Highlighting group '{group_label}' ({len(group_elements)} elements)")
|
573
718
|
|
@@ -575,11 +720,11 @@ class ElementCollection(Generic[T]):
|
|
575
720
|
for element in group_elements:
|
576
721
|
self._call_element_highlighter(
|
577
722
|
element=element,
|
578
|
-
color=None,
|
579
|
-
label=group_label,
|
580
|
-
use_color_cycling=False,
|
723
|
+
color=None, # Let ColorManager choose based on label
|
724
|
+
label=group_label, # Use the derived group label
|
725
|
+
use_color_cycling=False, # Use consistent color for the label
|
581
726
|
include_attrs=include_attrs,
|
582
|
-
existing=existing
|
727
|
+
existing=existing,
|
583
728
|
)
|
584
729
|
|
585
730
|
def _highlight_distinctly(self, include_attrs: Optional[List[str]], existing: str):
|
@@ -589,116 +734,122 @@ class ElementCollection(Generic[T]):
|
|
589
734
|
for element in self._elements:
|
590
735
|
self._call_element_highlighter(
|
591
736
|
element=element,
|
592
|
-
color=None,
|
593
|
-
label=None,
|
594
|
-
use_color_cycling=True,
|
737
|
+
color=None, # Let ColorManager cycle
|
738
|
+
label=None, # No label for distinct elements
|
739
|
+
use_color_cycling=True, # Force cycling
|
595
740
|
include_attrs=include_attrs,
|
596
|
-
existing=existing
|
741
|
+
existing=existing,
|
597
742
|
)
|
598
|
-
|
599
|
-
def show(
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
743
|
+
|
744
|
+
def show(
|
745
|
+
self,
|
746
|
+
# --- Visualization Parameters ---
|
747
|
+
group_by: Optional[str] = None,
|
748
|
+
label: Optional[str] = None,
|
749
|
+
color: Optional[Union[Tuple, str]] = None,
|
750
|
+
label_format: Optional[str] = None,
|
751
|
+
distinct: bool = False,
|
752
|
+
include_attrs: Optional[List[str]] = None,
|
753
|
+
# --- Rendering Parameters ---
|
754
|
+
scale: float = 2.0,
|
755
|
+
labels: bool = True, # Use 'labels' consistent with service
|
756
|
+
legend_position: str = "right",
|
757
|
+
render_ocr: bool = False,
|
758
|
+
) -> Optional["Image.Image"]:
|
759
|
+
"""
|
760
|
+
Generates a temporary preview image highlighting elements in this collection
|
761
|
+
on their page, ignoring any persistent highlights.
|
762
|
+
|
763
|
+
Currently only supports collections where all elements are on the same page.
|
764
|
+
|
765
|
+
Allows grouping and coloring elements based on attributes, similar to the
|
766
|
+
persistent `highlight()` method, but only for this temporary view.
|
767
|
+
|
768
|
+
Args:
|
769
|
+
group_by: Attribute name to group elements by for distinct colors/labels.
|
770
|
+
label: Explicit label for all elements (overrides group_by).
|
771
|
+
color: Explicit color for all elements (if label used) or base color.
|
772
|
+
label_format: F-string to format group labels if group_by is used.
|
773
|
+
distinct: Highlight each element distinctly (overrides group_by/label).
|
774
|
+
include_attrs: Attributes to display on individual highlights.
|
775
|
+
scale: Scale factor for rendering image.
|
776
|
+
labels: Whether to include a legend for the temporary highlights.
|
777
|
+
legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
|
778
|
+
render_ocr: Whether to render OCR text.
|
779
|
+
|
780
|
+
Returns:
|
781
|
+
PIL Image object of the temporary preview, or None if rendering fails or
|
782
|
+
elements span multiple pages.
|
783
|
+
|
784
|
+
Raises:
|
785
|
+
ValueError: If the collection is empty or elements are on different pages.
|
786
|
+
"""
|
787
|
+
if not self._elements:
|
788
|
+
raise ValueError("Cannot show an empty collection.")
|
789
|
+
|
790
|
+
# Check if elements are on multiple pages
|
791
|
+
if self._are_on_multiple_pages():
|
792
|
+
raise ValueError(
|
793
|
+
"show() currently only supports collections where all elements are on the same page."
|
794
|
+
)
|
795
|
+
|
796
|
+
# Get the page and highlighting service from the first element
|
797
|
+
first_element = self._elements[0]
|
798
|
+
if not hasattr(first_element, "page") or not first_element.page:
|
799
|
+
logger.warning("Cannot show collection: First element has no associated page.")
|
800
|
+
return None
|
801
|
+
page = first_element.page
|
802
|
+
if not hasattr(page, "pdf") or not page.pdf:
|
803
|
+
logger.warning("Cannot show collection: Page has no associated PDF object.")
|
804
|
+
return None
|
805
|
+
|
806
|
+
service = page._highlighter
|
807
|
+
if not service:
|
808
|
+
logger.warning("Cannot show collection: PDF object has no highlighting service.")
|
809
|
+
return None
|
810
|
+
|
811
|
+
# 1. Prepare temporary highlight data based on grouping parameters
|
812
|
+
# This returns a list of dicts, suitable for render_preview
|
813
|
+
highlight_data_list = self._prepare_highlight_data(
|
814
|
+
distinct=distinct,
|
815
|
+
label=label,
|
816
|
+
color=color,
|
817
|
+
group_by=group_by,
|
818
|
+
label_format=label_format,
|
819
|
+
include_attrs=include_attrs,
|
820
|
+
)
|
821
|
+
|
822
|
+
if not highlight_data_list:
|
823
|
+
logger.warning("No highlight data generated for show(). Rendering clean page.")
|
824
|
+
# Render the page without any temporary highlights
|
825
|
+
highlight_data_list = []
|
826
|
+
|
827
|
+
# 2. Call render_preview on the HighlightingService
|
828
|
+
try:
|
829
|
+
return service.render_preview(
|
830
|
+
page_index=page.index,
|
831
|
+
temporary_highlights=highlight_data_list,
|
832
|
+
scale=scale,
|
833
|
+
labels=labels, # Use 'labels'
|
834
|
+
legend_position=legend_position,
|
835
|
+
render_ocr=render_ocr,
|
836
|
+
)
|
837
|
+
except Exception as e:
|
838
|
+
logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
|
839
|
+
return None
|
840
|
+
|
841
|
+
def save(
|
842
|
+
self,
|
843
|
+
filename: str,
|
844
|
+
scale: float = 2.0,
|
845
|
+
width: Optional[int] = None,
|
846
|
+
labels: bool = True,
|
847
|
+
legend_position: str = "right",
|
848
|
+
render_ocr: bool = False,
|
849
|
+
) -> "ElementCollection":
|
699
850
|
"""
|
700
851
|
Save the page with this collection's elements highlighted to an image file.
|
701
|
-
|
852
|
+
|
702
853
|
Args:
|
703
854
|
filename: Path to save the image to
|
704
855
|
scale: Scale factor for rendering
|
@@ -706,32 +857,34 @@ class ElementCollection(Generic[T]):
|
|
706
857
|
labels: Whether to include a legend for labels
|
707
858
|
legend_position: Position of the legend
|
708
859
|
render_ocr: Whether to render OCR text with white background boxes
|
709
|
-
|
860
|
+
|
710
861
|
Returns:
|
711
862
|
Self for method chaining
|
712
863
|
"""
|
713
864
|
# Use to_image to generate and save the image
|
714
865
|
self.to_image(
|
715
|
-
path=filename,
|
866
|
+
path=filename,
|
716
867
|
scale=scale,
|
717
868
|
width=width,
|
718
|
-
labels=labels,
|
869
|
+
labels=labels,
|
719
870
|
legend_position=legend_position,
|
720
|
-
render_ocr=render_ocr
|
871
|
+
render_ocr=render_ocr,
|
721
872
|
)
|
722
873
|
return self
|
723
|
-
|
724
|
-
def to_image(
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
874
|
+
|
875
|
+
def to_image(
|
876
|
+
self,
|
877
|
+
path: Optional[str] = None,
|
878
|
+
scale: float = 2.0,
|
879
|
+
width: Optional[int] = None,
|
880
|
+
labels: bool = True,
|
881
|
+
legend_position: str = "right",
|
882
|
+
render_ocr: bool = False,
|
883
|
+
) -> Optional["Image.Image"]:
|
731
884
|
"""
|
732
885
|
Generate an image of the page with this collection's elements highlighted,
|
733
886
|
optionally saving it to a file.
|
734
|
-
|
887
|
+
|
735
888
|
Args:
|
736
889
|
path: Optional path to save the image to
|
737
890
|
scale: Scale factor for rendering
|
@@ -739,21 +892,21 @@ class ElementCollection(Generic[T]):
|
|
739
892
|
labels: Whether to include a legend for labels
|
740
893
|
legend_position: Position of the legend
|
741
894
|
render_ocr: Whether to render OCR text with white background boxes
|
742
|
-
|
895
|
+
|
743
896
|
Returns:
|
744
897
|
PIL Image of the page with elements highlighted, or None if no valid page
|
745
898
|
"""
|
746
899
|
# Get the page from the first element (if available)
|
747
|
-
if self._elements and hasattr(self._elements[0],
|
900
|
+
if self._elements and hasattr(self._elements[0], "page"):
|
748
901
|
page = self._elements[0].page
|
749
902
|
# Generate the image using to_image
|
750
903
|
return page.to_image(
|
751
|
-
path=path,
|
904
|
+
path=path,
|
752
905
|
scale=scale,
|
753
906
|
width=width,
|
754
|
-
labels=labels,
|
907
|
+
labels=labels,
|
755
908
|
legend_position=legend_position,
|
756
|
-
render_ocr=render_ocr
|
909
|
+
render_ocr=render_ocr,
|
757
910
|
)
|
758
911
|
return None
|
759
912
|
|
@@ -763,7 +916,7 @@ class ElementCollection(Generic[T]):
|
|
763
916
|
for element in self._elements:
|
764
917
|
try:
|
765
918
|
group_key = getattr(element, group_by, None)
|
766
|
-
if group_key is None:
|
919
|
+
if group_key is None: # Handle elements missing the attribute
|
767
920
|
group_key = f"Missing '{group_by}'"
|
768
921
|
# Ensure group_key is hashable (convert list/dict if necessary)
|
769
922
|
if isinstance(group_key, (list, dict)):
|
@@ -773,13 +926,17 @@ class ElementCollection(Generic[T]):
|
|
773
926
|
grouped_elements[group_key] = []
|
774
927
|
grouped_elements[group_key].append(element)
|
775
928
|
except AttributeError:
|
776
|
-
logger.warning(
|
929
|
+
logger.warning(
|
930
|
+
f"Attribute '{group_by}' not found on element {element}. Skipping grouping."
|
931
|
+
)
|
777
932
|
group_key = f"Error accessing '{group_by}'"
|
778
933
|
if group_key not in grouped_elements:
|
779
934
|
grouped_elements[group_key] = []
|
780
935
|
grouped_elements[group_key].append(element)
|
781
|
-
except TypeError:
|
782
|
-
logger.warning(
|
936
|
+
except TypeError: # Handle unhashable types
|
937
|
+
logger.warning(
|
938
|
+
f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation."
|
939
|
+
)
|
783
940
|
group_key = str(group_key)
|
784
941
|
if group_key not in grouped_elements:
|
785
942
|
grouped_elements[group_key] = []
|
@@ -787,48 +944,61 @@ class ElementCollection(Generic[T]):
|
|
787
944
|
|
788
945
|
return grouped_elements
|
789
946
|
|
790
|
-
def _format_group_label(
|
947
|
+
def _format_group_label(
|
948
|
+
self, group_key: Any, label_format: Optional[str], sample_element: T, group_by_attr: str
|
949
|
+
) -> str:
|
791
950
|
"""Formats the label for a group based on the key and format string."""
|
792
951
|
if label_format:
|
793
952
|
try:
|
794
953
|
element_attrs = sample_element.__dict__.copy()
|
795
|
-
element_attrs[group_by_attr] = group_key
|
954
|
+
element_attrs[group_by_attr] = group_key # Ensure key is present
|
796
955
|
return label_format.format(**element_attrs)
|
797
956
|
except KeyError as e:
|
798
|
-
logger.warning(
|
957
|
+
logger.warning(
|
958
|
+
f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label."
|
959
|
+
)
|
799
960
|
return str(group_key)
|
800
961
|
except Exception as format_e:
|
801
|
-
logger.warning(
|
962
|
+
logger.warning(
|
963
|
+
f"Error formatting label '{label_format}': {format_e}. Using group key as label."
|
964
|
+
)
|
802
965
|
return str(group_key)
|
803
966
|
else:
|
804
967
|
return str(group_key)
|
805
968
|
|
806
|
-
def _get_element_highlight_params(
|
969
|
+
def _get_element_highlight_params(
|
970
|
+
self, element: T, include_attrs: Optional[List[str]]
|
971
|
+
) -> Optional[Dict]:
|
807
972
|
"""Extracts common parameters needed for highlighting a single element."""
|
808
|
-
if not hasattr(element,
|
973
|
+
if not hasattr(element, "page"):
|
974
|
+
return None
|
809
975
|
page = element.page
|
810
976
|
|
811
977
|
base_data = {
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
978
|
+
"page_index": page.index,
|
979
|
+
"element": element,
|
980
|
+
"include_attrs": include_attrs,
|
981
|
+
"attributes_to_draw": {},
|
982
|
+
"bbox": None,
|
983
|
+
"polygon": None,
|
818
984
|
}
|
819
985
|
|
820
986
|
# Extract geometry
|
821
|
-
is_polygon = getattr(element,
|
987
|
+
is_polygon = getattr(element, "has_polygon", False)
|
822
988
|
geom_data = None
|
823
989
|
if is_polygon:
|
824
|
-
geom_data = getattr(element,
|
825
|
-
if geom_data:
|
990
|
+
geom_data = getattr(element, "polygon", None)
|
991
|
+
if geom_data:
|
992
|
+
base_data["polygon"] = geom_data
|
826
993
|
else:
|
827
|
-
geom_data = getattr(element,
|
828
|
-
if geom_data:
|
994
|
+
geom_data = getattr(element, "bbox", None)
|
995
|
+
if geom_data:
|
996
|
+
base_data["bbox"] = geom_data
|
829
997
|
|
830
998
|
if not geom_data:
|
831
|
-
logger.warning(
|
999
|
+
logger.warning(
|
1000
|
+
f"Cannot prepare highlight, no bbox or polygon found for element: {element}"
|
1001
|
+
)
|
832
1002
|
return None
|
833
1003
|
|
834
1004
|
# Extract attributes if requested
|
@@ -837,13 +1007,15 @@ class ElementCollection(Generic[T]):
|
|
837
1007
|
try:
|
838
1008
|
attr_value = getattr(element, attr_name, None)
|
839
1009
|
if attr_value is not None:
|
840
|
-
base_data[
|
1010
|
+
base_data["attributes_to_draw"][attr_name] = attr_value
|
841
1011
|
except AttributeError:
|
842
|
-
logger.warning(
|
1012
|
+
logger.warning(
|
1013
|
+
f"Attribute '{attr_name}' not found on element {element} for include_attrs"
|
1014
|
+
)
|
843
1015
|
|
844
1016
|
return base_data
|
845
1017
|
|
846
|
-
def viewer(self, title: Optional[str] = None) -> Optional[
|
1018
|
+
def viewer(self, title: Optional[str] = None) -> Optional["widgets.DOMWidget"]:
|
847
1019
|
"""
|
848
1020
|
Creates and returns an interactive ipywidget showing ONLY the elements
|
849
1021
|
in this collection on their page background.
|
@@ -862,28 +1034,36 @@ class ElementCollection(Generic[T]):
|
|
862
1034
|
try:
|
863
1035
|
page = self.elements[0].page
|
864
1036
|
# Check if the page object actually has the method
|
865
|
-
if hasattr(page,
|
866
|
-
final_title =
|
1037
|
+
if hasattr(page, "viewer") and callable(page.viewer):
|
1038
|
+
final_title = (
|
1039
|
+
title or f"Interactive Viewer for Collection ({len(self.elements)} elements)"
|
1040
|
+
)
|
867
1041
|
# Call the page method, passing this collection's elements
|
868
1042
|
return page.viewer(
|
869
1043
|
elements_to_render=self.elements,
|
870
|
-
title=final_title
|
1044
|
+
title=final_title, # Pass title if Page method accepts it
|
871
1045
|
)
|
872
1046
|
else:
|
873
|
-
|
874
|
-
|
1047
|
+
logger.error("Page object is missing the 'viewer' method.")
|
1048
|
+
return None
|
875
1049
|
except AttributeError:
|
876
|
-
logger.error(
|
1050
|
+
logger.error(
|
1051
|
+
"Cannot generate interactive viewer: Elements in collection lack 'page' attribute."
|
1052
|
+
)
|
877
1053
|
return None
|
878
1054
|
except IndexError:
|
879
|
-
|
880
|
-
|
881
|
-
|
1055
|
+
# Should be caught by the empty check, but just in case
|
1056
|
+
logger.error(
|
1057
|
+
"Cannot generate interactive viewer: Collection unexpectedly became empty."
|
1058
|
+
)
|
1059
|
+
return None
|
882
1060
|
except Exception as e:
|
883
|
-
|
884
|
-
|
1061
|
+
logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
|
1062
|
+
return None
|
885
1063
|
|
886
|
-
def find_all(
|
1064
|
+
def find_all(
|
1065
|
+
self, selector: str, regex: bool = False, case: bool = True, **kwargs
|
1066
|
+
) -> "ElementCollection[T]":
|
887
1067
|
"""
|
888
1068
|
Filter elements within this collection matching the selector.
|
889
1069
|
|
@@ -903,21 +1083,21 @@ class ElementCollection(Generic[T]):
|
|
903
1083
|
selector_obj = parse_selector(selector)
|
904
1084
|
except Exception as e:
|
905
1085
|
logger.error(f"Error parsing selector '{selector}': {e}")
|
906
|
-
return ElementCollection([])
|
1086
|
+
return ElementCollection([]) # Return empty on parse error
|
907
1087
|
|
908
1088
|
# Pass regex and case flags to selector function generator
|
909
|
-
kwargs[
|
910
|
-
kwargs[
|
1089
|
+
kwargs["regex"] = regex
|
1090
|
+
kwargs["case"] = case
|
911
1091
|
|
912
1092
|
try:
|
913
1093
|
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
914
1094
|
except Exception as e:
|
915
1095
|
logger.error(f"Error creating filter function for selector '{selector}': {e}")
|
916
|
-
return ElementCollection([])
|
1096
|
+
return ElementCollection([]) # Return empty on filter creation error
|
917
1097
|
|
918
1098
|
matching_elements = [element for element in self._elements if filter_func(element)]
|
919
1099
|
|
920
|
-
# Note: Unlike Page.find_all, this doesn't re-sort.
|
1100
|
+
# Note: Unlike Page.find_all, this doesn't re-sort.
|
921
1101
|
# Sorting should be done explicitly on the collection if needed.
|
922
1102
|
|
923
1103
|
return ElementCollection(matching_elements)
|
@@ -938,65 +1118,63 @@ class ElementCollection(Generic[T]):
|
|
938
1118
|
results = self.find_all(selector, regex=regex, case=case, **kwargs)
|
939
1119
|
return results.first
|
940
1120
|
|
1121
|
+
|
941
1122
|
class PageCollection(Generic[P]):
|
942
1123
|
"""
|
943
1124
|
A collection of PDF pages with cross-page operations.
|
944
|
-
|
1125
|
+
|
945
1126
|
This class provides methods for working with multiple pages, such as finding
|
946
1127
|
elements across pages, extracting text from page ranges, and more.
|
947
1128
|
"""
|
948
|
-
|
1129
|
+
|
949
1130
|
def __init__(self, pages: List[P]):
|
950
1131
|
"""
|
951
1132
|
Initialize a page collection.
|
952
|
-
|
1133
|
+
|
953
1134
|
Args:
|
954
1135
|
pages: List of Page objects
|
955
1136
|
"""
|
956
1137
|
self.pages = pages
|
957
|
-
|
1138
|
+
|
958
1139
|
def __len__(self) -> int:
|
959
1140
|
"""Return the number of pages in the collection."""
|
960
1141
|
return len(self.pages)
|
961
|
-
|
962
|
-
def __getitem__(self, idx) -> Union[P,
|
1142
|
+
|
1143
|
+
def __getitem__(self, idx) -> Union[P, "PageCollection[P]"]:
|
963
1144
|
"""Support indexing and slicing."""
|
964
1145
|
if isinstance(idx, slice):
|
965
1146
|
return PageCollection(self.pages[idx])
|
966
1147
|
return self.pages[idx]
|
967
|
-
|
1148
|
+
|
968
1149
|
def __iter__(self) -> Iterator[P]:
|
969
1150
|
"""Support iteration."""
|
970
1151
|
return iter(self.pages)
|
971
|
-
|
1152
|
+
|
972
1153
|
def __repr__(self) -> str:
|
973
1154
|
"""Return a string representation showing the page count."""
|
974
1155
|
return f"<PageCollection(count={len(self)})>"
|
975
|
-
|
1156
|
+
|
976
1157
|
def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
|
977
1158
|
"""
|
978
1159
|
Extract text from all pages in the collection.
|
979
|
-
|
1160
|
+
|
980
1161
|
Args:
|
981
1162
|
keep_blank_chars: Whether to keep blank characters (default: True)
|
982
1163
|
apply_exclusions: Whether to apply exclusion regions (default: True)
|
983
1164
|
**kwargs: Additional extraction parameters
|
984
|
-
|
1165
|
+
|
985
1166
|
Returns:
|
986
1167
|
Combined text from all pages
|
987
1168
|
"""
|
988
1169
|
texts = []
|
989
1170
|
for page in self.pages:
|
990
1171
|
text = page.extract_text(
|
991
|
-
keep_blank_chars=keep_blank_chars,
|
992
|
-
apply_exclusions=apply_exclusions,
|
993
|
-
**kwargs
|
1172
|
+
keep_blank_chars=keep_blank_chars, apply_exclusions=apply_exclusions, **kwargs
|
994
1173
|
)
|
995
1174
|
texts.append(text)
|
996
|
-
|
1175
|
+
|
997
1176
|
return "\n".join(texts)
|
998
1177
|
|
999
|
-
# --- NEW METHOD ---
|
1000
1178
|
def apply_ocr(
|
1001
1179
|
self,
|
1002
1180
|
engine: Optional[str] = None,
|
@@ -1004,13 +1182,11 @@ class PageCollection(Generic[P]):
|
|
1004
1182
|
languages: Optional[List[str]] = None,
|
1005
1183
|
min_confidence: Optional[float] = None,
|
1006
1184
|
device: Optional[str] = None,
|
1007
|
-
|
1008
|
-
) -> 'PageCollection[P]':
|
1185
|
+
) -> "PageCollection[P]":
|
1009
1186
|
"""
|
1010
1187
|
Applies OCR to all pages within this collection using batch processing.
|
1011
1188
|
|
1012
|
-
This delegates the work to the parent PDF object's `
|
1013
|
-
method for efficiency. The OCR results (TextElements) are added directly
|
1189
|
+
This delegates the work to the parent PDF object's `apply_ocr` method for efficiency. The OCR results (TextElements) are added directly
|
1014
1190
|
to the respective Page objects within this collection.
|
1015
1191
|
|
1016
1192
|
Args:
|
@@ -1028,8 +1204,8 @@ class PageCollection(Generic[P]):
|
|
1028
1204
|
Raises:
|
1029
1205
|
RuntimeError: If pages in the collection lack a parent PDF object
|
1030
1206
|
or if the parent PDF object lacks the required
|
1031
|
-
`
|
1032
|
-
(Propagates exceptions from PDF.
|
1207
|
+
`apply_ocr` method.
|
1208
|
+
(Propagates exceptions from PDF.apply_ocr)
|
1033
1209
|
"""
|
1034
1210
|
if not self.pages:
|
1035
1211
|
logger.warning("Cannot apply OCR to an empty PageCollection.")
|
@@ -1037,42 +1213,43 @@ class PageCollection(Generic[P]):
|
|
1037
1213
|
|
1038
1214
|
# Assume all pages share the same parent PDF object
|
1039
1215
|
first_page = self.pages[0]
|
1040
|
-
if not hasattr(first_page,
|
1216
|
+
if not hasattr(first_page, "_parent") or not first_page._parent:
|
1041
1217
|
raise RuntimeError("Pages in this collection do not have a parent PDF reference.")
|
1042
1218
|
|
1043
1219
|
parent_pdf = first_page._parent
|
1044
1220
|
|
1045
|
-
|
1046
|
-
|
1221
|
+
# Updated check for renamed method
|
1222
|
+
if not hasattr(parent_pdf, "apply_ocr") or not callable(parent_pdf.apply_ocr):
|
1223
|
+
raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
|
1047
1224
|
|
1048
1225
|
# Get the 0-based indices of the pages in this collection
|
1049
1226
|
page_indices = [p.index for p in self.pages]
|
1050
1227
|
|
1051
1228
|
logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
|
1052
1229
|
|
1053
|
-
# Delegate the batch call to the parent PDF object
|
1054
|
-
parent_pdf.
|
1230
|
+
# Delegate the batch call to the parent PDF object (using renamed method)
|
1231
|
+
parent_pdf.apply_ocr(
|
1055
1232
|
pages=page_indices,
|
1056
1233
|
engine=engine,
|
1057
1234
|
options=options,
|
1058
1235
|
languages=languages,
|
1059
1236
|
min_confidence=min_confidence,
|
1060
|
-
device=device
|
1237
|
+
device=device,
|
1061
1238
|
# Pass any other relevant simple_kwargs here if added
|
1062
1239
|
)
|
1063
1240
|
# The PDF method modifies the Page objects directly by adding elements.
|
1064
1241
|
|
1065
|
-
return self
|
1242
|
+
return self # Return self for chaining
|
1066
1243
|
|
1067
1244
|
def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[T]:
|
1068
1245
|
"""
|
1069
1246
|
Find the first element matching the selector across all pages.
|
1070
|
-
|
1247
|
+
|
1071
1248
|
Args:
|
1072
1249
|
selector: CSS-like selector string
|
1073
1250
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
1074
1251
|
**kwargs: Additional filter parameters
|
1075
|
-
|
1252
|
+
|
1076
1253
|
Returns:
|
1077
1254
|
First matching element or None
|
1078
1255
|
"""
|
@@ -1081,16 +1258,16 @@ class PageCollection(Generic[P]):
|
|
1081
1258
|
if element:
|
1082
1259
|
return element
|
1083
1260
|
return None
|
1084
|
-
|
1261
|
+
|
1085
1262
|
def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> ElementCollection:
|
1086
1263
|
"""
|
1087
1264
|
Find all elements matching the selector across all pages.
|
1088
|
-
|
1265
|
+
|
1089
1266
|
Args:
|
1090
1267
|
selector: CSS-like selector string
|
1091
1268
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
1092
1269
|
**kwargs: Additional filter parameters
|
1093
|
-
|
1270
|
+
|
1094
1271
|
Returns:
|
1095
1272
|
ElementCollection with matching elements from all pages
|
1096
1273
|
"""
|
@@ -1099,57 +1276,59 @@ class PageCollection(Generic[P]):
|
|
1099
1276
|
elements = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
1100
1277
|
if elements:
|
1101
1278
|
all_elements.extend(elements.elements)
|
1102
|
-
|
1279
|
+
|
1103
1280
|
return ElementCollection(all_elements)
|
1104
|
-
|
1281
|
+
|
1105
1282
|
# def debug_ocr(self, output_path):
|
1106
1283
|
# """
|
1107
1284
|
# Generate an interactive HTML debug report for OCR results.
|
1108
|
-
|
1285
|
+
|
1109
1286
|
# This creates a single-file HTML report with:
|
1110
1287
|
# - Side-by-side view of image regions and OCR text
|
1111
1288
|
# - Confidence scores with color coding
|
1112
1289
|
# - Editable correction fields
|
1113
1290
|
# - Filtering and sorting options
|
1114
1291
|
# - Export functionality for corrected text
|
1115
|
-
|
1292
|
+
|
1116
1293
|
# Args:
|
1117
1294
|
# output_path: Path to save the HTML report
|
1118
|
-
|
1295
|
+
|
1119
1296
|
# Returns:
|
1120
1297
|
# Path to the generated HTML file
|
1121
1298
|
# """
|
1122
1299
|
# from natural_pdf.utils.ocr import debug_ocr_to_html
|
1123
1300
|
# return debug_ocr_to_html(self.pages, output_path)
|
1124
|
-
|
1125
|
-
def get_sections(
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1301
|
+
|
1302
|
+
def get_sections(
|
1303
|
+
self,
|
1304
|
+
start_elements=None,
|
1305
|
+
end_elements=None,
|
1306
|
+
new_section_on_page_break=False,
|
1307
|
+
boundary_inclusion="both",
|
1308
|
+
) -> List["Region"]:
|
1130
1309
|
"""
|
1131
1310
|
Extract sections from a page collection based on start/end elements.
|
1132
|
-
|
1311
|
+
|
1133
1312
|
Args:
|
1134
1313
|
start_elements: Elements or selector string that mark the start of sections
|
1135
1314
|
end_elements: Elements or selector string that mark the end of sections
|
1136
1315
|
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
1137
1316
|
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
1138
|
-
|
1317
|
+
|
1139
1318
|
Returns:
|
1140
1319
|
List of Region objects representing the extracted sections
|
1141
1320
|
"""
|
1142
1321
|
# Find start and end elements across all pages
|
1143
1322
|
if isinstance(start_elements, str):
|
1144
1323
|
start_elements = self.find_all(start_elements).elements
|
1145
|
-
|
1324
|
+
|
1146
1325
|
if isinstance(end_elements, str):
|
1147
1326
|
end_elements = self.find_all(end_elements).elements
|
1148
|
-
|
1327
|
+
|
1149
1328
|
# If no start elements, return empty list
|
1150
1329
|
if not start_elements:
|
1151
1330
|
return []
|
1152
|
-
|
1331
|
+
|
1153
1332
|
# If there are page break boundaries, we'll need to add them
|
1154
1333
|
if new_section_on_page_break:
|
1155
1334
|
# For each page boundary, create virtual "end" and "start" elements
|
@@ -1159,183 +1338,200 @@ class PageCollection(Generic[P]):
|
|
1159
1338
|
# If end_elements is None, initialize it as an empty list
|
1160
1339
|
if end_elements is None:
|
1161
1340
|
end_elements = []
|
1162
|
-
|
1341
|
+
|
1163
1342
|
# Create a region at the bottom of the page as an artificial end marker
|
1164
1343
|
from natural_pdf.elements.region import Region
|
1344
|
+
|
1165
1345
|
bottom_region = Region(page, (0, page.height - 1, page.width, page.height))
|
1166
1346
|
bottom_region.is_page_boundary = True # Mark it as a special boundary
|
1167
1347
|
end_elements.append(bottom_region)
|
1168
|
-
|
1348
|
+
|
1169
1349
|
# Add a virtual "start" element at the top of the next page
|
1170
1350
|
next_page = self.pages[i + 1]
|
1171
1351
|
top_region = Region(next_page, (0, 0, next_page.width, 1))
|
1172
1352
|
top_region.is_page_boundary = True # Mark it as a special boundary
|
1173
1353
|
start_elements.append(top_region)
|
1174
|
-
|
1354
|
+
|
1175
1355
|
# Get all elements from all pages and sort them in document order
|
1176
1356
|
all_elements = []
|
1177
1357
|
for page in self.pages:
|
1178
1358
|
elements = page.get_elements()
|
1179
1359
|
all_elements.extend(elements)
|
1180
|
-
|
1360
|
+
|
1181
1361
|
# Sort by page index, then vertical position, then horizontal position
|
1182
1362
|
all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
|
1183
|
-
|
1363
|
+
|
1184
1364
|
# Mark section boundaries
|
1185
1365
|
section_boundaries = []
|
1186
|
-
|
1366
|
+
|
1187
1367
|
# Add start element boundaries
|
1188
1368
|
for element in start_elements:
|
1189
1369
|
if element in all_elements:
|
1190
1370
|
idx = all_elements.index(element)
|
1191
|
-
section_boundaries.append(
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1371
|
+
section_boundaries.append(
|
1372
|
+
{
|
1373
|
+
"index": idx,
|
1374
|
+
"element": element,
|
1375
|
+
"type": "start",
|
1376
|
+
"page_idx": element.page.index,
|
1377
|
+
}
|
1378
|
+
)
|
1379
|
+
elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
|
1198
1380
|
# This is a virtual page boundary element
|
1199
|
-
section_boundaries.append(
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1381
|
+
section_boundaries.append(
|
1382
|
+
{
|
1383
|
+
"index": -1, # Special index for page boundaries
|
1384
|
+
"element": element,
|
1385
|
+
"type": "start",
|
1386
|
+
"page_idx": element.page.index,
|
1387
|
+
}
|
1388
|
+
)
|
1389
|
+
|
1206
1390
|
# Add end element boundaries if provided
|
1207
1391
|
if end_elements:
|
1208
1392
|
for element in end_elements:
|
1209
1393
|
if element in all_elements:
|
1210
1394
|
idx = all_elements.index(element)
|
1211
|
-
section_boundaries.append(
|
1212
|
-
|
1213
|
-
|
1214
|
-
|
1215
|
-
|
1216
|
-
|
1217
|
-
|
1395
|
+
section_boundaries.append(
|
1396
|
+
{
|
1397
|
+
"index": idx,
|
1398
|
+
"element": element,
|
1399
|
+
"type": "end",
|
1400
|
+
"page_idx": element.page.index,
|
1401
|
+
}
|
1402
|
+
)
|
1403
|
+
elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
|
1218
1404
|
# This is a virtual page boundary element
|
1219
|
-
section_boundaries.append(
|
1220
|
-
|
1221
|
-
|
1222
|
-
|
1223
|
-
|
1224
|
-
|
1225
|
-
|
1405
|
+
section_boundaries.append(
|
1406
|
+
{
|
1407
|
+
"index": -1, # Special index for page boundaries
|
1408
|
+
"element": element,
|
1409
|
+
"type": "end",
|
1410
|
+
"page_idx": element.page.index,
|
1411
|
+
}
|
1412
|
+
)
|
1413
|
+
|
1226
1414
|
# Sort boundaries by page index, then by actual document position
|
1227
|
-
section_boundaries.sort(
|
1228
|
-
|
1229
|
-
|
1230
|
-
|
1415
|
+
section_boundaries.sort(
|
1416
|
+
key=lambda x: (
|
1417
|
+
x["page_idx"],
|
1418
|
+
x["index"] if x["index"] != -1 else (0 if x["type"] == "start" else float("inf")),
|
1419
|
+
)
|
1420
|
+
)
|
1421
|
+
|
1231
1422
|
# Generate sections
|
1232
1423
|
sections = []
|
1233
1424
|
current_start = None
|
1234
|
-
|
1425
|
+
|
1235
1426
|
for i, boundary in enumerate(section_boundaries):
|
1236
1427
|
# If it's a start boundary and we don't have a current start
|
1237
|
-
if boundary[
|
1428
|
+
if boundary["type"] == "start" and current_start is None:
|
1238
1429
|
current_start = boundary
|
1239
|
-
|
1430
|
+
|
1240
1431
|
# If it's an end boundary and we have a current start
|
1241
|
-
elif boundary[
|
1432
|
+
elif boundary["type"] == "end" and current_start is not None:
|
1242
1433
|
# Create a section from current_start to this boundary
|
1243
|
-
start_element = current_start[
|
1244
|
-
end_element = boundary[
|
1245
|
-
|
1434
|
+
start_element = current_start["element"]
|
1435
|
+
end_element = boundary["element"]
|
1436
|
+
|
1246
1437
|
# If both elements are on the same page, use the page's get_section_between
|
1247
1438
|
if start_element.page == end_element.page:
|
1248
1439
|
section = start_element.page.get_section_between(
|
1249
|
-
start_element,
|
1250
|
-
end_element,
|
1251
|
-
boundary_inclusion
|
1440
|
+
start_element, end_element, boundary_inclusion
|
1252
1441
|
)
|
1253
1442
|
sections.append(section)
|
1254
1443
|
else:
|
1255
1444
|
# Create a multi-page section
|
1256
1445
|
from natural_pdf.elements.region import Region
|
1257
|
-
|
1446
|
+
|
1258
1447
|
# Get the start and end pages
|
1259
1448
|
start_page = start_element.page
|
1260
1449
|
end_page = end_element.page
|
1261
|
-
|
1450
|
+
|
1262
1451
|
# Create a combined region
|
1263
1452
|
combined_region = Region(
|
1264
|
-
start_page,
|
1265
|
-
(0, start_element.top, start_page.width, start_page.height)
|
1453
|
+
start_page, (0, start_element.top, start_page.width, start_page.height)
|
1266
1454
|
)
|
1267
1455
|
combined_region._spans_pages = True
|
1268
1456
|
combined_region._page_range = (start_page.index, end_page.index)
|
1269
1457
|
combined_region.start_element = start_element
|
1270
1458
|
combined_region.end_element = end_element
|
1271
|
-
|
1459
|
+
|
1272
1460
|
# Get all elements that fall within this multi-page region
|
1273
1461
|
combined_elements = []
|
1274
|
-
|
1462
|
+
|
1275
1463
|
# Get elements from the first page
|
1276
|
-
first_page_elements = [
|
1277
|
-
|
1464
|
+
first_page_elements = [
|
1465
|
+
e
|
1466
|
+
for e in all_elements
|
1467
|
+
if e.page == start_page and e.top >= start_element.top
|
1468
|
+
]
|
1278
1469
|
combined_elements.extend(first_page_elements)
|
1279
|
-
|
1470
|
+
|
1280
1471
|
# Get elements from middle pages (if any)
|
1281
1472
|
for page_idx in range(start_page.index + 1, end_page.index):
|
1282
1473
|
middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
|
1283
1474
|
combined_elements.extend(middle_page_elements)
|
1284
|
-
|
1475
|
+
|
1285
1476
|
# Get elements from the last page
|
1286
|
-
last_page_elements = [
|
1287
|
-
|
1477
|
+
last_page_elements = [
|
1478
|
+
e
|
1479
|
+
for e in all_elements
|
1480
|
+
if e.page == end_page and e.bottom <= end_element.bottom
|
1481
|
+
]
|
1288
1482
|
combined_elements.extend(last_page_elements)
|
1289
|
-
|
1483
|
+
|
1290
1484
|
# Store the elements in the combined region
|
1291
1485
|
combined_region._multi_page_elements = combined_elements
|
1292
|
-
|
1486
|
+
|
1293
1487
|
sections.append(combined_region)
|
1294
|
-
|
1488
|
+
|
1295
1489
|
current_start = None
|
1296
|
-
|
1490
|
+
|
1297
1491
|
# If it's another start boundary and we have a current start (for splitting by starts only)
|
1298
|
-
elif boundary[
|
1492
|
+
elif boundary["type"] == "start" and current_start is not None and not end_elements:
|
1299
1493
|
# Create a section from current_start to just before this boundary
|
1300
|
-
start_element = current_start[
|
1301
|
-
|
1494
|
+
start_element = current_start["element"]
|
1495
|
+
|
1302
1496
|
# Find the last element before this boundary on the same page
|
1303
|
-
if start_element.page == boundary[
|
1497
|
+
if start_element.page == boundary["element"].page:
|
1304
1498
|
# Find elements on this page
|
1305
1499
|
page_elements = [e for e in all_elements if e.page == start_element.page]
|
1306
1500
|
# Sort by position
|
1307
1501
|
page_elements.sort(key=lambda e: (e.top, e.x0))
|
1308
|
-
|
1502
|
+
|
1309
1503
|
# Find the last element before the boundary
|
1310
|
-
end_idx =
|
1504
|
+
end_idx = (
|
1505
|
+
page_elements.index(boundary["element"]) - 1
|
1506
|
+
if boundary["element"] in page_elements
|
1507
|
+
else -1
|
1508
|
+
)
|
1311
1509
|
end_element = page_elements[end_idx] if end_idx >= 0 else None
|
1312
|
-
|
1510
|
+
|
1313
1511
|
# Create the section
|
1314
1512
|
section = start_element.page.get_section_between(
|
1315
|
-
start_element,
|
1316
|
-
end_element,
|
1317
|
-
boundary_inclusion
|
1513
|
+
start_element, end_element, boundary_inclusion
|
1318
1514
|
)
|
1319
1515
|
sections.append(section)
|
1320
1516
|
else:
|
1321
1517
|
# Cross-page section - create from current_start to the end of its page
|
1322
1518
|
from natural_pdf.elements.region import Region
|
1519
|
+
|
1323
1520
|
start_page = start_element.page
|
1324
|
-
|
1521
|
+
|
1325
1522
|
region = Region(
|
1326
|
-
start_page,
|
1327
|
-
(0, start_element.top, start_page.width, start_page.height)
|
1523
|
+
start_page, (0, start_element.top, start_page.width, start_page.height)
|
1328
1524
|
)
|
1329
1525
|
region.start_element = start_element
|
1330
1526
|
sections.append(region)
|
1331
|
-
|
1527
|
+
|
1332
1528
|
current_start = boundary
|
1333
|
-
|
1529
|
+
|
1334
1530
|
# Handle the last section if we have a current start
|
1335
1531
|
if current_start is not None:
|
1336
|
-
start_element = current_start[
|
1532
|
+
start_element = current_start["element"]
|
1337
1533
|
start_page = start_element.page
|
1338
|
-
|
1534
|
+
|
1339
1535
|
if end_elements:
|
1340
1536
|
# With end_elements, we need an explicit end - use the last element
|
1341
1537
|
# on the last page of the collection
|
@@ -1343,59 +1539,63 @@ class PageCollection(Generic[P]):
|
|
1343
1539
|
last_page_elements = [e for e in all_elements if e.page == last_page]
|
1344
1540
|
last_page_elements.sort(key=lambda e: (e.top, e.x0))
|
1345
1541
|
end_element = last_page_elements[-1] if last_page_elements else None
|
1346
|
-
|
1542
|
+
|
1347
1543
|
# Create a multi-page section
|
1348
1544
|
from natural_pdf.elements.region import Region
|
1349
|
-
|
1545
|
+
|
1350
1546
|
if start_page == last_page:
|
1351
1547
|
# Simple case - both on same page
|
1352
1548
|
section = start_page.get_section_between(
|
1353
|
-
start_element,
|
1354
|
-
end_element,
|
1355
|
-
boundary_inclusion
|
1549
|
+
start_element, end_element, boundary_inclusion
|
1356
1550
|
)
|
1357
1551
|
sections.append(section)
|
1358
1552
|
else:
|
1359
1553
|
# Create a multi-page section
|
1360
1554
|
combined_region = Region(
|
1361
|
-
start_page,
|
1362
|
-
(0, start_element.top, start_page.width, start_page.height)
|
1555
|
+
start_page, (0, start_element.top, start_page.width, start_page.height)
|
1363
1556
|
)
|
1364
1557
|
combined_region._spans_pages = True
|
1365
1558
|
combined_region._page_range = (start_page.index, last_page.index)
|
1366
1559
|
combined_region.start_element = start_element
|
1367
1560
|
combined_region.end_element = end_element
|
1368
|
-
|
1561
|
+
|
1369
1562
|
# Get all elements that fall within this multi-page region
|
1370
1563
|
combined_elements = []
|
1371
|
-
|
1564
|
+
|
1372
1565
|
# Get elements from the first page
|
1373
|
-
first_page_elements = [
|
1374
|
-
|
1566
|
+
first_page_elements = [
|
1567
|
+
e
|
1568
|
+
for e in all_elements
|
1569
|
+
if e.page == start_page and e.top >= start_element.top
|
1570
|
+
]
|
1375
1571
|
combined_elements.extend(first_page_elements)
|
1376
|
-
|
1572
|
+
|
1377
1573
|
# Get elements from middle pages (if any)
|
1378
1574
|
for page_idx in range(start_page.index + 1, last_page.index):
|
1379
1575
|
middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
|
1380
1576
|
combined_elements.extend(middle_page_elements)
|
1381
|
-
|
1577
|
+
|
1382
1578
|
# Get elements from the last page
|
1383
|
-
last_page_elements = [
|
1384
|
-
|
1579
|
+
last_page_elements = [
|
1580
|
+
e
|
1581
|
+
for e in all_elements
|
1582
|
+
if e.page == last_page
|
1583
|
+
and (end_element is None or e.bottom <= end_element.bottom)
|
1584
|
+
]
|
1385
1585
|
combined_elements.extend(last_page_elements)
|
1386
|
-
|
1586
|
+
|
1387
1587
|
# Store the elements in the combined region
|
1388
1588
|
combined_region._multi_page_elements = combined_elements
|
1389
|
-
|
1589
|
+
|
1390
1590
|
sections.append(combined_region)
|
1391
1591
|
else:
|
1392
1592
|
# With start_elements only, create a section to the end of the current page
|
1393
1593
|
from natural_pdf.elements.region import Region
|
1594
|
+
|
1394
1595
|
region = Region(
|
1395
|
-
start_page,
|
1396
|
-
(0, start_element.top, start_page.width, start_page.height)
|
1596
|
+
start_page, (0, start_element.top, start_page.width, start_page.height)
|
1397
1597
|
)
|
1398
1598
|
region.start_element = start_element
|
1399
1599
|
sections.append(region)
|
1400
|
-
|
1401
|
-
return sections
|
1600
|
+
|
1601
|
+
return sections
|