natural-pdf 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
- natural_pdf/classification/manager.py +2 -3
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/highlighting_service.py +29 -38
- natural_pdf/core/page.py +283 -186
- natural_pdf/core/pdf.py +4 -4
- natural_pdf/elements/base.py +34 -0
- natural_pdf/elements/collections.py +160 -9
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +353 -12
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/selectors/parser.py +163 -8
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +22 -17
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
@@ -3,8 +3,9 @@ import os
|
|
3
3
|
import random
|
4
4
|
import shutil
|
5
5
|
from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
|
6
|
+
import collections
|
6
7
|
|
7
|
-
from tqdm import tqdm
|
8
|
+
from tqdm.auto import tqdm
|
8
9
|
|
9
10
|
from natural_pdf.exporters.base import FinetuneExporter
|
10
11
|
|
@@ -33,19 +34,20 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
33
34
|
def __init__(
|
34
35
|
self,
|
35
36
|
resolution: int = 150,
|
36
|
-
padding: int =
|
37
|
+
padding: int = 0,
|
37
38
|
selector: Optional[str] = None,
|
38
39
|
corrected_only: bool = False,
|
39
40
|
split_ratio: Optional[float] = 0.9,
|
40
41
|
include_guide: bool = True,
|
41
42
|
random_seed: Optional[int] = 42,
|
43
|
+
min_char_freq: int = 3,
|
42
44
|
):
|
43
45
|
"""
|
44
46
|
Initialize the PaddleOCR Recognition Exporter.
|
45
47
|
|
46
48
|
Args:
|
47
49
|
resolution: DPI resolution for rendering text region images (default: 150).
|
48
|
-
padding: Padding (in points) to add around text element bbox before cropping (default:
|
50
|
+
padding: Padding (in points) to add around text element bbox before cropping (default: 0).
|
49
51
|
selector: CSS-like selector to filter which TextElements to export.
|
50
52
|
If None and corrected_only is False, all 'text' elements are considered.
|
51
53
|
corrected_only: If True, overrides selector and exports only elements likely
|
@@ -57,6 +59,9 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
57
59
|
in the output directory (default: True).
|
58
60
|
random_seed: Seed for the random number generator used for train/val split shuffling,
|
59
61
|
ensuring reproducibility (default: 42).
|
62
|
+
min_char_freq: Minimum frequency for a character to be included in the dictionary.
|
63
|
+
Text elements containing characters below this frequency will be removed.
|
64
|
+
(default: 1, meaning no filtering based on frequency).
|
60
65
|
"""
|
61
66
|
if corrected_only and selector:
|
62
67
|
logger.warning(
|
@@ -76,10 +81,12 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
76
81
|
self.split_ratio = split_ratio
|
77
82
|
self.include_guide = include_guide
|
78
83
|
self.random_seed = random_seed
|
84
|
+
self.min_char_freq = min_char_freq
|
79
85
|
|
80
86
|
logger.info(
|
81
87
|
f"Initialized PaddleOCRRecognitionExporter: selector='{self.selector}', resolution={resolution}, "
|
82
|
-
f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}"
|
88
|
+
f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}, "
|
89
|
+
f"min_char_freq={min_char_freq}"
|
83
90
|
)
|
84
91
|
|
85
92
|
def export(
|
@@ -114,7 +121,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
114
121
|
|
115
122
|
# --- 2. Collect Elements and Render Images ---
|
116
123
|
labels: List[Tuple[str, str]] = [] # List of (relative_image_path, text_label)
|
117
|
-
|
124
|
+
char_counts: collections.Counter = collections.Counter()
|
118
125
|
elements_processed = 0
|
119
126
|
elements_skipped = 0
|
120
127
|
|
@@ -200,7 +207,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
200
207
|
labels.append(
|
201
208
|
(relative_image_path.replace(os.path.sep, "/"), element_text)
|
202
209
|
) # Use forward slashes for labels
|
203
|
-
|
210
|
+
char_counts.update(element_text)
|
204
211
|
elements_processed += 1
|
205
212
|
|
206
213
|
except Exception as e:
|
@@ -226,15 +233,48 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
226
233
|
|
227
234
|
logger.info(f"Processed {elements_processed} text elements, skipped {elements_skipped}.")
|
228
235
|
|
236
|
+
# --- 2.5 Filter based on character frequency ---
|
237
|
+
if self.min_char_freq > 1:
|
238
|
+
logger.info(f"Filtering elements based on min_char_freq: {self.min_char_freq}")
|
239
|
+
original_label_count = len(labels)
|
240
|
+
rare_chars = {char for char, count in char_counts.items() if count < self.min_char_freq}
|
241
|
+
if rare_chars:
|
242
|
+
logger.info(f"Identified {len(rare_chars)} rare characters: {rare_chars}")
|
243
|
+
filtered_labels = []
|
244
|
+
for img_path, text in labels:
|
245
|
+
if any(char in rare_chars for char in text):
|
246
|
+
elements_skipped += 1 # Count these as skipped due to rare chars
|
247
|
+
elements_processed -=1 # Decrement from processed as it's now being skipped
|
248
|
+
else:
|
249
|
+
filtered_labels.append((img_path, text))
|
250
|
+
|
251
|
+
labels_removed_count = original_label_count - len(filtered_labels)
|
252
|
+
if labels_removed_count > 0:
|
253
|
+
logger.info(f"Removed {labels_removed_count} elements containing rare characters.")
|
254
|
+
labels = filtered_labels
|
255
|
+
|
256
|
+
# Recalculate char_counts based on filtered_labels to update the dictionary
|
257
|
+
char_counts.clear()
|
258
|
+
for _, text in labels:
|
259
|
+
char_counts.update(text)
|
260
|
+
|
261
|
+
if not labels:
|
262
|
+
logger.error(
|
263
|
+
"All elements were removed after character frequency filtering. Aborting."
|
264
|
+
)
|
265
|
+
return
|
266
|
+
else:
|
267
|
+
logger.info("No rare characters found below the frequency threshold.")
|
268
|
+
|
269
|
+
|
229
270
|
# --- 3. Generate Dictionary File (`dict.txt`) ---
|
230
271
|
dict_path = os.path.join(output_dir, "dict.txt")
|
231
272
|
try:
|
232
273
|
# Log the character set before sorting/writing
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
#
|
237
|
-
sorted_chars = sorted(list(char_set), reverse=True)
|
274
|
+
final_chars_for_dict = set(char_counts.keys()) # Use keys from potentially filtered char_counts
|
275
|
+
logger.debug(f"Exporter final char_set for dict: {repr(final_chars_for_dict)}")
|
276
|
+
|
277
|
+
sorted_chars = sorted(list(final_chars_for_dict)) # No specific sorting order needed, just make it consistent
|
238
278
|
with open(dict_path, "w", encoding="utf-8") as f_dict:
|
239
279
|
for char in sorted_chars:
|
240
280
|
# Ensure we don't write empty strings or just newlines as dictionary entries
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from .flow import Flow
|
2
|
+
from .element import FlowElement
|
3
|
+
from .region import FlowRegion
|
4
|
+
from .collections import FlowElementCollection, FlowRegionCollection
|
5
|
+
|
6
|
+
__all__ = [
|
7
|
+
"Flow",
|
8
|
+
"FlowElement",
|
9
|
+
"FlowRegion",
|
10
|
+
"FlowElementCollection",
|
11
|
+
"FlowRegionCollection",
|
12
|
+
]
|
@@ -0,0 +1,533 @@
|
|
1
|
+
import logging
|
2
|
+
from collections.abc import MutableSequence
|
3
|
+
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, TypeVar, Union
|
4
|
+
|
5
|
+
from PIL import Image # Single import for PIL.Image module
|
6
|
+
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
# from PIL.Image import Image as PIL_Image # No longer needed with Image.Image type hint
|
9
|
+
from natural_pdf.elements.base import Element as PhysicalElement
|
10
|
+
from natural_pdf.elements.collections import ElementCollection
|
11
|
+
from natural_pdf.core.page import Page as PhysicalPage
|
12
|
+
from .element import FlowElement
|
13
|
+
from .flow import Flow # Though not directly used in __init__, FlowRegion needs it.
|
14
|
+
from .region import FlowRegion
|
15
|
+
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
T_FEC = TypeVar("T_FEC", bound="FlowElement")
|
20
|
+
T_FRC = TypeVar("T_FRC", bound="FlowRegion")
|
21
|
+
|
22
|
+
|
23
|
+
class FlowElementCollection(MutableSequence[T_FEC]):
|
24
|
+
"""
|
25
|
+
A collection of FlowElement objects, typically the result of Flow.find_all().
|
26
|
+
Provides directional methods that operate on its contained FlowElements and
|
27
|
+
return FlowRegionCollection objects.
|
28
|
+
"""
|
29
|
+
def __init__(self, flow_elements: List["FlowElement"]):
|
30
|
+
self._flow_elements: List["FlowElement"] = flow_elements if flow_elements is not None else []
|
31
|
+
|
32
|
+
def __getitem__(self, index: int) -> "FlowElement":
|
33
|
+
return self._flow_elements[index]
|
34
|
+
|
35
|
+
def __setitem__(self, index: int, value: "FlowElement") -> None:
|
36
|
+
self._flow_elements[index] = value
|
37
|
+
|
38
|
+
def __delitem__(self, index: int) -> None:
|
39
|
+
del self._flow_elements[index]
|
40
|
+
|
41
|
+
def __len__(self) -> int:
|
42
|
+
return len(self._flow_elements)
|
43
|
+
|
44
|
+
def insert(self, index: int, value: "FlowElement") -> None:
|
45
|
+
self._flow_elements.insert(index, value)
|
46
|
+
|
47
|
+
@property
|
48
|
+
def flow_elements(self) -> List["FlowElement"]:
|
49
|
+
return self._flow_elements
|
50
|
+
|
51
|
+
@property
|
52
|
+
def first(self) -> Optional["FlowElement"]:
|
53
|
+
return self._flow_elements[0] if self._flow_elements else None
|
54
|
+
|
55
|
+
@property
|
56
|
+
def last(self) -> Optional["FlowElement"]:
|
57
|
+
return self._flow_elements[-1] if self._flow_elements else None
|
58
|
+
|
59
|
+
def __repr__(self) -> str:
|
60
|
+
return f"<FlowElementCollection(count={len(self)})>"
|
61
|
+
|
62
|
+
def _execute_directional_on_all(self, method_name: str, **kwargs) -> "FlowRegionCollection":
|
63
|
+
results: List["FlowRegion"] = []
|
64
|
+
if not self._flow_elements:
|
65
|
+
return FlowRegionCollection([]) # Return empty FlowRegionCollection
|
66
|
+
|
67
|
+
# Assuming all flow_elements share the same flow context
|
68
|
+
# (which should be true if they came from the same Flow.find_all())
|
69
|
+
|
70
|
+
for fe in self._flow_elements:
|
71
|
+
method_to_call = getattr(fe, method_name)
|
72
|
+
flow_region_result: "FlowRegion" = method_to_call(**kwargs)
|
73
|
+
# FlowElement directional methods always return a FlowRegion (even if empty)
|
74
|
+
results.append(flow_region_result)
|
75
|
+
return FlowRegionCollection(results)
|
76
|
+
|
77
|
+
def above(self, height: Optional[float] = None, width_ratio: Optional[float] = None,
|
78
|
+
width_absolute: Optional[float] = None, width_alignment: str = "center",
|
79
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> "FlowRegionCollection":
|
80
|
+
return self._execute_directional_on_all(
|
81
|
+
"above", height=height, width_ratio=width_ratio, width_absolute=width_absolute,
|
82
|
+
width_alignment=width_alignment, until=until, include_endpoint=include_endpoint, **kwargs
|
83
|
+
)
|
84
|
+
|
85
|
+
def below(self, height: Optional[float] = None, width_ratio: Optional[float] = None,
|
86
|
+
width_absolute: Optional[float] = None, width_alignment: str = "center",
|
87
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> "FlowRegionCollection":
|
88
|
+
return self._execute_directional_on_all(
|
89
|
+
"below", height=height, width_ratio=width_ratio, width_absolute=width_absolute,
|
90
|
+
width_alignment=width_alignment, until=until, include_endpoint=include_endpoint, **kwargs
|
91
|
+
)
|
92
|
+
|
93
|
+
def left(self, width: Optional[float] = None, height_ratio: Optional[float] = None,
|
94
|
+
height_absolute: Optional[float] = None, height_alignment: str = "center",
|
95
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> "FlowRegionCollection":
|
96
|
+
return self._execute_directional_on_all(
|
97
|
+
"left", width=width, height_ratio=height_ratio, height_absolute=height_absolute,
|
98
|
+
height_alignment=height_alignment, until=until, include_endpoint=include_endpoint, **kwargs
|
99
|
+
)
|
100
|
+
|
101
|
+
def right(self, width: Optional[float] = None, height_ratio: Optional[float] = None,
|
102
|
+
height_absolute: Optional[float] = None, height_alignment: str = "center",
|
103
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> "FlowRegionCollection":
|
104
|
+
return self._execute_directional_on_all(
|
105
|
+
"right", width=width, height_ratio=height_ratio, height_absolute=height_absolute,
|
106
|
+
height_alignment=height_alignment, until=until, include_endpoint=include_endpoint, **kwargs
|
107
|
+
)
|
108
|
+
|
109
|
+
def show(self, scale: float = 2.0, labels: bool = True, legend_position: str = "right",
|
110
|
+
default_color: Optional[Union[Tuple, str]] = "orange", # A distinct color for FEC show
|
111
|
+
label_prefix: Optional[str] = "FEC_Element", width: Optional[int] = None,
|
112
|
+
stack_direction: str = "vertical", # "vertical" or "horizontal"
|
113
|
+
stack_gap: int = 5, # Gap between stacked page images
|
114
|
+
stack_background_color: Tuple[int, int, int] = (255, 255, 255), # Background for stacking
|
115
|
+
**kwargs) -> Optional[Image.Image]:
|
116
|
+
"""
|
117
|
+
Shows all FlowElements in this collection by highlighting them on their respective pages.
|
118
|
+
If multiple pages are involved, they are stacked into a single image.
|
119
|
+
"""
|
120
|
+
if not self._flow_elements:
|
121
|
+
logger.info("FlowElementCollection.show() called on an empty collection.")
|
122
|
+
return None
|
123
|
+
|
124
|
+
# Group flow elements by their physical page
|
125
|
+
elements_by_page: dict["PhysicalPage", List["FlowElement"]] = {}
|
126
|
+
for flow_element in self._flow_elements:
|
127
|
+
page_obj = flow_element.page
|
128
|
+
if page_obj:
|
129
|
+
if page_obj not in elements_by_page:
|
130
|
+
elements_by_page[page_obj] = []
|
131
|
+
elements_by_page[page_obj].append(flow_element)
|
132
|
+
else:
|
133
|
+
raise ValueError(f"FlowElement {flow_element} has no page.")
|
134
|
+
|
135
|
+
if not elements_by_page:
|
136
|
+
logger.info("FlowElementCollection.show() found no flow elements with associated pages.")
|
137
|
+
return None
|
138
|
+
|
139
|
+
# Get a highlighter service from the first page
|
140
|
+
first_page_with_elements = next(iter(elements_by_page.keys()), None)
|
141
|
+
highlighter_service = None
|
142
|
+
if first_page_with_elements and hasattr(first_page_with_elements, '_highlighter'):
|
143
|
+
highlighter_service = first_page_with_elements._highlighter
|
144
|
+
|
145
|
+
if not highlighter_service:
|
146
|
+
raise ValueError(
|
147
|
+
"Cannot get highlighter service for FlowElementCollection.show(). "
|
148
|
+
"Ensure flow elements' pages are initialized with a highlighter."
|
149
|
+
)
|
150
|
+
|
151
|
+
output_page_images: List[Image.Image] = []
|
152
|
+
|
153
|
+
# Sort pages by index for consistent output order
|
154
|
+
sorted_pages = sorted(elements_by_page.keys(), key=lambda p: p.index if hasattr(p, 'index') else getattr(p, 'page_number', 0))
|
155
|
+
|
156
|
+
# Render each page with its relevant flow elements highlighted
|
157
|
+
for page_idx, page_obj in enumerate(sorted_pages):
|
158
|
+
flow_elements_on_this_page = elements_by_page[page_obj]
|
159
|
+
if not flow_elements_on_this_page:
|
160
|
+
continue
|
161
|
+
|
162
|
+
temp_highlights_for_page = []
|
163
|
+
for i, flow_element in enumerate(flow_elements_on_this_page):
|
164
|
+
element_label = None
|
165
|
+
if labels and label_prefix:
|
166
|
+
count_indicator = ""
|
167
|
+
if len(self._flow_elements) > 1:
|
168
|
+
# Find global index of this flow_element in self._flow_elements
|
169
|
+
try:
|
170
|
+
global_idx = self._flow_elements.index(flow_element)
|
171
|
+
count_indicator = f"_{global_idx + 1}"
|
172
|
+
except ValueError:
|
173
|
+
count_indicator = f"_p{page_idx}i{i+1}" # fallback local index
|
174
|
+
elif len(flow_elements_on_this_page) > 1:
|
175
|
+
count_indicator = f"_{i+1}"
|
176
|
+
|
177
|
+
element_label = f"{label_prefix}{count_indicator}" if label_prefix else None
|
178
|
+
|
179
|
+
temp_highlights_for_page.append({
|
180
|
+
"page_index": page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
|
181
|
+
"bbox": flow_element.bbox,
|
182
|
+
"polygon": getattr(flow_element.physical_object, 'polygon', None) if hasattr(flow_element.physical_object, 'has_polygon') and flow_element.physical_object.has_polygon else None,
|
183
|
+
"color": default_color,
|
184
|
+
"label": element_label,
|
185
|
+
"use_color_cycling": False,
|
186
|
+
})
|
187
|
+
|
188
|
+
if not temp_highlights_for_page:
|
189
|
+
continue
|
190
|
+
|
191
|
+
page_image = highlighter_service.render_preview(
|
192
|
+
page_index=page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
|
193
|
+
temporary_highlights=temp_highlights_for_page,
|
194
|
+
scale=scale,
|
195
|
+
width=width,
|
196
|
+
labels=labels,
|
197
|
+
legend_position=legend_position,
|
198
|
+
**kwargs
|
199
|
+
)
|
200
|
+
if page_image:
|
201
|
+
output_page_images.append(page_image)
|
202
|
+
|
203
|
+
# Stack the generated page images if multiple
|
204
|
+
if not output_page_images:
|
205
|
+
logger.info("FlowElementCollection.show() produced no page images to concatenate.")
|
206
|
+
return None
|
207
|
+
|
208
|
+
if len(output_page_images) == 1:
|
209
|
+
return output_page_images[0]
|
210
|
+
|
211
|
+
# Stacking logic (same as in FlowRegionCollection.show)
|
212
|
+
if stack_direction == "vertical":
|
213
|
+
final_width = max(img.width for img in output_page_images)
|
214
|
+
final_height = sum(img.height for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
|
215
|
+
if final_width == 0 or final_height == 0:
|
216
|
+
raise ValueError("Cannot create concatenated image with zero width or height.")
|
217
|
+
|
218
|
+
concatenated_image = Image.new("RGB", (final_width, final_height), stack_background_color)
|
219
|
+
current_y = 0
|
220
|
+
for img in output_page_images:
|
221
|
+
paste_x = (final_width - img.width) // 2
|
222
|
+
concatenated_image.paste(img, (paste_x, current_y))
|
223
|
+
current_y += img.height + stack_gap
|
224
|
+
return concatenated_image
|
225
|
+
elif stack_direction == "horizontal":
|
226
|
+
final_width = sum(img.width for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
|
227
|
+
final_height = max(img.height for img in output_page_images)
|
228
|
+
if final_width == 0 or final_height == 0:
|
229
|
+
raise ValueError("Cannot create concatenated image with zero width or height.")
|
230
|
+
|
231
|
+
concatenated_image = Image.new("RGB", (final_width, final_height), stack_background_color)
|
232
|
+
current_x = 0
|
233
|
+
for img in output_page_images:
|
234
|
+
paste_y = (final_height - img.height) // 2
|
235
|
+
concatenated_image.paste(img, (current_x, paste_y))
|
236
|
+
current_x += img.width + stack_gap
|
237
|
+
return concatenated_image
|
238
|
+
else:
|
239
|
+
raise ValueError(f"Invalid stack_direction '{stack_direction}' for FlowElementCollection.show(). Must be 'vertical' or 'horizontal'.")
|
240
|
+
|
241
|
+
|
242
|
+
class FlowRegionCollection(MutableSequence[T_FRC]):
|
243
|
+
"""
|
244
|
+
A collection of FlowRegion objects, typically the result of directional
|
245
|
+
operations on a FlowElementCollection.
|
246
|
+
Provides methods for querying and visualizing the aggregated content.
|
247
|
+
"""
|
248
|
+
def __init__(self, flow_regions: List["FlowRegion"]):
|
249
|
+
self._flow_regions: List["FlowRegion"] = flow_regions if flow_regions is not None else []
|
250
|
+
|
251
|
+
def __getitem__(self, index: int) -> "FlowRegion":
|
252
|
+
return self._flow_regions[index]
|
253
|
+
|
254
|
+
def __setitem__(self, index: int, value: "FlowRegion") -> None:
|
255
|
+
self._flow_regions[index] = value
|
256
|
+
|
257
|
+
def __delitem__(self, index: int) -> None:
|
258
|
+
del self._flow_regions[index]
|
259
|
+
|
260
|
+
def __len__(self) -> int:
|
261
|
+
return len(self._flow_regions)
|
262
|
+
|
263
|
+
def insert(self, index: int, value: "FlowRegion") -> None:
|
264
|
+
self._flow_regions.insert(index, value)
|
265
|
+
|
266
|
+
def __repr__(self) -> str:
|
267
|
+
return f"<FlowRegionCollection(count={len(self)})>"
|
268
|
+
|
269
|
+
def __add__(self, other: "FlowRegionCollection") -> "FlowRegionCollection":
|
270
|
+
if not isinstance(other, FlowRegionCollection):
|
271
|
+
return NotImplemented
|
272
|
+
return FlowRegionCollection(self._flow_regions + other._flow_regions)
|
273
|
+
|
274
|
+
@property
|
275
|
+
def flow_regions(self) -> List["FlowRegion"]:
|
276
|
+
return self._flow_regions
|
277
|
+
|
278
|
+
@property
|
279
|
+
def first(self) -> Optional["FlowRegion"]:
|
280
|
+
return self._flow_regions[0] if self._flow_regions else None
|
281
|
+
|
282
|
+
@property
|
283
|
+
def last(self) -> Optional["FlowRegion"]:
|
284
|
+
return self._flow_regions[-1] if self._flow_regions else None
|
285
|
+
|
286
|
+
@property
|
287
|
+
def is_empty(self) -> bool:
|
288
|
+
if not self._flow_regions:
|
289
|
+
return True
|
290
|
+
return all(fr.is_empty for fr in self._flow_regions)
|
291
|
+
|
292
|
+
def filter(self, func: Callable[["FlowRegion"], bool]) -> "FlowRegionCollection":
|
293
|
+
return FlowRegionCollection([fr for fr in self._flow_regions if func(fr)])
|
294
|
+
|
295
|
+
def sort(self, key: Optional[Callable[["FlowRegion"], Any]] = None, reverse: bool = False) -> "FlowRegionCollection":
|
296
|
+
"""Sorts the collection in-place. Default sort is by flow order if possible."""
|
297
|
+
# A default key could try to sort by first constituent region's page then top/left,
|
298
|
+
# but FlowRegions can be complex. For now, require explicit key or rely on list.sort default.
|
299
|
+
if key is None:
|
300
|
+
# Attempt a sensible default sort: by page of first constituent, then its top, then its x0
|
301
|
+
def default_sort_key(fr: "FlowRegion"):
|
302
|
+
if fr.constituent_regions:
|
303
|
+
first_constituent = fr.constituent_regions[0]
|
304
|
+
page_idx = first_constituent.page.index if first_constituent.page else -1
|
305
|
+
return (page_idx, first_constituent.top, first_constituent.x0)
|
306
|
+
return (float('inf'), float('inf'), float('inf')) # Push empty ones to the end
|
307
|
+
self._flow_regions.sort(key=default_sort_key, reverse=reverse)
|
308
|
+
else:
|
309
|
+
self._flow_regions.sort(key=key, reverse=reverse)
|
310
|
+
return self
|
311
|
+
|
312
|
+
def extract_text(self, separator: str = "\n", apply_exclusions: bool = True, **kwargs) -> str:
|
313
|
+
texts = [
|
314
|
+
fr.extract_text(apply_exclusions=apply_exclusions, **kwargs)
|
315
|
+
for fr in self._flow_regions
|
316
|
+
]
|
317
|
+
return separator.join(t for t in texts if t) # Filter out empty strings from concatenation
|
318
|
+
|
319
|
+
def extract_each_text(self, apply_exclusions: bool = True, **kwargs) -> List[str]:
|
320
|
+
return [
|
321
|
+
fr.extract_text(apply_exclusions=apply_exclusions, **kwargs)
|
322
|
+
for fr in self._flow_regions
|
323
|
+
]
|
324
|
+
|
325
|
+
def find(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> Optional["PhysicalElement"]:
|
326
|
+
from natural_pdf.elements.base import Element as PhysicalElement # Runtime import
|
327
|
+
for fr in self._flow_regions:
|
328
|
+
found = fr.find(selector=selector, text=text, **kwargs)
|
329
|
+
if found:
|
330
|
+
return found
|
331
|
+
return None
|
332
|
+
|
333
|
+
def find_all(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> "ElementCollection":
|
334
|
+
from natural_pdf.elements.collections import ElementCollection as RuntimeElementCollection # Runtime import
|
335
|
+
|
336
|
+
all_physical_elements: List["PhysicalElement"] = []
|
337
|
+
for fr in self._flow_regions:
|
338
|
+
# FlowRegion.find_all returns an ElementCollection
|
339
|
+
elements_in_fr: "RuntimeElementCollection" = fr.find_all(selector=selector, text=text, **kwargs)
|
340
|
+
if elements_in_fr: # ElementCollection has boolean True if not empty
|
341
|
+
all_physical_elements.extend(elements_in_fr.elements) # Access .elements to get list
|
342
|
+
|
343
|
+
# Deduplicate while preserving order as much as possible (simple set doesn't preserve order)
|
344
|
+
seen = set()
|
345
|
+
unique_elements = []
|
346
|
+
for el in all_physical_elements:
|
347
|
+
if el not in seen:
|
348
|
+
unique_elements.append(el)
|
349
|
+
seen.add(el)
|
350
|
+
return RuntimeElementCollection(unique_elements)
|
351
|
+
|
352
|
+
def highlight(self, label_prefix: Optional[str] = "FRC", color: Optional[Union[Tuple, str]] = None, **kwargs) -> "FlowRegionCollection":
|
353
|
+
if not self._flow_regions:
|
354
|
+
return self
|
355
|
+
|
356
|
+
num_flow_regions = len(self._flow_regions)
|
357
|
+
for i, fr in enumerate(self._flow_regions):
|
358
|
+
current_label = None
|
359
|
+
if label_prefix:
|
360
|
+
current_label = f"{label_prefix}_{i+1}" if num_flow_regions > 1 else label_prefix
|
361
|
+
|
362
|
+
# Pass the specific color to each FlowRegion's highlight method.
|
363
|
+
# FlowRegion.highlight will then pass it to its constituent regions.
|
364
|
+
fr.highlight(label=current_label, color=color, **kwargs)
|
365
|
+
return self
|
366
|
+
|
367
|
+
def show(self, scale: float = 2.0, labels: bool = True, legend_position: str = "right",
|
368
|
+
default_color: Optional[Union[Tuple, str]] = "darkviolet", # A distinct color for FRC show
|
369
|
+
label_prefix: Optional[str] = "FRC_Part", width: Optional[int] = None,
|
370
|
+
stack_direction: str = "vertical", # New: "vertical" or "horizontal"
|
371
|
+
stack_gap: int = 5, # New: Gap between stacked page images
|
372
|
+
stack_background_color: Tuple[int, int, int] = (255, 255, 255), # New: Background for stacking
|
373
|
+
**kwargs) -> Optional[Image.Image]: # Return type changed
|
374
|
+
if not self._flow_regions:
|
375
|
+
logger.info("FlowRegionCollection.show() called on an empty collection.")
|
376
|
+
return None # Changed from []
|
377
|
+
|
378
|
+
regions_by_page: dict["PhysicalPage", List[dict[str, Any]]] = {}
|
379
|
+
|
380
|
+
first_flow_region = self._flow_regions[0]
|
381
|
+
highlighter_service = None
|
382
|
+
if first_flow_region and first_flow_region.flow and first_flow_region.flow.segments:
|
383
|
+
first_segment_page = first_flow_region.flow.segments[0].page
|
384
|
+
if first_segment_page and hasattr(first_segment_page, '_highlighter'):
|
385
|
+
highlighter_service = first_segment_page._highlighter
|
386
|
+
|
387
|
+
if not highlighter_service:
|
388
|
+
logger.error("Cannot get highlighter service for FlowRegionCollection.show().")
|
389
|
+
return None # Changed from []
|
390
|
+
|
391
|
+
constituent_idx = 0
|
392
|
+
for fr_idx, fr in enumerate(self._flow_regions):
|
393
|
+
for constituent_region in fr.constituent_regions:
|
394
|
+
page_obj = constituent_region.page
|
395
|
+
if not page_obj:
|
396
|
+
logger.warning(f"Constituent region {constituent_region.bbox} has no page. Skipping in show().")
|
397
|
+
continue
|
398
|
+
|
399
|
+
if page_obj not in regions_by_page:
|
400
|
+
regions_by_page[page_obj] = []
|
401
|
+
|
402
|
+
part_label = None
|
403
|
+
if label_prefix:
|
404
|
+
part_label = f"{label_prefix}_{constituent_idx}"
|
405
|
+
|
406
|
+
regions_by_page[page_obj].append({
|
407
|
+
"page_index": page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
|
408
|
+
"bbox": constituent_region.bbox,
|
409
|
+
"polygon": constituent_region.polygon if constituent_region.has_polygon else None,
|
410
|
+
"color": default_color,
|
411
|
+
"label": part_label,
|
412
|
+
"use_color_cycling": False,
|
413
|
+
})
|
414
|
+
constituent_idx += 1
|
415
|
+
|
416
|
+
output_page_images: List[Image.Image] = []
|
417
|
+
sorted_pages = sorted(regions_by_page.keys(), key=lambda p: p.index if hasattr(p, 'index') else getattr(p, 'page_number', 0))
|
418
|
+
|
419
|
+
for page_obj in sorted_pages:
|
420
|
+
temp_highlights_for_page = regions_by_page[page_obj]
|
421
|
+
if not temp_highlights_for_page: continue
|
422
|
+
|
423
|
+
page_image = highlighter_service.render_preview(
|
424
|
+
page_index=page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
|
425
|
+
temporary_highlights=temp_highlights_for_page,
|
426
|
+
scale=scale,
|
427
|
+
width=width,
|
428
|
+
labels=labels,
|
429
|
+
legend_position=legend_position,
|
430
|
+
**kwargs
|
431
|
+
)
|
432
|
+
if page_image:
|
433
|
+
output_page_images.append(page_image)
|
434
|
+
|
435
|
+
if not output_page_images:
|
436
|
+
logger.info("FlowRegionCollection.show() produced no page images to concatenate.")
|
437
|
+
return None
|
438
|
+
|
439
|
+
if len(output_page_images) == 1:
|
440
|
+
return output_page_images[0]
|
441
|
+
|
442
|
+
if stack_direction == "vertical":
|
443
|
+
final_width = max(img.width for img in output_page_images)
|
444
|
+
final_height = sum(img.height for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
|
445
|
+
if final_width == 0 or final_height == 0:
|
446
|
+
logger.warning("Cannot create concatenated image with zero width or height.")
|
447
|
+
return None
|
448
|
+
|
449
|
+
concatenated_image = Image.new("RGB", (final_width, final_height), stack_background_color)
|
450
|
+
current_y = 0
|
451
|
+
for img in output_page_images:
|
452
|
+
paste_x = (final_width - img.width) // 2 # Center horizontally
|
453
|
+
concatenated_image.paste(img, (paste_x, current_y))
|
454
|
+
current_y += img.height + stack_gap
|
455
|
+
return concatenated_image
|
456
|
+
elif stack_direction == "horizontal":
|
457
|
+
final_width = sum(img.width for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
|
458
|
+
final_height = max(img.height for img in output_page_images)
|
459
|
+
if final_width == 0 or final_height == 0:
|
460
|
+
logger.warning("Cannot create concatenated image with zero width or height.")
|
461
|
+
return None
|
462
|
+
|
463
|
+
concatenated_image = Image.new("RGB", (final_width, final_height), stack_background_color)
|
464
|
+
current_x = 0
|
465
|
+
for img in output_page_images:
|
466
|
+
paste_y = (final_height - img.height) // 2 # Center vertically
|
467
|
+
concatenated_image.paste(img, (current_x, paste_y))
|
468
|
+
current_x += img.width + stack_gap
|
469
|
+
return concatenated_image
|
470
|
+
else:
|
471
|
+
logger.error(f"Invalid stack_direction '{stack_direction}' for FlowRegionCollection.show(). Must be 'vertical' or 'horizontal'.")
|
472
|
+
return None
|
473
|
+
|
474
|
+
def to_images(self, resolution: float = 150, **kwargs) -> List[Image.Image]:
|
475
|
+
"""Returns a flat list of cropped images of all constituent physical regions."""
|
476
|
+
all_cropped_images: List[Image.Image] = []
|
477
|
+
for fr in self._flow_regions:
|
478
|
+
all_cropped_images.extend(fr.to_images(resolution=resolution, **kwargs))
|
479
|
+
return all_cropped_images
|
480
|
+
|
481
|
+
def to_image(self, stack_direction: str = "vertical", background_color=(255,255,255), gap: int = 5, **kwargs_for_constituent_to_image) -> Optional[Image.Image]:
|
482
|
+
"""
|
483
|
+
Creates a single composite image by stacking the composite images of each FlowRegion.
|
484
|
+
Each FlowRegion's composite is generated by its own .to_image() method.
|
485
|
+
These are then stacked.
|
486
|
+
|
487
|
+
Args:
|
488
|
+
stack_direction: "vertical" or "horizontal".
|
489
|
+
background_color: Background for the final composite.
|
490
|
+
gap: Gap in pixels between stacked FlowRegion images.
|
491
|
+
**kwargs_for_constituent_to_image: Passed to each FlowRegion.to_image().
|
492
|
+
"""
|
493
|
+
if not self._flow_regions: return None
|
494
|
+
|
495
|
+
region_composites: List[Image.Image] = []
|
496
|
+
for fr in self._flow_regions:
|
497
|
+
img = fr.to_image(background_color=background_color, **kwargs_for_constituent_to_image)
|
498
|
+
if img:
|
499
|
+
region_composites.append(img)
|
500
|
+
|
501
|
+
if not region_composites: return None
|
502
|
+
if len(region_composites) == 1: return region_composites[0]
|
503
|
+
|
504
|
+
if stack_direction == "vertical":
|
505
|
+
final_width = max(img.width for img in region_composites)
|
506
|
+
final_height = sum(img.height for img in region_composites) + (len(region_composites) - 1) * gap
|
507
|
+
if final_width == 0 or final_height == 0: return None
|
508
|
+
|
509
|
+
new_image = Image.new("RGB", (final_width, final_height), background_color)
|
510
|
+
current_y = 0
|
511
|
+
for img in region_composites:
|
512
|
+
# Align to left for vertical stacking
|
513
|
+
new_image.paste(img, (0, current_y))
|
514
|
+
current_y += img.height + gap
|
515
|
+
return new_image
|
516
|
+
elif stack_direction == "horizontal":
|
517
|
+
final_width = sum(img.width for img in region_composites) + (len(region_composites) - 1) * gap
|
518
|
+
final_height = max(img.height for img in region_composites)
|
519
|
+
if final_width == 0 or final_height == 0: return None
|
520
|
+
|
521
|
+
new_image = Image.new("RGB", (final_width, final_height), background_color)
|
522
|
+
current_x = 0
|
523
|
+
for img in region_composites:
|
524
|
+
# Align to top for horizontal stacking
|
525
|
+
new_image.paste(img, (current_x, 0))
|
526
|
+
current_x += img.width + gap
|
527
|
+
return new_image
|
528
|
+
else:
|
529
|
+
logger.warning(f"Invalid stack_direction: {stack_direction}. Must be 'vertical' or 'horizontal'.")
|
530
|
+
return None # Or perhaps return the list of images?
|
531
|
+
|
532
|
+
def apply(self, func: Callable[["FlowRegion"], Any]) -> List[Any]:
|
533
|
+
return [func(fr) for fr in self._flow_regions]
|