natural-pdf 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,8 +3,9 @@ import os
3
3
  import random
4
4
  import shutil
5
5
  from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
6
+ import collections
6
7
 
7
- from tqdm import tqdm
8
+ from tqdm.auto import tqdm
8
9
 
9
10
  from natural_pdf.exporters.base import FinetuneExporter
10
11
 
@@ -33,19 +34,20 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
33
34
  def __init__(
34
35
  self,
35
36
  resolution: int = 150,
36
- padding: int = 2,
37
+ padding: int = 0,
37
38
  selector: Optional[str] = None,
38
39
  corrected_only: bool = False,
39
40
  split_ratio: Optional[float] = 0.9,
40
41
  include_guide: bool = True,
41
42
  random_seed: Optional[int] = 42,
43
+ min_char_freq: int = 3,
42
44
  ):
43
45
  """
44
46
  Initialize the PaddleOCR Recognition Exporter.
45
47
 
46
48
  Args:
47
49
  resolution: DPI resolution for rendering text region images (default: 150).
48
- padding: Padding (in points) to add around text element bbox before cropping (default: 2).
50
+ padding: Padding (in points) to add around text element bbox before cropping (default: 0).
49
51
  selector: CSS-like selector to filter which TextElements to export.
50
52
  If None and corrected_only is False, all 'text' elements are considered.
51
53
  corrected_only: If True, overrides selector and exports only elements likely
@@ -57,6 +59,9 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
57
59
  in the output directory (default: True).
58
60
  random_seed: Seed for the random number generator used for train/val split shuffling,
59
61
  ensuring reproducibility (default: 42).
62
+ min_char_freq: Minimum frequency for a character to be included in the dictionary.
63
+ Text elements containing characters below this frequency will be removed.
64
+ (default: 1, meaning no filtering based on frequency).
60
65
  """
61
66
  if corrected_only and selector:
62
67
  logger.warning(
@@ -76,10 +81,12 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
76
81
  self.split_ratio = split_ratio
77
82
  self.include_guide = include_guide
78
83
  self.random_seed = random_seed
84
+ self.min_char_freq = min_char_freq
79
85
 
80
86
  logger.info(
81
87
  f"Initialized PaddleOCRRecognitionExporter: selector='{self.selector}', resolution={resolution}, "
82
- f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}"
88
+ f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}, "
89
+ f"min_char_freq={min_char_freq}"
83
90
  )
84
91
 
85
92
  def export(
@@ -114,7 +121,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
114
121
 
115
122
  # --- 2. Collect Elements and Render Images ---
116
123
  labels: List[Tuple[str, str]] = [] # List of (relative_image_path, text_label)
117
- char_set: Set[str] = set()
124
+ char_counts: collections.Counter = collections.Counter()
118
125
  elements_processed = 0
119
126
  elements_skipped = 0
120
127
 
@@ -200,7 +207,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
200
207
  labels.append(
201
208
  (relative_image_path.replace(os.path.sep, "/"), element_text)
202
209
  ) # Use forward slashes for labels
203
- char_set.update(element_text)
210
+ char_counts.update(element_text)
204
211
  elements_processed += 1
205
212
 
206
213
  except Exception as e:
@@ -226,15 +233,48 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
226
233
 
227
234
  logger.info(f"Processed {elements_processed} text elements, skipped {elements_skipped}.")
228
235
 
236
+ # --- 2.5 Filter based on character frequency ---
237
+ if self.min_char_freq > 1:
238
+ logger.info(f"Filtering elements based on min_char_freq: {self.min_char_freq}")
239
+ original_label_count = len(labels)
240
+ rare_chars = {char for char, count in char_counts.items() if count < self.min_char_freq}
241
+ if rare_chars:
242
+ logger.info(f"Identified {len(rare_chars)} rare characters: {rare_chars}")
243
+ filtered_labels = []
244
+ for img_path, text in labels:
245
+ if any(char in rare_chars for char in text):
246
+ elements_skipped += 1 # Count these as skipped due to rare chars
247
+ elements_processed -=1 # Decrement from processed as it's now being skipped
248
+ else:
249
+ filtered_labels.append((img_path, text))
250
+
251
+ labels_removed_count = original_label_count - len(filtered_labels)
252
+ if labels_removed_count > 0:
253
+ logger.info(f"Removed {labels_removed_count} elements containing rare characters.")
254
+ labels = filtered_labels
255
+
256
+ # Recalculate char_counts based on filtered_labels to update the dictionary
257
+ char_counts.clear()
258
+ for _, text in labels:
259
+ char_counts.update(text)
260
+
261
+ if not labels:
262
+ logger.error(
263
+ "All elements were removed after character frequency filtering. Aborting."
264
+ )
265
+ return
266
+ else:
267
+ logger.info("No rare characters found below the frequency threshold.")
268
+
269
+
229
270
  # --- 3. Generate Dictionary File (`dict.txt`) ---
230
271
  dict_path = os.path.join(output_dir, "dict.txt")
231
272
  try:
232
273
  # Log the character set before sorting/writing
233
- logger.debug(f"Exporter final char_set before sorting: {repr(char_set)}")
234
- # PaddleOCR typically doesn't require special tokens like <UNK> or <BLK> in the dict
235
- # for recognition models, but this might depend on the specific base model.
236
- # Start with just the characters found.
237
- sorted_chars = sorted(list(char_set), reverse=True)
274
+ final_chars_for_dict = set(char_counts.keys()) # Use keys from potentially filtered char_counts
275
+ logger.debug(f"Exporter final char_set for dict: {repr(final_chars_for_dict)}")
276
+
277
+ sorted_chars = sorted(list(final_chars_for_dict)) # No specific sorting order needed, just make it consistent
238
278
  with open(dict_path, "w", encoding="utf-8") as f_dict:
239
279
  for char in sorted_chars:
240
280
  # Ensure we don't write empty strings or just newlines as dictionary entries
@@ -0,0 +1,12 @@
1
+ from .flow import Flow
2
+ from .element import FlowElement
3
+ from .region import FlowRegion
4
+ from .collections import FlowElementCollection, FlowRegionCollection
5
+
6
+ __all__ = [
7
+ "Flow",
8
+ "FlowElement",
9
+ "FlowRegion",
10
+ "FlowElementCollection",
11
+ "FlowRegionCollection",
12
+ ]
@@ -0,0 +1,533 @@
1
+ import logging
2
+ from collections.abc import MutableSequence
3
+ from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, TypeVar, Union
4
+
5
+ from PIL import Image # Single import for PIL.Image module
6
+
7
+ if TYPE_CHECKING:
8
+ # from PIL.Image import Image as PIL_Image # No longer needed with Image.Image type hint
9
+ from natural_pdf.elements.base import Element as PhysicalElement
10
+ from natural_pdf.elements.collections import ElementCollection
11
+ from natural_pdf.core.page import Page as PhysicalPage
12
+ from .element import FlowElement
13
+ from .flow import Flow # Though not directly used in __init__, FlowRegion needs it.
14
+ from .region import FlowRegion
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ T_FEC = TypeVar("T_FEC", bound="FlowElement")
20
+ T_FRC = TypeVar("T_FRC", bound="FlowRegion")
21
+
22
+
23
+ class FlowElementCollection(MutableSequence[T_FEC]):
24
+ """
25
+ A collection of FlowElement objects, typically the result of Flow.find_all().
26
+ Provides directional methods that operate on its contained FlowElements and
27
+ return FlowRegionCollection objects.
28
+ """
29
+ def __init__(self, flow_elements: List["FlowElement"]):
30
+ self._flow_elements: List["FlowElement"] = flow_elements if flow_elements is not None else []
31
+
32
+ def __getitem__(self, index: int) -> "FlowElement":
33
+ return self._flow_elements[index]
34
+
35
+ def __setitem__(self, index: int, value: "FlowElement") -> None:
36
+ self._flow_elements[index] = value
37
+
38
+ def __delitem__(self, index: int) -> None:
39
+ del self._flow_elements[index]
40
+
41
+ def __len__(self) -> int:
42
+ return len(self._flow_elements)
43
+
44
+ def insert(self, index: int, value: "FlowElement") -> None:
45
+ self._flow_elements.insert(index, value)
46
+
47
+ @property
48
+ def flow_elements(self) -> List["FlowElement"]:
49
+ return self._flow_elements
50
+
51
+ @property
52
+ def first(self) -> Optional["FlowElement"]:
53
+ return self._flow_elements[0] if self._flow_elements else None
54
+
55
+ @property
56
+ def last(self) -> Optional["FlowElement"]:
57
+ return self._flow_elements[-1] if self._flow_elements else None
58
+
59
+ def __repr__(self) -> str:
60
+ return f"<FlowElementCollection(count={len(self)})>"
61
+
62
+ def _execute_directional_on_all(self, method_name: str, **kwargs) -> "FlowRegionCollection":
63
+ results: List["FlowRegion"] = []
64
+ if not self._flow_elements:
65
+ return FlowRegionCollection([]) # Return empty FlowRegionCollection
66
+
67
+ # Assuming all flow_elements share the same flow context
68
+ # (which should be true if they came from the same Flow.find_all())
69
+
70
+ for fe in self._flow_elements:
71
+ method_to_call = getattr(fe, method_name)
72
+ flow_region_result: "FlowRegion" = method_to_call(**kwargs)
73
+ # FlowElement directional methods always return a FlowRegion (even if empty)
74
+ results.append(flow_region_result)
75
+ return FlowRegionCollection(results)
76
+
77
+ def above(self, height: Optional[float] = None, width_ratio: Optional[float] = None,
78
+ width_absolute: Optional[float] = None, width_alignment: str = "center",
79
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> "FlowRegionCollection":
80
+ return self._execute_directional_on_all(
81
+ "above", height=height, width_ratio=width_ratio, width_absolute=width_absolute,
82
+ width_alignment=width_alignment, until=until, include_endpoint=include_endpoint, **kwargs
83
+ )
84
+
85
+ def below(self, height: Optional[float] = None, width_ratio: Optional[float] = None,
86
+ width_absolute: Optional[float] = None, width_alignment: str = "center",
87
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> "FlowRegionCollection":
88
+ return self._execute_directional_on_all(
89
+ "below", height=height, width_ratio=width_ratio, width_absolute=width_absolute,
90
+ width_alignment=width_alignment, until=until, include_endpoint=include_endpoint, **kwargs
91
+ )
92
+
93
+ def left(self, width: Optional[float] = None, height_ratio: Optional[float] = None,
94
+ height_absolute: Optional[float] = None, height_alignment: str = "center",
95
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> "FlowRegionCollection":
96
+ return self._execute_directional_on_all(
97
+ "left", width=width, height_ratio=height_ratio, height_absolute=height_absolute,
98
+ height_alignment=height_alignment, until=until, include_endpoint=include_endpoint, **kwargs
99
+ )
100
+
101
+ def right(self, width: Optional[float] = None, height_ratio: Optional[float] = None,
102
+ height_absolute: Optional[float] = None, height_alignment: str = "center",
103
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> "FlowRegionCollection":
104
+ return self._execute_directional_on_all(
105
+ "right", width=width, height_ratio=height_ratio, height_absolute=height_absolute,
106
+ height_alignment=height_alignment, until=until, include_endpoint=include_endpoint, **kwargs
107
+ )
108
+
109
+ def show(self, scale: float = 2.0, labels: bool = True, legend_position: str = "right",
110
+ default_color: Optional[Union[Tuple, str]] = "orange", # A distinct color for FEC show
111
+ label_prefix: Optional[str] = "FEC_Element", width: Optional[int] = None,
112
+ stack_direction: str = "vertical", # "vertical" or "horizontal"
113
+ stack_gap: int = 5, # Gap between stacked page images
114
+ stack_background_color: Tuple[int, int, int] = (255, 255, 255), # Background for stacking
115
+ **kwargs) -> Optional[Image.Image]:
116
+ """
117
+ Shows all FlowElements in this collection by highlighting them on their respective pages.
118
+ If multiple pages are involved, they are stacked into a single image.
119
+ """
120
+ if not self._flow_elements:
121
+ logger.info("FlowElementCollection.show() called on an empty collection.")
122
+ return None
123
+
124
+ # Group flow elements by their physical page
125
+ elements_by_page: dict["PhysicalPage", List["FlowElement"]] = {}
126
+ for flow_element in self._flow_elements:
127
+ page_obj = flow_element.page
128
+ if page_obj:
129
+ if page_obj not in elements_by_page:
130
+ elements_by_page[page_obj] = []
131
+ elements_by_page[page_obj].append(flow_element)
132
+ else:
133
+ raise ValueError(f"FlowElement {flow_element} has no page.")
134
+
135
+ if not elements_by_page:
136
+ logger.info("FlowElementCollection.show() found no flow elements with associated pages.")
137
+ return None
138
+
139
+ # Get a highlighter service from the first page
140
+ first_page_with_elements = next(iter(elements_by_page.keys()), None)
141
+ highlighter_service = None
142
+ if first_page_with_elements and hasattr(first_page_with_elements, '_highlighter'):
143
+ highlighter_service = first_page_with_elements._highlighter
144
+
145
+ if not highlighter_service:
146
+ raise ValueError(
147
+ "Cannot get highlighter service for FlowElementCollection.show(). "
148
+ "Ensure flow elements' pages are initialized with a highlighter."
149
+ )
150
+
151
+ output_page_images: List[Image.Image] = []
152
+
153
+ # Sort pages by index for consistent output order
154
+ sorted_pages = sorted(elements_by_page.keys(), key=lambda p: p.index if hasattr(p, 'index') else getattr(p, 'page_number', 0))
155
+
156
+ # Render each page with its relevant flow elements highlighted
157
+ for page_idx, page_obj in enumerate(sorted_pages):
158
+ flow_elements_on_this_page = elements_by_page[page_obj]
159
+ if not flow_elements_on_this_page:
160
+ continue
161
+
162
+ temp_highlights_for_page = []
163
+ for i, flow_element in enumerate(flow_elements_on_this_page):
164
+ element_label = None
165
+ if labels and label_prefix:
166
+ count_indicator = ""
167
+ if len(self._flow_elements) > 1:
168
+ # Find global index of this flow_element in self._flow_elements
169
+ try:
170
+ global_idx = self._flow_elements.index(flow_element)
171
+ count_indicator = f"_{global_idx + 1}"
172
+ except ValueError:
173
+ count_indicator = f"_p{page_idx}i{i+1}" # fallback local index
174
+ elif len(flow_elements_on_this_page) > 1:
175
+ count_indicator = f"_{i+1}"
176
+
177
+ element_label = f"{label_prefix}{count_indicator}" if label_prefix else None
178
+
179
+ temp_highlights_for_page.append({
180
+ "page_index": page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
181
+ "bbox": flow_element.bbox,
182
+ "polygon": getattr(flow_element.physical_object, 'polygon', None) if hasattr(flow_element.physical_object, 'has_polygon') and flow_element.physical_object.has_polygon else None,
183
+ "color": default_color,
184
+ "label": element_label,
185
+ "use_color_cycling": False,
186
+ })
187
+
188
+ if not temp_highlights_for_page:
189
+ continue
190
+
191
+ page_image = highlighter_service.render_preview(
192
+ page_index=page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
193
+ temporary_highlights=temp_highlights_for_page,
194
+ scale=scale,
195
+ width=width,
196
+ labels=labels,
197
+ legend_position=legend_position,
198
+ **kwargs
199
+ )
200
+ if page_image:
201
+ output_page_images.append(page_image)
202
+
203
+ # Stack the generated page images if multiple
204
+ if not output_page_images:
205
+ logger.info("FlowElementCollection.show() produced no page images to concatenate.")
206
+ return None
207
+
208
+ if len(output_page_images) == 1:
209
+ return output_page_images[0]
210
+
211
+ # Stacking logic (same as in FlowRegionCollection.show)
212
+ if stack_direction == "vertical":
213
+ final_width = max(img.width for img in output_page_images)
214
+ final_height = sum(img.height for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
215
+ if final_width == 0 or final_height == 0:
216
+ raise ValueError("Cannot create concatenated image with zero width or height.")
217
+
218
+ concatenated_image = Image.new("RGB", (final_width, final_height), stack_background_color)
219
+ current_y = 0
220
+ for img in output_page_images:
221
+ paste_x = (final_width - img.width) // 2
222
+ concatenated_image.paste(img, (paste_x, current_y))
223
+ current_y += img.height + stack_gap
224
+ return concatenated_image
225
+ elif stack_direction == "horizontal":
226
+ final_width = sum(img.width for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
227
+ final_height = max(img.height for img in output_page_images)
228
+ if final_width == 0 or final_height == 0:
229
+ raise ValueError("Cannot create concatenated image with zero width or height.")
230
+
231
+ concatenated_image = Image.new("RGB", (final_width, final_height), stack_background_color)
232
+ current_x = 0
233
+ for img in output_page_images:
234
+ paste_y = (final_height - img.height) // 2
235
+ concatenated_image.paste(img, (current_x, paste_y))
236
+ current_x += img.width + stack_gap
237
+ return concatenated_image
238
+ else:
239
+ raise ValueError(f"Invalid stack_direction '{stack_direction}' for FlowElementCollection.show(). Must be 'vertical' or 'horizontal'.")
240
+
241
+
242
+ class FlowRegionCollection(MutableSequence[T_FRC]):
243
+ """
244
+ A collection of FlowRegion objects, typically the result of directional
245
+ operations on a FlowElementCollection.
246
+ Provides methods for querying and visualizing the aggregated content.
247
+ """
248
+ def __init__(self, flow_regions: List["FlowRegion"]):
249
+ self._flow_regions: List["FlowRegion"] = flow_regions if flow_regions is not None else []
250
+
251
+ def __getitem__(self, index: int) -> "FlowRegion":
252
+ return self._flow_regions[index]
253
+
254
+ def __setitem__(self, index: int, value: "FlowRegion") -> None:
255
+ self._flow_regions[index] = value
256
+
257
+ def __delitem__(self, index: int) -> None:
258
+ del self._flow_regions[index]
259
+
260
+ def __len__(self) -> int:
261
+ return len(self._flow_regions)
262
+
263
+ def insert(self, index: int, value: "FlowRegion") -> None:
264
+ self._flow_regions.insert(index, value)
265
+
266
+ def __repr__(self) -> str:
267
+ return f"<FlowRegionCollection(count={len(self)})>"
268
+
269
+ def __add__(self, other: "FlowRegionCollection") -> "FlowRegionCollection":
270
+ if not isinstance(other, FlowRegionCollection):
271
+ return NotImplemented
272
+ return FlowRegionCollection(self._flow_regions + other._flow_regions)
273
+
274
+ @property
275
+ def flow_regions(self) -> List["FlowRegion"]:
276
+ return self._flow_regions
277
+
278
+ @property
279
+ def first(self) -> Optional["FlowRegion"]:
280
+ return self._flow_regions[0] if self._flow_regions else None
281
+
282
+ @property
283
+ def last(self) -> Optional["FlowRegion"]:
284
+ return self._flow_regions[-1] if self._flow_regions else None
285
+
286
+ @property
287
+ def is_empty(self) -> bool:
288
+ if not self._flow_regions:
289
+ return True
290
+ return all(fr.is_empty for fr in self._flow_regions)
291
+
292
+ def filter(self, func: Callable[["FlowRegion"], bool]) -> "FlowRegionCollection":
293
+ return FlowRegionCollection([fr for fr in self._flow_regions if func(fr)])
294
+
295
+ def sort(self, key: Optional[Callable[["FlowRegion"], Any]] = None, reverse: bool = False) -> "FlowRegionCollection":
296
+ """Sorts the collection in-place. Default sort is by flow order if possible."""
297
+ # A default key could try to sort by first constituent region's page then top/left,
298
+ # but FlowRegions can be complex. For now, require explicit key or rely on list.sort default.
299
+ if key is None:
300
+ # Attempt a sensible default sort: by page of first constituent, then its top, then its x0
301
+ def default_sort_key(fr: "FlowRegion"):
302
+ if fr.constituent_regions:
303
+ first_constituent = fr.constituent_regions[0]
304
+ page_idx = first_constituent.page.index if first_constituent.page else -1
305
+ return (page_idx, first_constituent.top, first_constituent.x0)
306
+ return (float('inf'), float('inf'), float('inf')) # Push empty ones to the end
307
+ self._flow_regions.sort(key=default_sort_key, reverse=reverse)
308
+ else:
309
+ self._flow_regions.sort(key=key, reverse=reverse)
310
+ return self
311
+
312
+ def extract_text(self, separator: str = "\n", apply_exclusions: bool = True, **kwargs) -> str:
313
+ texts = [
314
+ fr.extract_text(apply_exclusions=apply_exclusions, **kwargs)
315
+ for fr in self._flow_regions
316
+ ]
317
+ return separator.join(t for t in texts if t) # Filter out empty strings from concatenation
318
+
319
+ def extract_each_text(self, apply_exclusions: bool = True, **kwargs) -> List[str]:
320
+ return [
321
+ fr.extract_text(apply_exclusions=apply_exclusions, **kwargs)
322
+ for fr in self._flow_regions
323
+ ]
324
+
325
+ def find(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> Optional["PhysicalElement"]:
326
+ from natural_pdf.elements.base import Element as PhysicalElement # Runtime import
327
+ for fr in self._flow_regions:
328
+ found = fr.find(selector=selector, text=text, **kwargs)
329
+ if found:
330
+ return found
331
+ return None
332
+
333
+ def find_all(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> "ElementCollection":
334
+ from natural_pdf.elements.collections import ElementCollection as RuntimeElementCollection # Runtime import
335
+
336
+ all_physical_elements: List["PhysicalElement"] = []
337
+ for fr in self._flow_regions:
338
+ # FlowRegion.find_all returns an ElementCollection
339
+ elements_in_fr: "RuntimeElementCollection" = fr.find_all(selector=selector, text=text, **kwargs)
340
+ if elements_in_fr: # ElementCollection has boolean True if not empty
341
+ all_physical_elements.extend(elements_in_fr.elements) # Access .elements to get list
342
+
343
+ # Deduplicate while preserving order as much as possible (simple set doesn't preserve order)
344
+ seen = set()
345
+ unique_elements = []
346
+ for el in all_physical_elements:
347
+ if el not in seen:
348
+ unique_elements.append(el)
349
+ seen.add(el)
350
+ return RuntimeElementCollection(unique_elements)
351
+
352
+ def highlight(self, label_prefix: Optional[str] = "FRC", color: Optional[Union[Tuple, str]] = None, **kwargs) -> "FlowRegionCollection":
353
+ if not self._flow_regions:
354
+ return self
355
+
356
+ num_flow_regions = len(self._flow_regions)
357
+ for i, fr in enumerate(self._flow_regions):
358
+ current_label = None
359
+ if label_prefix:
360
+ current_label = f"{label_prefix}_{i+1}" if num_flow_regions > 1 else label_prefix
361
+
362
+ # Pass the specific color to each FlowRegion's highlight method.
363
+ # FlowRegion.highlight will then pass it to its constituent regions.
364
+ fr.highlight(label=current_label, color=color, **kwargs)
365
+ return self
366
+
367
+ def show(self, scale: float = 2.0, labels: bool = True, legend_position: str = "right",
368
+ default_color: Optional[Union[Tuple, str]] = "darkviolet", # A distinct color for FRC show
369
+ label_prefix: Optional[str] = "FRC_Part", width: Optional[int] = None,
370
+ stack_direction: str = "vertical", # New: "vertical" or "horizontal"
371
+ stack_gap: int = 5, # New: Gap between stacked page images
372
+ stack_background_color: Tuple[int, int, int] = (255, 255, 255), # New: Background for stacking
373
+ **kwargs) -> Optional[Image.Image]: # Return type changed
374
+ if not self._flow_regions:
375
+ logger.info("FlowRegionCollection.show() called on an empty collection.")
376
+ return None # Changed from []
377
+
378
+ regions_by_page: dict["PhysicalPage", List[dict[str, Any]]] = {}
379
+
380
+ first_flow_region = self._flow_regions[0]
381
+ highlighter_service = None
382
+ if first_flow_region and first_flow_region.flow and first_flow_region.flow.segments:
383
+ first_segment_page = first_flow_region.flow.segments[0].page
384
+ if first_segment_page and hasattr(first_segment_page, '_highlighter'):
385
+ highlighter_service = first_segment_page._highlighter
386
+
387
+ if not highlighter_service:
388
+ logger.error("Cannot get highlighter service for FlowRegionCollection.show().")
389
+ return None # Changed from []
390
+
391
+ constituent_idx = 0
392
+ for fr_idx, fr in enumerate(self._flow_regions):
393
+ for constituent_region in fr.constituent_regions:
394
+ page_obj = constituent_region.page
395
+ if not page_obj:
396
+ logger.warning(f"Constituent region {constituent_region.bbox} has no page. Skipping in show().")
397
+ continue
398
+
399
+ if page_obj not in regions_by_page:
400
+ regions_by_page[page_obj] = []
401
+
402
+ part_label = None
403
+ if label_prefix:
404
+ part_label = f"{label_prefix}_{constituent_idx}"
405
+
406
+ regions_by_page[page_obj].append({
407
+ "page_index": page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
408
+ "bbox": constituent_region.bbox,
409
+ "polygon": constituent_region.polygon if constituent_region.has_polygon else None,
410
+ "color": default_color,
411
+ "label": part_label,
412
+ "use_color_cycling": False,
413
+ })
414
+ constituent_idx += 1
415
+
416
+ output_page_images: List[Image.Image] = []
417
+ sorted_pages = sorted(regions_by_page.keys(), key=lambda p: p.index if hasattr(p, 'index') else getattr(p, 'page_number', 0))
418
+
419
+ for page_obj in sorted_pages:
420
+ temp_highlights_for_page = regions_by_page[page_obj]
421
+ if not temp_highlights_for_page: continue
422
+
423
+ page_image = highlighter_service.render_preview(
424
+ page_index=page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
425
+ temporary_highlights=temp_highlights_for_page,
426
+ scale=scale,
427
+ width=width,
428
+ labels=labels,
429
+ legend_position=legend_position,
430
+ **kwargs
431
+ )
432
+ if page_image:
433
+ output_page_images.append(page_image)
434
+
435
+ if not output_page_images:
436
+ logger.info("FlowRegionCollection.show() produced no page images to concatenate.")
437
+ return None
438
+
439
+ if len(output_page_images) == 1:
440
+ return output_page_images[0]
441
+
442
+ if stack_direction == "vertical":
443
+ final_width = max(img.width for img in output_page_images)
444
+ final_height = sum(img.height for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
445
+ if final_width == 0 or final_height == 0:
446
+ logger.warning("Cannot create concatenated image with zero width or height.")
447
+ return None
448
+
449
+ concatenated_image = Image.new("RGB", (final_width, final_height), stack_background_color)
450
+ current_y = 0
451
+ for img in output_page_images:
452
+ paste_x = (final_width - img.width) // 2 # Center horizontally
453
+ concatenated_image.paste(img, (paste_x, current_y))
454
+ current_y += img.height + stack_gap
455
+ return concatenated_image
456
+ elif stack_direction == "horizontal":
457
+ final_width = sum(img.width for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
458
+ final_height = max(img.height for img in output_page_images)
459
+ if final_width == 0 or final_height == 0:
460
+ logger.warning("Cannot create concatenated image with zero width or height.")
461
+ return None
462
+
463
+ concatenated_image = Image.new("RGB", (final_width, final_height), stack_background_color)
464
+ current_x = 0
465
+ for img in output_page_images:
466
+ paste_y = (final_height - img.height) // 2 # Center vertically
467
+ concatenated_image.paste(img, (current_x, paste_y))
468
+ current_x += img.width + stack_gap
469
+ return concatenated_image
470
+ else:
471
+ logger.error(f"Invalid stack_direction '{stack_direction}' for FlowRegionCollection.show(). Must be 'vertical' or 'horizontal'.")
472
+ return None
473
+
474
+ def to_images(self, resolution: float = 150, **kwargs) -> List[Image.Image]:
475
+ """Returns a flat list of cropped images of all constituent physical regions."""
476
+ all_cropped_images: List[Image.Image] = []
477
+ for fr in self._flow_regions:
478
+ all_cropped_images.extend(fr.to_images(resolution=resolution, **kwargs))
479
+ return all_cropped_images
480
+
481
+ def to_image(self, stack_direction: str = "vertical", background_color=(255,255,255), gap: int = 5, **kwargs_for_constituent_to_image) -> Optional[Image.Image]:
482
+ """
483
+ Creates a single composite image by stacking the composite images of each FlowRegion.
484
+ Each FlowRegion's composite is generated by its own .to_image() method.
485
+ These are then stacked.
486
+
487
+ Args:
488
+ stack_direction: "vertical" or "horizontal".
489
+ background_color: Background for the final composite.
490
+ gap: Gap in pixels between stacked FlowRegion images.
491
+ **kwargs_for_constituent_to_image: Passed to each FlowRegion.to_image().
492
+ """
493
+ if not self._flow_regions: return None
494
+
495
+ region_composites: List[Image.Image] = []
496
+ for fr in self._flow_regions:
497
+ img = fr.to_image(background_color=background_color, **kwargs_for_constituent_to_image)
498
+ if img:
499
+ region_composites.append(img)
500
+
501
+ if not region_composites: return None
502
+ if len(region_composites) == 1: return region_composites[0]
503
+
504
+ if stack_direction == "vertical":
505
+ final_width = max(img.width for img in region_composites)
506
+ final_height = sum(img.height for img in region_composites) + (len(region_composites) - 1) * gap
507
+ if final_width == 0 or final_height == 0: return None
508
+
509
+ new_image = Image.new("RGB", (final_width, final_height), background_color)
510
+ current_y = 0
511
+ for img in region_composites:
512
+ # Align to left for vertical stacking
513
+ new_image.paste(img, (0, current_y))
514
+ current_y += img.height + gap
515
+ return new_image
516
+ elif stack_direction == "horizontal":
517
+ final_width = sum(img.width for img in region_composites) + (len(region_composites) - 1) * gap
518
+ final_height = max(img.height for img in region_composites)
519
+ if final_width == 0 or final_height == 0: return None
520
+
521
+ new_image = Image.new("RGB", (final_width, final_height), background_color)
522
+ current_x = 0
523
+ for img in region_composites:
524
+ # Align to top for horizontal stacking
525
+ new_image.paste(img, (current_x, 0))
526
+ current_x += img.width + gap
527
+ return new_image
528
+ else:
529
+ logger.warning(f"Invalid stack_direction: {stack_direction}. Must be 'vertical' or 'horizontal'.")
530
+ return None # Or perhaps return the list of images?
531
+
532
+ def apply(self, func: Callable[["FlowRegion"], Any]) -> List[Any]:
533
+ return [func(fr) for fr in self._flow_regions]