natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +119 -76
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/describe/__init__.py +21 -0
  14. natural_pdf/describe/base.py +457 -0
  15. natural_pdf/describe/elements.py +411 -0
  16. natural_pdf/describe/mixin.py +84 -0
  17. natural_pdf/describe/summary.py +186 -0
  18. natural_pdf/elements/base.py +11 -10
  19. natural_pdf/elements/collections.py +116 -51
  20. natural_pdf/elements/region.py +204 -127
  21. natural_pdf/exporters/paddleocr.py +38 -13
  22. natural_pdf/flows/__init__.py +3 -3
  23. natural_pdf/flows/collections.py +303 -132
  24. natural_pdf/flows/element.py +277 -132
  25. natural_pdf/flows/flow.py +33 -16
  26. natural_pdf/flows/region.py +142 -79
  27. natural_pdf/ocr/engine_doctr.py +37 -4
  28. natural_pdf/ocr/engine_easyocr.py +23 -3
  29. natural_pdf/ocr/engine_paddle.py +281 -30
  30. natural_pdf/ocr/engine_surya.py +8 -3
  31. natural_pdf/ocr/ocr_manager.py +75 -76
  32. natural_pdf/ocr/ocr_options.py +52 -87
  33. natural_pdf/search/__init__.py +25 -12
  34. natural_pdf/search/lancedb_search_service.py +91 -54
  35. natural_pdf/search/numpy_search_service.py +86 -65
  36. natural_pdf/search/searchable_mixin.py +2 -2
  37. natural_pdf/selectors/parser.py +125 -81
  38. natural_pdf/widgets/__init__.py +1 -1
  39. natural_pdf/widgets/viewer.py +205 -449
  40. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
  41. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
  42. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
  43. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
  44. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
@@ -2,15 +2,16 @@ import logging
2
2
  from collections.abc import MutableSequence
3
3
  from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, TypeVar, Union
4
4
 
5
- from PIL import Image # Single import for PIL.Image module
5
+ from PIL import Image # Single import for PIL.Image module
6
6
 
7
7
  if TYPE_CHECKING:
8
8
  # from PIL.Image import Image as PIL_Image # No longer needed with Image.Image type hint
9
+ from natural_pdf.core.page import Page as PhysicalPage
9
10
  from natural_pdf.elements.base import Element as PhysicalElement
10
11
  from natural_pdf.elements.collections import ElementCollection
11
- from natural_pdf.core.page import Page as PhysicalPage
12
+
12
13
  from .element import FlowElement
13
- from .flow import Flow # Though not directly used in __init__, FlowRegion needs it.
14
+ from .flow import Flow # Though not directly used in __init__, FlowRegion needs it.
14
15
  from .region import FlowRegion
15
16
 
16
17
 
@@ -26,8 +27,11 @@ class FlowElementCollection(MutableSequence[T_FEC]):
26
27
  Provides directional methods that operate on its contained FlowElements and
27
28
  return FlowRegionCollection objects.
28
29
  """
30
+
29
31
  def __init__(self, flow_elements: List["FlowElement"]):
30
- self._flow_elements: List["FlowElement"] = flow_elements if flow_elements is not None else []
32
+ self._flow_elements: List["FlowElement"] = (
33
+ flow_elements if flow_elements is not None else []
34
+ )
31
35
 
32
36
  def __getitem__(self, index: int) -> "FlowElement":
33
37
  return self._flow_elements[index]
@@ -62,11 +66,11 @@ class FlowElementCollection(MutableSequence[T_FEC]):
62
66
  def _execute_directional_on_all(self, method_name: str, **kwargs) -> "FlowRegionCollection":
63
67
  results: List["FlowRegion"] = []
64
68
  if not self._flow_elements:
65
- return FlowRegionCollection([]) # Return empty FlowRegionCollection
69
+ return FlowRegionCollection([]) # Return empty FlowRegionCollection
66
70
 
67
71
  # Assuming all flow_elements share the same flow context
68
72
  # (which should be true if they came from the same Flow.find_all())
69
-
73
+
70
74
  for fe in self._flow_elements:
71
75
  method_to_call = getattr(fe, method_name)
72
76
  flow_region_result: "FlowRegion" = method_to_call(**kwargs)
@@ -74,45 +78,103 @@ class FlowElementCollection(MutableSequence[T_FEC]):
74
78
  results.append(flow_region_result)
75
79
  return FlowRegionCollection(results)
76
80
 
77
- def above(self, height: Optional[float] = None, width_ratio: Optional[float] = None,
78
- width_absolute: Optional[float] = None, width_alignment: str = "center",
79
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> "FlowRegionCollection":
81
+ def above(
82
+ self,
83
+ height: Optional[float] = None,
84
+ width_ratio: Optional[float] = None,
85
+ width_absolute: Optional[float] = None,
86
+ width_alignment: str = "center",
87
+ until: Optional[str] = None,
88
+ include_endpoint: bool = True,
89
+ **kwargs,
90
+ ) -> "FlowRegionCollection":
80
91
  return self._execute_directional_on_all(
81
- "above", height=height, width_ratio=width_ratio, width_absolute=width_absolute,
82
- width_alignment=width_alignment, until=until, include_endpoint=include_endpoint, **kwargs
92
+ "above",
93
+ height=height,
94
+ width_ratio=width_ratio,
95
+ width_absolute=width_absolute,
96
+ width_alignment=width_alignment,
97
+ until=until,
98
+ include_endpoint=include_endpoint,
99
+ **kwargs,
83
100
  )
84
101
 
85
- def below(self, height: Optional[float] = None, width_ratio: Optional[float] = None,
86
- width_absolute: Optional[float] = None, width_alignment: str = "center",
87
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> "FlowRegionCollection":
102
+ def below(
103
+ self,
104
+ height: Optional[float] = None,
105
+ width_ratio: Optional[float] = None,
106
+ width_absolute: Optional[float] = None,
107
+ width_alignment: str = "center",
108
+ until: Optional[str] = None,
109
+ include_endpoint: bool = True,
110
+ **kwargs,
111
+ ) -> "FlowRegionCollection":
88
112
  return self._execute_directional_on_all(
89
- "below", height=height, width_ratio=width_ratio, width_absolute=width_absolute,
90
- width_alignment=width_alignment, until=until, include_endpoint=include_endpoint, **kwargs
113
+ "below",
114
+ height=height,
115
+ width_ratio=width_ratio,
116
+ width_absolute=width_absolute,
117
+ width_alignment=width_alignment,
118
+ until=until,
119
+ include_endpoint=include_endpoint,
120
+ **kwargs,
91
121
  )
92
122
 
93
- def left(self, width: Optional[float] = None, height_ratio: Optional[float] = None,
94
- height_absolute: Optional[float] = None, height_alignment: str = "center",
95
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> "FlowRegionCollection":
123
+ def left(
124
+ self,
125
+ width: Optional[float] = None,
126
+ height_ratio: Optional[float] = None,
127
+ height_absolute: Optional[float] = None,
128
+ height_alignment: str = "center",
129
+ until: Optional[str] = None,
130
+ include_endpoint: bool = True,
131
+ **kwargs,
132
+ ) -> "FlowRegionCollection":
96
133
  return self._execute_directional_on_all(
97
- "left", width=width, height_ratio=height_ratio, height_absolute=height_absolute,
98
- height_alignment=height_alignment, until=until, include_endpoint=include_endpoint, **kwargs
134
+ "left",
135
+ width=width,
136
+ height_ratio=height_ratio,
137
+ height_absolute=height_absolute,
138
+ height_alignment=height_alignment,
139
+ until=until,
140
+ include_endpoint=include_endpoint,
141
+ **kwargs,
99
142
  )
100
143
 
101
- def right(self, width: Optional[float] = None, height_ratio: Optional[float] = None,
102
- height_absolute: Optional[float] = None, height_alignment: str = "center",
103
- until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> "FlowRegionCollection":
144
+ def right(
145
+ self,
146
+ width: Optional[float] = None,
147
+ height_ratio: Optional[float] = None,
148
+ height_absolute: Optional[float] = None,
149
+ height_alignment: str = "center",
150
+ until: Optional[str] = None,
151
+ include_endpoint: bool = True,
152
+ **kwargs,
153
+ ) -> "FlowRegionCollection":
104
154
  return self._execute_directional_on_all(
105
- "right", width=width, height_ratio=height_ratio, height_absolute=height_absolute,
106
- height_alignment=height_alignment, until=until, include_endpoint=include_endpoint, **kwargs
155
+ "right",
156
+ width=width,
157
+ height_ratio=height_ratio,
158
+ height_absolute=height_absolute,
159
+ height_alignment=height_alignment,
160
+ until=until,
161
+ include_endpoint=include_endpoint,
162
+ **kwargs,
107
163
  )
108
164
 
109
- def show(self, scale: float = 2.0, labels: bool = True, legend_position: str = "right",
110
- default_color: Optional[Union[Tuple, str]] = "orange", # A distinct color for FEC show
111
- label_prefix: Optional[str] = "FEC_Element", width: Optional[int] = None,
112
- stack_direction: str = "vertical", # "vertical" or "horizontal"
113
- stack_gap: int = 5, # Gap between stacked page images
114
- stack_background_color: Tuple[int, int, int] = (255, 255, 255), # Background for stacking
115
- **kwargs) -> Optional[Image.Image]:
165
+ def show(
166
+ self,
167
+ scale: float = 2.0,
168
+ labels: bool = True,
169
+ legend_position: str = "right",
170
+ default_color: Optional[Union[Tuple, str]] = "orange", # A distinct color for FEC show
171
+ label_prefix: Optional[str] = "FEC_Element",
172
+ width: Optional[int] = None,
173
+ stack_direction: str = "vertical", # "vertical" or "horizontal"
174
+ stack_gap: int = 5, # Gap between stacked page images
175
+ stack_background_color: Tuple[int, int, int] = (255, 255, 255), # Background for stacking
176
+ **kwargs,
177
+ ) -> Optional[Image.Image]:
116
178
  """
117
179
  Shows all FlowElements in this collection by highlighting them on their respective pages.
118
180
  If multiple pages are involved, they are stacked into a single image.
@@ -133,15 +195,17 @@ class FlowElementCollection(MutableSequence[T_FEC]):
133
195
  raise ValueError(f"FlowElement {flow_element} has no page.")
134
196
 
135
197
  if not elements_by_page:
136
- logger.info("FlowElementCollection.show() found no flow elements with associated pages.")
198
+ logger.info(
199
+ "FlowElementCollection.show() found no flow elements with associated pages."
200
+ )
137
201
  return None
138
202
 
139
203
  # Get a highlighter service from the first page
140
204
  first_page_with_elements = next(iter(elements_by_page.keys()), None)
141
205
  highlighter_service = None
142
- if first_page_with_elements and hasattr(first_page_with_elements, '_highlighter'):
206
+ if first_page_with_elements and hasattr(first_page_with_elements, "_highlighter"):
143
207
  highlighter_service = first_page_with_elements._highlighter
144
-
208
+
145
209
  if not highlighter_service:
146
210
  raise ValueError(
147
211
  "Cannot get highlighter service for FlowElementCollection.show(). "
@@ -149,9 +213,12 @@ class FlowElementCollection(MutableSequence[T_FEC]):
149
213
  )
150
214
 
151
215
  output_page_images: List[Image.Image] = []
152
-
216
+
153
217
  # Sort pages by index for consistent output order
154
- sorted_pages = sorted(elements_by_page.keys(), key=lambda p: p.index if hasattr(p, 'index') else getattr(p, 'page_number', 0))
218
+ sorted_pages = sorted(
219
+ elements_by_page.keys(),
220
+ key=lambda p: p.index if hasattr(p, "index") else getattr(p, "page_number", 0),
221
+ )
155
222
 
156
223
  # Render each page with its relevant flow elements highlighted
157
224
  for page_idx, page_obj in enumerate(sorted_pages):
@@ -170,32 +237,47 @@ class FlowElementCollection(MutableSequence[T_FEC]):
170
237
  global_idx = self._flow_elements.index(flow_element)
171
238
  count_indicator = f"_{global_idx + 1}"
172
239
  except ValueError:
173
- count_indicator = f"_p{page_idx}i{i+1}" # fallback local index
240
+ count_indicator = f"_p{page_idx}i{i+1}" # fallback local index
174
241
  elif len(flow_elements_on_this_page) > 1:
175
- count_indicator = f"_{i+1}"
242
+ count_indicator = f"_{i+1}"
176
243
 
177
244
  element_label = f"{label_prefix}{count_indicator}" if label_prefix else None
178
-
179
- temp_highlights_for_page.append({
180
- "page_index": page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
181
- "bbox": flow_element.bbox,
182
- "polygon": getattr(flow_element.physical_object, 'polygon', None) if hasattr(flow_element.physical_object, 'has_polygon') and flow_element.physical_object.has_polygon else None,
183
- "color": default_color,
184
- "label": element_label,
185
- "use_color_cycling": False,
186
- })
187
-
245
+
246
+ temp_highlights_for_page.append(
247
+ {
248
+ "page_index": (
249
+ page_obj.index
250
+ if hasattr(page_obj, "index")
251
+ else getattr(page_obj, "page_number", 1) - 1
252
+ ),
253
+ "bbox": flow_element.bbox,
254
+ "polygon": (
255
+ getattr(flow_element.physical_object, "polygon", None)
256
+ if hasattr(flow_element.physical_object, "has_polygon")
257
+ and flow_element.physical_object.has_polygon
258
+ else None
259
+ ),
260
+ "color": default_color,
261
+ "label": element_label,
262
+ "use_color_cycling": False,
263
+ }
264
+ )
265
+
188
266
  if not temp_highlights_for_page:
189
267
  continue
190
268
 
191
269
  page_image = highlighter_service.render_preview(
192
- page_index=page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
270
+ page_index=(
271
+ page_obj.index
272
+ if hasattr(page_obj, "index")
273
+ else getattr(page_obj, "page_number", 1) - 1
274
+ ),
193
275
  temporary_highlights=temp_highlights_for_page,
194
276
  scale=scale,
195
277
  width=width,
196
278
  labels=labels,
197
279
  legend_position=legend_position,
198
- **kwargs
280
+ **kwargs,
199
281
  )
200
282
  if page_image:
201
283
  output_page_images.append(page_image)
@@ -204,18 +286,23 @@ class FlowElementCollection(MutableSequence[T_FEC]):
204
286
  if not output_page_images:
205
287
  logger.info("FlowElementCollection.show() produced no page images to concatenate.")
206
288
  return None
207
-
289
+
208
290
  if len(output_page_images) == 1:
209
291
  return output_page_images[0]
210
292
 
211
293
  # Stacking logic (same as in FlowRegionCollection.show)
212
294
  if stack_direction == "vertical":
213
295
  final_width = max(img.width for img in output_page_images)
214
- final_height = sum(img.height for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
215
- if final_width == 0 or final_height == 0:
296
+ final_height = (
297
+ sum(img.height for img in output_page_images)
298
+ + (len(output_page_images) - 1) * stack_gap
299
+ )
300
+ if final_width == 0 or final_height == 0:
216
301
  raise ValueError("Cannot create concatenated image with zero width or height.")
217
-
218
- concatenated_image = Image.new("RGB", (final_width, final_height), stack_background_color)
302
+
303
+ concatenated_image = Image.new(
304
+ "RGB", (final_width, final_height), stack_background_color
305
+ )
219
306
  current_y = 0
220
307
  for img in output_page_images:
221
308
  paste_x = (final_width - img.width) // 2
@@ -223,12 +310,17 @@ class FlowElementCollection(MutableSequence[T_FEC]):
223
310
  current_y += img.height + stack_gap
224
311
  return concatenated_image
225
312
  elif stack_direction == "horizontal":
226
- final_width = sum(img.width for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
313
+ final_width = (
314
+ sum(img.width for img in output_page_images)
315
+ + (len(output_page_images) - 1) * stack_gap
316
+ )
227
317
  final_height = max(img.height for img in output_page_images)
228
318
  if final_width == 0 or final_height == 0:
229
319
  raise ValueError("Cannot create concatenated image with zero width or height.")
230
320
 
231
- concatenated_image = Image.new("RGB", (final_width, final_height), stack_background_color)
321
+ concatenated_image = Image.new(
322
+ "RGB", (final_width, final_height), stack_background_color
323
+ )
232
324
  current_x = 0
233
325
  for img in output_page_images:
234
326
  paste_y = (final_height - img.height) // 2
@@ -236,7 +328,9 @@ class FlowElementCollection(MutableSequence[T_FEC]):
236
328
  current_x += img.width + stack_gap
237
329
  return concatenated_image
238
330
  else:
239
- raise ValueError(f"Invalid stack_direction '{stack_direction}' for FlowElementCollection.show(). Must be 'vertical' or 'horizontal'.")
331
+ raise ValueError(
332
+ f"Invalid stack_direction '{stack_direction}' for FlowElementCollection.show(). Must be 'vertical' or 'horizontal'."
333
+ )
240
334
 
241
335
 
242
336
  class FlowRegionCollection(MutableSequence[T_FRC]):
@@ -245,6 +339,7 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
245
339
  operations on a FlowElementCollection.
246
340
  Provides methods for querying and visualizing the aggregated content.
247
341
  """
342
+
248
343
  def __init__(self, flow_regions: List["FlowRegion"]):
249
344
  self._flow_regions: List["FlowRegion"] = flow_regions if flow_regions is not None else []
250
345
 
@@ -292,7 +387,9 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
292
387
  def filter(self, func: Callable[["FlowRegion"], bool]) -> "FlowRegionCollection":
293
388
  return FlowRegionCollection([fr for fr in self._flow_regions if func(fr)])
294
389
 
295
- def sort(self, key: Optional[Callable[["FlowRegion"], Any]] = None, reverse: bool = False) -> "FlowRegionCollection":
390
+ def sort(
391
+ self, key: Optional[Callable[["FlowRegion"], Any]] = None, reverse: bool = False
392
+ ) -> "FlowRegionCollection":
296
393
  """Sorts the collection in-place. Default sort is by flow order if possible."""
297
394
  # A default key could try to sort by first constituent region's page then top/left,
298
395
  # but FlowRegions can be complex. For now, require explicit key or rely on list.sort default.
@@ -303,18 +400,19 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
303
400
  first_constituent = fr.constituent_regions[0]
304
401
  page_idx = first_constituent.page.index if first_constituent.page else -1
305
402
  return (page_idx, first_constituent.top, first_constituent.x0)
306
- return (float('inf'), float('inf'), float('inf')) # Push empty ones to the end
403
+ return (float("inf"), float("inf"), float("inf")) # Push empty ones to the end
404
+
307
405
  self._flow_regions.sort(key=default_sort_key, reverse=reverse)
308
406
  else:
309
407
  self._flow_regions.sort(key=key, reverse=reverse)
310
408
  return self
311
-
409
+
312
410
  def extract_text(self, separator: str = "\n", apply_exclusions: bool = True, **kwargs) -> str:
313
411
  texts = [
314
412
  fr.extract_text(apply_exclusions=apply_exclusions, **kwargs)
315
413
  for fr in self._flow_regions
316
414
  ]
317
- return separator.join(t for t in texts if t) # Filter out empty strings from concatenation
415
+ return separator.join(t for t in texts if t) # Filter out empty strings from concatenation
318
416
 
319
417
  def extract_each_text(self, apply_exclusions: bool = True, **kwargs) -> List[str]:
320
418
  return [
@@ -322,24 +420,35 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
322
420
  for fr in self._flow_regions
323
421
  ]
324
422
 
325
- def find(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> Optional["PhysicalElement"]:
326
- from natural_pdf.elements.base import Element as PhysicalElement # Runtime import
423
+ def find(
424
+ self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
425
+ ) -> Optional["PhysicalElement"]:
426
+ from natural_pdf.elements.base import Element as PhysicalElement # Runtime import
427
+
327
428
  for fr in self._flow_regions:
328
429
  found = fr.find(selector=selector, text=text, **kwargs)
329
430
  if found:
330
431
  return found
331
432
  return None
332
433
 
333
- def find_all(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> "ElementCollection":
334
- from natural_pdf.elements.collections import ElementCollection as RuntimeElementCollection # Runtime import
335
-
434
+ def find_all(
435
+ self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
436
+ ) -> "ElementCollection":
437
+ from natural_pdf.elements.collections import (
438
+ ElementCollection as RuntimeElementCollection, # Runtime import
439
+ )
440
+
336
441
  all_physical_elements: List["PhysicalElement"] = []
337
442
  for fr in self._flow_regions:
338
443
  # FlowRegion.find_all returns an ElementCollection
339
- elements_in_fr: "RuntimeElementCollection" = fr.find_all(selector=selector, text=text, **kwargs)
340
- if elements_in_fr: # ElementCollection has boolean True if not empty
341
- all_physical_elements.extend(elements_in_fr.elements) # Access .elements to get list
342
-
444
+ elements_in_fr: "RuntimeElementCollection" = fr.find_all(
445
+ selector=selector, text=text, **kwargs
446
+ )
447
+ if elements_in_fr: # ElementCollection has boolean True if not empty
448
+ all_physical_elements.extend(
449
+ elements_in_fr.elements
450
+ ) # Access .elements to get list
451
+
343
452
  # Deduplicate while preserving order as much as possible (simple set doesn't preserve order)
344
453
  seen = set()
345
454
  unique_elements = []
@@ -349,126 +458,171 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
349
458
  seen.add(el)
350
459
  return RuntimeElementCollection(unique_elements)
351
460
 
352
- def highlight(self, label_prefix: Optional[str] = "FRC", color: Optional[Union[Tuple, str]] = None, **kwargs) -> "FlowRegionCollection":
461
+ def highlight(
462
+ self,
463
+ label_prefix: Optional[str] = "FRC",
464
+ color: Optional[Union[Tuple, str]] = None,
465
+ **kwargs,
466
+ ) -> "FlowRegionCollection":
353
467
  if not self._flow_regions:
354
468
  return self
355
-
469
+
356
470
  num_flow_regions = len(self._flow_regions)
357
471
  for i, fr in enumerate(self._flow_regions):
358
472
  current_label = None
359
473
  if label_prefix:
360
474
  current_label = f"{label_prefix}_{i+1}" if num_flow_regions > 1 else label_prefix
361
-
475
+
362
476
  # Pass the specific color to each FlowRegion's highlight method.
363
477
  # FlowRegion.highlight will then pass it to its constituent regions.
364
478
  fr.highlight(label=current_label, color=color, **kwargs)
365
479
  return self
366
480
 
367
- def show(self, scale: float = 2.0, labels: bool = True, legend_position: str = "right",
368
- default_color: Optional[Union[Tuple, str]] = "darkviolet", # A distinct color for FRC show
369
- label_prefix: Optional[str] = "FRC_Part", width: Optional[int] = None,
370
- stack_direction: str = "vertical", # New: "vertical" or "horizontal"
371
- stack_gap: int = 5, # New: Gap between stacked page images
372
- stack_background_color: Tuple[int, int, int] = (255, 255, 255), # New: Background for stacking
373
- **kwargs) -> Optional[Image.Image]: # Return type changed
481
+ def show(
482
+ self,
483
+ scale: float = 2.0,
484
+ labels: bool = True,
485
+ legend_position: str = "right",
486
+ default_color: Optional[Union[Tuple, str]] = "darkviolet", # A distinct color for FRC show
487
+ label_prefix: Optional[str] = "FRC_Part",
488
+ width: Optional[int] = None,
489
+ stack_direction: str = "vertical", # New: "vertical" or "horizontal"
490
+ stack_gap: int = 5, # New: Gap between stacked page images
491
+ stack_background_color: Tuple[int, int, int] = (
492
+ 255,
493
+ 255,
494
+ 255,
495
+ ), # New: Background for stacking
496
+ **kwargs,
497
+ ) -> Optional[Image.Image]: # Return type changed
374
498
  if not self._flow_regions:
375
499
  logger.info("FlowRegionCollection.show() called on an empty collection.")
376
- return None # Changed from []
500
+ return None # Changed from []
377
501
 
378
- regions_by_page: dict["PhysicalPage", List[dict[str, Any]]] = {}
502
+ regions_by_page: dict["PhysicalPage", List[dict[str, Any]]] = {}
379
503
 
380
504
  first_flow_region = self._flow_regions[0]
381
505
  highlighter_service = None
382
506
  if first_flow_region and first_flow_region.flow and first_flow_region.flow.segments:
383
- first_segment_page = first_flow_region.flow.segments[0].page
384
- if first_segment_page and hasattr(first_segment_page, '_highlighter'):
385
- highlighter_service = first_segment_page._highlighter
386
-
507
+ first_segment_page = first_flow_region.flow.segments[0].page
508
+ if first_segment_page and hasattr(first_segment_page, "_highlighter"):
509
+ highlighter_service = first_segment_page._highlighter
510
+
387
511
  if not highlighter_service:
388
512
  logger.error("Cannot get highlighter service for FlowRegionCollection.show().")
389
- return None # Changed from []
513
+ return None # Changed from []
390
514
 
391
515
  constituent_idx = 0
392
516
  for fr_idx, fr in enumerate(self._flow_regions):
393
517
  for constituent_region in fr.constituent_regions:
394
518
  page_obj = constituent_region.page
395
519
  if not page_obj:
396
- logger.warning(f"Constituent region {constituent_region.bbox} has no page. Skipping in show().")
520
+ logger.warning(
521
+ f"Constituent region {constituent_region.bbox} has no page. Skipping in show()."
522
+ )
397
523
  continue
398
524
 
399
525
  if page_obj not in regions_by_page:
400
526
  regions_by_page[page_obj] = []
401
-
527
+
402
528
  part_label = None
403
529
  if label_prefix:
404
530
  part_label = f"{label_prefix}_{constituent_idx}"
405
-
406
- regions_by_page[page_obj].append({
407
- "page_index": page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
408
- "bbox": constituent_region.bbox,
409
- "polygon": constituent_region.polygon if constituent_region.has_polygon else None,
410
- "color": default_color,
411
- "label": part_label,
412
- "use_color_cycling": False,
413
- })
531
+
532
+ regions_by_page[page_obj].append(
533
+ {
534
+ "page_index": (
535
+ page_obj.index
536
+ if hasattr(page_obj, "index")
537
+ else getattr(page_obj, "page_number", 1) - 1
538
+ ),
539
+ "bbox": constituent_region.bbox,
540
+ "polygon": (
541
+ constituent_region.polygon if constituent_region.has_polygon else None
542
+ ),
543
+ "color": default_color,
544
+ "label": part_label,
545
+ "use_color_cycling": False,
546
+ }
547
+ )
414
548
  constituent_idx += 1
415
-
549
+
416
550
  output_page_images: List[Image.Image] = []
417
- sorted_pages = sorted(regions_by_page.keys(), key=lambda p: p.index if hasattr(p, 'index') else getattr(p, 'page_number', 0))
551
+ sorted_pages = sorted(
552
+ regions_by_page.keys(),
553
+ key=lambda p: p.index if hasattr(p, "index") else getattr(p, "page_number", 0),
554
+ )
418
555
 
419
556
  for page_obj in sorted_pages:
420
557
  temp_highlights_for_page = regions_by_page[page_obj]
421
- if not temp_highlights_for_page: continue
558
+ if not temp_highlights_for_page:
559
+ continue
422
560
 
423
561
  page_image = highlighter_service.render_preview(
424
- page_index=page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
562
+ page_index=(
563
+ page_obj.index
564
+ if hasattr(page_obj, "index")
565
+ else getattr(page_obj, "page_number", 1) - 1
566
+ ),
425
567
  temporary_highlights=temp_highlights_for_page,
426
568
  scale=scale,
427
569
  width=width,
428
570
  labels=labels,
429
571
  legend_position=legend_position,
430
- **kwargs
572
+ **kwargs,
431
573
  )
432
574
  if page_image:
433
575
  output_page_images.append(page_image)
434
-
576
+
435
577
  if not output_page_images:
436
578
  logger.info("FlowRegionCollection.show() produced no page images to concatenate.")
437
579
  return None
438
-
580
+
439
581
  if len(output_page_images) == 1:
440
582
  return output_page_images[0]
441
583
 
442
584
  if stack_direction == "vertical":
443
585
  final_width = max(img.width for img in output_page_images)
444
- final_height = sum(img.height for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
445
- if final_width == 0 or final_height == 0:
586
+ final_height = (
587
+ sum(img.height for img in output_page_images)
588
+ + (len(output_page_images) - 1) * stack_gap
589
+ )
590
+ if final_width == 0 or final_height == 0:
446
591
  logger.warning("Cannot create concatenated image with zero width or height.")
447
592
  return None
448
-
449
- concatenated_image = Image.new("RGB", (final_width, final_height), stack_background_color)
593
+
594
+ concatenated_image = Image.new(
595
+ "RGB", (final_width, final_height), stack_background_color
596
+ )
450
597
  current_y = 0
451
598
  for img in output_page_images:
452
- paste_x = (final_width - img.width) // 2 # Center horizontally
599
+ paste_x = (final_width - img.width) // 2 # Center horizontally
453
600
  concatenated_image.paste(img, (paste_x, current_y))
454
601
  current_y += img.height + stack_gap
455
602
  return concatenated_image
456
603
  elif stack_direction == "horizontal":
457
- final_width = sum(img.width for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
604
+ final_width = (
605
+ sum(img.width for img in output_page_images)
606
+ + (len(output_page_images) - 1) * stack_gap
607
+ )
458
608
  final_height = max(img.height for img in output_page_images)
459
- if final_width == 0 or final_height == 0:
609
+ if final_width == 0 or final_height == 0:
460
610
  logger.warning("Cannot create concatenated image with zero width or height.")
461
611
  return None
462
612
 
463
- concatenated_image = Image.new("RGB", (final_width, final_height), stack_background_color)
613
+ concatenated_image = Image.new(
614
+ "RGB", (final_width, final_height), stack_background_color
615
+ )
464
616
  current_x = 0
465
617
  for img in output_page_images:
466
- paste_y = (final_height - img.height) // 2 # Center vertically
618
+ paste_y = (final_height - img.height) // 2 # Center vertically
467
619
  concatenated_image.paste(img, (current_x, paste_y))
468
620
  current_x += img.width + stack_gap
469
621
  return concatenated_image
470
622
  else:
471
- logger.error(f"Invalid stack_direction '{stack_direction}' for FlowRegionCollection.show(). Must be 'vertical' or 'horizontal'.")
623
+ logger.error(
624
+ f"Invalid stack_direction '{stack_direction}' for FlowRegionCollection.show(). Must be 'vertical' or 'horizontal'."
625
+ )
472
626
  return None
473
627
 
474
628
  def to_images(self, resolution: float = 150, **kwargs) -> List[Image.Image]:
@@ -477,8 +631,14 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
477
631
  for fr in self._flow_regions:
478
632
  all_cropped_images.extend(fr.to_images(resolution=resolution, **kwargs))
479
633
  return all_cropped_images
480
-
481
- def to_image(self, stack_direction: str = "vertical", background_color=(255,255,255), gap: int = 5, **kwargs_for_constituent_to_image) -> Optional[Image.Image]:
634
+
635
+ def to_image(
636
+ self,
637
+ stack_direction: str = "vertical",
638
+ background_color=(255, 255, 255),
639
+ gap: int = 5,
640
+ **kwargs_for_constituent_to_image,
641
+ ) -> Optional[Image.Image]:
482
642
  """
483
643
  Creates a single composite image by stacking the composite images of each FlowRegion.
484
644
  Each FlowRegion's composite is generated by its own .to_image() method.
@@ -490,22 +650,28 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
490
650
  gap: Gap in pixels between stacked FlowRegion images.
491
651
  **kwargs_for_constituent_to_image: Passed to each FlowRegion.to_image().
492
652
  """
493
- if not self._flow_regions: return None
653
+ if not self._flow_regions:
654
+ return None
494
655
 
495
656
  region_composites: List[Image.Image] = []
496
657
  for fr in self._flow_regions:
497
658
  img = fr.to_image(background_color=background_color, **kwargs_for_constituent_to_image)
498
659
  if img:
499
660
  region_composites.append(img)
500
-
501
- if not region_composites: return None
502
- if len(region_composites) == 1: return region_composites[0]
661
+
662
+ if not region_composites:
663
+ return None
664
+ if len(region_composites) == 1:
665
+ return region_composites[0]
503
666
 
504
667
  if stack_direction == "vertical":
505
668
  final_width = max(img.width for img in region_composites)
506
- final_height = sum(img.height for img in region_composites) + (len(region_composites) - 1) * gap
507
- if final_width == 0 or final_height == 0: return None
508
-
669
+ final_height = (
670
+ sum(img.height for img in region_composites) + (len(region_composites) - 1) * gap
671
+ )
672
+ if final_width == 0 or final_height == 0:
673
+ return None
674
+
509
675
  new_image = Image.new("RGB", (final_width, final_height), background_color)
510
676
  current_y = 0
511
677
  for img in region_composites:
@@ -514,9 +680,12 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
514
680
  current_y += img.height + gap
515
681
  return new_image
516
682
  elif stack_direction == "horizontal":
517
- final_width = sum(img.width for img in region_composites) + (len(region_composites) - 1) * gap
683
+ final_width = (
684
+ sum(img.width for img in region_composites) + (len(region_composites) - 1) * gap
685
+ )
518
686
  final_height = max(img.height for img in region_composites)
519
- if final_width == 0 or final_height == 0: return None
687
+ if final_width == 0 or final_height == 0:
688
+ return None
520
689
 
521
690
  new_image = Image.new("RGB", (final_width, final_height), background_color)
522
691
  current_x = 0
@@ -526,8 +695,10 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
526
695
  current_x += img.width + gap
527
696
  return new_image
528
697
  else:
529
- logger.warning(f"Invalid stack_direction: {stack_direction}. Must be 'vertical' or 'horizontal'.")
530
- return None # Or perhaps return the list of images?
698
+ logger.warning(
699
+ f"Invalid stack_direction: {stack_direction}. Must be 'vertical' or 'horizontal'."
700
+ )
701
+ return None # Or perhaps return the list of images?
531
702
 
532
703
  def apply(self, func: Callable[["FlowRegion"], Any]) -> List[Any]:
533
- return [func(fr) for fr in self._flow_regions]
704
+ return [func(fr) for fr in self._flow_regions]