natural-pdf 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +226 -70
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/elements/base.py +9 -9
  14. natural_pdf/elements/collections.py +105 -50
  15. natural_pdf/elements/region.py +320 -113
  16. natural_pdf/exporters/paddleocr.py +38 -13
  17. natural_pdf/flows/__init__.py +3 -3
  18. natural_pdf/flows/collections.py +303 -132
  19. natural_pdf/flows/element.py +277 -132
  20. natural_pdf/flows/flow.py +33 -16
  21. natural_pdf/flows/region.py +142 -79
  22. natural_pdf/ocr/engine_doctr.py +37 -4
  23. natural_pdf/ocr/engine_easyocr.py +23 -3
  24. natural_pdf/ocr/engine_paddle.py +281 -30
  25. natural_pdf/ocr/engine_surya.py +8 -3
  26. natural_pdf/ocr/ocr_manager.py +75 -76
  27. natural_pdf/ocr/ocr_options.py +52 -87
  28. natural_pdf/search/__init__.py +25 -12
  29. natural_pdf/search/lancedb_search_service.py +91 -54
  30. natural_pdf/search/numpy_search_service.py +86 -65
  31. natural_pdf/search/searchable_mixin.py +2 -2
  32. natural_pdf/selectors/parser.py +125 -81
  33. natural_pdf/widgets/__init__.py +1 -1
  34. natural_pdf/widgets/viewer.py +205 -449
  35. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
  36. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
  37. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
  38. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
  39. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
natural_pdf/flows/flow.py CHANGED
@@ -3,11 +3,12 @@ from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union
3
3
 
4
4
  if TYPE_CHECKING:
5
5
  from natural_pdf.core.page import Page
6
- from natural_pdf.elements.region import Region as PhysicalRegion
7
6
  from natural_pdf.elements.base import Element as PhysicalElement
8
7
  from natural_pdf.elements.collections import ElementCollection as PhysicalElementCollection
9
- from .element import FlowElement
8
+ from natural_pdf.elements.region import Region as PhysicalRegion
9
+
10
10
  from .collections import FlowElementCollection
11
+ from .element import FlowElement
11
12
 
12
13
  logger = logging.getLogger(__name__)
13
14
 
@@ -53,14 +54,18 @@ class Flow:
53
54
 
54
55
  self.segments: List["PhysicalRegion"] = self._normalize_segments(segments)
55
56
  self.arrangement: Literal["vertical", "horizontal"] = arrangement
56
- self.alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = alignment
57
+ self.alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = (
58
+ alignment
59
+ )
57
60
  self.segment_gap: float = segment_gap
58
61
 
59
62
  self._validate_alignment()
60
63
 
61
64
  # TODO: Pre-calculate segment offsets for faster lookups if needed
62
65
 
63
- def _normalize_segments(self, segments: List[Union["Page", "PhysicalRegion"]]) -> List["PhysicalRegion"]:
66
+ def _normalize_segments(
67
+ self, segments: List[Union["Page", "PhysicalRegion"]]
68
+ ) -> List["PhysicalRegion"]:
64
69
  """Converts all Page segments to full-page Region objects for uniform processing."""
65
70
  normalized = []
66
71
  from natural_pdf.core.page import Page as CorePage
@@ -71,13 +76,17 @@ class Flow:
71
76
  normalized.append(segment.region(0, 0, segment.width, segment.height))
72
77
  elif isinstance(segment, ElementsRegion):
73
78
  normalized.append(segment)
74
- elif hasattr(segment, 'object_type') and segment.object_type == "page":
79
+ elif hasattr(segment, "object_type") and segment.object_type == "page":
75
80
  if not isinstance(segment, CorePage):
76
- raise TypeError(f"Segment {i} has object_type 'page' but is not an instance of natural_pdf.core.page.Page. Got {type(segment)}")
81
+ raise TypeError(
82
+ f"Segment {i} has object_type 'page' but is not an instance of natural_pdf.core.page.Page. Got {type(segment)}"
83
+ )
77
84
  normalized.append(segment.region(0, 0, segment.width, segment.height))
78
- elif hasattr(segment, 'object_type') and segment.object_type == "region":
85
+ elif hasattr(segment, "object_type") and segment.object_type == "region":
79
86
  if not isinstance(segment, ElementsRegion):
80
- raise TypeError(f"Segment {i} has object_type 'region' but is not an instance of natural_pdf.elements.region.Region. Got {type(segment)}")
87
+ raise TypeError(
88
+ f"Segment {i} has object_type 'region' but is not an instance of natural_pdf.elements.region.Region. Got {type(segment)}"
89
+ )
81
90
  normalized.append(segment)
82
91
  else:
83
92
  raise TypeError(
@@ -129,7 +138,7 @@ class Flow:
129
138
  apply_exclusions=apply_exclusions,
130
139
  regex=regex,
131
140
  case=case,
132
- **kwargs
141
+ **kwargs,
133
142
  )
134
143
  return results.first if results else None
135
144
 
@@ -172,7 +181,7 @@ class Flow:
172
181
  # This preserves the order from matches_in_segment.elements
173
182
  for phys_elem in matches_in_segment.elements:
174
183
  all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
175
-
184
+
176
185
  # The global sort that was here previously has been removed.
177
186
  # The order is now determined by segment sequence, then by local order within each segment.
178
187
 
@@ -184,10 +193,12 @@ class Flow:
184
193
  f"arrangement='{self.arrangement}', alignment='{self.alignment}', gap={self.segment_gap}>"
185
194
  )
186
195
 
187
- # --- Helper methods for coordinate transformations and segment iteration ---
196
+ # --- Helper methods for coordinate transformations and segment iteration ---
188
197
  # These will be crucial for FlowElement's directional methods.
189
198
 
190
- def get_segment_bounding_box_in_flow(self, segment_index: int) -> Optional[tuple[float, float, float, float]]:
199
+ def get_segment_bounding_box_in_flow(
200
+ self, segment_index: int
201
+ ) -> Optional[tuple[float, float, float, float]]:
191
202
  """
192
203
  Calculates the conceptual bounding box of a segment within the flow's coordinate system.
193
204
  This considers arrangement, alignment, and segment gaps.
@@ -196,15 +207,19 @@ class Flow:
196
207
  """
197
208
  if segment_index < 0 or segment_index >= len(self.segments):
198
209
  return None
199
-
210
+
200
211
  # This is a simplified version. A full implementation would calculate offsets.
201
212
  # For now, we assume FlowElement directional logic handles segment traversal and uses physical coords.
202
213
  # If we were to *draw* the flow or get a FlowRegion bbox that spans gaps, this would be critical.
203
214
  # physical_segment = self.segments[segment_index]
204
215
  # return physical_segment.bbox
205
- raise NotImplementedError("Calculating a segment's bbox *within the flow's virtual coordinate system* is not yet fully implemented.")
216
+ raise NotImplementedError(
217
+ "Calculating a segment's bbox *within the flow's virtual coordinate system* is not yet fully implemented."
218
+ )
206
219
 
207
- def get_element_flow_coordinates(self, physical_element: "PhysicalElement") -> Optional[tuple[float, float, float, float]]:
220
+ def get_element_flow_coordinates(
221
+ self, physical_element: "PhysicalElement"
222
+ ) -> Optional[tuple[float, float, float, float]]:
208
223
  """
209
224
  Translates a physical element's coordinates into the flow's virtual coordinate system.
210
225
  (Placeholder - very complex if segment_gap > 0 or complex alignments)
@@ -213,4 +228,6 @@ class Flow:
213
228
  # if FlowRegion.bbox or other operations needed to present a unified coordinate space.
214
229
  # As per our discussion, elements *within* a FlowRegion retain original physical coordinates.
215
230
  # So, this might not be strictly necessary for the current design's core functionality.
216
- raise NotImplementedError("Translating element coordinates to a unified flow coordinate system is not yet implemented.")
231
+ raise NotImplementedError(
232
+ "Translating element coordinates to a unified flow coordinate system is not yet implemented."
233
+ )
@@ -1,19 +1,21 @@
1
1
  import logging
2
2
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
3
3
 
4
- from pdfplumber.utils.geometry import objects_to_bbox # For calculating combined bbox
4
+ from pdfplumber.utils.geometry import objects_to_bbox # For calculating combined bbox
5
5
 
6
6
  # For runtime image manipulation
7
7
  from PIL import Image as PIL_Image_Runtime
8
8
 
9
9
  if TYPE_CHECKING:
10
- from PIL.Image import Image as PIL_Image # For type hints
10
+ from PIL.Image import Image as PIL_Image # For type hints
11
+
12
+ from natural_pdf.core.page import Page as PhysicalPage
11
13
  from natural_pdf.elements.base import Element as PhysicalElement
12
- from natural_pdf.elements.region import Region as PhysicalRegion
13
14
  from natural_pdf.elements.collections import ElementCollection
14
- from natural_pdf.core.page import Page as PhysicalPage
15
- from .flow import Flow
15
+ from natural_pdf.elements.region import Region as PhysicalRegion
16
+
16
17
  from .element import FlowElement
18
+ from .flow import Flow
17
19
 
18
20
  logger = logging.getLogger(__name__)
19
21
 
@@ -53,7 +55,7 @@ class FlowRegion:
53
55
 
54
56
  # Cache for expensive operations
55
57
  self._cached_text: Optional[str] = None
56
- self._cached_elements: Optional["ElementCollection"] = None # Stringized
58
+ self._cached_elements: Optional["ElementCollection"] = None # Stringized
57
59
  self._cached_bbox: Optional[Tuple[float, float, float, float]] = None
58
60
 
59
61
  @property
@@ -68,7 +70,7 @@ class FlowRegion:
68
70
  return self._cached_bbox
69
71
  if not self.constituent_regions:
70
72
  return None
71
-
73
+
72
74
  # Use objects_to_bbox from pdfplumber.utils.geometry to merge bboxes
73
75
  # This helper expects a list of objects that have .x0, .top, .x1, .bottom attributes.
74
76
  # Our PhysicalRegion objects satisfy this.
@@ -113,7 +115,9 @@ class FlowRegion:
113
115
  Returns:
114
116
  The combined text content as a string.
115
117
  """
116
- if self._cached_text is not None and apply_exclusions: # Simple cache check, might need refinement if kwargs change behavior
118
+ if (
119
+ self._cached_text is not None and apply_exclusions
120
+ ): # Simple cache check, might need refinement if kwargs change behavior
117
121
  return self._cached_text
118
122
 
119
123
  if not self.constituent_regions:
@@ -124,17 +128,19 @@ class FlowRegion:
124
128
  # The FlowElement._flow_direction method is responsible for ordering constituent_regions correctly.
125
129
  for region in self.constituent_regions:
126
130
  texts.append(region.extract_text(apply_exclusions=apply_exclusions, **kwargs))
127
-
131
+
128
132
  # Join based on flow arrangement (e.g., newline for vertical, space for horizontal)
129
133
  # This is a simplification; true layout-aware joining would be more complex.
130
- joiner = "\n" if self.flow.arrangement == "vertical" else " " # TODO: Make this smarter, consider segment_gap
134
+ joiner = (
135
+ "\n" if self.flow.arrangement == "vertical" else " "
136
+ ) # TODO: Make this smarter, consider segment_gap
131
137
  extracted = joiner.join(t for t in texts if t)
132
-
133
- if apply_exclusions: # Only cache if standard exclusion behavior
134
- self._cached_text = extracted
138
+
139
+ if apply_exclusions: # Only cache if standard exclusion behavior
140
+ self._cached_text = extracted
135
141
  return extracted
136
142
 
137
- def elements(self, apply_exclusions: bool = True) -> "ElementCollection": # Stringized return
143
+ def elements(self, apply_exclusions: bool = True) -> "ElementCollection": # Stringized return
138
144
  """
139
145
  Collects all unique physical elements from all constituent physical regions.
140
146
 
@@ -145,36 +151,44 @@ class FlowRegion:
145
151
  Returns:
146
152
  An ElementCollection containing all unique elements.
147
153
  """
148
- from natural_pdf.elements.collections import ElementCollection as RuntimeElementCollection # Local import
154
+ from natural_pdf.elements.collections import (
155
+ ElementCollection as RuntimeElementCollection, # Local import
156
+ )
149
157
 
150
- if self._cached_elements is not None and apply_exclusions: # Simple cache check
158
+ if self._cached_elements is not None and apply_exclusions: # Simple cache check
151
159
  return self._cached_elements
152
160
 
153
161
  if not self.constituent_regions:
154
162
  return RuntimeElementCollection([])
155
163
 
156
- all_physical_elements: List["PhysicalElement"] = [] # Stringized item type
157
- seen_elements = set() # To ensure uniqueness if elements are shared or duplicated by region definitions
164
+ all_physical_elements: List["PhysicalElement"] = [] # Stringized item type
165
+ seen_elements = (
166
+ set()
167
+ ) # To ensure uniqueness if elements are shared or duplicated by region definitions
158
168
 
159
169
  for region in self.constituent_regions:
160
170
  # Region.get_elements() returns a list, not ElementCollection
161
- elements_in_region: List["PhysicalElement"] = region.get_elements(apply_exclusions=apply_exclusions)
171
+ elements_in_region: List["PhysicalElement"] = region.get_elements(
172
+ apply_exclusions=apply_exclusions
173
+ )
162
174
  for elem in elements_in_region:
163
- if elem not in seen_elements: # Check for uniqueness based on object identity
175
+ if elem not in seen_elements: # Check for uniqueness based on object identity
164
176
  all_physical_elements.append(elem)
165
177
  seen_elements.add(elem)
166
178
 
167
179
  # Basic reading order sort based on original page and coordinates.
168
- def get_sort_key(phys_elem: "PhysicalElement"): # Stringized param type
180
+ def get_sort_key(phys_elem: "PhysicalElement"): # Stringized param type
169
181
  page_idx = -1
170
- if hasattr(phys_elem, 'page') and hasattr(phys_elem.page, 'index'):
182
+ if hasattr(phys_elem, "page") and hasattr(phys_elem.page, "index"):
171
183
  page_idx = phys_elem.page.index
172
184
  return (page_idx, phys_elem.top, phys_elem.x0)
173
185
 
174
186
  try:
175
187
  sorted_physical_elements = sorted(all_physical_elements, key=get_sort_key)
176
188
  except AttributeError:
177
- logger.warning("Could not sort elements in FlowRegion by reading order; some elements might be missing page, top or x0 attributes.")
189
+ logger.warning(
190
+ "Could not sort elements in FlowRegion by reading order; some elements might be missing page, top or x0 attributes."
191
+ )
178
192
  sorted_physical_elements = all_physical_elements
179
193
 
180
194
  result_collection = RuntimeElementCollection(sorted_physical_elements)
@@ -182,22 +196,30 @@ class FlowRegion:
182
196
  self._cached_elements = result_collection
183
197
  return result_collection
184
198
 
185
- def find(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> Optional["PhysicalElement"]: # Stringized
199
+ def find(
200
+ self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
201
+ ) -> Optional["PhysicalElement"]: # Stringized
186
202
  """
187
203
  Finds the first physical element within this FlowRegion that matches the selector or text.
188
204
  """
189
205
  # Uses self.elements() which respects exclusions if apply_exclusions=True by default
190
206
  all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
191
- return all_elems.find(selector=selector, text=text, **kwargs) # ElementCollection.find
207
+ return all_elems.find(selector=selector, text=text, **kwargs) # ElementCollection.find
192
208
 
193
- def find_all(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> "ElementCollection": # Stringized
209
+ def find_all(
210
+ self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
211
+ ) -> "ElementCollection": # Stringized
194
212
  """
195
213
  Finds all physical elements within this FlowRegion that match the selector or text.
196
214
  """
197
215
  all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
198
- return all_elems.find_all(selector=selector, text=text, **kwargs) # ElementCollection.find_all
216
+ return all_elems.find_all(
217
+ selector=selector, text=text, **kwargs
218
+ ) # ElementCollection.find_all
199
219
 
200
- def highlight(self, label: Optional[str] = None, color: Optional[Union[Tuple, str]] = None, **kwargs) -> "FlowRegion": # Stringized
220
+ def highlight(
221
+ self, label: Optional[str] = None, color: Optional[Union[Tuple, str]] = None, **kwargs
222
+ ) -> "FlowRegion": # Stringized
201
223
  """
202
224
  Highlights all constituent physical regions on their respective pages.
203
225
 
@@ -214,7 +236,9 @@ class FlowRegion:
214
236
 
215
237
  base_label = label if label else "FlowRegionPart"
216
238
  for i, region in enumerate(self.constituent_regions):
217
- current_label = f"{base_label}_{i+1}" if len(self.constituent_regions) > 1 else base_label
239
+ current_label = (
240
+ f"{base_label}_{i+1}" if len(self.constituent_regions) > 1 else base_label
241
+ )
218
242
  region.highlight(label=current_label, color=color, **kwargs)
219
243
  return self
220
244
 
@@ -229,7 +253,7 @@ class FlowRegion:
229
253
  stack_direction: str = "vertical",
230
254
  stack_gap: int = 5,
231
255
  stack_background_color: Tuple[int, int, int] = (255, 255, 255),
232
- **kwargs
256
+ **kwargs,
233
257
  ) -> Optional["PIL_Image"]:
234
258
  """
235
259
  Generates and returns a PIL Image of relevant pages with constituent regions highlighted.
@@ -256,9 +280,9 @@ class FlowRegion:
256
280
  # 2. Get a highlighter service (e.g., from the first page involved)
257
281
  first_page_with_regions = next(iter(regions_by_page.keys()), None)
258
282
  highlighter_service = None
259
- if first_page_with_regions and hasattr(first_page_with_regions, '_highlighter'):
283
+ if first_page_with_regions and hasattr(first_page_with_regions, "_highlighter"):
260
284
  highlighter_service = first_page_with_regions._highlighter
261
-
285
+
262
286
  if not highlighter_service:
263
287
  raise ValueError(
264
288
  "Cannot get highlighter service for FlowRegion.show(). "
@@ -266,9 +290,12 @@ class FlowRegion:
266
290
  )
267
291
 
268
292
  output_page_images: List["PIL_Image_Runtime"] = []
269
-
293
+
270
294
  # Sort pages by index for consistent output order
271
- sorted_pages = sorted(regions_by_page.keys(), key=lambda p: p.index if hasattr(p, 'index') else getattr(p, 'page_number', 0))
295
+ sorted_pages = sorted(
296
+ regions_by_page.keys(),
297
+ key=lambda p: p.index if hasattr(p, "index") else getattr(p, "page_number", 0),
298
+ )
272
299
 
273
300
  # 3. Render each page with its relevant constituent regions highlighted
274
301
  for page_idx, page_obj in enumerate(sorted_pages):
@@ -279,41 +306,55 @@ class FlowRegion:
279
306
  temp_highlights_for_page = []
280
307
  for i, region_part in enumerate(constituent_regions_on_this_page):
281
308
  part_label = None
282
- if labels and label_prefix: # Ensure labels is True for label_prefix to apply
309
+ if labels and label_prefix: # Ensure labels is True for label_prefix to apply
283
310
  # If FlowRegion consists of multiple parts on this page, or overall
284
311
  count_indicator = ""
285
- if len(self.constituent_regions) > 1 : # If flow region has multiple parts overall
312
+ if (
313
+ len(self.constituent_regions) > 1
314
+ ): # If flow region has multiple parts overall
286
315
  # Find global index of this region_part in self.constituent_regions
287
316
  try:
288
317
  global_idx = self.constituent_regions.index(region_part)
289
318
  count_indicator = f"_{global_idx + 1}"
290
- except ValueError: # Should not happen if region_part is from the list
291
- count_indicator = f"_p{page_idx}i{i+1}" # fallback local index
292
- elif len(constituent_regions_on_this_page) > 1 : # If multiple parts on *this* page, but FR is single part overall
293
- count_indicator = f"_{i+1}"
319
+ except ValueError: # Should not happen if region_part is from the list
320
+ count_indicator = f"_p{page_idx}i{i+1}" # fallback local index
321
+ elif (
322
+ len(constituent_regions_on_this_page) > 1
323
+ ): # If multiple parts on *this* page, but FR is single part overall
324
+ count_indicator = f"_{i+1}"
294
325
 
295
326
  part_label = f"{label_prefix}{count_indicator}" if label_prefix else None
296
-
297
- temp_highlights_for_page.append({
298
- "page_index": page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
299
- "bbox": region_part.bbox,
300
- "polygon": region_part.polygon if region_part.has_polygon else None,
301
- "color": color, # Use the passed color
302
- "label": part_label,
303
- "use_color_cycling": False, # Keep specific color
304
- })
305
-
327
+
328
+ temp_highlights_for_page.append(
329
+ {
330
+ "page_index": (
331
+ page_obj.index
332
+ if hasattr(page_obj, "index")
333
+ else getattr(page_obj, "page_number", 1) - 1
334
+ ),
335
+ "bbox": region_part.bbox,
336
+ "polygon": region_part.polygon if region_part.has_polygon else None,
337
+ "color": color, # Use the passed color
338
+ "label": part_label,
339
+ "use_color_cycling": False, # Keep specific color
340
+ }
341
+ )
342
+
306
343
  if not temp_highlights_for_page:
307
344
  continue
308
345
 
309
346
  page_image = highlighter_service.render_preview(
310
- page_index=page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
347
+ page_index=(
348
+ page_obj.index
349
+ if hasattr(page_obj, "index")
350
+ else getattr(page_obj, "page_number", 1) - 1
351
+ ),
311
352
  temporary_highlights=temp_highlights_for_page,
312
353
  scale=scale,
313
354
  width=width,
314
- labels=labels, # Pass through labels
355
+ labels=labels, # Pass through labels
315
356
  legend_position=legend_position,
316
- **kwargs
357
+ **kwargs,
317
358
  )
318
359
  if page_image:
319
360
  output_page_images.append(page_image)
@@ -322,18 +363,23 @@ class FlowRegion:
322
363
  if not output_page_images:
323
364
  logger.info("FlowRegion.show() produced no page images to concatenate.")
324
365
  return None
325
-
366
+
326
367
  if len(output_page_images) == 1:
327
368
  return output_page_images[0]
328
369
 
329
370
  # Stacking logic (same as in FlowRegionCollection.show)
330
371
  if stack_direction == "vertical":
331
372
  final_width = max(img.width for img in output_page_images)
332
- final_height = sum(img.height for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
333
- if final_width == 0 or final_height == 0:
373
+ final_height = (
374
+ sum(img.height for img in output_page_images)
375
+ + (len(output_page_images) - 1) * stack_gap
376
+ )
377
+ if final_width == 0 or final_height == 0:
334
378
  raise ValueError("Cannot create concatenated image with zero width or height.")
335
-
336
- concatenated_image = PIL_Image_Runtime.new("RGB", (final_width, final_height), stack_background_color)
379
+
380
+ concatenated_image = PIL_Image_Runtime.new(
381
+ "RGB", (final_width, final_height), stack_background_color
382
+ )
337
383
  current_y = 0
338
384
  for img in output_page_images:
339
385
  paste_x = (final_width - img.width) // 2
@@ -341,12 +387,17 @@ class FlowRegion:
341
387
  current_y += img.height + stack_gap
342
388
  return concatenated_image
343
389
  elif stack_direction == "horizontal":
344
- final_width = sum(img.width for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
390
+ final_width = (
391
+ sum(img.width for img in output_page_images)
392
+ + (len(output_page_images) - 1) * stack_gap
393
+ )
345
394
  final_height = max(img.height for img in output_page_images)
346
395
  if final_width == 0 or final_height == 0:
347
396
  raise ValueError("Cannot create concatenated image with zero width or height.")
348
397
 
349
- concatenated_image = PIL_Image_Runtime.new("RGB", (final_width, final_height), stack_background_color)
398
+ concatenated_image = PIL_Image_Runtime.new(
399
+ "RGB", (final_width, final_height), stack_background_color
400
+ )
350
401
  current_x = 0
351
402
  for img in output_page_images:
352
403
  paste_y = (final_height - img.height) // 2
@@ -354,15 +405,17 @@ class FlowRegion:
354
405
  current_x += img.width + stack_gap
355
406
  return concatenated_image
356
407
  else:
357
- raise ValueError(f"Invalid stack_direction '{stack_direction}' for FlowRegion.show(). Must be 'vertical' or 'horizontal'.")
408
+ raise ValueError(
409
+ f"Invalid stack_direction '{stack_direction}' for FlowRegion.show(). Must be 'vertical' or 'horizontal'."
410
+ )
358
411
 
359
412
  def to_images(
360
413
  self,
361
414
  resolution: float = 150,
362
- **kwargs,
363
- ) -> List["PIL_Image"]:
415
+ **kwargs,
416
+ ) -> List["PIL_Image"]:
364
417
  """
365
- Generates and returns a list of cropped PIL Images,
418
+ Generates and returns a list of cropped PIL Images,
366
419
  one for each constituent physical region of this FlowRegion.
367
420
  """
368
421
  if not self.constituent_regions:
@@ -373,19 +426,19 @@ class FlowRegion:
373
426
  for region_part in self.constituent_regions:
374
427
  try:
375
428
  img = region_part.to_image(
376
- resolution=resolution,
377
- crop_only=True,
378
- include_highlights=False,
379
- **kwargs
429
+ resolution=resolution, crop_only=True, include_highlights=False, **kwargs
380
430
  )
381
431
  if img:
382
432
  cropped_images.append(img)
383
433
  except Exception as e:
384
- logger.error(f"Error generating image for constituent region {region_part.bbox}: {e}", exc_info=True)
385
-
434
+ logger.error(
435
+ f"Error generating image for constituent region {region_part.bbox}: {e}",
436
+ exc_info=True,
437
+ )
438
+
386
439
  return cropped_images
387
440
 
388
- def to_image(self, background_color=(255,255,255), **kwargs) -> Optional["PIL_Image"]:
441
+ def to_image(self, background_color=(255, 255, 255), **kwargs) -> Optional["PIL_Image"]:
389
442
  """
390
443
  Creates a single composite image by stacking the images of its constituent regions.
391
444
  Stacking direction is based on the Flow's arrangement.
@@ -410,23 +463,29 @@ class FlowRegion:
410
463
  # Stack vertically
411
464
  composite_width = max(img.width for img in images)
412
465
  composite_height = sum(img.height for img in images)
413
- if composite_width == 0 or composite_height == 0: return None # Avoid zero-size image
466
+ if composite_width == 0 or composite_height == 0:
467
+ return None # Avoid zero-size image
414
468
 
415
- new_image = PIL_Image_Runtime.new("RGB", (composite_width, composite_height), background_color)
469
+ new_image = PIL_Image_Runtime.new(
470
+ "RGB", (composite_width, composite_height), background_color
471
+ )
416
472
  current_y = 0
417
473
  for img in images:
418
474
  # Default to left alignment for vertical stacking
419
475
  new_image.paste(img, (0, current_y))
420
476
  current_y += img.height
421
477
  return new_image
422
-
478
+
423
479
  elif self.flow.arrangement == "horizontal":
424
480
  # Stack horizontally
425
481
  composite_width = sum(img.width for img in images)
426
482
  composite_height = max(img.height for img in images)
427
- if composite_width == 0 or composite_height == 0: return None
483
+ if composite_width == 0 or composite_height == 0:
484
+ return None
428
485
 
429
- new_image = PIL_Image_Runtime.new("RGB", (composite_width, composite_height), background_color)
486
+ new_image = PIL_Image_Runtime.new(
487
+ "RGB", (composite_width, composite_height), background_color
488
+ )
430
489
  current_x = 0
431
490
  for img in images:
432
491
  # Default to top alignment for horizontal stacking
@@ -435,7 +494,9 @@ class FlowRegion:
435
494
  return new_image
436
495
  else:
437
496
  # Should not happen if flow.arrangement is validated
438
- logger.warning(f"Unknown flow arrangement: {self.flow.arrangement}. Cannot stack images.")
497
+ logger.warning(
498
+ f"Unknown flow arrangement: {self.flow.arrangement}. Cannot stack images."
499
+ )
439
500
  return None
440
501
 
441
502
  def __repr__(self) -> str:
@@ -453,6 +514,8 @@ class FlowRegion:
453
514
  # For now, if it has regions, it's not considered empty by this simple check.
454
515
  # User Point 4: FlowRegion can be empty (no text, no elements). This implies checking content.
455
516
  try:
456
- return not bool(self.extract_text(apply_exclusions=False).strip()) and not bool(self.elements(apply_exclusions=False))
517
+ return not bool(self.extract_text(apply_exclusions=False).strip()) and not bool(
518
+ self.elements(apply_exclusions=False)
519
+ )
457
520
  except Exception:
458
- return True # If error during check, assume empty to be safe
521
+ return True # If error during check, assume empty to be safe
@@ -60,10 +60,22 @@ class DoctrOCREngine(OCREngine):
60
60
  # Filter out None values
61
61
  predictor_args = {k: v for k, v in predictor_args.items() if v is not None}
62
62
 
63
- self.logger.debug(f"doctr ocr_predictor constructor args: {predictor_args}")
63
+ # Filter only allowed doctr ocr_predictor args
64
+ allowed_ocr_args = {
65
+ "det_arch",
66
+ "reco_arch",
67
+ "pretrained",
68
+ "assume_straight_pages",
69
+ "export_as_straight_boxes",
70
+ }
71
+ filtered_ocr_args = {k: v for k, v in predictor_args.items() if k in allowed_ocr_args}
72
+ dropped_ocr = set(predictor_args) - allowed_ocr_args
73
+ if dropped_ocr:
74
+ self.logger.warning(f"Dropped unsupported doctr ocr_predictor args: {dropped_ocr}")
75
+
76
+ self.logger.debug(f"doctr ocr_predictor constructor args: {filtered_ocr_args}")
64
77
  try:
65
- # Create the main OCR predictor (doesn't accept batch_size)
66
- self._model = doctr.models.ocr_predictor(**predictor_args)
78
+ self._model = doctr.models.ocr_predictor(**filtered_ocr_args)
67
79
 
68
80
  # Apply CUDA if available
69
81
  if use_cuda:
@@ -81,7 +93,28 @@ class DoctrOCREngine(OCREngine):
81
93
  "preserve_aspect_ratio": doctr_opts.preserve_aspect_ratio,
82
94
  "batch_size": doctr_opts.batch_size,
83
95
  }
84
- self._detection_model = doctr.models.detection_predictor(**detection_args)
96
+ # Filter out None values
97
+ detection_args = {k: v for k, v in detection_args.items() if v is not None}
98
+ allowed_det_args = {
99
+ "arch",
100
+ "pretrained",
101
+ "assume_straight_pages",
102
+ "symmetric_pad",
103
+ "preserve_aspect_ratio",
104
+ "batch_size",
105
+ }
106
+ filtered_det_args = {
107
+ k: v for k, v in detection_args.items() if k in allowed_det_args
108
+ }
109
+ dropped_det = set(detection_args) - allowed_det_args
110
+ if dropped_det:
111
+ self.logger.warning(
112
+ f"Dropped unsupported doctr detection_predictor args: {dropped_det}"
113
+ )
114
+ self.logger.debug(
115
+ f"doctr detection_predictor constructor args: {filtered_det_args}"
116
+ )
117
+ self._detection_model = doctr.models.detection_predictor(**filtered_det_args)
85
118
 
86
119
  # Apply CUDA if available
87
120
  if use_cuda:
@@ -59,14 +59,34 @@ class EasyOCREngine(OCREngine):
59
59
  "cudnn_benchmark": easy_options.cudnn_benchmark,
60
60
  }
61
61
 
62
- # Filter out None values, as EasyOCR expects non-None or default behaviour
62
+ # Filter out None values
63
63
  constructor_args = {k: v for k, v in constructor_args.items() if v is not None}
64
64
 
65
- self.logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
65
+ # Filter only allowed EasyOCR args
66
+ allowed_args = {
67
+ "lang_list",
68
+ "gpu",
69
+ "model_storage_directory",
70
+ "user_network_directory",
71
+ "recog_network",
72
+ "detect_network",
73
+ "download_enabled",
74
+ "detector",
75
+ "recognizer",
76
+ "verbose",
77
+ "quantize",
78
+ "cudnn_benchmark",
79
+ }
80
+ filtered_args = {k: v for k, v in constructor_args.items() if k in allowed_args}
81
+ dropped = set(constructor_args) - allowed_args
82
+ if dropped:
83
+ self.logger.warning(f"Dropped unsupported EasyOCR args: {dropped}")
84
+
85
+ self.logger.debug(f"EasyOCR Reader constructor args: {filtered_args}")
66
86
 
67
87
  # Create the reader
68
88
  try:
69
- self._model = easyocr.Reader(**constructor_args)
89
+ self._model = easyocr.Reader(**filtered_args)
70
90
  self.logger.info("EasyOCR reader created successfully")
71
91
  except Exception as e:
72
92
  self.logger.error(f"Failed to create EasyOCR reader: {e}")