natural-pdf 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +670 -595
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +131 -45
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +113 -22
  19. natural_pdf/core/pdf.py +477 -75
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +222 -108
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.34.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.33.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
@@ -18,12 +18,59 @@ logger = logging.getLogger(__name__)
18
18
 
19
19
 
20
20
  class LayoutDetector(ABC):
21
- """
22
- Abstract Base Class for layout detection engines.
23
-
24
- Subclasses should implement is_available, _load_model_from_options, detect,
25
- and override _get_cache_key if model loading depends on options beyond device.
26
- They should also populate the 'supported_classes' set.
21
+ """Abstract base class for layout detection engines.
22
+
23
+ This class defines the standard interface that all layout detection engines
24
+ must implement in natural-pdf. Layout detectors analyze document images to
25
+ identify structural elements like tables, figures, headers, paragraphs, etc.
26
+
27
+ The base class provides common functionality including model caching, result
28
+ standardization, and configuration management, while concrete implementations
29
+ handle engine-specific detection logic for different models (YOLO, TATR, Surya, etc.).
30
+
31
+ Subclasses must implement:
32
+ - detect(): Core layout detection for a single image
33
+ - is_available(): Check if engine dependencies are installed
34
+ - _load_model_from_options(): Load and configure the detection model
35
+ - _get_cache_key(): Generate cache keys for model instances
36
+
37
+ Subclasses should also populate the 'supported_classes' set with the document
38
+ element types they can detect (e.g., 'table', 'figure', 'text', 'title').
39
+
40
+ Attributes:
41
+ logger: Logger instance for the specific detector.
42
+ supported_classes: Set of document element types this detector can identify.
43
+ _model_cache: Dictionary cache for loaded model instances.
44
+
45
+ Example:
46
+ Implementing a custom layout detector:
47
+ ```python
48
+ class MyLayoutDetector(LayoutDetector):
49
+ def __init__(self):
50
+ super().__init__()
51
+ self.supported_classes = {'table', 'figure', 'text'}
52
+
53
+ @classmethod
54
+ def is_available(cls) -> bool:
55
+ try:
56
+ import my_layout_library
57
+ return True
58
+ except ImportError:
59
+ return False
60
+
61
+ def detect(self, image, options):
62
+ # Implement layout detection
63
+ return detection_results
64
+ ```
65
+
66
+ Using a layout detector:
67
+ ```python
68
+ if YOLODetector.is_available():
69
+ detector = YOLODetector()
70
+ results = detector.detect(page_image, options)
71
+ for result in results:
72
+ print(f"Found {result['class']} at {result['bbox']}")
73
+ ```
27
74
  """
28
75
 
29
76
  def __init__(self):
@@ -83,7 +83,9 @@ class LayoutAnalyzer:
83
83
  f" Rendering page {self._page.number} to image for initial layout detection..."
84
84
  )
85
85
  try:
86
- layout_resolution = getattr(self._page._parent, "_config", {}).get("layout_image_resolution", 72)
86
+ layout_resolution = getattr(self._page._parent, "_config", {}).get(
87
+ "layout_image_resolution", 72
88
+ )
87
89
  std_res_page_image = self._page.to_image(
88
90
  resolution=layout_resolution, include_highlights=False
89
91
  )
@@ -5,10 +5,6 @@ from typing import Any, Dict, List, Optional, Type, Union
5
5
 
6
6
  from PIL import Image
7
7
 
8
- # --- Import lightweight components only ---
9
- # Heavy detector implementations (paddle, yolo, etc.) are **not** imported at module load.
10
- # Instead, we provide tiny helper functions that import them lazily **only when needed**.
11
-
12
8
  from .base import LayoutDetector # Lightweight base class
13
9
  from .layout_options import (
14
10
  BaseLayoutOptions,
@@ -21,6 +17,11 @@ from .layout_options import (
21
17
  YOLOLayoutOptions,
22
18
  )
23
19
 
20
+ # --- Import lightweight components only ---
21
+ # Heavy detector implementations (paddle, yolo, etc.) are **not** imported at module load.
22
+ # Instead, we provide tiny helper functions that import them lazily **only when needed**.
23
+
24
+
24
25
  # ------------------ Lazy import helpers ------------------ #
25
26
 
26
27
 
@@ -60,6 +61,7 @@ def _lazy_import_gemini_detector():
60
61
 
61
62
  return GeminiLayoutDetector
62
63
 
64
+
63
65
  # --------------------------------------------------------- #
64
66
 
65
67
  logger = logging.getLogger(__name__)
@@ -205,7 +207,9 @@ class LayoutManager:
205
207
  for name, registry_entry in self.ENGINE_REGISTRY.items():
206
208
  try:
207
209
  engine_class_or_factory = registry_entry["class"]
208
- if callable(engine_class_or_factory) and not isinstance(engine_class_or_factory, type):
210
+ if callable(engine_class_or_factory) and not isinstance(
211
+ engine_class_or_factory, type
212
+ ):
209
213
  # Lazy factory – call it to obtain real class
210
214
  engine_class = engine_class_or_factory()
211
215
  else:
@@ -224,43 +228,43 @@ class LayoutManager:
224
228
  def cleanup_detector(self, detector_name: Optional[str] = None) -> int:
225
229
  """
226
230
  Cleanup layout detector instances to free memory.
227
-
231
+
228
232
  Args:
229
233
  detector_name: Specific detector to cleanup, or None to cleanup all detectors
230
-
234
+
231
235
  Returns:
232
236
  Number of detectors cleaned up
233
237
  """
234
238
  cleaned_count = 0
235
-
239
+
236
240
  if detector_name:
237
241
  # Cleanup specific detector
238
242
  detector_name = detector_name.lower()
239
243
  if detector_name in self._detector_instances:
240
244
  detector = self._detector_instances.pop(detector_name)
241
- if hasattr(detector, 'cleanup'):
245
+ if hasattr(detector, "cleanup"):
242
246
  try:
243
247
  detector.cleanup()
244
248
  except Exception as e:
245
249
  logger.debug(f"Detector {detector_name} cleanup method failed: {e}")
246
-
250
+
247
251
  logger.info(f"Cleaned up layout detector: {detector_name}")
248
252
  cleaned_count = 1
249
253
  else:
250
254
  # Cleanup all detectors
251
255
  for name, detector in list(self._detector_instances.items()):
252
- if hasattr(detector, 'cleanup'):
256
+ if hasattr(detector, "cleanup"):
253
257
  try:
254
258
  detector.cleanup()
255
259
  except Exception as e:
256
260
  logger.debug(f"Detector {name} cleanup method failed: {e}")
257
-
261
+
258
262
  # Clear all caches
259
263
  detector_count = len(self._detector_instances)
260
264
  self._detector_instances.clear()
261
-
265
+
262
266
  if detector_count > 0:
263
267
  logger.info(f"Cleaned up {detector_count} layout detectors")
264
268
  cleaned_count = detector_count
265
-
269
+
266
270
  return cleaned_count
@@ -58,6 +58,7 @@ class PaddleLayoutOptions(BaseLayoutOptions):
58
58
  Options specific to PaddlePaddle PP-StructureV3 layout detection.
59
59
  See: https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-StructureV3.html
60
60
  """
61
+
61
62
  # Model paths and names
62
63
  layout_detection_model_name: Optional[str] = None
63
64
  layout_detection_model_dir: Optional[str] = None
@@ -55,6 +55,7 @@ else:
55
55
 
56
56
  from .table_structure_utils import group_cells_into_rows_and_columns
57
57
 
58
+
58
59
  class PaddleLayoutDetector(LayoutDetector):
59
60
  """Document layout and table structure detector using PaddlePaddle's PP-StructureV3."""
60
61
 
@@ -187,8 +188,9 @@ class PaddleLayoutDetector(LayoutDetector):
187
188
  init_args[field_name] = value
188
189
  # Add filtered extra_args (not starting with '_' and in valid set)
189
190
  filtered_extra_args = {
190
- k: v for k, v in options.extra_args.items()
191
- if not k.startswith('_') and k in valid_init_args
191
+ k: v
192
+ for k, v in options.extra_args.items()
193
+ if not k.startswith("_") and k in valid_init_args
192
194
  }
193
195
  init_args.update(filtered_extra_args)
194
196
 
@@ -266,7 +268,7 @@ class PaddleLayoutDetector(LayoutDetector):
266
268
  if options.exclude_classes
267
269
  else set()
268
270
  )
269
-
271
+
270
272
  # Debug counters
271
273
  table_count = 0
272
274
  cell_count = 0
@@ -296,7 +298,9 @@ class PaddleLayoutDetector(LayoutDetector):
296
298
  table_structures = table_res_list or []
297
299
  table_idx = 0 # fallback index if no region_id
298
300
  if table_res_list:
299
- self.logger.debug(f"Found {len(table_res_list)} table structure(s) in table_res_list.")
301
+ self.logger.debug(
302
+ f"Found {len(table_res_list)} table structure(s) in table_res_list."
303
+ )
300
304
 
301
305
  if not layout_res or "boxes" not in layout_res:
302
306
  self.logger.debug("No layout detection boxes found in result.")
@@ -322,9 +326,7 @@ class PaddleLayoutDetector(LayoutDetector):
322
326
 
323
327
  bbox = region.get("coordinate")
324
328
  if not bbox or len(bbox) != 4:
325
- self.logger.warning(
326
- f"Skipping region with invalid bbox: {region}"
327
- )
329
+ self.logger.warning(f"Skipping region with invalid bbox: {region}")
328
330
  continue
329
331
  x_min, y_min, x_max, y_max = map(float, bbox)
330
332
 
@@ -351,10 +353,14 @@ class PaddleLayoutDetector(LayoutDetector):
351
353
 
352
354
  if table_struct:
353
355
  matched_table_structures += 1
354
- self.logger.debug(f"Matched table structure for table_region_id {region_id} or index {table_idx-1}.")
356
+ self.logger.debug(
357
+ f"Matched table structure for table_region_id {region_id} or index {table_idx-1}."
358
+ )
355
359
  # Attach structure info as metadata
356
360
  detection_data["metadata"] = {
357
- k: v for k, v in table_struct.items() if k not in ("cell_box_list", "table_ocr_pred", "pred_html")
361
+ k: v
362
+ for k, v in table_struct.items()
363
+ if k not in ("cell_box_list", "table_ocr_pred", "pred_html")
358
364
  }
359
365
  detection_data["html"] = table_struct.get("pred_html")
360
366
  # Add cell regions
@@ -364,84 +370,116 @@ class PaddleLayoutDetector(LayoutDetector):
364
370
  continue
365
371
  sx0, sy0, sx1, sy1 = map(float, cell_bbox)
366
372
  cell_boxes.append((sx0, sy0, sx1, sy1))
367
- detections.append({
368
- "bbox": (sx0, sy0, sx1, sy1),
369
- "class": "table_cell",
370
- "confidence": confidence_score,
371
- "normalized_class": self._normalize_class_name("table_cell"),
372
- "source": "layout",
373
- "model": "paddle_v3",
374
- "parent_bbox": (x_min, y_min, x_max, y_max),
375
- })
376
- cell_count += 1
377
- self.logger.debug(f"Created table_cell region for bbox {(sx0, sy0, sx1, sy1)}.")
378
- # Add row/col regions if not present in Paddle output
379
- if not table_struct.get("row_box_list") and not table_struct.get("col_box_list"):
380
- row_boxes, col_boxes = group_cells_into_rows_and_columns(cell_boxes)
381
- for row_bbox in row_boxes:
382
- rx0, ry0, rx1, ry1 = row_bbox
383
- detections.append({
384
- "bbox": (rx0, ry0, rx1, ry1),
385
- "class": "table_row",
373
+ detections.append(
374
+ {
375
+ "bbox": (sx0, sy0, sx1, sy1),
376
+ "class": "table_cell",
386
377
  "confidence": confidence_score,
387
- "normalized_class": self._normalize_class_name("table_row"),
378
+ "normalized_class": self._normalize_class_name(
379
+ "table_cell"
380
+ ),
388
381
  "source": "layout",
389
382
  "model": "paddle_v3",
390
383
  "parent_bbox": (x_min, y_min, x_max, y_max),
391
- })
384
+ }
385
+ )
386
+ cell_count += 1
387
+ self.logger.debug(
388
+ f"Created table_cell region for bbox {(sx0, sy0, sx1, sy1)}."
389
+ )
390
+ # Add row/col regions if not present in Paddle output
391
+ if not table_struct.get("row_box_list") and not table_struct.get(
392
+ "col_box_list"
393
+ ):
394
+ row_boxes, col_boxes = group_cells_into_rows_and_columns(
395
+ cell_boxes
396
+ )
397
+ for row_bbox in row_boxes:
398
+ rx0, ry0, rx1, ry1 = row_bbox
399
+ detections.append(
400
+ {
401
+ "bbox": (rx0, ry0, rx1, ry1),
402
+ "class": "table_row",
403
+ "confidence": confidence_score,
404
+ "normalized_class": self._normalize_class_name(
405
+ "table_row"
406
+ ),
407
+ "source": "layout",
408
+ "model": "paddle_v3",
409
+ "parent_bbox": (x_min, y_min, x_max, y_max),
410
+ }
411
+ )
392
412
  row_count += 1
393
- self.logger.debug(f"[UTIL] Created table_row region for bbox {(rx0, ry0, rx1, ry1)}.")
413
+ self.logger.debug(
414
+ f"[UTIL] Created table_row region for bbox {(rx0, ry0, rx1, ry1)}."
415
+ )
394
416
  for col_bbox in col_boxes:
395
417
  cx0, cy0, cx1, cy1 = col_bbox
396
- detections.append({
397
- "bbox": (cx0, cy0, cx1, cy1),
398
- "class": "table_column",
399
- "confidence": confidence_score,
400
- "normalized_class": self._normalize_class_name("table_column"),
401
- "source": "layout",
402
- "model": "paddle_v3",
403
- "parent_bbox": (x_min, y_min, x_max, y_max),
404
- })
418
+ detections.append(
419
+ {
420
+ "bbox": (cx0, cy0, cx1, cy1),
421
+ "class": "table_column",
422
+ "confidence": confidence_score,
423
+ "normalized_class": self._normalize_class_name(
424
+ "table_column"
425
+ ),
426
+ "source": "layout",
427
+ "model": "paddle_v3",
428
+ "parent_bbox": (x_min, y_min, x_max, y_max),
429
+ }
430
+ )
405
431
  col_count += 1
406
- self.logger.debug(f"[UTIL] Created table_column region for bbox {(cx0, cy0, cx1, cy1)}.")
432
+ self.logger.debug(
433
+ f"[UTIL] Created table_column region for bbox {(cx0, cy0, cx1, cy1)}."
434
+ )
407
435
  else:
408
436
  # Add row regions from Paddle output if present
409
437
  for row_bbox in table_struct.get("row_box_list", []):
410
438
  if row_bbox is None or len(row_bbox) != 4:
411
439
  continue
412
440
  rx0, ry0, rx1, ry1 = map(float, row_bbox)
413
- detections.append({
414
- "bbox": (rx0, ry0, rx1, ry1),
415
- "class": "table_row",
416
- "confidence": confidence_score,
417
- "normalized_class": self._normalize_class_name("table_row"),
418
- "source": "layout",
419
- "model": "paddle_v3",
420
- "parent_bbox": (x_min, y_min, x_max, y_max),
421
- })
441
+ detections.append(
442
+ {
443
+ "bbox": (rx0, ry0, rx1, ry1),
444
+ "class": "table_row",
445
+ "confidence": confidence_score,
446
+ "normalized_class": self._normalize_class_name(
447
+ "table_row"
448
+ ),
449
+ "source": "layout",
450
+ "model": "paddle_v3",
451
+ "parent_bbox": (x_min, y_min, x_max, y_max),
452
+ }
453
+ )
422
454
  row_count += 1
423
- self.logger.debug(f"Created table_row region for bbox {(rx0, ry0, rx1, ry1)}.")
455
+ self.logger.debug(
456
+ f"Created table_row region for bbox {(rx0, ry0, rx1, ry1)}."
457
+ )
424
458
  # Add column regions from Paddle output if present
425
459
  for col_bbox in table_struct.get("col_box_list", []):
426
460
  if col_bbox is None or len(col_bbox) != 4:
427
461
  continue
428
462
  cx0, cy0, cx1, cy1 = map(float, col_bbox)
429
- detections.append({
430
- "bbox": (cx0, cy0, cx1, cy1),
431
- "class": "table_column",
432
- "confidence": confidence_score,
433
- "normalized_class": self._normalize_class_name("table_column"),
434
- "source": "layout",
435
- "model": "paddle_v3",
436
- "parent_bbox": (x_min, y_min, x_max, y_max),
437
- })
463
+ detections.append(
464
+ {
465
+ "bbox": (cx0, cy0, cx1, cy1),
466
+ "class": "table_column",
467
+ "confidence": confidence_score,
468
+ "normalized_class": self._normalize_class_name(
469
+ "table_column"
470
+ ),
471
+ "source": "layout",
472
+ "model": "paddle_v3",
473
+ "parent_bbox": (x_min, y_min, x_max, y_max),
474
+ }
475
+ )
438
476
  col_count += 1
439
- self.logger.debug(f"Created table_column region for bbox {(cx0, cy0, cx1, cy1)}.")
477
+ self.logger.debug(
478
+ f"Created table_column region for bbox {(cx0, cy0, cx1, cy1)}."
479
+ )
440
480
  detections.append(detection_data)
441
481
  except (TypeError, KeyError, IndexError, ValueError) as e:
442
- self.logger.warning(
443
- f"Error processing Paddle region: {region}. Error: {e}"
444
- )
482
+ self.logger.warning(f"Error processing Paddle region: {region}. Error: {e}")
445
483
  continue
446
484
 
447
485
  self.logger.info(
@@ -1,6 +1,8 @@
1
1
  from typing import List, Tuple
2
+
2
3
  import numpy as np
3
4
 
5
+
4
6
  def group_cells_into_rows_and_columns(
5
7
  cell_boxes: List[Tuple[float, float, float, float]],
6
8
  row_tol: float = None,
@@ -75,4 +77,4 @@ def group_cells_into_rows_and_columns(
75
77
  y1 = float(np.max(boxes[group, 3]))
76
78
  col_boxes.append((x0, y0, x1, y1))
77
79
 
78
- return row_boxes, col_boxes
80
+ return row_boxes, col_boxes
@@ -91,9 +91,7 @@ class YOLODocLayoutDetector(LayoutDetector):
91
91
  def _load_model_from_options(self, options: YOLOLayoutOptions) -> Any:
92
92
  """Load the YOLOv10 model based on options."""
93
93
  if not self.is_available():
94
- raise RuntimeError(
95
- "YOLO dependencies not installed. Please run: npdf install yolo"
96
- )
94
+ raise RuntimeError("YOLO dependencies not installed. Please run: npdf install yolo")
97
95
  self.logger.info(f"Loading YOLO model: {options.model_repo}/{options.model_file}")
98
96
  try:
99
97
  model_path = hf_hub_download(repo_id=options.model_repo, filename=options.model_file)
@@ -107,9 +105,7 @@ class YOLODocLayoutDetector(LayoutDetector):
107
105
  def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
108
106
  """Detect layout elements in an image using YOLO."""
109
107
  if not self.is_available():
110
- raise RuntimeError(
111
- "YOLO dependencies not installed. Please run: npdf install yolo"
112
- )
108
+ raise RuntimeError("YOLO dependencies not installed. Please run: npdf install yolo")
113
109
 
114
110
  # Ensure options are the correct type, falling back to defaults if base type passed
115
111
  if not isinstance(options, YOLOLayoutOptions):
@@ -3,10 +3,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
3
3
 
4
4
  import numpy as np
5
5
  from PIL import Image, ImageDraw
6
- from scipy.ndimage import binary_closing, binary_opening, gaussian_filter1d
6
+ from scipy.ndimage import binary_closing, binary_opening, find_objects, gaussian_filter1d
7
+ from scipy.ndimage import label as nd_label
7
8
  from scipy.signal import find_peaks
8
9
  from sklearn.cluster import MiniBatchKMeans
9
- from scipy.ndimage import label as nd_label, find_objects
10
10
 
11
11
  if TYPE_CHECKING:
12
12
  from natural_pdf.core.page import Page
@@ -1160,10 +1160,13 @@ class ShapeDetectionMixin:
1160
1160
  masking so large painted areas are not cut by text boxes.
1161
1161
  """
1162
1162
  import numpy as np
1163
- from scipy.ndimage import label as nd_label, find_objects
1163
+ from scipy.ndimage import find_objects
1164
+ from scipy.ndimage import label as nd_label
1164
1165
 
1165
1166
  # Acquire raster image & scale info
1166
- cv_image, scale_factor, origin_offset_pdf, page_obj = self._get_image_for_detection(resolution)
1167
+ cv_image, scale_factor, origin_offset_pdf, page_obj = self._get_image_for_detection(
1168
+ resolution
1169
+ )
1167
1170
  if cv_image is None or page_obj is None:
1168
1171
  return self # nothing to do
1169
1172
  img_arr = cv_image.reshape(-1, 3).astype(np.float32) / 255.0 # normalised
@@ -1246,7 +1249,12 @@ class ShapeDetectionMixin:
1246
1249
 
1247
1250
  # ── optional purge ──
1248
1251
  if replace and hasattr(page_obj, "_element_mgr"):
1249
- old_blobs = [r for r in page_obj._element_mgr.regions if getattr(r, "region_type", None) == "blob" and getattr(r, "source", None) == source_label]
1252
+ old_blobs = [
1253
+ r
1254
+ for r in page_obj._element_mgr.regions
1255
+ if getattr(r, "region_type", None) == "blob"
1256
+ and getattr(r, "source", None) == source_label
1257
+ ]
1250
1258
  for r in old_blobs:
1251
1259
  try:
1252
1260
  page_obj._element_mgr.regions.remove(r)
@@ -1273,7 +1281,7 @@ class ShapeDetectionMixin:
1273
1281
  x0, x1 = sl[1].start, sl[1].stop
1274
1282
  # bbox area in pixels → in pts²
1275
1283
  area_pixels = (y1 - y0) * (x1 - x0)
1276
- area_pts = area_pixels * (scale_factor ** 2)
1284
+ area_pts = area_pixels * (scale_factor**2)
1277
1285
 
1278
1286
  # Skip tiny regions
1279
1287
  if area_pts < min_area_pts:
@@ -1331,6 +1339,7 @@ class ShapeDetectionMixin:
1331
1339
  pdf_x0, pdf_top, pdf_x1, pdf_bottom = region_bbox_pdf
1332
1340
 
1333
1341
  from natural_pdf.elements.region import Region
1342
+
1334
1343
  region = Region(page_obj, (pdf_x0, pdf_top, pdf_x1, pdf_bottom))
1335
1344
  region.region_type = "blob"
1336
1345
  region.normalized_type = "blob"