PyPI - natural-pdf - Versions diffs - 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl - Mend

natural-pdf 0.1.33py3-none-any.whl → 0.1.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

natural_pdf/analyzers/__init__.py +2 -2
natural_pdf/analyzers/guides.py +670 -595
natural_pdf/analyzers/layout/base.py +53 -6
natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
natural_pdf/analyzers/layout/layout_manager.py +18 -14
natural_pdf/analyzers/layout/layout_options.py +1 -0
natural_pdf/analyzers/layout/paddle.py +102 -64
natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
natural_pdf/analyzers/layout/yolo.py +2 -6
natural_pdf/analyzers/shape_detection_mixin.py +15 -6
natural_pdf/classification/manager.py +92 -77
natural_pdf/classification/mixin.py +49 -5
natural_pdf/classification/results.py +1 -1
natural_pdf/cli.py +7 -3
natural_pdf/collections/pdf_collection.py +96 -101
natural_pdf/core/element_manager.py +131 -45
natural_pdf/core/highlighting_service.py +5 -6
natural_pdf/core/page.py +113 -22
natural_pdf/core/pdf.py +477 -75
natural_pdf/describe/__init__.py +18 -12
natural_pdf/describe/base.py +179 -172
natural_pdf/describe/elements.py +155 -155
natural_pdf/describe/mixin.py +27 -19
natural_pdf/describe/summary.py +44 -55
natural_pdf/elements/base.py +134 -18
natural_pdf/elements/collections.py +90 -18
natural_pdf/elements/image.py +2 -1
natural_pdf/elements/line.py +0 -31
natural_pdf/elements/rect.py +0 -14
natural_pdf/elements/region.py +222 -108
natural_pdf/elements/text.py +18 -12
natural_pdf/exporters/__init__.py +4 -1
natural_pdf/exporters/original_pdf.py +12 -4
natural_pdf/extraction/mixin.py +66 -10
natural_pdf/extraction/result.py +1 -1
natural_pdf/flows/flow.py +63 -4
natural_pdf/flows/region.py +4 -4
natural_pdf/ocr/engine.py +83 -2
natural_pdf/ocr/engine_paddle.py +5 -5
natural_pdf/ocr/ocr_factory.py +2 -1
natural_pdf/ocr/ocr_manager.py +24 -13
natural_pdf/ocr/ocr_options.py +3 -10
natural_pdf/qa/document_qa.py +21 -8
natural_pdf/qa/qa_result.py +3 -7
natural_pdf/search/__init__.py +3 -2
natural_pdf/search/lancedb_search_service.py +5 -6
natural_pdf/search/numpy_search_service.py +5 -2
natural_pdf/selectors/parser.py +51 -6
natural_pdf/tables/__init__.py +2 -2
natural_pdf/tables/result.py +7 -6
natural_pdf/utils/bidi_mirror.py +2 -1
natural_pdf/utils/reading_order.py +3 -2
natural_pdf/utils/visualization.py +3 -3
natural_pdf/widgets/viewer.py +0 -1
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
natural_pdf-0.1.34.dist-info/RECORD +121 -0
optimization/memory_comparison.py +73 -58
optimization/pdf_analyzer.py +141 -96
optimization/performance_analysis.py +111 -110
optimization/test_cleanup_methods.py +47 -36
optimization/test_memory_fix.py +40 -39
tools/bad_pdf_eval/__init__.py +0 -1
tools/bad_pdf_eval/analyser.py +35 -18
tools/bad_pdf_eval/collate_summaries.py +22 -18
tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
tools/bad_pdf_eval/eval_suite.py +21 -9
tools/bad_pdf_eval/evaluate_quality.py +198 -0
tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
tools/bad_pdf_eval/llm_enrich.py +71 -39
tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
tools/bad_pdf_eval/reporter.py +1 -1
tools/bad_pdf_eval/utils.py +7 -4
natural_pdf-0.1.33.dist-info/RECORD +0 -118
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0

natural_pdf/analyzers/layout/base.py CHANGED Viewed

@@ -18,12 +18,59 @@ logger = logging.getLogger(__name__)
 class LayoutDetector(ABC):
-    """
-    Abstract Base Class for layout detection engines.
-    Subclasses should implement is_available, _load_model_from_options, detect,
-    and override _get_cache_key if model loading depends on options beyond device.
-    They should also populate the 'supported_classes' set.
+    """Abstract base class for layout detection engines.
+    This class defines the standard interface that all layout detection engines
+    must implement in natural-pdf. Layout detectors analyze document images to
+    identify structural elements like tables, figures, headers, paragraphs, etc.
+    The base class provides common functionality including model caching, result
+    standardization, and configuration management, while concrete implementations
+    handle engine-specific detection logic for different models (YOLO, TATR, Surya, etc.).
+    Subclasses must implement:
+    - detect(): Core layout detection for a single image
+    - is_available(): Check if engine dependencies are installed
+    - _load_model_from_options(): Load and configure the detection model
+    - _get_cache_key(): Generate cache keys for model instances
+    Subclasses should also populate the 'supported_classes' set with the document
+    element types they can detect (e.g., 'table', 'figure', 'text', 'title').
+    Attributes:
+        logger: Logger instance for the specific detector.
+        supported_classes: Set of document element types this detector can identify.
+        _model_cache: Dictionary cache for loaded model instances.
+    Example:
+        Implementing a custom layout detector:
+        ```python
+        class MyLayoutDetector(LayoutDetector):
+            def __init__(self):
+                super().__init__()
+                self.supported_classes = {'table', 'figure', 'text'}
+            @classmethod
+            def is_available(cls) -> bool:
+                try:
+                    import my_layout_library
+                    return True
+                except ImportError:
+                    return False
+            def detect(self, image, options):
+                # Implement layout detection
+                return detection_results
+        ```
+        Using a layout detector:
+        ```python
+        if YOLODetector.is_available():
+            detector = YOLODetector()
+            results = detector.detect(page_image, options)
+            for result in results:
+                print(f"Found {result['class']} at {result['bbox']}")
+        ```
     """
     def __init__(self):

natural_pdf/analyzers/layout/layout_analyzer.py CHANGED Viewed

@@ -83,7 +83,9 @@ class LayoutAnalyzer:
             f"  Rendering page {self._page.number} to image for initial layout detection..."
         )
         try:
-            layout_resolution = getattr(self._page._parent, "_config", {}).get("layout_image_resolution", 72)
+            layout_resolution = getattr(self._page._parent, "_config", {}).get(
+                "layout_image_resolution", 72
+            )
             std_res_page_image = self._page.to_image(
                 resolution=layout_resolution, include_highlights=False
             )

natural_pdf/analyzers/layout/layout_manager.py CHANGED Viewed

@@ -5,10 +5,6 @@ from typing import Any, Dict, List, Optional, Type, Union
 from PIL import Image
-# --- Import lightweight components only ---
-# Heavy detector implementations (paddle, yolo, etc.) are **not** imported at module load.
-# Instead, we provide tiny helper functions that import them lazily **only when needed**.
 from .base import LayoutDetector  # Lightweight base class
 from .layout_options import (
     BaseLayoutOptions,
@@ -21,6 +17,11 @@ from .layout_options import (
     YOLOLayoutOptions,
 )
+# --- Import lightweight components only ---
+# Heavy detector implementations (paddle, yolo, etc.) are **not** imported at module load.
+# Instead, we provide tiny helper functions that import them lazily **only when needed**.
 # ------------------ Lazy import helpers ------------------ #
@@ -60,6 +61,7 @@ def _lazy_import_gemini_detector():
     return GeminiLayoutDetector
 # --------------------------------------------------------- #
 logger = logging.getLogger(__name__)
@@ -205,7 +207,9 @@ class LayoutManager:
         for name, registry_entry in self.ENGINE_REGISTRY.items():
             try:
                 engine_class_or_factory = registry_entry["class"]
-                if callable(engine_class_or_factory) and not isinstance(engine_class_or_factory, type):
+                if callable(engine_class_or_factory) and not isinstance(
+                    engine_class_or_factory, type
+                ):
                     # Lazy factory – call it to obtain real class
                     engine_class = engine_class_or_factory()
                 else:
@@ -224,43 +228,43 @@ class LayoutManager:
     def cleanup_detector(self, detector_name: Optional[str] = None) -> int:
         """
         Cleanup layout detector instances to free memory.
         Args:
             detector_name: Specific detector to cleanup, or None to cleanup all detectors
         Returns:
             Number of detectors cleaned up
         """
         cleaned_count = 0
         if detector_name:
             # Cleanup specific detector
             detector_name = detector_name.lower()
             if detector_name in self._detector_instances:
                 detector = self._detector_instances.pop(detector_name)
-                if hasattr(detector, 'cleanup'):
+                if hasattr(detector, "cleanup"):
                     try:
                         detector.cleanup()
                     except Exception as e:
                         logger.debug(f"Detector {detector_name} cleanup method failed: {e}")
                 logger.info(f"Cleaned up layout detector: {detector_name}")
                 cleaned_count = 1
         else:
             # Cleanup all detectors
             for name, detector in list(self._detector_instances.items()):
-                if hasattr(detector, 'cleanup'):
+                if hasattr(detector, "cleanup"):
                     try:
                         detector.cleanup()
                     except Exception as e:
                         logger.debug(f"Detector {name} cleanup method failed: {e}")
             # Clear all caches
             detector_count = len(self._detector_instances)
             self._detector_instances.clear()
             if detector_count > 0:
                 logger.info(f"Cleaned up {detector_count} layout detectors")
             cleaned_count = detector_count
         return cleaned_count

natural_pdf/analyzers/layout/layout_options.py CHANGED Viewed

@@ -58,6 +58,7 @@ class PaddleLayoutOptions(BaseLayoutOptions):
     Options specific to PaddlePaddle PP-StructureV3 layout detection.
     See: https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-StructureV3.html
     """
     # Model paths and names
     layout_detection_model_name: Optional[str] = None
     layout_detection_model_dir: Optional[str] = None

natural_pdf/analyzers/layout/paddle.py CHANGED Viewed

@@ -55,6 +55,7 @@ else:
 from .table_structure_utils import group_cells_into_rows_and_columns
 class PaddleLayoutDetector(LayoutDetector):
     """Document layout and table structure detector using PaddlePaddle's PP-StructureV3."""
@@ -187,8 +188,9 @@ class PaddleLayoutDetector(LayoutDetector):
                     init_args[field_name] = value
         # Add filtered extra_args (not starting with '_' and in valid set)
         filtered_extra_args = {
-            k: v for k, v in options.extra_args.items()
-            if not k.startswith('_') and k in valid_init_args
+            k: v
+            for k, v in options.extra_args.items()
+            if not k.startswith("_") and k in valid_init_args
         }
         init_args.update(filtered_extra_args)
@@ -266,7 +268,7 @@ class PaddleLayoutDetector(LayoutDetector):
             if options.exclude_classes
             else set()
         )
         # Debug counters
         table_count = 0
         cell_count = 0
@@ -296,7 +298,9 @@ class PaddleLayoutDetector(LayoutDetector):
             table_structures = table_res_list or []
             table_idx = 0  # fallback index if no region_id
             if table_res_list:
-                self.logger.debug(f"Found {len(table_res_list)} table structure(s) in table_res_list.")
+                self.logger.debug(
+                    f"Found {len(table_res_list)} table structure(s) in table_res_list."
+                )
             if not layout_res or "boxes" not in layout_res:
                 self.logger.debug("No layout detection boxes found in result.")
@@ -322,9 +326,7 @@ class PaddleLayoutDetector(LayoutDetector):
                         bbox = region.get("coordinate")
                         if not bbox or len(bbox) != 4:
-                            self.logger.warning(
-                                f"Skipping region with invalid bbox: {region}"
-                            )
+                            self.logger.warning(f"Skipping region with invalid bbox: {region}")
                             continue
                         x_min, y_min, x_max, y_max = map(float, bbox)
@@ -351,10 +353,14 @@ class PaddleLayoutDetector(LayoutDetector):
                             if table_struct:
                                 matched_table_structures += 1
-                                self.logger.debug(f"Matched table structure for table_region_id {region_id} or index {table_idx-1}.")
+                                self.logger.debug(
+                                    f"Matched table structure for table_region_id {region_id} or index {table_idx-1}."
+                                )
                                 # Attach structure info as metadata
                                 detection_data["metadata"] = {
-                                    k: v for k, v in table_struct.items() if k not in ("cell_box_list", "table_ocr_pred", "pred_html")
+                                    k: v
+                                    for k, v in table_struct.items()
+                                    if k not in ("cell_box_list", "table_ocr_pred", "pred_html")
                                 }
                                 detection_data["html"] = table_struct.get("pred_html")
                                 # Add cell regions
@@ -364,84 +370,116 @@ class PaddleLayoutDetector(LayoutDetector):
                                         continue
                                     sx0, sy0, sx1, sy1 = map(float, cell_bbox)
                                     cell_boxes.append((sx0, sy0, sx1, sy1))
-                                    detections.append({
-                                        "bbox": (sx0, sy0, sx1, sy1),
-                                        "class": "table_cell",
-                                        "confidence": confidence_score,
-                                        "normalized_class": self._normalize_class_name("table_cell"),
-                                        "source": "layout",
-                                        "model": "paddle_v3",
-                                        "parent_bbox": (x_min, y_min, x_max, y_max),
-                                    })
-                                    cell_count += 1
-                                    self.logger.debug(f"Created table_cell region for bbox {(sx0, sy0, sx1, sy1)}.")
-                                # Add row/col regions if not present in Paddle output
-                                if not table_struct.get("row_box_list") and not table_struct.get("col_box_list"):
-                                    row_boxes, col_boxes = group_cells_into_rows_and_columns(cell_boxes)
-                                    for row_bbox in row_boxes:
-                                        rx0, ry0, rx1, ry1 = row_bbox
-                                        detections.append({
-                                            "bbox": (rx0, ry0, rx1, ry1),
-                                            "class": "table_row",
+                                    detections.append(
+                                        {
+                                            "bbox": (sx0, sy0, sx1, sy1),
+                                            "class": "table_cell",
                                             "confidence": confidence_score,
-                                            "normalized_class": self._normalize_class_name("table_row"),
+                                            "normalized_class": self._normalize_class_name(
+                                                "table_cell"
+                                            ),
                                             "source": "layout",
                                             "model": "paddle_v3",
                                             "parent_bbox": (x_min, y_min, x_max, y_max),
-                                        })
+                                        }
+                                    )
+                                    cell_count += 1
+                                    self.logger.debug(
+                                        f"Created table_cell region for bbox {(sx0, sy0, sx1, sy1)}."
+                                    )
+                                # Add row/col regions if not present in Paddle output
+                                if not table_struct.get("row_box_list") and not table_struct.get(
+                                    "col_box_list"
+                                ):
+                                    row_boxes, col_boxes = group_cells_into_rows_and_columns(
+                                        cell_boxes
+                                    )
+                                    for row_bbox in row_boxes:
+                                        rx0, ry0, rx1, ry1 = row_bbox
+                                        detections.append(
+                                            {
+                                                "bbox": (rx0, ry0, rx1, ry1),
+                                                "class": "table_row",
+                                                "confidence": confidence_score,
+                                                "normalized_class": self._normalize_class_name(
+                                                    "table_row"
+                                                ),
+                                                "source": "layout",
+                                                "model": "paddle_v3",
+                                                "parent_bbox": (x_min, y_min, x_max, y_max),
+                                            }
+                                        )
                                         row_count += 1
-                                        self.logger.debug(f"[UTIL] Created table_row region for bbox {(rx0, ry0, rx1, ry1)}.")
+                                        self.logger.debug(
+                                            f"[UTIL] Created table_row region for bbox {(rx0, ry0, rx1, ry1)}."
+                                        )
                                     for col_bbox in col_boxes:
                                         cx0, cy0, cx1, cy1 = col_bbox
-                                        detections.append({
-                                            "bbox": (cx0, cy0, cx1, cy1),
-                                            "class": "table_column",
-                                            "confidence": confidence_score,
-                                            "normalized_class": self._normalize_class_name("table_column"),
-                                            "source": "layout",
-                                            "model": "paddle_v3",
-                                            "parent_bbox": (x_min, y_min, x_max, y_max),
-                                        })
+                                        detections.append(
+                                            {
+                                                "bbox": (cx0, cy0, cx1, cy1),
+                                                "class": "table_column",
+                                                "confidence": confidence_score,
+                                                "normalized_class": self._normalize_class_name(
+                                                    "table_column"
+                                                ),
+                                                "source": "layout",
+                                                "model": "paddle_v3",
+                                                "parent_bbox": (x_min, y_min, x_max, y_max),
+                                            }
+                                        )
                                         col_count += 1
-                                        self.logger.debug(f"[UTIL] Created table_column region for bbox {(cx0, cy0, cx1, cy1)}.")
+                                        self.logger.debug(
+                                            f"[UTIL] Created table_column region for bbox {(cx0, cy0, cx1, cy1)}."
+                                        )
                                 else:
                                     # Add row regions from Paddle output if present
                                     for row_bbox in table_struct.get("row_box_list", []):
                                         if row_bbox is None or len(row_bbox) != 4:
                                             continue
                                         rx0, ry0, rx1, ry1 = map(float, row_bbox)
-                                        detections.append({
-                                            "bbox": (rx0, ry0, rx1, ry1),
-                                            "class": "table_row",
-                                            "confidence": confidence_score,
-                                            "normalized_class": self._normalize_class_name("table_row"),
-                                            "source": "layout",
-                                            "model": "paddle_v3",
-                                            "parent_bbox": (x_min, y_min, x_max, y_max),
-                                        })
+                                        detections.append(
+                                            {
+                                                "bbox": (rx0, ry0, rx1, ry1),
+                                                "class": "table_row",
+                                                "confidence": confidence_score,
+                                                "normalized_class": self._normalize_class_name(
+                                                    "table_row"
+                                                ),
+                                                "source": "layout",
+                                                "model": "paddle_v3",
+                                                "parent_bbox": (x_min, y_min, x_max, y_max),
+                                            }
+                                        )
                                         row_count += 1
-                                        self.logger.debug(f"Created table_row region for bbox {(rx0, ry0, rx1, ry1)}.")
+                                        self.logger.debug(
+                                            f"Created table_row region for bbox {(rx0, ry0, rx1, ry1)}."
+                                        )
                                     # Add column regions from Paddle output if present
                                     for col_bbox in table_struct.get("col_box_list", []):
                                         if col_bbox is None or len(col_bbox) != 4:
                                             continue
                                         cx0, cy0, cx1, cy1 = map(float, col_bbox)
-                                        detections.append({
-                                            "bbox": (cx0, cy0, cx1, cy1),
-                                            "class": "table_column",
-                                            "confidence": confidence_score,
-                                            "normalized_class": self._normalize_class_name("table_column"),
-                                            "source": "layout",
-                                            "model": "paddle_v3",
-                                            "parent_bbox": (x_min, y_min, x_max, y_max),
-                                        })
+                                        detections.append(
+                                            {
+                                                "bbox": (cx0, cy0, cx1, cy1),
+                                                "class": "table_column",
+                                                "confidence": confidence_score,
+                                                "normalized_class": self._normalize_class_name(
+                                                    "table_column"
+                                                ),
+                                                "source": "layout",
+                                                "model": "paddle_v3",
+                                                "parent_bbox": (x_min, y_min, x_max, y_max),
+                                            }
+                                        )
                                         col_count += 1
-                                        self.logger.debug(f"Created table_column region for bbox {(cx0, cy0, cx1, cy1)}.")
+                                        self.logger.debug(
+                                            f"Created table_column region for bbox {(cx0, cy0, cx1, cy1)}."
+                                        )
                         detections.append(detection_data)
                     except (TypeError, KeyError, IndexError, ValueError) as e:
-                        self.logger.warning(
-                            f"Error processing Paddle region: {region}. Error: {e}"
-                        )
+                        self.logger.warning(f"Error processing Paddle region: {region}. Error: {e}")
                         continue
         self.logger.info(

natural_pdf/analyzers/layout/table_structure_utils.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from typing import List, Tuple
 import numpy as np
 def group_cells_into_rows_and_columns(
     cell_boxes: List[Tuple[float, float, float, float]],
     row_tol: float = None,
@@ -75,4 +77,4 @@ def group_cells_into_rows_and_columns(
         y1 = float(np.max(boxes[group, 3]))
         col_boxes.append((x0, y0, x1, y1))
-    return row_boxes, col_boxes
+    return row_boxes, col_boxes

natural_pdf/analyzers/layout/yolo.py CHANGED Viewed

@@ -91,9 +91,7 @@ class YOLODocLayoutDetector(LayoutDetector):
     def _load_model_from_options(self, options: YOLOLayoutOptions) -> Any:
         """Load the YOLOv10 model based on options."""
         if not self.is_available():
-            raise RuntimeError(
-                "YOLO dependencies not installed. Please run: npdf install yolo"
-            )
+            raise RuntimeError("YOLO dependencies not installed. Please run: npdf install yolo")
         self.logger.info(f"Loading YOLO model: {options.model_repo}/{options.model_file}")
         try:
             model_path = hf_hub_download(repo_id=options.model_repo, filename=options.model_file)
@@ -107,9 +105,7 @@ class YOLODocLayoutDetector(LayoutDetector):
     def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
         """Detect layout elements in an image using YOLO."""
         if not self.is_available():
-            raise RuntimeError(
-                "YOLO dependencies not installed. Please run: npdf install yolo"
-            )
+            raise RuntimeError("YOLO dependencies not installed. Please run: npdf install yolo")
         # Ensure options are the correct type, falling back to defaults if base type passed
         if not isinstance(options, YOLOLayoutOptions):

natural_pdf/analyzers/shape_detection_mixin.py CHANGED Viewed

@@ -3,10 +3,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 from PIL import Image, ImageDraw
-from scipy.ndimage import binary_closing, binary_opening, gaussian_filter1d
+from scipy.ndimage import binary_closing, binary_opening, find_objects, gaussian_filter1d
+from scipy.ndimage import label as nd_label
 from scipy.signal import find_peaks
 from sklearn.cluster import MiniBatchKMeans
-from scipy.ndimage import label as nd_label, find_objects
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
@@ -1160,10 +1160,13 @@ class ShapeDetectionMixin:
                 masking so large painted areas are not cut by text boxes.
         """
         import numpy as np
-        from scipy.ndimage import label as nd_label, find_objects
+        from scipy.ndimage import find_objects
+        from scipy.ndimage import label as nd_label
         # Acquire raster image & scale info
-        cv_image, scale_factor, origin_offset_pdf, page_obj = self._get_image_for_detection(resolution)
+        cv_image, scale_factor, origin_offset_pdf, page_obj = self._get_image_for_detection(
+            resolution
+        )
         if cv_image is None or page_obj is None:
             return self  # nothing to do
         img_arr = cv_image.reshape(-1, 3).astype(np.float32) / 255.0  # normalised
@@ -1246,7 +1249,12 @@ class ShapeDetectionMixin:
         # ── optional purge ──
         if replace and hasattr(page_obj, "_element_mgr"):
-            old_blobs = [r for r in page_obj._element_mgr.regions if getattr(r, "region_type", None) == "blob" and getattr(r, "source", None) == source_label]
+            old_blobs = [
+                r
+                for r in page_obj._element_mgr.regions
+                if getattr(r, "region_type", None) == "blob"
+                and getattr(r, "source", None) == source_label
+            ]
             for r in old_blobs:
                 try:
                     page_obj._element_mgr.regions.remove(r)
@@ -1273,7 +1281,7 @@ class ShapeDetectionMixin:
                 x0, x1 = sl[1].start, sl[1].stop
                 # bbox area in pixels → in pts²
                 area_pixels = (y1 - y0) * (x1 - x0)
-                area_pts = area_pixels * (scale_factor ** 2)
+                area_pts = area_pixels * (scale_factor**2)
                 # Skip tiny regions
                 if area_pts < min_area_pts:
@@ -1331,6 +1339,7 @@ class ShapeDetectionMixin:
                 pdf_x0, pdf_top, pdf_x1, pdf_bottom = region_bbox_pdf
                 from natural_pdf.elements.region import Region
                 region = Region(page_obj, (pdf_x0, pdf_top, pdf_x1, pdf_bottom))
                 region.region_type = "blob"
                 region.normalized_type = "blob"

natural-pdf 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl

natural-pdf 0.1.33py3-none-any.whl → 0.1.34py3-none-any.whl