PyPI - natural-pdf - Versions diffs - 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl - Mend

natural-pdf 0.1.15py3-none-any.whl → 0.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

natural_pdf/__init__.py +31 -0
natural_pdf/analyzers/layout/gemini.py +137 -162
natural_pdf/analyzers/layout/layout_manager.py +9 -5
natural_pdf/analyzers/layout/layout_options.py +77 -7
natural_pdf/analyzers/layout/paddle.py +318 -165
natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
natural_pdf/analyzers/shape_detection_mixin.py +770 -405
natural_pdf/classification/mixin.py +2 -8
natural_pdf/collections/pdf_collection.py +25 -30
natural_pdf/core/highlighting_service.py +47 -32
natural_pdf/core/page.py +117 -75
natural_pdf/core/pdf.py +19 -22
natural_pdf/elements/base.py +9 -9
natural_pdf/elements/collections.py +105 -50
natural_pdf/elements/region.py +200 -126
natural_pdf/exporters/paddleocr.py +38 -13
natural_pdf/flows/__init__.py +3 -3
natural_pdf/flows/collections.py +303 -132
natural_pdf/flows/element.py +277 -132
natural_pdf/flows/flow.py +33 -16
natural_pdf/flows/region.py +142 -79
natural_pdf/ocr/engine_doctr.py +37 -4
natural_pdf/ocr/engine_easyocr.py +23 -3
natural_pdf/ocr/engine_paddle.py +281 -30
natural_pdf/ocr/engine_surya.py +8 -3
natural_pdf/ocr/ocr_manager.py +75 -76
natural_pdf/ocr/ocr_options.py +52 -87
natural_pdf/search/__init__.py +25 -12
natural_pdf/search/lancedb_search_service.py +91 -54
natural_pdf/search/numpy_search_service.py +86 -65
natural_pdf/search/searchable_mixin.py +2 -2
natural_pdf/selectors/parser.py +125 -81
natural_pdf/widgets/__init__.py +1 -1
natural_pdf/widgets/viewer.py +205 -449
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0

natural_pdf/analyzers/layout/paddle.py CHANGED Viewed

@@ -5,6 +5,7 @@ import os
 import tempfile
 from typing import Any, Dict, List, Optional
+import numpy as np
 from PIL import Image
 # Assuming base class and options are importable
@@ -40,12 +41,11 @@ logger = logging.getLogger(__name__)
 # Check for dependencies
 paddle_spec = importlib.util.find_spec("paddle") or importlib.util.find_spec("paddlepaddle")
 paddleocr_spec = importlib.util.find_spec("paddleocr")
-PPStructure = None
-PaddleOCR = None  # For optional text detection
+PPStructureV3 = None
 if paddle_spec and paddleocr_spec:
     try:
-        from paddleocr import PaddleOCR, PPStructure
+        from paddleocr import PPStructureV3
     except ImportError as e:
         logger.warning(f"Could not import Paddle dependencies: {e}")
 else:
@@ -53,85 +53,159 @@ else:
         "paddlepaddle or paddleocr not found. PaddleLayoutDetector will not be available."
     )
+from .table_structure_utils import group_cells_into_rows_and_columns
 class PaddleLayoutDetector(LayoutDetector):
-    """Document layout and table structure detector using PaddlePaddle's PP-Structure."""
+    """Document layout and table structure detector using PaddlePaddle's PP-StructureV3."""
     def __init__(self):
         super().__init__()
-        # Supported classes by PP-Structure (adjust based on model version/capabilities)
+        # Supported classes by PP-StructureV3 (based on docs and common usage)
         self.supported_classes = {
             "text",
             "title",
             "figure",
-            "figure_caption",
             "table",
-            "table_caption",
-            "table_cell",  # Added table_cell
             "header",
             "footer",
             "reference",
             "equation",
-            # PP-StructureV2 might add others like list, pub_number etc.
+            # New labels from V3
+            "image",
+            "paragraph_title",
+            "doc_title",
+            "figure_title",
+            "table_cell",
         }
         # Models are loaded via _get_model
     def is_available(self) -> bool:
         """Check if dependencies are installed."""
-        return PPStructure is not None and PaddleOCR is not None
+        return PPStructureV3 is not None
     def _get_cache_key(self, options: BaseLayoutOptions) -> str:
-        """Generate cache key based on language and device."""
+        """Generate cache key based on model configuration."""
         if not isinstance(options, PaddleLayoutOptions):
-            options = PaddleLayoutOptions(device=options.device)  # Use base device
+            options = PaddleLayoutOptions(device=options.device)
         device_key = str(options.device).lower() if options.device else "default_device"
         lang_key = options.lang
-        # Key could also include enable_table, use_angle_cls if these affect model loading fundamentally
-        # For PPStructure, they are primarily runtime flags, so lang/device might suffice for caching the *instance*.
-        return f"{self.__class__.__name__}_{device_key}_{lang_key}"
+        table_key = str(options.use_table_recognition)
+        orientation_key = str(options.use_textline_orientation)
+        return f"{self.__class__.__name__}_{device_key}_{lang_key}_{table_key}_{orientation_key}"
     def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
-        """Load the PPStructure model based on options."""
+        """Load the PPStructureV3 model based on options."""
         if not self.is_available():
             raise RuntimeError("Paddle dependencies (paddlepaddle, paddleocr) not installed.")
         if not isinstance(options, PaddleLayoutOptions):
             raise TypeError("Incorrect options type provided for Paddle model loading.")
-        self.logger.info(
-            f"Loading PPStructure model (lang={options.lang}, device={options.device}, table={options.enable_table})..."
-        )
+        self.logger.info(f"Loading PP-StructureV3 model with options: {options}")
+        # List of valid PPStructureV3 constructor arguments (from official docs)
+        valid_init_args = {
+            "layout_detection_model_name",
+            "layout_detection_model_dir",
+            "layout_threshold",
+            "layout_nms",
+            "layout_unclip_ratio",
+            "layout_merge_bboxes_mode",
+            "chart_recognition_model_name",
+            "chart_recognition_model_dir",
+            "chart_recognition_batch_size",
+            "region_detection_model_name",
+            "region_detection_model_dir",
+            "doc_orientation_classify_model_name",
+            "doc_orientation_classify_model_dir",
+            "doc_unwarping_model_name",
+            "doc_unwarping_model_dir",
+            "text_detection_model_name",
+            "text_detection_model_dir",
+            "text_det_limit_side_len",
+            "text_det_limit_type",
+            "text_det_thresh",
+            "text_det_box_thresh",
+            "text_det_unclip_ratio",
+            "textline_orientation_model_name",
+            "textline_orientation_model_dir",
+            "textline_orientation_batch_size",
+            "text_recognition_model_name",
+            "text_recognition_model_dir",
+            "text_recognition_batch_size",
+            "text_rec_score_thresh",
+            "table_classification_model_name",
+            "table_classification_model_dir",
+            "wired_table_structure_recognition_model_name",
+            "wired_table_structure_recognition_model_dir",
+            "wireless_table_structure_recognition_model_name",
+            "wireless_table_structure_recognition_model_dir",
+            "wired_table_cells_detection_model_name",
+            "wired_table_cells_detection_model_dir",
+            "wireless_table_cells_detection_model_name",
+            "wireless_table_cells_detection_model_dir",
+            "seal_text_detection_model_name",
+            "seal_text_detection_model_dir",
+            "seal_det_limit_side_len",
+            "seal_det_limit_type",
+            "seal_det_thresh",
+            "seal_det_box_thresh",
+            "seal_det_unclip_ratio",
+            "seal_text_recognition_model_name",
+            "seal_text_recognition_model_dir",
+            "seal_text_recognition_batch_size",
+            "seal_rec_score_thresh",
+            "formula_recognition_model_name",
+            "formula_recognition_model_dir",
+            "formula_recognition_batch_size",
+            "use_doc_orientation_classify",
+            "use_doc_unwarping",
+            "use_textline_orientation",
+            "use_seal_recognition",
+            "use_table_recognition",
+            "use_formula_recognition",
+            "use_chart_recognition",
+            "use_region_detection",
+            "device",
+            "enable_hpi",
+            "use_tensorrt",
+            "precision",
+            "enable_mkldnn",
+            "cpu_threads",
+            "paddlex_config",
+        }
+        # Build init_args from dataclass fields and filtered extra_args
+        init_args = {}
+        # Add all dataclass fields that are in the valid set and not None
+        for field_name in options.__dataclass_fields__:
+            if field_name in valid_init_args:
+                value = getattr(options, field_name)
+                if value is not None:
+                    init_args[field_name] = value
+        # Add filtered extra_args (not starting with '_' and in valid set)
+        filtered_extra_args = {
+            k: v for k, v in options.extra_args.items()
+            if not k.startswith('_') and k in valid_init_args
+        }
+        init_args.update(filtered_extra_args)
+        # Special handling for English model selection
+        if getattr(options, "lang", None) == "en":
+            init_args["text_recognition_model_name"] = "en_PP-OCRv4_mobile_rec"
         try:
-            # PPStructure init takes several arguments that control runtime behavior
-            # We cache the instance based on lang/device, assuming other flags don't require reloading.
-            # Note: show_log is a runtime arg, not needed for instance caching key.
-            # Note: `layout=False` disables layout analysis, which we definitely want here.
-            # Note: `ocr=False` might disable text detection needed for table structure? Check PPStructure docs.
-            # It seems best to initialize with core settings and pass others during the call if possible.
-            # However, PPStructure call signature is simple (__call__(self, img, ...))
-            # So, we likely need to initialize with most settings.
-            model_instance = PPStructure(
-                lang=options.lang,
-                use_gpu=(
-                    "cuda" in str(options.device).lower() or "gpu" in str(options.device).lower()
-                ),
-                use_angle_cls=options.use_angle_cls,
-                show_log=options.show_log,
-                layout=True,  # Ensure layout analysis is on
-                table=options.enable_table,  # Control table analysis
-                ocr=False,  # Usually disable internal OCR if only using for layout/table
-                # Add other PPStructure init args from options.extra_args if needed
-                # **options.extra_args
-            )
-            self.logger.info("PPStructure model loaded.")
+            model_instance = PPStructureV3(**init_args)
+            self.logger.info("PP-StructureV3 model loaded.")
             return model_instance
         except Exception as e:
-            self.logger.error(f"Failed to load PPStructure model: {e}", exc_info=True)
+            self.logger.error(f"Failed to load PP-StructureV3 model: {e}", exc_info=True)
             raise
     def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
-        """Detect layout elements in an image using PaddlePaddle."""
+        """Detect layout elements in an image using PP-StructureV3."""
         if not self.is_available():
             raise RuntimeError("Paddle dependencies (paddlepaddle, paddleocr) not installed.")
@@ -145,52 +219,41 @@ class PaddleLayoutDetector(LayoutDetector):
                 exclude_classes=options.exclude_classes,
                 device=options.device,
                 extra_args=options.extra_args,
-                # Other Paddle options will use defaults
             )
+        # --- Backward compatibility for renamed options passed via extra_args ---
+        if "use_angle_cls" in options.extra_args:
+            self.logger.warning(
+                "Parameter 'use_angle_cls' is deprecated for Paddle. Use 'use_textline_orientation' instead."
+            )
+            options.use_textline_orientation = options.extra_args.pop("use_angle_cls")
+        if "enable_table" in options.extra_args:
+            self.logger.warning(
+                "Parameter 'enable_table' is deprecated for Paddle. Use 'use_table_recognition' instead."
+            )
+            options.use_table_recognition = options.extra_args.pop("enable_table")
         self.validate_classes(options.classes or [])
         if options.exclude_classes:
             self.validate_classes(options.exclude_classes)
-        # Get the cached/loaded PPStructure instance
+        # Get the cached/loaded PP-StructureV3 instance
         ppstructure_instance = self._get_model(options)
-        # PPStructure call requires an image path. Save temp file.
-        detections = []
-        with tempfile.TemporaryDirectory() as temp_dir:
-            temp_image_path = os.path.join(temp_dir, f"paddle_input_{os.getpid()}.png")
-            try:
-                self.logger.debug(
-                    f"Saving temporary image for Paddle detector to: {temp_image_path}"
-                )
-                image.convert("RGB").save(temp_image_path)  # Ensure RGB
-                # Process image with PP-Structure instance
-                # The instance was configured during _load_model_from_options
-                self.logger.debug("Running PPStructure analysis...")
-                result = ppstructure_instance(temp_image_path)
-                self.logger.debug(f"PPStructure returned {len(result)} regions.")
-            except Exception as e:
-                self.logger.error(f"Error during PPStructure analysis: {e}", exc_info=True)
-                # Clean up temp file before raising or returning
-                if os.path.exists(temp_image_path):
-                    try:
-                        os.remove(temp_image_path)
-                    except OSError as e_rm:
-                        self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
-                raise  # Re-raise error
-            finally:
-                # Ensure cleanup even if analysis worked
-                if os.path.exists(temp_image_path):
-                    try:
-                        os.remove(temp_image_path)
-                    except OSError as e_rm:
-                        self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
+        # Convert PIL image to numpy array for prediction
+        img_np = np.array(image.convert("RGB"))
+        self.logger.debug("Running PP-StructureV3 analysis...")
+        try:
+            results = ppstructure_instance.predict(img_np)
+        except Exception as e:
+            self.logger.error(f"Error during PP-StructureV3 analysis: {e}", exc_info=True)
+            raise
+        self.logger.debug(f"PP-StructureV3 returned {len(results)} result objects.")
         # --- Process Results ---
-        if not result:
+        detections = []
+        if not results:
             self.logger.warning("PaddleLayout returned empty results")
             return []
@@ -203,95 +266,185 @@ class PaddleLayoutDetector(LayoutDetector):
             if options.exclude_classes
             else set()
         )
-        for region in result:
-            try:
-                region_type_orig = region.get("type", "unknown")
-                # Handle potential list returns for type (seen in some versions)
-                if isinstance(region_type_orig, list):
-                    region_type_orig = region_type_orig[0] if region_type_orig else "unknown"
-                region_type = region_type_orig.lower()
-                normalized_class = self._normalize_class_name(region_type)
-                # Apply class filtering
-                if normalized_classes_req and normalized_class not in normalized_classes_req:
-                    continue
-                if normalized_class in normalized_classes_excl:
-                    continue
-                # PP-Structure results don't always have confidence, use threshold or default
-                confidence_score = region.get("score", 1.0)  # Default to 1.0 if missing
-                if confidence_score < options.confidence:
-                    continue
-                bbox = region.get("bbox")
-                if not bbox or len(bbox) != 4:
-                    self.logger.warning(f"Skipping region with invalid bbox: {region}")
-                    continue
-                x_min, y_min, x_max, y_max = map(float, bbox)
-                # Add detection
-                detection_data = {
-                    "bbox": (x_min, y_min, x_max, y_max),
-                    "class": region_type_orig,  # Keep original case if needed
-                    "confidence": confidence_score,
-                    "normalized_class": normalized_class,
-                    "source": "layout",
-                    "model": "paddle",
-                }
-                detections.append(detection_data)
-                # --- Process Table Cells (if enabled and present) ---
-                if region_type == "table" and options.enable_table and "res" in region:
-                    process_cells = (
-                        normalized_classes_req is None or "table-cell" in normalized_classes_req
-                    ) and ("table-cell" not in normalized_classes_excl)
-                    if process_cells and isinstance(region["res"], list):  # V2 structure
-                        for cell in region["res"]:
-                            if "box" not in cell or len(cell["box"]) != 4:
-                                continue
-                            cell_bbox = cell["box"]
-                            cell_x_min, cell_y_min, cell_x_max, cell_y_max = map(float, cell_bbox)
-                            # Add cell detection (confidence often not available per cell)
-                            detections.append(
-                                {
-                                    "bbox": (cell_x_min, cell_y_min, cell_x_max, cell_y_max),
-                                    "class": "table cell",  # Standardize name
-                                    "confidence": confidence_score
-                                    * 0.95,  # Inherit table confidence (slightly reduced)
-                                    "normalized_class": "table-cell",
-                                    "text": cell.get("text", ""),  # Include text if available
-                                    "source": "layout",
-                                    "model": "paddle",
-                                }
-                            )
-                    elif (
-                        process_cells
-                        and isinstance(region["res"], dict)
-                        and "cells" in region["res"]
-                    ):  # Older structure
-                        # Handle older 'cells' list if needed (logic from original file)
-                        pass  # Add logic based on original paddle.txt if supporting older PP-Structure
-            except (TypeError, KeyError, IndexError, ValueError) as e:
-                self.logger.warning(f"Error processing Paddle region: {region}. Error: {e}")
+        # Debug counters
+        table_count = 0
+        cell_count = 0
+        row_count = 0
+        col_count = 0
+        matched_table_structures = 0
+        # A single image input returns a list with one result object
+        for res in results:
+            # Handle both possible result structures (with or without 'res' key)
+            if isinstance(res, dict) and "res" in res:
+                result_data = res["res"]
+            elif isinstance(res, dict):
+                result_data = res
+            else:
+                self.logger.warning(f"Skipping result with unexpected structure: {res}")
                 continue
-        # --- Optional: Add Text Boxes from separate OCR run ---
-        if options.detect_text:
-            # This requires another model instance (PaddleOCR) and adds complexity.
-            # Consider if this is truly needed or if layout regions are sufficient.
-            # If needed, implement similar to original paddle.txt:
-            # - Instantiate PaddleOCR (potentially cache separately)
-            # - Run ocr(img_path, det=True, rec=False)
-            # - Process results, adding 'text' class detections
-            self.logger.info("Paddle detect_text=True: Running separate OCR text detection...")
-            # (Implementation omitted for brevity - requires PaddleOCR instance)
-            pass
+            # --- Process Layout Regions ---
+            layout_res = result_data.get("layout_det_res", {})
+            table_res_list = result_data.get("table_res_list", [])
+            # Build a map of table_region_id to structure info for fast lookup
+            table_structures_by_id = {}
+            for t in table_res_list:
+                if "table_region_id" in t:
+                    table_structures_by_id[t["table_region_id"]] = t
+            table_structures = table_res_list or []
+            table_idx = 0  # fallback index if no region_id
+            if table_res_list:
+                self.logger.debug(f"Found {len(table_res_list)} table structure(s) in table_res_list.")
+            if not layout_res or "boxes" not in layout_res:
+                self.logger.debug("No layout detection boxes found in result.")
+            else:
+                for region in layout_res["boxes"]:
+                    try:
+                        region_type_orig = region.get("label", "unknown")
+                        region_type = region_type_orig.lower()
+                        normalized_class = self._normalize_class_name(region_type)
+                        # Apply class filtering
+                        if (
+                            normalized_classes_req
+                            and normalized_class not in normalized_classes_req
+                        ):
+                            continue
+                        if normalized_class in normalized_classes_excl:
+                            continue
+                        confidence_score = region.get("score", 1.0)
+                        if confidence_score < options.confidence:
+                            continue
+                        bbox = region.get("coordinate")
+                        if not bbox or len(bbox) != 4:
+                            self.logger.warning(
+                                f"Skipping region with invalid bbox: {region}"
+                            )
+                            continue
+                        x_min, y_min, x_max, y_max = map(float, bbox)
+                        detection_data = {
+                            "bbox": (x_min, y_min, x_max, y_max),
+                            "class": region_type_orig,
+                            "confidence": confidence_score,
+                            "normalized_class": normalized_class,
+                            "source": "layout",
+                            "model": "paddle_v3",
+                        }
+                        # --- Table structure parsing ---
+                        if normalized_class == "table" and options.create_cells:
+                            table_count += 1
+                            # Try to match by region_id, else by order
+                            table_struct = None
+                            region_id = region.get("table_region_id")
+                            if region_id is not None and region_id in table_structures_by_id:
+                                table_struct = table_structures_by_id[region_id]
+                            elif table_idx < len(table_structures):
+                                table_struct = table_structures[table_idx]
+                                table_idx += 1
+                            if table_struct:
+                                matched_table_structures += 1
+                                self.logger.debug(f"Matched table structure for table_region_id {region_id} or index {table_idx-1}.")
+                                # Attach structure info as metadata
+                                detection_data["metadata"] = {
+                                    k: v for k, v in table_struct.items() if k not in ("cell_box_list", "table_ocr_pred", "pred_html")
+                                }
+                                detection_data["html"] = table_struct.get("pred_html")
+                                # Add cell regions
+                                cell_boxes = []
+                                for cell_bbox in table_struct.get("cell_box_list", []):
+                                    if cell_bbox is None or len(cell_bbox) != 4:
+                                        continue
+                                    sx0, sy0, sx1, sy1 = map(float, cell_bbox)
+                                    cell_boxes.append((sx0, sy0, sx1, sy1))
+                                    detections.append({
+                                        "bbox": (sx0, sy0, sx1, sy1),
+                                        "class": "table_cell",
+                                        "confidence": confidence_score,
+                                        "normalized_class": self._normalize_class_name("table_cell"),
+                                        "source": "layout",
+                                        "model": "paddle_v3",
+                                        "parent_bbox": (x_min, y_min, x_max, y_max),
+                                    })
+                                    cell_count += 1
+                                    self.logger.debug(f"Created table_cell region for bbox {(sx0, sy0, sx1, sy1)}.")
+                                # Add row/col regions if not present in Paddle output
+                                if not table_struct.get("row_box_list") and not table_struct.get("col_box_list"):
+                                    row_boxes, col_boxes = group_cells_into_rows_and_columns(cell_boxes)
+                                    for row_bbox in row_boxes:
+                                        rx0, ry0, rx1, ry1 = row_bbox
+                                        detections.append({
+                                            "bbox": (rx0, ry0, rx1, ry1),
+                                            "class": "table_row",
+                                            "confidence": confidence_score,
+                                            "normalized_class": self._normalize_class_name("table_row"),
+                                            "source": "layout",
+                                            "model": "paddle_v3",
+                                            "parent_bbox": (x_min, y_min, x_max, y_max),
+                                        })
+                                        row_count += 1
+                                        self.logger.debug(f"[UTIL] Created table_row region for bbox {(rx0, ry0, rx1, ry1)}.")
+                                    for col_bbox in col_boxes:
+                                        cx0, cy0, cx1, cy1 = col_bbox
+                                        detections.append({
+                                            "bbox": (cx0, cy0, cx1, cy1),
+                                            "class": "table_column",
+                                            "confidence": confidence_score,
+                                            "normalized_class": self._normalize_class_name("table_column"),
+                                            "source": "layout",
+                                            "model": "paddle_v3",
+                                            "parent_bbox": (x_min, y_min, x_max, y_max),
+                                        })
+                                        col_count += 1
+                                        self.logger.debug(f"[UTIL] Created table_column region for bbox {(cx0, cy0, cx1, cy1)}.")
+                                else:
+                                    # Add row regions from Paddle output if present
+                                    for row_bbox in table_struct.get("row_box_list", []):
+                                        if row_bbox is None or len(row_bbox) != 4:
+                                            continue
+                                        rx0, ry0, rx1, ry1 = map(float, row_bbox)
+                                        detections.append({
+                                            "bbox": (rx0, ry0, rx1, ry1),
+                                            "class": "table_row",
+                                            "confidence": confidence_score,
+                                            "normalized_class": self._normalize_class_name("table_row"),
+                                            "source": "layout",
+                                            "model": "paddle_v3",
+                                            "parent_bbox": (x_min, y_min, x_max, y_max),
+                                        })
+                                        row_count += 1
+                                        self.logger.debug(f"Created table_row region for bbox {(rx0, ry0, rx1, ry1)}.")
+                                    # Add column regions from Paddle output if present
+                                    for col_bbox in table_struct.get("col_box_list", []):
+                                        if col_bbox is None or len(col_bbox) != 4:
+                                            continue
+                                        cx0, cy0, cx1, cy1 = map(float, col_bbox)
+                                        detections.append({
+                                            "bbox": (cx0, cy0, cx1, cy1),
+                                            "class": "table_column",
+                                            "confidence": confidence_score,
+                                            "normalized_class": self._normalize_class_name("table_column"),
+                                            "source": "layout",
+                                            "model": "paddle_v3",
+                                            "parent_bbox": (x_min, y_min, x_max, y_max),
+                                        })
+                                        col_count += 1
+                                        self.logger.debug(f"Created table_column region for bbox {(cx0, cy0, cx1, cy1)}.")
+                        detections.append(detection_data)
+                    except (TypeError, KeyError, IndexError, ValueError) as e:
+                        self.logger.warning(
+                            f"Error processing Paddle region: {region}. Error: {e}"
+                        )
+                        continue
         self.logger.info(
-            f"PaddleLayout detected {len(detections)} layout elements matching criteria."
+            f"PaddleLayout detected {len(detections)} layout elements matching criteria. Tables: {table_count}, matched structures: {matched_table_structures}, cells: {cell_count}, rows: {row_count}, columns: {col_count}."
         )
         return detections

natural_pdf/analyzers/layout/table_structure_utils.py ADDED Viewed

@@ -0,0 +1,78 @@
+from typing import List, Tuple
+import numpy as np
+def group_cells_into_rows_and_columns(
+    cell_boxes: List[Tuple[float, float, float, float]],
+    row_tol: float = None,
+    col_tol: float = None,
+) -> Tuple[List[Tuple[float, float, float, float]], List[Tuple[float, float, float, float]]]:
+    """
+    Groups cell bounding boxes into rows and columns using spatial proximity.
+    Args:
+        cell_boxes: List of (x0, y0, x1, y1) for each cell.
+        row_tol: Vertical tolerance for grouping rows (default: 10% of median cell height).
+        col_tol: Horizontal tolerance for grouping columns (default: 10% of median cell width).
+    Returns:
+        (row_boxes, col_boxes): Lists of bounding boxes for rows and columns.
+    """
+    if not cell_boxes:
+        return [], []
+    # Convert to numpy for easier manipulation
+    boxes = np.array(cell_boxes)
+    y_centers = (boxes[:, 1] + boxes[:, 3]) / 2
+    x_centers = (boxes[:, 0] + boxes[:, 2]) / 2
+    heights = boxes[:, 3] - boxes[:, 1]
+    widths = boxes[:, 2] - boxes[:, 0]
+    # Set default tolerances if not provided
+    median_height = float(np.median(heights))
+    median_width = float(np.median(widths))
+    row_tol = row_tol if row_tol is not None else max(2.0, 0.1 * median_height)
+    col_tol = col_tol if col_tol is not None else max(2.0, 0.1 * median_width)
+    # --- Group into rows ---
+    row_groups = []
+    for i, yc in enumerate(y_centers):
+        placed = False
+        for group in row_groups:
+            # If this cell's center is close to the group's mean center, add it
+            if abs(yc - np.mean([y_centers[j] for j in group])) <= row_tol:
+                group.append(i)
+                placed = True
+                break
+        if not placed:
+            row_groups.append([i])
+    # --- Group into columns ---
+    col_groups = []
+    for i, xc in enumerate(x_centers):
+        placed = False
+        for group in col_groups:
+            if abs(xc - np.mean([x_centers[j] for j in group])) <= col_tol:
+                group.append(i)
+                placed = True
+                break
+        if not placed:
+            col_groups.append([i])
+    # --- Compute bounding boxes for each group ---
+    row_boxes = []
+    for group in row_groups:
+        x0 = float(np.min(boxes[group, 0]))
+        y0 = float(np.min(boxes[group, 1]))
+        x1 = float(np.max(boxes[group, 2]))
+        y1 = float(np.max(boxes[group, 3]))
+        row_boxes.append((x0, y0, x1, y1))
+    col_boxes = []
+    for group in col_groups:
+        x0 = float(np.min(boxes[group, 0]))
+        y0 = float(np.min(boxes[group, 1]))
+        x1 = float(np.max(boxes[group, 2]))
+        y1 = float(np.max(boxes[group, 3]))
+        col_boxes.append((x0, y0, x1, y1))
+    return row_boxes, col_boxes

natural-pdf 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl

natural-pdf 0.1.15py3-none-any.whl → 0.1.16py3-none-any.whl