PyPI - natural-pdf - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +3 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +422 -0
natural_pdf/classification/mixin.py +163 -0
natural_pdf/classification/results.py +80 -0
natural_pdf/collections/mixins.py +111 -0
natural_pdf/collections/pdf_collection.py +434 -15
natural_pdf/core/element_manager.py +83 -0
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +578 -93
natural_pdf/core/pdf.py +912 -460
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +712 -109
natural_pdf/elements/region.py +722 -69
natural_pdf/elements/text.py +4 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +5 -4
natural_pdf/extraction/manager.py +135 -0
natural_pdf/extraction/mixin.py +279 -0
natural_pdf/extraction/result.py +23 -0
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/engine_easyocr.py +6 -3
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +122 -26
natural_pdf/ocr/ocr_options.py +94 -11
natural_pdf/ocr/utils.py +19 -6
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +431 -230
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +60 -1
natural_pdf/utils/tqdm_utils.py +51 -0
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
natural_pdf-0.1.9.dist-info/RECORD +80 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -915
docs/element-selection/index.md +0 -229
docs/finetuning/index.md +0 -176
docs/index.md +0 -170
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -209
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -194
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -340
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -147
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -114
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -270
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -332
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -288
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -413
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -508
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2434
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -512
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -604
docs/tutorials/12-ocr-integration.md +0 -175
docs/tutorials/13-semantic-search.ipynb +0 -1328
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.7.dist-info/RECORD +0 -145
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0

natural_pdf/analyzers/layout/pdfplumber_table_finder.py ADDED Viewed

@@ -0,0 +1,142 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+# Attempt to import pdfplumber modules
+import pdfplumber.table as pdfplumber_table
+# Type Definitions
+T_num = Union[int, float]
+T_bbox = Tuple[T_num, T_num, T_num, T_num]
+T_obj = Dict[str, Any]
+T_obj_list = List[T_obj]
+T_intersections = Dict[Tuple[T_num, T_num], Dict[str, T_obj_list]]
+T_cell_dict = Dict[str, T_num]
+# Use defaults directly from pdfplumber.table (or placeholders if not installed)
+DEFAULT_SNAP_TOLERANCE = pdfplumber_table.DEFAULT_SNAP_TOLERANCE
+DEFAULT_JOIN_TOLERANCE = pdfplumber_table.DEFAULT_JOIN_TOLERANCE
+DEFAULT_MIN_WORDS_VERTICAL = pdfplumber_table.DEFAULT_MIN_WORDS_VERTICAL
+DEFAULT_MIN_WORDS_HORIZONTAL = pdfplumber_table.DEFAULT_MIN_WORDS_HORIZONTAL
+# --- Main Function ---
+def find_text_based_tables(
+    bboxes: List[T_bbox],
+    snap_tolerance: T_num = DEFAULT_SNAP_TOLERANCE,
+    join_tolerance: T_num = DEFAULT_JOIN_TOLERANCE,
+    min_words_vertical: int = DEFAULT_MIN_WORDS_VERTICAL,
+    min_words_horizontal: int = DEFAULT_MIN_WORDS_HORIZONTAL,
+    intersection_tolerance: T_num = 3,
+    snap_x_tolerance: Optional[T_num] = None,
+    snap_y_tolerance: Optional[T_num] = None,
+    join_x_tolerance: Optional[T_num] = None,
+    join_y_tolerance: Optional[T_num] = None,
+    intersection_x_tolerance: Optional[T_num] = None,
+    intersection_y_tolerance: Optional[T_num] = None,
+) -> Dict[str, Union[T_obj_list, List[T_cell_dict], T_intersections]]:
+    """
+    Finds table structures based on text element alignment using imported
+    pdfplumber functions. Accepts a list of bounding box tuples.
+    Args:
+        bboxes: A list of bounding box tuples (x0, top, x1, bottom).
+        snap_tolerance: General tolerance for snapping edges.
+        join_tolerance: General tolerance for joining nearby edges.
+        min_words_vertical: Minimum words to form a vertical edge.
+        min_words_horizontal: Minimum words to form a horizontal edge.
+        intersection_tolerance: General tolerance for intersections.
+        snap_x_tolerance: Specific horizontal snap tolerance (overrides general).
+        snap_y_tolerance: Specific vertical snap tolerance (overrides general).
+        join_x_tolerance: Specific horizontal join tolerance (overrides general).
+        join_y_tolerance: Specific vertical join tolerance (overrides general).
+        intersection_x_tolerance: Specific horizontal intersection tolerance.
+        intersection_y_tolerance: Specific vertical intersection tolerance.
+    Returns:
+        A dictionary containing:
+        - 'horizontal_edges': List of merged horizontal edge dictionaries.
+        - 'vertical_edges': List of merged vertical edge dictionaries.
+        - 'cells': List of dictionaries [{'left': x0, 'top': top, 'right': x1, 'bottom': bottom}, ...]
+                   representing detected cells, ready for page.region().
+        - 'intersections': Dictionary of intersection points and the edges forming them.
+    Raises:
+        ImportError: If the 'pdfplumber' library is not installed when this function is called.
+    """
+    if not bboxes:
+        return {"horizontal_edges": [], "vertical_edges": [], "cells": [], "intersections": {}}
+    # Convert BBoxes to Dictionaries required by pdfplumber functions
+    text_elements = []
+    for i, bbox in enumerate(bboxes):
+        x0, top, x1, bottom = bbox
+        # Basic structure needed for words_to_edges_h/v
+        text_elements.append(
+            {
+                "x0": x0,
+                "top": top,
+                "x1": x1,
+                "bottom": bottom,
+                "width": x1 - x0,
+                "height": bottom - top,
+                "text": f"elem_{i}",  # Placeholder text
+                "object_type": "char",  # Mimic word/char structure loosely
+            }
+        )
+    # Resolve tolerances
+    sx = snap_x_tolerance if snap_x_tolerance is not None else snap_tolerance
+    sy = snap_y_tolerance if snap_y_tolerance is not None else snap_tolerance
+    jx = join_x_tolerance if join_x_tolerance is not None else join_tolerance
+    jy = join_y_tolerance if join_y_tolerance is not None else join_tolerance
+    ix = (
+        intersection_x_tolerance if intersection_x_tolerance is not None else intersection_tolerance
+    )
+    iy = (
+        intersection_y_tolerance if intersection_y_tolerance is not None else intersection_tolerance
+    )
+    # --- pdfplumber Pipeline ---
+    h_edges = pdfplumber_table.words_to_edges_h(text_elements, word_threshold=min_words_horizontal)
+    v_edges = pdfplumber_table.words_to_edges_v(text_elements, word_threshold=min_words_vertical)
+    initial_edges = h_edges + v_edges
+    if not initial_edges:
+        return {"horizontal_edges": [], "vertical_edges": [], "cells": [], "intersections": {}}
+    merged_edges = pdfplumber_table.merge_edges(initial_edges, sx, sy, jx, jy)
+    merged_h = [e for e in merged_edges if e["orientation"] == "h"]
+    merged_v = [e for e in merged_edges if e["orientation"] == "v"]
+    if not merged_edges:
+        return {
+            "horizontal_edges": merged_h,
+            "vertical_edges": merged_v,
+            "cells": [],
+            "intersections": {},
+        }
+    intersections = pdfplumber_table.edges_to_intersections(merged_edges, ix, iy)
+    if not intersections:
+        return {
+            "horizontal_edges": merged_h,
+            "vertical_edges": merged_v,
+            "cells": [],
+            "intersections": intersections,
+        }
+    cell_tuples = pdfplumber_table.intersections_to_cells(intersections)
+    # Convert cell tuples to dictionaries for page.region()
+    cell_dicts = []
+    for x0, top, x1, bottom in cell_tuples:
+        cell_dicts.append({"left": x0, "top": top, "right": x1, "bottom": bottom})
+    return {
+        "horizontal_edges": merged_h,
+        "vertical_edges": merged_v,
+        "cells": cell_dicts,
+        "intersections": intersections,
+    }

natural_pdf/analyzers/layout/surya.py CHANGED Viewed

@@ -20,6 +20,7 @@ TableRecPredictor = None
 if surya_spec:
     try:
+        from surya.common.util import expand_bbox, rescale_bbox
         from surya.layout import LayoutPredictor
         from surya.table_rec import TableRecPredictor
     except ImportError as e:
@@ -74,25 +75,10 @@ class SuryaLayoutDetector(LayoutDetector):
             raise TypeError("Incorrect options type provided for Surya model loading.")
         self.logger.info(f"Loading Surya models (device={options.device})...")
         models = {}
-        try:
-            models["layout"] = LayoutPredictor()
-            models["table_rec"] = TableRecPredictor()
-            self.logger.info("Surya LayoutPredictor and TableRecPredictor loaded.")
-            return models
-        except Exception as e:
-            self.logger.error(f"Failed to load Surya models: {e}", exc_info=True)
-            raise
-    def _expand_bbox(
-        self, bbox: Tuple[float, float, float, float], padding: int, max_width: int, max_height: int
-    ) -> Tuple[int, int, int, int]:
-        """Expand bbox by padding, clamping to max dimensions."""
-        x0, y0, x1, y1 = bbox
-        x0 = max(0, int(x0 - padding))
-        y0 = max(0, int(y0 - padding))
-        x1 = min(max_width, int(x1 + padding))
-        y1 = min(max_height, int(y1 + padding))
-        return x0, y0, x1, y1
+        models["layout"] = LayoutPredictor()
+        models["table_rec"] = TableRecPredictor()
+        self.logger.info("Surya LayoutPredictor and TableRecPredictor loaded.")
+        return models
     def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
         """Detect layout elements and optionally table structure in an image using Surya."""
@@ -114,19 +100,12 @@ class SuryaLayoutDetector(LayoutDetector):
         # Extract page reference and scaling factors from extra_args (passed by LayoutAnalyzer)
         self._page_ref = options.extra_args.get("_page_ref")
-        img_scale_x = options.extra_args.get("_img_scale_x")
-        img_scale_y = options.extra_args.get("_img_scale_y")
         # We still need this check, otherwise later steps that need these vars will fail
-        can_do_table_rec = (
-            options.recognize_table_structure
-            and self._page_ref
-            and img_scale_x is not None
-            and img_scale_y is not None
-        )
+        can_do_table_rec = options.recognize_table_structure
         if options.recognize_table_structure and not can_do_table_rec:
             logger.warning(
-                "Surya table recognition cannot proceed without page reference and scaling factors. Disabling."
+                "Surya table recognition cannot proceed without page reference. Disabling."
             )
             options.recognize_table_structure = False
@@ -141,14 +120,12 @@ class SuryaLayoutDetector(LayoutDetector):
         table_rec_predictor = models["table_rec"]
         input_image = image.convert("RGB")
-        input_image_list = [input_image]
-        initial_layout_detections = []  # Detections relative to input_image
+        initial_layout_detections = []
         tables_to_process = []
-        # --- Initial Layout Detection ---
         self.logger.debug("Running Surya layout prediction...")
-        layout_predictions = layout_predictor(input_image_list)
+        layout_predictions = layout_predictor([input_image])
         self.logger.debug(f"Surya prediction returned {len(layout_predictions)} results.")
         if not layout_predictions:
             return []
@@ -164,6 +141,7 @@ class SuryaLayoutDetector(LayoutDetector):
         )
         for layout_box in prediction.bboxes:
             class_name_orig = layout_box.label
             normalized_class = self._normalize_class_name(class_name_orig)
             score = float(layout_box.confidence)
@@ -196,7 +174,6 @@ class SuryaLayoutDetector(LayoutDetector):
             f"Surya initially detected {len(initial_layout_detections)} layout elements matching criteria."
         )
-        # --- Table Structure Recognition (Optional) ---
         if not options.recognize_table_structure or not tables_to_process:
             self.logger.debug(
                 "Skipping Surya table structure recognition (disabled or no tables found)."
@@ -207,59 +184,29 @@ class SuryaLayoutDetector(LayoutDetector):
             f"Attempting Surya table structure recognition for {len(tables_to_process)} tables..."
         )
         high_res_crops = []
-        pdf_offsets = []  # Store (pdf_x0, pdf_y0) for each crop
         high_res_dpi = getattr(self._page_ref._parent, "_config", {}).get(
             "surya_table_rec_dpi", 192
         )
-        bbox_padding = getattr(self._page_ref._parent, "_config", {}).get(
-            "surya_table_bbox_padding", 10
+        high_res_page_image = self._page_ref.to_image(
+            resolution=high_res_dpi, include_highlights=False, scale=1.0
         )
-        pdf_to_highres_scale = high_res_dpi / 72.0
         # Render high-res page ONCE
         self.logger.debug(
-            f"Rendering page {self._page_ref.number} at {high_res_dpi} DPI for table recognition..."
-        )
-        high_res_page_image = self._page_ref.to_image(
-            resolution=high_res_dpi, include_highlights=False
-        )
-        if not high_res_page_image:
-            raise RuntimeError(f"Failed to render page {self._page_ref.number} at high resolution.")
-        self.logger.debug(
-            f"  High-res image size: {high_res_page_image.width}x{high_res_page_image.height}"
+            f"Rendering page {self._page_ref.number} at {high_res_dpi} DPI for table recognition, size {high_res_page_image.width}x{high_res_page_image.height}."
         )
+        source_tables = []
         for i, table_detection in enumerate(tables_to_process):
-            img_x0, img_y0, img_x1, img_y1 = table_detection["bbox"]
-            # PDF coords
-            pdf_x0 = img_x0 * img_scale_x
-            pdf_y0 = img_y0 * img_scale_y
-            pdf_x1 = img_x1 * img_scale_x
-            pdf_y1 = img_y1 * img_scale_y
-            pdf_x0 = max(0, pdf_x0)
-            pdf_y0 = max(0, pdf_y0)
-            pdf_x1 = min(self._page_ref.width, pdf_x1)
-            pdf_y1 = min(self._page_ref.height, pdf_y1)
-            # High-res image coords
-            hr_x0 = pdf_x0 * pdf_to_highres_scale
-            hr_y0 = pdf_y0 * pdf_to_highres_scale
-            hr_x1 = pdf_x1 * pdf_to_highres_scale
-            hr_y1 = pdf_y1 * pdf_to_highres_scale
-            # Expand high-res bbox
-            hr_x0_exp, hr_y0_exp, hr_x1_exp, hr_y1_exp = self._expand_bbox(
-                (hr_x0, hr_y0, hr_x1, hr_y1),
-                padding=bbox_padding,
-                max_width=high_res_page_image.width,
-                max_height=high_res_page_image.height,
+            highres_bbox = rescale_bbox(
+                list(table_detection["bbox"]), image.size, high_res_page_image.size
             )
+            highres_bbox = expand_bbox(highres_bbox)
-            crop = high_res_page_image.crop((hr_x0_exp, hr_y0_exp, hr_x1_exp, hr_y1_exp))
+            crop = high_res_page_image.crop(highres_bbox)
             high_res_crops.append(crop)
-            pdf_offsets.append((pdf_x0, pdf_y0))
+            source_tables.append(highres_bbox)
         if not high_res_crops:
             self.logger.info("No valid high-resolution table crops generated.")
@@ -267,64 +214,40 @@ class SuryaLayoutDetector(LayoutDetector):
         structure_detections = []  # Detections relative to std_res input_image
-        # --- Run Table Recognition (will raise error on failure) ---
         self.logger.debug(
             f"Running Surya table recognition on {len(high_res_crops)} high-res images..."
         )
         table_predictions = table_rec_predictor(high_res_crops)
         self.logger.debug(f"Surya table recognition returned {len(table_predictions)} results.")
-        # --- Process Results ---
-        if len(table_predictions) != len(pdf_offsets):
-            # This case is less likely if predictor didn't error, but good sanity check
-            raise RuntimeError(
-                f"Mismatch between table inputs ({len(pdf_offsets)}) and predictions ({len(table_predictions)})."
-            )
+        def build_row_item(element, source_table_bbox, label):
+            adjusted_bbox = [
+                float(element.bbox[0] + source_table_bbox[0]),
+                float(element.bbox[1] + source_table_bbox[1]),
+                float(element.bbox[2] + source_table_bbox[0]),
+                float(element.bbox[3] + source_table_bbox[1]),
+            ]
+            adjusted_bbox = rescale_bbox(adjusted_bbox, high_res_page_image.size, image.size)
+            return {
+                "bbox": adjusted_bbox,
+                "class": label,
+                "confidence": 1.0,
+                "normalized_class": label,
+                "source": "layout",
+                "model": "surya",
+            }
+        for table_pred, source_table_bbox in zip(table_predictions, source_tables):
+            for box in table_pred.rows:
+                structure_detections.append(build_row_item(box, source_table_bbox, "table-row"))
+            for box in table_pred.cols:
+                structure_detections.append(build_row_item(box, source_table_bbox, "table-column"))
-        for table_pred, (offset_pdf_x0, offset_pdf_y0) in zip(table_predictions, pdf_offsets):
-            # Process Rows
-            for row_box in table_pred.rows:
-                crop_rx0, crop_ry0, crop_rx1, crop_ry1 = map(float, row_box.bbox)
-                pdf_row_x0 = offset_pdf_x0 + crop_rx0 / pdf_to_highres_scale
-                pdf_row_y0 = offset_pdf_y0 + crop_ry0 / pdf_to_highres_scale
-                pdf_row_x1 = offset_pdf_x0 + crop_rx1 / pdf_to_highres_scale
-                pdf_row_y1 = offset_pdf_y0 + crop_ry1 / pdf_to_highres_scale
-                img_row_x0 = pdf_row_x0 / img_scale_x
-                img_row_y0 = pdf_row_y0 / img_scale_y
-                img_row_x1 = pdf_row_x1 / img_scale_x
-                img_row_y1 = pdf_row_y1 / img_scale_y
-                structure_detections.append(
-                    {
-                        "bbox": (img_row_x0, img_row_y0, img_row_x1, img_row_y1),
-                        "class": "table-row",
-                        "confidence": 1.0,
-                        "normalized_class": "table-row",
-                        "source": "layout",
-                        "model": "surya",
-                    }
-                )
-            # Process Columns
-            for col_box in table_pred.cols:
-                crop_cx0, crop_cy0, crop_cx1, crop_cy1 = map(float, col_box.bbox)
-                pdf_col_x0 = offset_pdf_x0 + crop_cx0 / pdf_to_highres_scale
-                pdf_col_y0 = offset_pdf_y0 + crop_cy0 / pdf_to_highres_scale
-                pdf_col_x1 = offset_pdf_x0 + crop_cx1 / pdf_to_highres_scale
-                pdf_col_y1 = offset_pdf_y0 + crop_cy1 / pdf_to_highres_scale
-                img_col_x0 = pdf_col_x0 / img_scale_x
-                img_col_y0 = pdf_col_y0 / img_scale_y
-                img_col_x1 = pdf_col_x1 / img_scale_x
-                img_col_y1 = pdf_col_y1 / img_scale_y
-                structure_detections.append(
-                    {
-                        "bbox": (img_col_x0, img_col_y0, img_col_x1, img_col_y1),
-                        "class": "table-column",
-                        "confidence": 1.0,
-                        "normalized_class": "table-column",
-                        "source": "layout",
-                        "model": "surya",
-                    }
-                )
+            for box in table_pred.cells:
+                structure_detections.append(build_row_item(box, source_table_bbox, "table-cell"))
         self.logger.info(f"Added {len(structure_detections)} table structure elements.")

natural_pdf/analyzers/layout/tatr.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 import tempfile
 from typing import Any, Dict, List, Optional, Tuple
-from PIL import Image
+from PIL import Image, ImageEnhance
 # Assuming base class and options are importable
 from .base import LayoutDetector
@@ -150,6 +150,26 @@ class TableTransformerDetector(LayoutDetector):
                 )
         return objects
+    def preprocess_image(self, image: Image.Image, enhance_contrast: float = 1.5) -> Image.Image:
+        """Enhance the image to improve table structure detection.
+        Args:
+            image: The input PIL image
+            enhance_contrast: Contrast enhancement factor (1.0 = no change)
+        Returns:
+            Enhanced PIL image
+        """
+        # Convert to grayscale and back to RGB for better structure detection
+        if image.mode != "L":  # If not already grayscale
+            grayscale = image.convert("L")
+            enhanced = ImageEnhance.Contrast(grayscale).enhance(enhance_contrast)
+            return enhanced.convert("RGB")  # Convert back to RGB for model input
+        else:
+            # Just enhance contrast if already grayscale
+            enhanced = ImageEnhance.Contrast(image).enhance(enhance_contrast)
+            return enhanced.convert("RGB")
     # --- End Helper Methods ---
     def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
@@ -196,9 +216,17 @@ class TableTransformerDetector(LayoutDetector):
             ]
         )
+        # Use image preprocessing for better structure detection
+        enhance_contrast = (
+            options.enhance_contrast
+            if hasattr(options, "enhance_contrast")
+            else options.extra_args.get("enhance_contrast", 1.5)
+        )
+        processed_image = self.preprocess_image(image, enhance_contrast)
         # --- Detect Tables ---
         self.logger.debug("Running TATR table detection...")
-        pixel_values = detection_transform(image.convert("RGB")).unsqueeze(0).to(device)
+        pixel_values = detection_transform(processed_image).unsqueeze(0).to(device)
         with torch.no_grad():
             outputs = detection_model(pixel_values)
@@ -271,19 +299,38 @@ class TableTransformerDetector(LayoutDetector):
                 if x_max <= x_min or y_max <= y_min:
                     continue  # Skip invalid crop
+                # Process the cropped table for better structure detection
                 cropped_table = image.crop((x_min, y_min, x_max, y_max))
                 if cropped_table.width == 0 or cropped_table.height == 0:
                     continue  # Skip empty crop
-                pixel_values_struct = structure_transform(cropped_table).unsqueeze(0).to(device)
+                processed_crop = self.preprocess_image(cropped_table, enhance_contrast)
+                pixel_values_struct = structure_transform(processed_crop).unsqueeze(0).to(device)
                 with torch.no_grad():
                     outputs_struct = structure_model(pixel_values_struct)
                 structure_elements = self.outputs_to_objects(
                     outputs_struct, cropped_table.size, id2label_struct
                 )
+                # Reduce confidence threshold specifically for columns to catch more
+                column_threshold = None
+                if hasattr(options, "column_threshold") and options.column_threshold is not None:
+                    column_threshold = options.column_threshold
+                else:
+                    column_threshold = options.extra_args.get(
+                        "column_threshold", options.confidence * 0.8
+                    )
                 structure_elements = [
-                    e for e in structure_elements if e["score"] >= options.confidence
+                    e
+                    for e in structure_elements
+                    if (
+                        e["score"] >= column_threshold
+                        if "column" in e["label"]
+                        else e["score"] >= options.confidence
+                    )
                 ]
                 for element in structure_elements:

natural_pdf/analyzers/text_structure.py CHANGED Viewed

@@ -9,14 +9,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 from natural_pdf.analyzers.text_options import TextStyleOptions
-# Import ElementCollection and TextStyleOptions
-from natural_pdf.elements.collections import ElementCollection
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
     from natural_pdf.elements.base import Element
-    # Remove ElementCollection from here if imported above
+    from natural_pdf.elements.collections import ElementCollection
 logger = logging.getLogger(__name__)
@@ -68,6 +64,8 @@ class TextStyleAnalyzer:
             ElementCollection containing all processed text elements (typically words)
             with added 'style_label', 'style_key', and 'style_properties' attributes.
         """
+        from natural_pdf.elements.collections import ElementCollection
         current_options = options or self.options
         logger.info(
             f"Starting text style analysis for page {page.number} with options: {current_options}"

natural_pdf/analyzers/utils.py CHANGED Viewed

@@ -1,12 +1,10 @@
 import logging
 from typing import Any, Dict, List
-from ..elements.region import Region
 def convert_to_regions(
     page: Any, detections: List[Dict[str, Any]], scale_factor: float = 1.0
-) -> List[Region]:
+) -> List["Region"]:
     """
     Convert layout detections to Region objects.
@@ -18,6 +16,8 @@ def convert_to_regions(
     Returns:
         List of Region objects with layout metadata
     """
+    from natural_pdf.elements.region import Region
     conversion_logger = logging.getLogger("natural_pdf.analyzers.layout.convert")
     conversion_logger.debug(
         f"Converting {len(detections)} detections to regions with scale {scale_factor}"

natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl