PyPI - sparrow-parse - Versions diffs - 0.3.11__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

sparrow-parse 0.3.11py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

sparrow_parse/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = '0.3.11'
1	+ __version__ = '0.4.0'

sparrow_parse/extractors/vllm_extractor.py CHANGED Viewed

@@ -1,87 +1,207 @@
+import json
 from sparrow_parse.vllm.inference_factory import InferenceFactory
 from sparrow_parse.helpers.pdf_optimizer import PDFOptimizer
+from sparrow_parse.processors.table_structure_processor import TableDetector
 from rich import print
 import os
+import tempfile
 import shutil
+from typing import Any, Dict, List, Union
 class VLLMExtractor(object):
     def __init__(self):
         pass
-    def run_inference(self, model_inference_instance, input_data,
+    def run_inference(self, model_inference_instance, input_data, tables_only=False,
                       generic_query=False, debug_dir=None, debug=False, mode=None):
-        # Modify input for generic queries
+        """
+        Main entry point for processing input data using a model inference instance.
+        Handles generic queries, PDFs, and table extraction.
+        """
         if generic_query:
             input_data[0]["text_input"] = "retrieve document data. return response in JSON format"
         if debug:
-            print("Input Data:", input_data)
+            print("Input data:", input_data)
-        # Check if the input file is a PDF
         file_path = input_data[0]["file_path"]
         if self.is_pdf(file_path):
-            return self._process_pdf(model_inference_instance, input_data, debug_dir, mode)
+            return self._process_pdf(model_inference_instance, input_data, tables_only, debug, debug_dir, mode)
-        # Default processing for non-PDF files
-        input_data[0]["file_path"] = [file_path]
-        results_array = model_inference_instance.inference(input_data)
-        return results_array, 1
+        return self._process_non_pdf(model_inference_instance, input_data, tables_only, debug, debug_dir)
-    def _process_pdf(self, model_inference_instance, input_data, debug_dir, mode):
-        """Handles processing and inference for PDF files."""
+    def _process_pdf(self, model_inference_instance, input_data, tables_only, debug, debug_dir, mode):
+        """
+        Handles processing and inference for PDF files, including page splitting and optional table extraction.
+        """
         pdf_optimizer = PDFOptimizer()
         num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(input_data[0]["file_path"],
-                                                                             debug_dir,
-                                                                             True)
-        # Update file paths for PDF pages
-        input_data[0]["file_path"] = output_files
+                                                                             debug_dir, convert_to_images=True)
-        # Run inference on PDF pages
-        results_array = model_inference_instance.inference(input_data, mode)
+        results = self._process_pages(model_inference_instance, output_files, input_data, tables_only, debug, debug_dir)
         # Clean up temporary directory
         shutil.rmtree(temp_dir, ignore_errors=True)
-        return results_array, num_pages
+        return results, num_pages
+    def _process_non_pdf(self, model_inference_instance, input_data, tables_only, debug, debug_dir):
+        """
+        Handles processing and inference for non-PDF files, with optional table extraction.
+        """
+        file_path = input_data[0]["file_path"]
+        if tables_only:
+            return [self._extract_tables(model_inference_instance, file_path, input_data, debug, debug_dir)], 1
+        else:
+            input_data[0]["file_path"] = [file_path]
+            results = model_inference_instance.inference(input_data)
+            return results, 1
+    def _process_pages(self, model_inference_instance, output_files, input_data, tables_only, debug, debug_dir):
+        """
+        Processes individual pages (PDF split) and handles table extraction or inference.
+        Args:
+            model_inference_instance: The model inference object.
+            output_files: List of file paths for the split PDF pages.
+            input_data: Input data for inference.
+            tables_only: Whether to only process tables.
+            debug: Debug flag for logging.
+            debug_dir: Directory for saving debug information.
+        Returns:
+            List of results from the processing or inference.
+        """
+        results_array = []
+        if tables_only:
+            if debug:
+                print(f"Processing {len(output_files)} pages for table extraction.")
+            # Process each page individually for table extraction
+            for i, file_path in enumerate(output_files):
+                tables_result = self._extract_tables(
+                    model_inference_instance, file_path, input_data, debug, debug_dir, page_index=i
+                )
+                results_array.append(tables_result)
+        else:
+            if debug:
+                print(f"Processing {len(output_files)} pages for inference at once.")
+            # Pass all output files to the inference method for processing at once
+            input_data[0]["file_path"] = output_files
+            results = model_inference_instance.inference(input_data)
+            results_array.extend(results)
+        return results_array
+    def _extract_tables(self, model_inference_instance, file_path, input_data, debug, debug_dir, page_index=None):
+        """
+        Detects and processes tables from an input file.
+        """
+        table_detector = TableDetector()
+        cropped_tables = table_detector.detect_tables(file_path, local=False, debug_dir=debug_dir, debug=debug)
+        results_array = []
+        temp_dir = tempfile.mkdtemp()
+        for i, table in enumerate(cropped_tables):
+            table_index = f"page_{page_index + 1}_table_{i + 1}" if page_index is not None else f"table_{i + 1}"
+            print(f"Processing {table_index} for document {file_path}")
+            output_filename = os.path.join(temp_dir, f"{table_index}.jpg")
+            table.save(output_filename, "JPEG")
+            input_data[0]["file_path"] = [output_filename]
+            result = self._run_model_inference(model_inference_instance, input_data)
+            result = self.add_table_info_to_data(result, "table_nr", i + 1)
+            results_array.append(result)
+        shutil.rmtree(temp_dir, ignore_errors=True)
+        return json.dumps(results_array, indent=4)
+    @staticmethod
+    def _run_model_inference(model_inference_instance, input_data):
+        """
+        Runs model inference and handles JSON decoding.
+        """
+        result = model_inference_instance.inference(input_data)[0]
+        try:
+            return json.loads(result) if isinstance(result, str) else result
+        except json.JSONDecodeError:
+            return {"message": "Invalid JSON format in LLM output", "valid": "false"}
     @staticmethod
     def is_pdf(file_path):
         """Checks if a file is a PDF based on its extension."""
         return file_path.lower().endswith('.pdf')
+    @staticmethod
+    def add_table_info_to_data(data: Union[Dict, List], key: str, message: Any) -> Dict:
+        """
+        Add a key-value pair to a dictionary or wrap a list in a dictionary.
+        If a 'table' key exists, add or update the key-value pair inside it.
+        Args:
+            data (Union[Dict, List]): The input data (either a dictionary or list).
+            key (str): The key to add.
+            message (Any): The value to associate with the key.
+        Returns:
+            Dict: The modified data.
+        """
+        if isinstance(data, dict):
+            if "table" in data and isinstance(data["table"], list):
+                # Add or update the key-value pair in the existing structure
+                data[key] = message
+            else:
+                # Wrap the dictionary inside a `table` key and include the additional key-value pair
+                data = {"table": [data], key: message}
+        elif isinstance(data, list):
+            # Wrap the list in a dictionary with the additional key-value pair
+            data = {"table": data, key: message}
+        else:
+            raise TypeError("Data must be a dictionary or a list.")
+        return data
 if __name__ == "__main__":
     # run locally: python -m sparrow_parse.extractors.vllm_extractor
     extractor = VLLMExtractor()
-    # export HF_TOKEN="hf_"
-    config = {
-        "method": "mlx",  # Could be 'huggingface', 'mlx' or 'local_gpu'
-        "model_name": "mlx-community/Qwen2-VL-72B-Instruct-4bit",
-        # "hf_space": "katanaml/sparrow-qwen2-vl-7b",
-        # "hf_token": os.getenv('HF_TOKEN'),
-        # Additional fields for local GPU inference
-        # "device": "cuda", "model_path": "model.pth"
-    }
-    # Use the factory to get the correct instance
-    factory = InferenceFactory(config)
-    model_inference_instance = factory.get_inference_instance()
-    input_data = [
-        {
-            "file_path": "/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/oracle_10k_2014_q1_small.pdf",
-            "text_input": "retrieve table, description, latest_amount, previous_amount. return response in JSON format, by strictly following this JSON schema: {\"table\": [{\"description\": \"str\", \"latest_amount\": 0, \"previous_amount\": 0}]}"
-        }
-    ]
-    # Now you can run inference without knowing which implementation is used
-    results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
-                                     debug_dir=None,
-                                     debug=True,
-                                     mode=None)
-    for i, result in enumerate(results_array):
-        print(f"Result for page {i + 1}:", result)
-    print(f"Number of pages: {num_pages}")
+    # # export HF_TOKEN="hf_"
+    # config = {
+    #     "method": "mlx",  # Could be 'huggingface', 'mlx' or 'local_gpu'
+    #     "model_name": "mlx-community/Qwen2-VL-72B-Instruct-4bit",
+    #     # "hf_space": "katanaml/sparrow-qwen2-vl-7b",
+    #     # "hf_token": os.getenv('HF_TOKEN'),
+    #     # Additional fields for local GPU inference
+    #     # "device": "cuda", "model_path": "model.pth"
+    # }
+    #
+    # # Use the factory to get the correct instance
+    # factory = InferenceFactory(config)
+    # model_inference_instance = factory.get_inference_instance()
+    #
+    # input_data = [
+    #     {
+    #         "file_path": "/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.jpg",
+    #         "text_input": "retrieve document data. return response in JSON format"
+    #     }
+    # ]
+    #
+    # # Now you can run inference without knowing which implementation is used
+    # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
+    #                                                    generic_query=False,
+    #                                                    debug_dir="/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/",
+    #                                                    debug=True,
+    #                                                    mode=None)
+    #
+    # for i, result in enumerate(results_array):
+    #     print(f"Result for page {i + 1}:", result)
+    # print(f"Number of pages: {num_pages}")

sparrow_parse/processors/table_structure_processor.py CHANGED Viewed

@@ -1,19 +1,18 @@
 from rich.progress import Progress, SpinnerColumn, TextColumn
 from rich import print
 from transformers import AutoModelForObjectDetection
-from transformers import TableTransformerForObjectDetection
 import torch
 from PIL import Image
 from torchvision import transforms
-from PIL import ImageDraw
 import os
-import numpy as np
-import easyocr
 class TableDetector(object):
+    _model = None  # Static variable to hold the table detection model
+    _device = None  # Static variable to hold the device information
     def __init__(self):
-        self.reader = easyocr.Reader(['en']) # this needs to run only once to load the model into memory
+        pass
     class MaxResize(object):
         def __init__(self, max_size=800):
@@ -27,12 +26,27 @@ class TableDetector(object):
             return resized_image
-    def detect_table(self, file_path, options, local=True, debug=False):
-        model, device = self.invoke_pipeline_step(
-            lambda: self.load_table_detection_model(),
-            "Loading table detection model...",
-            local
-        )
+    @classmethod
+    def _initialize_model(cls, invoke_pipeline_step, local):
+        """
+        Static method to initialize the table detection model if not already initialized.
+        """
+        if cls._model is None:
+            # Use invoke_pipeline_step to load the model
+            cls._model, cls._device = invoke_pipeline_step(
+                lambda: cls.load_table_detection_model(),
+                "Loading table detection model...",
+                local
+            )
+            print("Table detection model initialized.")
+    def detect_tables(self, file_path, local=True, debug_dir=None, debug=False):
+        # Ensure the model is initialized using invoke_pipeline_step
+        self._initialize_model(self.invoke_pipeline_step, local)
+        # Use the static model and device
+        model, device = self._model, self._device
         outputs, image = self.invoke_pipeline_step(
             lambda: self.prepare_image(file_path, model, device),
@@ -46,38 +60,17 @@ class TableDetector(object):
             local
         )
-        cropped_table = self.invoke_pipeline_step(
-            lambda: self.crop_table(file_path, image, objects),
+        cropped_tables = self.invoke_pipeline_step(
+            lambda: self.crop_tables(file_path, image, objects, debug, debug_dir),
             "Cropping tables from the image...",
             local
         )
-        structure_model = self.invoke_pipeline_step(
-            lambda: self.load_table_structure_model(device),
-            "Loading table structure recognition model...",
-            local
-        )
-        structure_outputs = self.invoke_pipeline_step(
-            lambda: self.get_table_structure(cropped_table, structure_model, device),
-            "Getting table structure from cropped table...",
-            local
-        )
-        table_data = self.invoke_pipeline_step(
-            lambda: self.get_structure_cells(structure_model, cropped_table, structure_outputs),
-            "Getting structure cells from cropped table...",
-            local
-        )
-        self.invoke_pipeline_step(
-            lambda: self.process_table_structure(table_data, cropped_table, file_path),
-            "Processing structure cells...",
-            local
-        )
+        return cropped_tables
-    def load_table_detection_model(self):
+    @staticmethod
+    def load_table_detection_model():
         model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-detection", revision="no_timm")
         device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -85,11 +78,6 @@ class TableDetector(object):
         return model, device
-    def load_table_structure_model(self, device):
-        structure_model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-structure-recognition-v1.1-all")
-        structure_model.to(device)
-        return structure_model
     def prepare_image(self, file_path, model, device):
         image = Image.open(file_path).convert("RGB")
@@ -115,38 +103,52 @@ class TableDetector(object):
         objects = self.outputs_to_objects(outputs, image.size, id2label)
         return objects
-    def crop_table(self, file_path, image, objects):
+    def crop_tables(self, file_path, image, objects, debug, debug_dir):
         tokens = []
         detection_class_thresholds = {
             "table": 0.5,
             "table rotated": 0.5,
             "no object": 10
         }
-        crop_padding = 10
+        crop_padding = 30
         tables_crops = self.objects_to_crops(image, tokens, objects, detection_class_thresholds, padding=crop_padding)
-        cropped_table = None
+        cropped_tables = []
         if len(tables_crops) == 0:
-            print("No tables detected.")
-            return
+            if debug:
+                print("No tables detected in: ", file_path)
+            return None
         elif len(tables_crops) > 1:
             for i, table_crop in enumerate(tables_crops):
+                if debug:
+                    print("Table detected in:", file_path, "-", i + 1)
                 cropped_table = table_crop['image'].convert("RGB")
-                file_name_table = self.append_filename(file_path, f"table_{i}")
-                cropped_table.save(file_name_table)
-                break
+                cropped_tables.append(cropped_table)
+                if debug_dir:
+                    file_name_table = self.append_filename(file_path, debug_dir, f"cropped_{i + 1}")
+                    cropped_table.save(file_name_table)
         else:
+            if debug:
+                print("Table detected in: ", file_path)
             cropped_table = tables_crops[0]['image'].convert("RGB")
+            cropped_tables.append(cropped_table)
-            file_name_table = self.append_filename(file_path, "table")
-            cropped_table.save(file_name_table)
+            if debug_dir:
+                file_name_table = self.append_filename(file_path, debug_dir, "cropped")
+                cropped_table.save(file_name_table)
-        return cropped_table
+        return cropped_tables
     # for output bounding box post-processing
-    def box_cxcywh_to_xyxy(self, x):
+    @staticmethod
+    def box_cxcywh_to_xyxy(x):
         x_c, y_c, w, h = x.unbind(-1)
         b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
         return torch.stack(b, dim=1)
@@ -216,214 +218,15 @@ class TableDetector(object):
         return table_crops
-    def get_table_structure(self, cropped_table, structure_model, device):
-        structure_transform = transforms.Compose([
-            self.MaxResize(1000),
-            transforms.ToTensor(),
-            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
-        ])
-        pixel_values = structure_transform(cropped_table).unsqueeze(0)
-        pixel_values = pixel_values.to(device)
-        with torch.no_grad():
-            outputs = structure_model(pixel_values)
-        return outputs
-    def get_structure_cells(self, structure_model, cropped_table, outputs):
-        structure_id2label = structure_model.config.id2label
-        structure_id2label[len(structure_id2label)] = "no object"
-        cells = self.outputs_to_objects(outputs, cropped_table.size, structure_id2label)
-        return cells
-    def process_table_structure(self, table_data, cropped_table, file_path):
-        cropped_table_raw_visualized = cropped_table.copy()
-        draw_raw = ImageDraw.Draw(cropped_table_raw_visualized)
-        cropped_table_header_visualized = cropped_table.copy()
-        draw_header = ImageDraw.Draw(cropped_table_header_visualized)
-        cropped_table_visualized = cropped_table.copy()
-        draw = ImageDraw.Draw(cropped_table_visualized)
-        table_data = [cell for cell in table_data if cell['label'] != 'table spanning cell']
-        table_data = [cell for cell in table_data if cell['label'] != 'table']
-        table_data = [cell for cell in table_data if cell['score'] >= 0.8]
-        table_data = self.merge_overlapping_columns(cropped_table, table_data)
-        table_data = self.adjust_overlapping_rows(cropped_table, table_data)
-        table_data_filtered = [item for item in table_data if item['label'] == 'table row']
-        # table_data_filtered = table_data
-        for cell in table_data_filtered:
-            draw_raw.rectangle(cell["bbox"], outline="red")
-        file_name_table_grid_raw = self.append_filename(file_path, "table_raw")
-        cropped_table_raw_visualized.save(file_name_table_grid_raw)
-        print("Table raw data:")
-        print(table_data_filtered)
-        # table, table column header, table row, table column
-        table_data_header = [cell for cell in table_data if cell['label'] == 'table column header'
-                             or cell['label'] == 'table' or cell['label'] == 'table column']
-        print("Table header data:")
-        print(table_data_header)
-        table_data_rows = [cell for cell in table_data if cell['label'] == 'table column'
-                           or cell['label'] == 'table row']
-        table_data_rows = self.remove_overlapping_table_header_rows(table_data_header, table_data_rows)
-        print("Table row data:")
-        print(table_data_rows)
-        header_cells = self.get_header_cell_coordinates(table_data_header)
-        if header_cells is not None:
-            print("Header cell coordinates:")
-            print(header_cells)
-            header_data = self.do_ocr_with_coordinates(header_cells, cropped_table)
-            print("Header data:")
-            print(header_data)
-            for cell_data in header_cells['row0']:
-                draw_header.rectangle(cell_data["cell"], outline="red")
-            file_name_table_grid_header = self.append_filename(file_path, "table_grid_header")
-            cropped_table_header_visualized.save(file_name_table_grid_header)
-        table_cells = self.get_table_cell_coordinates(table_data_rows)
-        if table_cells is not None:
-            print("Table cell coordinates:")
-            print(table_cells)
-            table_data = self.do_ocr_with_coordinates(table_cells, cropped_table)
-            print("Table data:")
-            print(table_data)
-            for row_key, row_cells in table_cells.items():
-                for cell_data in row_cells:
-                    draw.rectangle(cell_data["cell"], outline="red")
-            file_name_table_grid = self.append_filename(file_path, "table_grid_cells")
-            cropped_table_visualized.save(file_name_table_grid)
-    def get_header_cell_coordinates(self, table_data):
-        header_column = None
-        columns = []
-        # Separate header and columns
-        for item in table_data:
-            if item['label'] == 'table column header':
-                header_column = item['bbox']
-            elif item['label'] == 'table column':
-                columns.append(item['bbox'])
-        if not header_column:
-            return None
-        header_top = header_column[1]
-        header_bottom = header_column[3]
-        cells = []
-        # Calculate cell coordinates based on header and column intersections
-        for column in columns:
-            cell_left = column[0]
-            cell_right = column[2]
-            cell_top = header_top
-            cell_bottom = header_bottom
-            cells.append({
-                'cell': (cell_left, cell_top, cell_right, cell_bottom)
-            })
-        # Sort cells by the left coordinate (cell_left) to order them from left to right
-        cells.sort(key=lambda x: x['cell'][0])
-        header_row = {"row0": cells}
-        return header_row
-    def get_table_cell_coordinates(self, table_data):
-        rows = []
-        columns = []
-        # Separate rows and columns
-        for item in table_data:
-            if item['label'] == 'table row':
-                rows.append(item['bbox'])
-            elif item['label'] == 'table column':
-                columns.append(item['bbox'])
-        if not rows or not columns:
-            return None
-        # Sort rows by the top coordinate to ensure they are processed from top to bottom
-        rows.sort(key=lambda x: x[1])
-        row_cells = {}
-        # Calculate cell coordinates based on row and column intersections
-        for row_idx, row in enumerate(rows):
-            row_top = row[1]
-            row_bottom = row[3]
-            cells = []
-            for column in columns:
-                cell_left = column[0]
-                cell_right = column[2]
-                cell_top = row_top
-                cell_bottom = row_bottom
-                cells.append({
-                    'cell': (cell_left, cell_top, cell_right, cell_bottom)
-                })
-            # Sort cells within the row by the left coordinate to ensure they are ordered from left to right
-            cells.sort(key=lambda x: x['cell'][0])
-            row_cells[f'row{row_idx}'] = cells
-        return row_cells
-    def do_ocr_with_coordinates(self, cell_coordinates, cropped_table):
-        data = {}
-        max_num_columns = 0
-        # Iterate over each row in cell_coordinates
-        for row_key in cell_coordinates:
-            row_text = []
-            for cell in cell_coordinates[row_key]:
-                # Crop cell out of image
-                cell_image = cropped_table.crop(cell['cell'])
-                cell_image_np = np.array(cell_image)
-                # Apply OCR
-                result = self.reader.readtext(cell_image_np)
-                if result:
-                    text = " ".join([x[1] for x in result])
-                    row_text.append(text)
-                else:
-                    row_text.append("")  # If no text is detected, append an empty string
-            if len(row_text) > max_num_columns:
-                max_num_columns = len(row_text)
-            data[row_key] = row_text
-        print("Max number of columns:", max_num_columns)
-        # Pad rows which don't have max_num_columns elements
-        for row_key, row_data in data.items():
-            if len(row_data) < max_num_columns:
-                row_data += [""] * (max_num_columns - len(row_data))
-            data[row_key] = row_data
-        return data
-    def append_filename(self, file_path, word):
+    @staticmethod
+    def append_filename(file_path, debug_dir, word):
         directory, filename = os.path.split(file_path)
         name, ext = os.path.splitext(filename)
         new_filename = f"{name}_{word}{ext}"
-        return os.path.join(directory, new_filename)
+        return os.path.join(debug_dir, new_filename)
+    @staticmethod
     def iob(boxA, boxB):
         # Determine the coordinates of the intersection rectangle
         xA = max(boxA[0], boxB[0])
@@ -443,159 +246,9 @@ class TableDetector(object):
         return iob
-    def remove_overlapping_table_header_rows(self, header_data, row_data, tolerance=1.0):
-        # Function to calculate the Intersection over Union (IoU) of two bounding boxes
-        def calculate_iou(bbox1, bbox2):
-            x1_min, y1_min, x1_max, y1_max = bbox1
-            x2_min, y2_min, x2_max, y2_max = bbox2
-            # Determine the coordinates of the intersection rectangle
-            inter_min_x = max(x1_min, x2_min)
-            inter_min_y = max(y1_min, y2_min)
-            inter_max_x = min(x1_max, x2_max)
-            inter_max_y = min(y1_max, y2_max)
-            # Compute the area of intersection
-            inter_area = max(0, inter_max_x - inter_min_x) * max(0, inter_max_y - inter_min_y)
-            # Compute the area of both bounding boxes
-            bbox1_area = (x1_max - x1_min) * (y1_max - y1_min)
-            bbox2_area = (x2_max - x2_min) * (y2_max - y2_min)
-            # Compute the Intersection over Union (IoU)
-            iou = inter_area / float(bbox1_area + bbox2_area - inter_area)
-            return iou
-        # Extract the bounding box of the table column header
-        header_bbox = None
-        for item in header_data:
-            if item['label'] == 'table column header':
-                header_bbox = item['bbox']
-                break
-        if header_bbox is None:
-            print("No 'table column header' found in header data.")
-            return row_data
-        # Initialize a counter for removed rows
-        removed_count = 0
-        # Iterate over the table row data and remove rows with overlapping bbox
-        updated_row_data = []
-        for row in row_data:
-            if row['label'] == 'table row':
-                row_bbox = row['bbox']
-                # Check for overlap (IoU > 0) or very similar bounding box
-                iou = calculate_iou(header_bbox, row_bbox)
-                if iou > 0 or np.allclose(row_bbox, header_bbox, atol=tolerance):
-                    removed_count += 1  # Increment the removed counter
-                    continue  # Skip this row as it overlaps or matches the header bbox
-            # Add row to the updated list if it doesn't overlap
-            updated_row_data.append(row)
-        # Print the number of removed rows
-        print(f"Number of removed rows: {removed_count}")
-        return updated_row_data
-    def filter_table_columns(self, data):
-        return [item for item in data if item['label'] == 'table column']
-    def filter_table_rows(self, data):
-        return [item for item in data if item['label'] == 'table row']
-    def extract_text_boundaries(self, image, box):
-        """
-        Extract the start and end coordinates of the text within a bounding box,
-        and translate them back to the original image coordinates.
-        Args:
-        - image: The image in which the box is located.
-        - box: The bounding box (x_min, y_min, x_max, y_max).
-        - reader: The EasyOCR reader object.
-        Returns:
-        - text_start: The x-coordinate of the start of the text in the original image.
-        - text_end: The x-coordinate of the end of the text in the original image.
-        """
-        x_min, y_min, x_max, y_max = box
-        cropped_image = image.crop((x_min, y_min, x_max, y_max))
-        result = self.reader.readtext(np.array(cropped_image))
-        if result:
-            text_coordinates = result[0][0]  # Extract the coordinates of the text within the cropped image
-            # Translate the coordinates back to the original image coordinates
-            text_start = min(point[0] + x_min for point in text_coordinates)
-            text_end = max(point[0] + x_min for point in text_coordinates)
-            return text_start, text_end
-        return None, None
-    def merge_overlapping_columns(self, image, data, proximity_threshold=20):
-        """
-            Merge only those bounding boxes where the text is split directly by the box line,
-            while keeping other labels intact.
-            Args:
-            - image: The image in which the boxes are located.
-            - data: List of dictionary items with bounding boxes and labels.
-            - reader: The EasyOCR reader object.
-            - proximity_threshold: The maximum distance between text boundaries to consider merging.
-            Returns:
-            - Updated list of dictionary items with merged bounding boxes and other entries preserved.
-            """
-        table_columns = self.filter_table_columns(data)
-        other_entries = [item for item in data if item['label'] != 'table column']
-        merged_boxes = []
-        table_columns = sorted(table_columns, key=lambda x: x['bbox'][0])  # Sort by x_min
-        while table_columns:
-            box_data = table_columns.pop(0)
-            x_min, y_min, x_max, y_max = box_data['bbox']
-            to_merge = []
-            for i, other_box_data in enumerate(table_columns):
-                ox_min, oy_min, ox_max, oy_max = other_box_data['bbox']
-                # Only consider merging if the boxes are adjacent horizontally
-                if x_min < ox_max and x_max > ox_min:
-                    # Extract text boundaries from both boxes
-                    text_start_1, text_end_1 = self.extract_text_boundaries(image, box_data['bbox'])
-                    text_start_2, text_end_2 = self.extract_text_boundaries(image, other_box_data['bbox'])
-                    # Check if the text from one box ends very close to where the text in the next box starts
-                    if text_end_1 is not None and text_start_2 is not None and text_start_2 - text_end_1 <= proximity_threshold:
-                        x_max = max(x_max, ox_max)
-                        y_max = max(y_max, oy_max)
-                        y_min = min(y_min, oy_min)
-                        to_merge.append(i)
-            # Merge the boxes
-            for index in sorted(to_merge, reverse=True):
-                table_columns.pop(index)
-            merged_boxes.append({
-                'label': box_data['label'],
-                'score': box_data['score'],
-                'bbox': [x_min, y_min, x_max, y_max]
-            })
-        # Combine the merged boxes with other entries
-        final_output = merged_boxes + other_entries
-        # Sort final output by the y-coordinate to maintain the original order
-        final_output = sorted(final_output, key=lambda x: x['bbox'][1])
-        return final_output
-    def adjust_overlapping_rows(self, image, data, proximity_threshold=10):
-        return data
-    def invoke_pipeline_step(self, task_call, task_description, local):
+    @staticmethod
+    def invoke_pipeline_step(task_call, task_description, local):
         if local:
             with Progress(
                     SpinnerColumn(),
@@ -614,5 +267,9 @@ class TableDetector(object):
 if __name__ == "__main__":
     table_detector = TableDetector()
-    table_detector.detect_table("/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_1.jpg", None, local=True, debug=False)
-    # table_detector.detect_table("/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.jpg", None, local=True, debug=False)
+    # file_path = "/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/bonds_table.png"
+    # cropped_tables = table_detector.detect_tables(file_path, local=True, debug_dir="/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/", debug=True)
+    # for i, cropped_table in enumerate(cropped_tables):
+    #     file_name_table = table_detector.append_filename(file_path, "cropped_" + str(i))
+    #     cropped_table.save(file_name_table)

{sparrow_parse-0.3.11.dist-info → sparrow_parse-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sparrow-parse
-Version: 0.3.11
+Version: 0.4.0
 Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
 Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
 Author: Andrej Baranovskij
@@ -21,10 +21,9 @@ Requires-Dist: transformers==4.46.3
 Requires-Dist: sentence-transformers==3.3.1
 Requires-Dist: numpy==2.1.3
 Requires-Dist: pypdf==4.3.0
-Requires-Dist: easyocr==1.7.1
 Requires-Dist: gradio-client
 Requires-Dist: pdf2image
-Requires-Dist: mlx-vlm==0.1.3; sys_platform == "darwin" and platform_machine == "arm64"
+Requires-Dist: mlx-vlm==0.1.4; sys_platform == "darwin" and platform_machine == "arm64"
 # Sparrow Parse
@@ -67,7 +66,8 @@ input_data = [
 ]
 # Now you can run inference without knowing which implementation is used
-results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
+results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
+                                 generic_query=False,
                                  debug_dir=None,
                                  debug=True,
                                  mode=None)
@@ -77,6 +77,8 @@ for i, result in enumerate(results_array):
 print(f"Number of pages: {num_pages}")
 ```
+Use `tables_only=True` if you want to extract only tables.
 Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
 Method `run_inference` will return results and number of pages processed.

{sparrow_parse-0.3.11.dist-info → sparrow_parse-0.4.0.dist-info}/RECORD RENAMED Viewed

@@ -1,19 +1,19 @@
-sparrow_parse/__init__.py,sha256=ffGYSp8ZNa0Q4_Q7ecO1MiMLkWV8z2pOxdetmy6GsVI,22
+sparrow_parse/__init__.py,sha256=DObMj8zITWgJRRICOQXNFEgLDtZ9uQZUVwbNAU-P3oc,21
 sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
 sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sparrow_parse/extractors/vllm_extractor.py,sha256=UDKoh5KDdGqNeJJCVdt044yo2SlvWBqWVzhHknSMTlU,3576
+sparrow_parse/extractors/vllm_extractor.py,sha256=SCqxdr8V_cm0COfs0TelTcBXapVcz2WffhESJ1fry0g,8716
 sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sparrow_parse/helpers/pdf_optimizer.py,sha256=GIqQYWtixFeZGCRFXL0lQfQByapCDuQzzRHAkzcPwLE,3302
 sparrow_parse/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sparrow_parse/processors/table_structure_processor.py,sha256=bG_6jx66n_KNdY_O6hrZD1D4DHX5Qy__RYcKHmrSGnc,23894
+sparrow_parse/processors/table_structure_processor.py,sha256=PQHHFdQUuTin3Mm2USuUga2n4fGWMLwiBJYq4CVD67o,9775
 sparrow_parse/vllm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sparrow_parse/vllm/huggingface_inference.py,sha256=EJnG6PesGKMc_0qGPN8ufE6pSnhAgFu0XjCbaLCNVyM,1980
 sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzKmLNzgjw,820
 sparrow_parse/vllm/inference_factory.py,sha256=FTM65O-dW2WZchHOrNN7_Q3-FlVoAc65iSptuuUuClM,1166
 sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
 sparrow_parse/vllm/mlx_inference.py,sha256=xR40qwjIR0HvrN8x58oOq6F4r1hEANRB-9kcokUQHHU,4748
-sparrow_parse-0.3.11.dist-info/METADATA,sha256=1WHTqOZ-IBrlYHA6z2gJV4lIBunPN0uynpWZ2ikQ3g0,6351
-sparrow_parse-0.3.11.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-sparrow_parse-0.3.11.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
-sparrow_parse-0.3.11.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
-sparrow_parse-0.3.11.dist-info/RECORD,,
+sparrow_parse-0.4.0.dist-info/METADATA,sha256=IQqfUUKnpA0ystjBmrrpSWw4b1hDYnLO4sqKdoNYEHk,6432
+sparrow_parse-0.4.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+sparrow_parse-0.4.0.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
+sparrow_parse-0.4.0.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
+sparrow_parse-0.4.0.dist-info/RECORD,,

{sparrow_parse-0.3.11.dist-info → sparrow_parse-0.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{sparrow_parse-0.3.11.dist-info → sparrow_parse-0.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{sparrow_parse-0.3.11.dist-info → sparrow_parse-0.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

sparrow-parse 0.3.11__py3-none-any.whl → 0.4.0__py3-none-any.whl

sparrow-parse 0.3.11py3-none-any.whl → 0.4.0py3-none-any.whl