PyPI - sparrow-parse - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

sparrow-parse 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

sparrow_parse/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = '0.3.2'
1	+ __version__ = '0.3.4'

sparrow_parse/{extractor → extractors}/html_extractor.py RENAMED Viewed

@@ -3,9 +3,8 @@ from sentence_transformers import SentenceTransformer, util
 from bs4 import BeautifulSoup
 import json
 from rich.progress import Progress, SpinnerColumn, TextColumn
-from .extractor_helper import merge_html_table_headers
-from .extractor_helper import clean_html_table_header_names
-import re
+from sparrow_parse.helpers.html_extractor_helper import merge_html_table_headers
+from sparrow_parse.helpers.html_extractor_helper import clean_html_table_header_names
 class HTMLExtractor(object):
@@ -221,8 +220,8 @@ class HTMLExtractor(object):
 if __name__ == "__main__":
-    # to run for debugging, navigate to sparrow_parse and run the following command:
-    # python -m extractor.html_extractor
+    # to run for debugging, navigate above sparrow_parse and run the following command:
+    # python -m sparrow_parse.extractors.html_extractor
     # with open('data/invoice_1_table.txt', 'r') as file:
     #     file_content = file.read()
@@ -233,7 +232,7 @@ if __name__ == "__main__":
     extractor = HTMLExtractor()
-    # answer, targets_unprocessed = extractor.read_data(
+    # answer, targets_unprocessed = extractors.read_data(
     #     # ['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth'],
     #     ['transaction_date', 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance',
     #      'deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'maturity_date'],

sparrow_parse/extractors/vllm_extractor.py ADDED Viewed

@@ -0,0 +1,46 @@
+from sparrow_parse.vllm.inference_factory import InferenceFactory
+from rich import print
+import os
+class VLLMExtractor(object):
+    def __init__(self):
+        pass
+    def run_inference(self, model_inference_instance, input_data, generic_query=False, debug=False):
+        if generic_query:
+            input_data[0]["text_input"] = "retrieve document data. return response in JSON format"
+        if debug:
+            print("Input Data:", input_data)
+        result = model_inference_instance.inference(input_data)
+        return result
+if __name__ == "__main__":
+    extractor = VLLMExtractor()
+    # export HF_TOKEN="hf_"
+    config = {
+        "method": "huggingface",  # Could be 'huggingface' or 'local_gpu'
+        "hf_space": "katanaml/sparrow-qwen2-vl-7b",
+        "hf_token": os.getenv('HF_TOKEN'),
+        # Additional fields for local GPU inference
+        # "device": "cuda", "model_path": "model.pth"
+    }
+    # Use the factory to get the correct instance
+    factory = InferenceFactory(config)
+    model_inference_instance = factory.get_inference_instance()
+    input_data = [
+        {
+            "image": "/Users/andrejb/Documents/work/epik/bankstatement/bonds_table.png",
+            "text_input": "retrieve financial instruments data. return response in JSON format"
+        }
+    ]
+    # Now you can run inference without knowing which implementation is used
+    result = extractor.run_inference(model_inference_instance, input_data, generic_query=False, debug=True)
+    print("Inference Result:", result)

sparrow_parse/helpers/__init__.py ADDED Viewed

File without changes

sparrow_parse/processors/__init__.py ADDED Viewed

File without changes

sparrow_parse/processors/table_structure_processor.py ADDED Viewed

@@ -0,0 +1,618 @@
+from rich.progress import Progress, SpinnerColumn, TextColumn
+from rich import print
+from transformers import AutoModelForObjectDetection
+from transformers import TableTransformerForObjectDetection
+import torch
+from PIL import Image
+from torchvision import transforms
+from PIL import ImageDraw
+import os
+import numpy as np
+import easyocr
+class TableDetector(object):
+    def __init__(self):
+        self.reader = easyocr.Reader(['en']) # this needs to run only once to load the model into memory
+    class MaxResize(object):
+        def __init__(self, max_size=800):
+            self.max_size = max_size
+        def __call__(self, image):
+            width, height = image.size
+            current_max_size = max(width, height)
+            scale = self.max_size / current_max_size
+            resized_image = image.resize((int(round(scale * width)), int(round(scale * height))))
+            return resized_image
+    def detect_table(self, file_path, options, local=True, debug=False):
+        model, device = self.invoke_pipeline_step(
+            lambda: self.load_table_detection_model(),
+            "Loading table detection model...",
+            local
+        )
+        outputs, image = self.invoke_pipeline_step(
+            lambda: self.prepare_image(file_path, model, device),
+            "Preparing image for table detection...",
+            local
+        )
+        objects = self.invoke_pipeline_step(
+            lambda: self.identify_tables(model, outputs, image),
+            "Identifying tables in the image...",
+            local
+        )
+        cropped_table = self.invoke_pipeline_step(
+            lambda: self.crop_table(file_path, image, objects),
+            "Cropping tables from the image...",
+            local
+        )
+        structure_model = self.invoke_pipeline_step(
+            lambda: self.load_table_structure_model(device),
+            "Loading table structure recognition model...",
+            local
+        )
+        structure_outputs = self.invoke_pipeline_step(
+            lambda: self.get_table_structure(cropped_table, structure_model, device),
+            "Getting table structure from cropped table...",
+            local
+        )
+        table_data = self.invoke_pipeline_step(
+            lambda: self.get_structure_cells(structure_model, cropped_table, structure_outputs),
+            "Getting structure cells from cropped table...",
+            local
+        )
+        self.invoke_pipeline_step(
+            lambda: self.process_table_structure(table_data, cropped_table, file_path),
+            "Processing structure cells...",
+            local
+        )
+    def load_table_detection_model(self):
+        model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-detection", revision="no_timm")
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model.to(device)
+        return model, device
+    def load_table_structure_model(self, device):
+        structure_model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-structure-recognition-v1.1-all")
+        structure_model.to(device)
+        return structure_model
+    def prepare_image(self, file_path, model, device):
+        image = Image.open(file_path).convert("RGB")
+        detection_transform = transforms.Compose([
+            self.MaxResize(800),
+            transforms.ToTensor(),
+            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+        ])
+        pixel_values = detection_transform(image).unsqueeze(0)
+        pixel_values = pixel_values.to(device)
+        with torch.no_grad():
+            outputs = model(pixel_values)
+        return outputs, image
+    def identify_tables(self, model, outputs, image):
+        id2label = model.config.id2label
+        id2label[len(model.config.id2label)] = "no object"
+        objects = self.outputs_to_objects(outputs, image.size, id2label)
+        return objects
+    def crop_table(self, file_path, image, objects):
+        tokens = []
+        detection_class_thresholds = {
+            "table": 0.5,
+            "table rotated": 0.5,
+            "no object": 10
+        }
+        crop_padding = 10
+        tables_crops = self.objects_to_crops(image, tokens, objects, detection_class_thresholds, padding=crop_padding)
+        cropped_table = None
+        if len(tables_crops) == 0:
+            print("No tables detected.")
+            return
+        elif len(tables_crops) > 1:
+            for i, table_crop in enumerate(tables_crops):
+                cropped_table = table_crop['image'].convert("RGB")
+                file_name_table = self.append_filename(file_path, f"table_{i}")
+                cropped_table.save(file_name_table)
+                break
+        else:
+            cropped_table = tables_crops[0]['image'].convert("RGB")
+            file_name_table = self.append_filename(file_path, "table")
+            cropped_table.save(file_name_table)
+        return cropped_table
+    # for output bounding box post-processing
+    def box_cxcywh_to_xyxy(self, x):
+        x_c, y_c, w, h = x.unbind(-1)
+        b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
+        return torch.stack(b, dim=1)
+    def rescale_bboxes(self, out_bbox, size):
+        img_w, img_h = size
+        b = self.box_cxcywh_to_xyxy(out_bbox)
+        b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
+        return b
+    def outputs_to_objects(self, outputs, img_size, id2label):
+        m = outputs.logits.softmax(-1).max(-1)
+        pred_labels = list(m.indices.detach().cpu().numpy())[0]
+        pred_scores = list(m.values.detach().cpu().numpy())[0]
+        pred_bboxes = outputs['pred_boxes'].detach().cpu()[0]
+        pred_bboxes = [elem.tolist() for elem in self.rescale_bboxes(pred_bboxes, img_size)]
+        objects = []
+        for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes):
+            class_label = id2label[int(label)]
+            if not class_label == 'no object':
+                objects.append({'label': class_label, 'score': float(score),
+                                'bbox': [float(elem) for elem in bbox]})
+        return objects
+    def objects_to_crops(self, img, tokens, objects, class_thresholds, padding=10):
+        """
+        Process the bounding boxes produced by the table detection model into
+        cropped table images and cropped tokens.
+        """
+        table_crops = []
+        for obj in objects:
+            if obj['score'] < class_thresholds[obj['label']]:
+                continue
+            cropped_table = {}
+            bbox = obj['bbox']
+            bbox = [bbox[0] - padding, bbox[1] - padding, bbox[2] + padding, bbox[3] + padding]
+            cropped_img = img.crop(bbox)
+            table_tokens = [token for token in tokens if self.iob(token['bbox'], bbox) >= 0.5]
+            for token in table_tokens:
+                token['bbox'] = [token['bbox'][0] - bbox[0],
+                                 token['bbox'][1] - bbox[1],
+                                 token['bbox'][2] - bbox[0],
+                                 token['bbox'][3] - bbox[1]]
+            # If table is predicted to be rotated, rotate cropped image and tokens/words:
+            if obj['label'] == 'table rotated':
+                cropped_img = cropped_img.rotate(270, expand=True)
+                for token in table_tokens:
+                    bbox = token['bbox']
+                    bbox = [cropped_img.size[0] - bbox[3] - 1,
+                            bbox[0],
+                            cropped_img.size[0] - bbox[1] - 1,
+                            bbox[2]]
+                    token['bbox'] = bbox
+            cropped_table['image'] = cropped_img
+            cropped_table['tokens'] = table_tokens
+            table_crops.append(cropped_table)
+        return table_crops
+    def get_table_structure(self, cropped_table, structure_model, device):
+        structure_transform = transforms.Compose([
+            self.MaxResize(1000),
+            transforms.ToTensor(),
+            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+        ])
+        pixel_values = structure_transform(cropped_table).unsqueeze(0)
+        pixel_values = pixel_values.to(device)
+        with torch.no_grad():
+            outputs = structure_model(pixel_values)
+        return outputs
+    def get_structure_cells(self, structure_model, cropped_table, outputs):
+        structure_id2label = structure_model.config.id2label
+        structure_id2label[len(structure_id2label)] = "no object"
+        cells = self.outputs_to_objects(outputs, cropped_table.size, structure_id2label)
+        return cells
+    def process_table_structure(self, table_data, cropped_table, file_path):
+        cropped_table_raw_visualized = cropped_table.copy()
+        draw_raw = ImageDraw.Draw(cropped_table_raw_visualized)
+        cropped_table_header_visualized = cropped_table.copy()
+        draw_header = ImageDraw.Draw(cropped_table_header_visualized)
+        cropped_table_visualized = cropped_table.copy()
+        draw = ImageDraw.Draw(cropped_table_visualized)
+        table_data = [cell for cell in table_data if cell['label'] != 'table spanning cell']
+        table_data = [cell for cell in table_data if cell['label'] != 'table']
+        table_data = [cell for cell in table_data if cell['score'] >= 0.8]
+        table_data = self.merge_overlapping_columns(cropped_table, table_data)
+        table_data = self.adjust_overlapping_rows(cropped_table, table_data)
+        table_data_filtered = [item for item in table_data if item['label'] == 'table row']
+        # table_data_filtered = table_data
+        for cell in table_data_filtered:
+            draw_raw.rectangle(cell["bbox"], outline="red")
+        file_name_table_grid_raw = self.append_filename(file_path, "table_raw")
+        cropped_table_raw_visualized.save(file_name_table_grid_raw)
+        print("Table raw data:")
+        print(table_data_filtered)
+        # table, table column header, table row, table column
+        table_data_header = [cell for cell in table_data if cell['label'] == 'table column header'
+                             or cell['label'] == 'table' or cell['label'] == 'table column']
+        print("Table header data:")
+        print(table_data_header)
+        table_data_rows = [cell for cell in table_data if cell['label'] == 'table column'
+                           or cell['label'] == 'table row']
+        table_data_rows = self.remove_overlapping_table_header_rows(table_data_header, table_data_rows)
+        print("Table row data:")
+        print(table_data_rows)
+        header_cells = self.get_header_cell_coordinates(table_data_header)
+        if header_cells is not None:
+            print("Header cell coordinates:")
+            print(header_cells)
+            header_data = self.do_ocr_with_coordinates(header_cells, cropped_table)
+            print("Header data:")
+            print(header_data)
+            for cell_data in header_cells['row0']:
+                draw_header.rectangle(cell_data["cell"], outline="red")
+            file_name_table_grid_header = self.append_filename(file_path, "table_grid_header")
+            cropped_table_header_visualized.save(file_name_table_grid_header)
+        table_cells = self.get_table_cell_coordinates(table_data_rows)
+        if table_cells is not None:
+            print("Table cell coordinates:")
+            print(table_cells)
+            table_data = self.do_ocr_with_coordinates(table_cells, cropped_table)
+            print("Table data:")
+            print(table_data)
+            for row_key, row_cells in table_cells.items():
+                for cell_data in row_cells:
+                    draw.rectangle(cell_data["cell"], outline="red")
+            file_name_table_grid = self.append_filename(file_path, "table_grid_cells")
+            cropped_table_visualized.save(file_name_table_grid)
+    def get_header_cell_coordinates(self, table_data):
+        header_column = None
+        columns = []
+        # Separate header and columns
+        for item in table_data:
+            if item['label'] == 'table column header':
+                header_column = item['bbox']
+            elif item['label'] == 'table column':
+                columns.append(item['bbox'])
+        if not header_column:
+            return None
+        header_top = header_column[1]
+        header_bottom = header_column[3]
+        cells = []
+        # Calculate cell coordinates based on header and column intersections
+        for column in columns:
+            cell_left = column[0]
+            cell_right = column[2]
+            cell_top = header_top
+            cell_bottom = header_bottom
+            cells.append({
+                'cell': (cell_left, cell_top, cell_right, cell_bottom)
+            })
+        # Sort cells by the left coordinate (cell_left) to order them from left to right
+        cells.sort(key=lambda x: x['cell'][0])
+        header_row = {"row0": cells}
+        return header_row
+    def get_table_cell_coordinates(self, table_data):
+        rows = []
+        columns = []
+        # Separate rows and columns
+        for item in table_data:
+            if item['label'] == 'table row':
+                rows.append(item['bbox'])
+            elif item['label'] == 'table column':
+                columns.append(item['bbox'])
+        if not rows or not columns:
+            return None
+        # Sort rows by the top coordinate to ensure they are processed from top to bottom
+        rows.sort(key=lambda x: x[1])
+        row_cells = {}
+        # Calculate cell coordinates based on row and column intersections
+        for row_idx, row in enumerate(rows):
+            row_top = row[1]
+            row_bottom = row[3]
+            cells = []
+            for column in columns:
+                cell_left = column[0]
+                cell_right = column[2]
+                cell_top = row_top
+                cell_bottom = row_bottom
+                cells.append({
+                    'cell': (cell_left, cell_top, cell_right, cell_bottom)
+                })
+            # Sort cells within the row by the left coordinate to ensure they are ordered from left to right
+            cells.sort(key=lambda x: x['cell'][0])
+            row_cells[f'row{row_idx}'] = cells
+        return row_cells
+    def do_ocr_with_coordinates(self, cell_coordinates, cropped_table):
+        data = {}
+        max_num_columns = 0
+        # Iterate over each row in cell_coordinates
+        for row_key in cell_coordinates:
+            row_text = []
+            for cell in cell_coordinates[row_key]:
+                # Crop cell out of image
+                cell_image = cropped_table.crop(cell['cell'])
+                cell_image_np = np.array(cell_image)
+                # Apply OCR
+                result = self.reader.readtext(cell_image_np)
+                if result:
+                    text = " ".join([x[1] for x in result])
+                    row_text.append(text)
+                else:
+                    row_text.append("")  # If no text is detected, append an empty string
+            if len(row_text) > max_num_columns:
+                max_num_columns = len(row_text)
+            data[row_key] = row_text
+        print("Max number of columns:", max_num_columns)
+        # Pad rows which don't have max_num_columns elements
+        for row_key, row_data in data.items():
+            if len(row_data) < max_num_columns:
+                row_data += [""] * (max_num_columns - len(row_data))
+            data[row_key] = row_data
+        return data
+    def append_filename(self, file_path, word):
+        directory, filename = os.path.split(file_path)
+        name, ext = os.path.splitext(filename)
+        new_filename = f"{name}_{word}{ext}"
+        return os.path.join(directory, new_filename)
+    def iob(boxA, boxB):
+        # Determine the coordinates of the intersection rectangle
+        xA = max(boxA[0], boxB[0])
+        yA = max(boxA[1], boxB[1])
+        xB = min(boxA[2], boxB[2])
+        yB = min(boxA[3], boxB[3])
+        # Compute the area of intersection rectangle
+        interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
+        # Compute the area of both the prediction and ground-truth rectangles
+        boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
+        boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
+        # Compute the intersection over box (IoB)
+        iob = interArea / float(boxAArea)
+        return iob
+    def remove_overlapping_table_header_rows(self, header_data, row_data, tolerance=1.0):
+        # Function to calculate the Intersection over Union (IoU) of two bounding boxes
+        def calculate_iou(bbox1, bbox2):
+            x1_min, y1_min, x1_max, y1_max = bbox1
+            x2_min, y2_min, x2_max, y2_max = bbox2
+            # Determine the coordinates of the intersection rectangle
+            inter_min_x = max(x1_min, x2_min)
+            inter_min_y = max(y1_min, y2_min)
+            inter_max_x = min(x1_max, x2_max)
+            inter_max_y = min(y1_max, y2_max)
+            # Compute the area of intersection
+            inter_area = max(0, inter_max_x - inter_min_x) * max(0, inter_max_y - inter_min_y)
+            # Compute the area of both bounding boxes
+            bbox1_area = (x1_max - x1_min) * (y1_max - y1_min)
+            bbox2_area = (x2_max - x2_min) * (y2_max - y2_min)
+            # Compute the Intersection over Union (IoU)
+            iou = inter_area / float(bbox1_area + bbox2_area - inter_area)
+            return iou
+        # Extract the bounding box of the table column header
+        header_bbox = None
+        for item in header_data:
+            if item['label'] == 'table column header':
+                header_bbox = item['bbox']
+                break
+        if header_bbox is None:
+            print("No 'table column header' found in header data.")
+            return row_data
+        # Initialize a counter for removed rows
+        removed_count = 0
+        # Iterate over the table row data and remove rows with overlapping bbox
+        updated_row_data = []
+        for row in row_data:
+            if row['label'] == 'table row':
+                row_bbox = row['bbox']
+                # Check for overlap (IoU > 0) or very similar bounding box
+                iou = calculate_iou(header_bbox, row_bbox)
+                if iou > 0 or np.allclose(row_bbox, header_bbox, atol=tolerance):
+                    removed_count += 1  # Increment the removed counter
+                    continue  # Skip this row as it overlaps or matches the header bbox
+            # Add row to the updated list if it doesn't overlap
+            updated_row_data.append(row)
+        # Print the number of removed rows
+        print(f"Number of removed rows: {removed_count}")
+        return updated_row_data
+    def filter_table_columns(self, data):
+        return [item for item in data if item['label'] == 'table column']
+    def filter_table_rows(self, data):
+        return [item for item in data if item['label'] == 'table row']
+    def extract_text_boundaries(self, image, box):
+        """
+        Extract the start and end coordinates of the text within a bounding box,
+        and translate them back to the original image coordinates.
+        Args:
+        - image: The image in which the box is located.
+        - box: The bounding box (x_min, y_min, x_max, y_max).
+        - reader: The EasyOCR reader object.
+        Returns:
+        - text_start: The x-coordinate of the start of the text in the original image.
+        - text_end: The x-coordinate of the end of the text in the original image.
+        """
+        x_min, y_min, x_max, y_max = box
+        cropped_image = image.crop((x_min, y_min, x_max, y_max))
+        result = self.reader.readtext(np.array(cropped_image))
+        if result:
+            text_coordinates = result[0][0]  # Extract the coordinates of the text within the cropped image
+            # Translate the coordinates back to the original image coordinates
+            text_start = min(point[0] + x_min for point in text_coordinates)
+            text_end = max(point[0] + x_min for point in text_coordinates)
+            return text_start, text_end
+        return None, None
+    def merge_overlapping_columns(self, image, data, proximity_threshold=20):
+        """
+            Merge only those bounding boxes where the text is split directly by the box line,
+            while keeping other labels intact.
+            Args:
+            - image: The image in which the boxes are located.
+            - data: List of dictionary items with bounding boxes and labels.
+            - reader: The EasyOCR reader object.
+            - proximity_threshold: The maximum distance between text boundaries to consider merging.
+            Returns:
+            - Updated list of dictionary items with merged bounding boxes and other entries preserved.
+            """
+        table_columns = self.filter_table_columns(data)
+        other_entries = [item for item in data if item['label'] != 'table column']
+        merged_boxes = []
+        table_columns = sorted(table_columns, key=lambda x: x['bbox'][0])  # Sort by x_min
+        while table_columns:
+            box_data = table_columns.pop(0)
+            x_min, y_min, x_max, y_max = box_data['bbox']
+            to_merge = []
+            for i, other_box_data in enumerate(table_columns):
+                ox_min, oy_min, ox_max, oy_max = other_box_data['bbox']
+                # Only consider merging if the boxes are adjacent horizontally
+                if x_min < ox_max and x_max > ox_min:
+                    # Extract text boundaries from both boxes
+                    text_start_1, text_end_1 = self.extract_text_boundaries(image, box_data['bbox'])
+                    text_start_2, text_end_2 = self.extract_text_boundaries(image, other_box_data['bbox'])
+                    # Check if the text from one box ends very close to where the text in the next box starts
+                    if text_end_1 is not None and text_start_2 is not None and text_start_2 - text_end_1 <= proximity_threshold:
+                        x_max = max(x_max, ox_max)
+                        y_max = max(y_max, oy_max)
+                        y_min = min(y_min, oy_min)
+                        to_merge.append(i)
+            # Merge the boxes
+            for index in sorted(to_merge, reverse=True):
+                table_columns.pop(index)
+            merged_boxes.append({
+                'label': box_data['label'],
+                'score': box_data['score'],
+                'bbox': [x_min, y_min, x_max, y_max]
+            })
+        # Combine the merged boxes with other entries
+        final_output = merged_boxes + other_entries
+        # Sort final output by the y-coordinate to maintain the original order
+        final_output = sorted(final_output, key=lambda x: x['bbox'][1])
+        return final_output
+    def adjust_overlapping_rows(self, image, data, proximity_threshold=10):
+        return data
+    def invoke_pipeline_step(self, task_call, task_description, local):
+        if local:
+            with Progress(
+                    SpinnerColumn(),
+                    TextColumn("[progress.description]{task.description}"),
+                    transient=False,
+            ) as progress:
+                progress.add_task(description=task_description, total=None)
+                ret = task_call()
+        else:
+            print(task_description)
+            ret = task_call()
+        return ret
+if __name__ == "__main__":
+    table_detector = TableDetector()
+    table_detector.detect_table("/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_1.jpg", None, local=True, debug=False)
+    # table_detector.detect_table("/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.jpg", None, local=True, debug=False)

sparrow_parse/vllm/__init__.py ADDED Viewed

File without changes

sparrow_parse/vllm/huggingface_inference.py ADDED Viewed

@@ -0,0 +1,36 @@
+from gradio_client import Client, handle_file
+from sparrow_parse.vllm.inference_base import ModelInference
+import json
+class HuggingFaceInference(ModelInference):
+    def __init__(self, hf_space, hf_token):
+        self.hf_space = hf_space
+        self.hf_token = hf_token
+    def process_response(self, output_text):
+        json_string = output_text
+        json_string = json_string.strip("[]'")
+        json_string = json_string.replace("```json\n", "").replace("\n```", "")
+        json_string = json_string.replace("'", "")
+        try:
+            formatted_json = json.loads(json_string)
+            return json.dumps(formatted_json, indent=2)
+        except json.JSONDecodeError as e:
+            print("Failed to parse JSON:", e)
+            return output_text
+    def inference(self, input_data):
+        client = Client(self.hf_space, hf_token=self.hf_token)
+        result = client.predict(
+            image=handle_file(input_data[0]["image"]),
+            text_input=input_data[0]["text_input"],
+            api_name="/run_inference"
+        )
+        return self.process_response(result)

sparrow_parse/vllm/inference_base.py ADDED Viewed

@@ -0,0 +1,7 @@
+from abc import ABC, abstractmethod
+class ModelInference(ABC):
+    @abstractmethod
+    def inference(self, input_data):
+        """This method should be implemented by subclasses."""
+        pass

sparrow_parse/vllm/inference_factory.py ADDED Viewed

@@ -0,0 +1,22 @@
+from sparrow_parse.vllm.huggingface_inference import HuggingFaceInference
+from sparrow_parse.vllm.local_gpu_inference import LocalGPUInference
+class InferenceFactory:
+    def __init__(self, config):
+        self.config = config
+    def get_inference_instance(self):
+        if self.config["method"] == "huggingface":
+            return HuggingFaceInference(hf_space=self.config["hf_space"], hf_token=self.config["hf_token"])
+        elif self.config["method"] == "local_gpu":
+            model = self._load_local_model()  # Replace with actual model loading logic
+            return LocalGPUInference(model=model, device=self.config.get("device", "cuda"))
+        else:
+            raise ValueError(f"Unknown method: {self.config['method']}")
+    def _load_local_model(self):
+        # Example: Load a PyTorch model (replace with actual loading code)
+        # model = torch.load('model.pth')
+        # return model
+        raise NotImplementedError("Model loading logic not implemented")

sparrow_parse/vllm/local_gpu_inference.py ADDED Viewed

@@ -0,0 +1,16 @@
+import torch
+from sparrow_parse.vllm.inference_base import ModelInference
+class LocalGPUInference(ModelInference):
+    def __init__(self, model, device='cuda'):
+        self.model = model
+        self.device = device
+        self.model.to(self.device)
+    def inference(self, input_data):
+        self.model.eval()  # Set the model to evaluation mode
+        with torch.no_grad():  # No need to calculate gradients
+            input_tensor = torch.tensor(input_data).to(self.device)
+            output = self.model(input_tensor)
+        return output.cpu().numpy()  # Convert the output back to NumPy if necessary

{sparrow_parse-0.3.2.dist-info → sparrow_parse-0.3.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sparrow-parse
-Version: 0.3.2
+Version: 0.3.4
 Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
 Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
 Author: Andrej Baranovskij
@@ -25,6 +25,8 @@ Requires-Dist: transformers ==4.41.2
 Requires-Dist: sentence-transformers ==3.0.1
 Requires-Dist: numpy ==1.26.4
 Requires-Dist: pypdf ==4.3.0
+Requires-Dist: easyocr ==1.7.1
+Requires-Dist: gradio-client
 # Sparrow Parse
@@ -96,6 +98,8 @@ Example:
 ## Parsing and extraction
+### HTML extractor
 ```
 from sparrow_parse.extractor.html_extractor import HTMLExtractor
@@ -128,6 +132,36 @@ Example:
 *debug* - `True`
+### Sparrow Parse VL (vision-language) extractor
+```
+extractor = VLLMExtractor()
+# export HF_TOKEN="hf_"
+config = {
+    "method": "huggingface",  # Could be 'huggingface' or 'local_gpu'
+    "hf_space": "katanaml/sparrow-qwen2-vl-7b",
+    "hf_token": os.getenv('HF_TOKEN'),
+    # Additional fields for local GPU inference
+    # "device": "cuda", "model_path": "model.pth"
+}
+# Use the factory to get the correct instance
+factory = InferenceFactory(config)
+model_inference_instance = factory.get_inference_instance()
+input_data = [
+    {
+        "image": "/Users/andrejb/Documents/work/epik/bankstatement/bonds_table.png",
+        "text_input": "retrieve financial instruments data. return response in JSON format"
+    }
+]
+# Now you can run inference without knowing which implementation is used
+result = extractor.run_inference(model_inference_instance, input_data, generic_query=False, debug=True)
+print("Inference Result:", result)
+```
 ## PDF optimization
 ```

sparrow_parse-0.3.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,23 @@
+sparrow_parse/__init__.py,sha256=SH0xuWVUkyLHZJwWBZ8GJoeliTeYFcqA6TWJgrkLv-U,21
+sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
+sparrow_parse/temp.py,sha256=gy4_mtNW_KfXn9br_suu6jHx7JKYLKs9pIOBynh_JWY,1134
+sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sparrow_parse/extractors/html_extractor.py,sha256=qe9Oz7J-GiIE8G1kIDMOeh96xe6P59Gyh5SjgV3v2c8,9977
+sparrow_parse/extractors/vllm_extractor.py,sha256=Qwmf-SW4z_UstiiynX5TkyovlkokVhLuzcbUVZ16TXM,1540
+sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sparrow_parse/helpers/html_extractor_helper.py,sha256=n9M9NyZfesiCCj3ET9WoyqRcWIFJ4k-jyQlUAarKIhE,13658
+sparrow_parse/helpers/pdf_optimizer.py,sha256=KI_EweGt9Y_rDH1uCpYD5wKCW3rdjSFFhoVtiPBxX8k,3013
+sparrow_parse/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sparrow_parse/processors/markdown_processor.py,sha256=dC2WUdA-v2psh7oytruftxYkXdQi72FoEYxF30ROuO0,4506
+sparrow_parse/processors/table_structure_processor.py,sha256=bG_6jx66n_KNdY_O6hrZD1D4DHX5Qy__RYcKHmrSGnc,23894
+sparrow_parse/processors/unstructured_processor.py,sha256=oonkB5ALaV1pVs0a-xr8yAf-kirIabmtugHMnnEILqo,6770
+sparrow_parse/vllm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sparrow_parse/vllm/huggingface_inference.py,sha256=Q2Ju65LDzbO-8RWW7cXzrR-pbZ1zKuPVODlKOTWKg_E,1114
+sparrow_parse/vllm/inference_base.py,sha256=W0N2khehGdF1XHzZACG3I1UZaydHMk6BZgWNvaJD4Ck,197
+sparrow_parse/vllm/inference_factory.py,sha256=r04e95uPWG5l8Q23yeDqKmvFxLyF991aA2m0hfBTNn8,993
+sparrow_parse/vllm/local_gpu_inference.py,sha256=I_uWYiFAQhRrykOKbVz69NzftDxuemDKtAye4kWhtnU,617
+sparrow_parse-0.3.4.dist-info/METADATA,sha256=L7qXKxktk42gUQlBlZAdzHQqfORoC6vBwRCd-VSwv3Y,7444
+sparrow_parse-0.3.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+sparrow_parse-0.3.4.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
+sparrow_parse-0.3.4.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
+sparrow_parse-0.3.4.dist-info/RECORD,,

sparrow_parse-0.3.2.dist-info/RECORD DELETED Viewed

@@ -1,14 +0,0 @@
-sparrow_parse/__init__.py,sha256=64UBVh2KX7E-WVG4ZyY1dUiW9jGXZloWZk1N9nEUC2k,21
-sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
-sparrow_parse/temp.py,sha256=gy4_mtNW_KfXn9br_suu6jHx7JKYLKs9pIOBynh_JWY,1134
-sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sparrow_parse/extractor/extractor_helper.py,sha256=n9M9NyZfesiCCj3ET9WoyqRcWIFJ4k-jyQlUAarKIhE,13658
-sparrow_parse/extractor/html_extractor.py,sha256=Y9c17epY6esn1lNGhOVpzgRuolFJUUZAfZ3G9fKcArU,9916
-sparrow_parse/extractor/markdown_processor.py,sha256=dC2WUdA-v2psh7oytruftxYkXdQi72FoEYxF30ROuO0,4506
-sparrow_parse/extractor/pdf_optimizer.py,sha256=KI_EweGt9Y_rDH1uCpYD5wKCW3rdjSFFhoVtiPBxX8k,3013
-sparrow_parse/extractor/unstructured_processor.py,sha256=oonkB5ALaV1pVs0a-xr8yAf-kirIabmtugHMnnEILqo,6770
-sparrow_parse-0.3.2.dist-info/METADATA,sha256=BA_M_vHGpbJuXvivXHJLCIejtdGHFatOrUVJve1USXY,6422
-sparrow_parse-0.3.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-sparrow_parse-0.3.2.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
-sparrow_parse-0.3.2.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
-sparrow_parse-0.3.2.dist-info/RECORD,,