PyPI - sparrow-parse - Versions diffs - 1.0.4a0__py3-none-any.whl → 1.0.6__py3-none-any.whl - Mend

sparrow-parse 1.0.4a0py3-none-any.whl → 1.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

sparrow_parse/__init__.py +1 -1
sparrow_parse/extractors/vllm_extractor.py +19 -18
sparrow_parse/text_extraction.py +194 -13
sparrow_parse/vllm/huggingface_inference.py +1 -1
sparrow_parse/vllm/inference_base.py +1 -1
sparrow_parse/vllm/inference_factory.py +2 -3
sparrow_parse/vllm/local_gpu_inference.py +1 -1
sparrow_parse/vllm/mlx_inference.py +302 -217
{sparrow_parse-1.0.4a0.dist-info → sparrow_parse-1.0.6.dist-info}/METADATA +4 -2
sparrow_parse-1.0.6.dist-info/RECORD +21 -0
sparrow_parse-1.0.4a0.dist-info/RECORD +0 -21
{sparrow_parse-1.0.4a0.dist-info → sparrow_parse-1.0.6.dist-info}/WHEEL +0 -0
{sparrow_parse-1.0.4a0.dist-info → sparrow_parse-1.0.6.dist-info}/entry_points.txt +0 -0
{sparrow_parse-1.0.4a0.dist-info → sparrow_parse-1.0.6.dist-info}/top_level.txt +0 -0

sparrow_parse/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = '1.0.4a'
1	+ __version__ = '1.0.6'

sparrow_parse/extractors/vllm_extractor.py CHANGED Viewed

@@ -14,13 +14,14 @@ class VLLMExtractor(object):
         pass
     def run_inference(self, model_inference_instance, input_data, tables_only=False,
-                      generic_query=False, crop_size=None, debug_dir=None, debug=False, mode=None):
+                      generic_query=False, crop_size=None, apply_annotation=False, debug_dir=None, debug=False, mode=None):
         """
         Main entry point for processing input data using a model inference instance.
         Handles generic queries, PDFs, and table extraction.
         """
         if generic_query:
             input_data[0]["text_input"] = "retrieve document data. return response in JSON format"
+            apply_annotation=False
         if debug:
             print("Input data:", input_data)
@@ -37,12 +38,12 @@ class VLLMExtractor(object):
         # Document data extraction inference (file_path exists and is not None)
         file_path = input_data[0]["file_path"]
         if self.is_pdf(file_path):
-            return self._process_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode)
+            return self._process_pdf(model_inference_instance, input_data, tables_only, crop_size, apply_annotation, debug, debug_dir, mode)
         else:
-            return self._process_non_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir)
+            return self._process_non_pdf(model_inference_instance, input_data, tables_only, crop_size, apply_annotation, debug, debug_dir)
-    def _process_pdf(self, model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode):
+    def _process_pdf(self, model_inference_instance, input_data, tables_only, crop_size, apply_annotation, debug, debug_dir, mode):
         """
         Handles processing and inference for PDF files, including page splitting and optional table extraction.
         """
@@ -50,21 +51,21 @@ class VLLMExtractor(object):
         num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(input_data[0]["file_path"],
                                                                              debug_dir, convert_to_images=True)
-        results = self._process_pages(model_inference_instance, output_files, input_data, tables_only, crop_size, debug, debug_dir)
+        results = self._process_pages(model_inference_instance, output_files, input_data, tables_only, crop_size, apply_annotation, debug, debug_dir)
         # Clean up temporary directory
         shutil.rmtree(temp_dir, ignore_errors=True)
         return results, num_pages
-    def _process_non_pdf(self, model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir):
+    def _process_non_pdf(self, model_inference_instance, input_data, tables_only, crop_size, apply_annotation, debug, debug_dir):
         """
         Handles processing and inference for non-PDF files, with optional table extraction.
         """
         file_path = input_data[0]["file_path"]
         if tables_only:
-            return self._extract_tables(model_inference_instance, file_path, input_data, debug, debug_dir), 1
+            return self._extract_tables(model_inference_instance, file_path, input_data, apply_annotation, debug, debug_dir), 1
         else:
             temp_dir = tempfile.mkdtemp()
@@ -77,13 +78,13 @@ class VLLMExtractor(object):
             file_path = input_data[0]["file_path"]
             input_data[0]["file_path"] = [file_path]
-            results = model_inference_instance.inference(input_data)
+            results = model_inference_instance.inference(input_data, apply_annotation)
             shutil.rmtree(temp_dir, ignore_errors=True)
             return results, 1
-    def _process_pages(self, model_inference_instance, output_files, input_data, tables_only, crop_size, debug, debug_dir):
+    def _process_pages(self, model_inference_instance, output_files, input_data, tables_only, crop_size, apply_annotation, debug, debug_dir):
         """
         Processes individual pages (PDF split) and handles table extraction or inference.
@@ -93,6 +94,7 @@ class VLLMExtractor(object):
             input_data: Input data for inference.
             tables_only: Whether to only process tables.
             crop_size: Size for cropping image borders.
+            apply_annotation: Flag to apply annotations to the output.
             debug: Debug flag for logging.
             debug_dir: Directory for saving debug information.
@@ -106,9 +108,7 @@ class VLLMExtractor(object):
                 print(f"Processing {len(output_files)} pages for table extraction.")
             # Process each page individually for table extraction
             for i, file_path in enumerate(output_files):
-                tables_result = self._extract_tables(
-                    model_inference_instance, file_path, input_data, debug, debug_dir, page_index=i
-                )
+                tables_result = self._extract_tables( model_inference_instance, file_path, input_data, apply_annotation, debug, debug_dir, page_index=i)
                 # Since _extract_tables returns a list with one JSON string, unpack it
                 results_array.extend(tables_result)  # Unpack the single JSON string
         else:
@@ -141,7 +141,7 @@ class VLLMExtractor(object):
                 input_data[0]["file_path"] = output_files
             # Process all files at once
-            results = model_inference_instance.inference(input_data)
+            results = model_inference_instance.inference(input_data, apply_annotation)
             results_array.extend(results)
             # Clean up temporary directory
@@ -150,7 +150,7 @@ class VLLMExtractor(object):
         return results_array
-    def _extract_tables(self, model_inference_instance, file_path, input_data, debug, debug_dir, page_index=None):
+    def _extract_tables(self, model_inference_instance, file_path, input_data, apply_annotation, debug, debug_dir, page_index=None):
         """
         Detects and processes tables from an input file.
         """
@@ -175,7 +175,7 @@ class VLLMExtractor(object):
             table.save(output_filename, "JPEG")
             input_data[0]["file_path"] = [output_filename]
-            result = self._run_model_inference(model_inference_instance, input_data)
+            result = self._run_model_inference(model_inference_instance, input_data, apply_annotation)
             results_array.append(result)
         shutil.rmtree(temp_dir, ignore_errors=True)
@@ -191,11 +191,11 @@ class VLLMExtractor(object):
     @staticmethod
-    def _run_model_inference(model_inference_instance, input_data):
+    def _run_model_inference(model_inference_instance, input_data, apply_annotation):
         """
         Runs model inference and handles JSON decoding.
         """
-        result = model_inference_instance.inference(input_data)[0]
+        result = model_inference_instance.inference(input_data, apply_annotation)[0]
         try:
             return json.loads(result) if isinstance(result, str) else result
         except json.JSONDecodeError:
@@ -230,7 +230,7 @@ if __name__ == "__main__":
     # input_data = [
     #     {
     #         "file_path": "sparrow_parse/images/bonds_table.png",
-    #         "text_input": "retrieve all data. return response in JSON format"
+    #         "text_input": "retrieve [{\"instrument_name\":\"str\", \"valuation\":\"int\"}]. return response in JSON format"
     #     }
     # ]
     #
@@ -245,6 +245,7 @@ if __name__ == "__main__":
     # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
     #                                                    generic_query=False,
     #                                                    crop_size=0,
+    #                                                    apply_annotation=False,
     #                                                    debug_dir="/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/",
     #                                                    debug=True,
     #                                                    mode=None)

sparrow_parse/text_extraction.py CHANGED Viewed

@@ -1,35 +1,216 @@
 from mlx_vlm import load, apply_chat_template, generate
 from mlx_vlm.utils import load_image
+from PIL import ImageDraw, ImageFont
+import json
 # Load model and processor
-# vl_model, vl_processor = load("mlx-community/Mistral-Small-3.1-24B-Instruct-2503-8bit")
-vl_model, vl_processor = load("mlx-community/Qwen2.5-VL-7B-Instruct-8bit")
+vl_model, vl_processor = load("mlx-community/Mistral-Small-3.1-24B-Instruct-2503-8bit")
+# vl_model, vl_processor = load("mlx-community/Qwen2.5-VL-72B-Instruct-4bit")
 vl_config = vl_model.config
 image = load_image("images/bonds_table.png")
-messages = [
-    {"role": "system", "content": "You are an expert at extracting text from images. Format your response in json."},
-    {"role": "user", "content": "retrieve [{\"instrument_name\":\"str\", \"valuation\":\"int\"}]. return response in JSON format"}
-]
+# Qwen
+# messages = [
+#     {"role": "system", "content": "You are an expert at extracting text from images. Format your response in JSON."},
+#     {"role": "user", "content": "retrieve [{\"instrument_name\":\"str\", \"valuation\":\"int\"}]. return response in JSON format"}
+# ]
+# Qwen with bbox
+# messages = [
+#     {"role": "system", "content": "You are an expert at extracting text from images. For each item in the table, provide separate bounding boxes for each field. All coordinates should be in pixels relative to the original image. Format your response in JSON."},
+#     {"role": "user", "content": "retrieve [{\"instrument_name\":{\"value\":\"str\", \"bbox\":[\"float\", \"float\", \"float\", \"float\"], \"confidence\":\"float\"}, \"valuation\":{\"value\":\"int\", \"bbox\":[\"float\", \"float\", \"float\", \"float\"], \"confidence\":\"float\"}}]. return response in JSON format"}
+# ]
+# Qwen with bbox, get all data
+# messages = [
+#     {"role": "system", "content": "You are an expert at extracting text from images. For each item in the table, provide separate bounding boxes for each field. All coordinates should be in pixels relative to the original image. Format your response in JSON."},
+#     {"role": "user", "content": "retrieve all data. return response in JSON format. For each identified field or data element, include: 1) a descriptive field name as the object key, 2) a nested object with 'value' containing the extracted content, 'bbox' array with [x_min, y_min, x_max, y_max] coordinates in pixels, and 'confidence' score between 0-1. Example structure: [{\"field_name\":{\"value\":\"extracted value\", \"bbox\":[100, 200, 300, 250], \"confidence\":0.95}}]"}
+# ]
+# Mistral
 # message = "retrieve all data. return response in JSON format"
-# message = "retrieve [{\"instrument_name\":\"str\", \"valuation\":\"int\"}]. return response in JSON format"
+message = "retrieve [{\"instrument_name\":\"str\", \"valuation\":\"int\"}]. return response in JSON format"
-# Apply chat template
-prompt = apply_chat_template(vl_processor, vl_config, messages)
-# prompt = apply_chat_template(vl_processor, vl_config, message)
+# Qwen
+# prompt = apply_chat_template(vl_processor, vl_config, messages)
+# Mistral
+prompt = apply_chat_template(vl_processor, vl_config, message)
 # Generate text
-vl_output = generate(
+vl_output, _ = generate(
     vl_model,
     vl_processor,
     prompt,
     image,
-    max_tokens=1000,
+    max_tokens=4000,
     temperature=0,
     verbose=False
 )
-print(vl_output)
+print(vl_output)
+# Comment out below code if non Qwen model is used
+# # Convert to a format we can draw on
+# img_draw = image.copy()
+# draw = ImageDraw.Draw(img_draw)
+#
+# # Parse the JSON result
+# results = json.loads(vl_output.strip('```json\n').strip('```'))
+#
+# # Predefined solid colors that are highly visible
+# solid_colors = [
+#     (180, 30, 40),  # Dark red
+#     (0, 100, 140),  # Dark blue
+#     (30, 120, 40),  # Dark green
+#     (140, 60, 160),  # Purple
+#     (200, 100, 0),  # Orange
+#     (100, 80, 0),  # Brown
+#     (0, 100, 100),  # Teal
+#     (120, 40, 100)  # Magenta
+# ]
+#
+# # Determine unique field keys across all items to assign consistent colors
+# unique_fields = set()
+# for item in results:
+#     unique_fields.update(item.keys())
+#
+# # Map each unique field to a color
+# field_color_map = {}
+# for i, field in enumerate(sorted(unique_fields)):
+#     field_color_map[field] = solid_colors[i % len(solid_colors)]
+#
+# # Load font with larger size
+# font_size = 20
+# try:
+#     font = ImageFont.truetype("arial.ttf", font_size)
+# except IOError:
+#     try:
+#         font = ImageFont.truetype("DejaVuSans.ttf", font_size)
+#     except IOError:
+#         try:
+#             font = ImageFont.truetype("Helvetica.ttf", font_size)
+#         except IOError:
+#             font = ImageFont.load_default()
+#
+#
+# # Helper function to measure text width
+# def get_text_dimensions(text, font):
+#     try:
+#         # Method for newer Pillow versions
+#         left, top, right, bottom = draw.textbbox((0, 0), text, font=font)
+#         return right - left, bottom - top
+#     except AttributeError:
+#         try:
+#             # Alternative method
+#             left, top, right, bottom = font.getbbox(text)
+#             return right - left, bottom - top
+#         except AttributeError:
+#             # Fallback approximation
+#             return len(text) * (font_size // 2), font_size + 2
+#
+#
+# # Draw bounding boxes for each item
+# for item in results:
+#     # Process each field
+#     for field_name, field_data in item.items():
+#         # Check if this field has the expected structure
+#         if isinstance(field_data, dict) and "bbox" in field_data and "value" in field_data:
+#             bbox = field_data["bbox"]
+#             value = field_data["value"]
+#             confidence = field_data.get("confidence", "N/A")
+#
+#             # Check if coordinates need to be scaled (normalized 0-1 values)
+#             if all(isinstance(coord, (int, float)) for coord in bbox):
+#                 if max(bbox) <= 1.0:  # Normalized coordinates
+#                     width, height = image.size
+#                     bbox = [
+#                         bbox[0] * width,
+#                         bbox[1] * height,
+#                         bbox[2] * width,
+#                         bbox[3] * height
+#                     ]
+#
+#             # Get color from the mapping we created
+#             color = field_color_map[field_name]
+#
+#             # Make sure bbox coordinates are integers
+#             bbox = [int(coord) for coord in bbox]
+#
+#             # Calculate the bbox width
+#             bbox_width = bbox[2] - bbox[0]
+#
+#             # Draw rectangle with appropriate thickness
+#             border_thickness = 3
+#             draw.rectangle(
+#                 [(bbox[0], bbox[1]), (bbox[2], bbox[3])],
+#                 outline=color,
+#                 width=border_thickness
+#             )
+#
+#             # Format the value and confidence
+#             value_str = str(value)
+#             confidence_str = f" [{confidence:.2f}]" if isinstance(confidence, (int, float)) else ""
+#             prefix = f"{field_name}: "
+#
+#             # First, try with full text without truncation
+#             full_label = prefix + value_str + confidence_str
+#             full_width, text_height = get_text_dimensions(full_label, font)
+#
+#             # Compare with a reasonable maximum display width
+#             min_display_width = 300  # Reasonable minimum width to display text
+#             max_display_width = max(bbox_width * 1.5, min_display_width)
+#
+#             # Only truncate if the full text exceeds our maximum display width
+#             if full_width > max_display_width:
+#                 # Calculate the space available for the value
+#                 prefix_width, _ = get_text_dimensions(prefix, font)
+#                 confidence_width, _ = get_text_dimensions(confidence_str, font)
+#                 available_value_width = max_display_width - prefix_width - confidence_width
+#
+#                 # Truncate the value to fit
+#                 truncated_value = value_str
+#                 for i in range(len(value_str) - 1, 3, -1):
+#                     truncated_value = value_str[:i] + "..."
+#                     temp_width, _ = get_text_dimensions(truncated_value, font)
+#                     if temp_width <= available_value_width:
+#                         break
+#
+#                 label = prefix + truncated_value + confidence_str
+#                 text_width, _ = get_text_dimensions(label, font)
+#             else:
+#                 # No truncation needed
+#                 label = full_label
+#                 text_width = full_width
+#
+#             # Position for text (above the bounding box)
+#             padding = 6
+#             text_position = (bbox[0], bbox[1] - text_height - (padding * 2))
+#
+#             # Ensure text doesn't go off the top of the image
+#             if text_position[1] < padding:
+#                 # If too close to top, position below the box instead
+#                 text_position = (bbox[0], bbox[3] + padding)
+#
+#             # Add a background rectangle with better contrast
+#             draw.rectangle(
+#                 [(text_position[0] - padding, text_position[1] - padding),
+#                  (text_position[0] + text_width + padding, text_position[1] + text_height + padding)],
+#                 fill=(255, 255, 255, 240),
+#                 outline=color,
+#                 width=2
+#             )
+#
+#             # Draw the text
+#             draw.text(
+#                 text_position,
+#                 label,
+#                 fill=color,
+#                 font=font
+#             )
+#
+# # Save the annotated image
+# output_path = "images/bonds_table_annotated.png"
+# img_draw.save(output_path)
+# print(f"Annotated image saved to {output_path}")

sparrow_parse/vllm/huggingface_inference.py CHANGED Viewed

@@ -26,7 +26,7 @@ class HuggingFaceInference(ModelInference):
             return output_text
-    def inference(self, input_data, mode=None):
+    def inference(self, input_data, apply_annotation=False, mode=None):
         if mode == "static":
             simple_json = self.get_simple_json()
             return [simple_json]

sparrow_parse/vllm/inference_base.py CHANGED Viewed

@@ -4,7 +4,7 @@ import json
 class ModelInference(ABC):
     @abstractmethod
-    def inference(self, input_data, mode=None):
+    def inference(self, input_data, apply_annotation=False, mode=None):
         """This method should be implemented by subclasses."""
         pass

sparrow_parse/vllm/inference_factory.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from sparrow_parse.vllm.huggingface_inference import HuggingFaceInference
 from sparrow_parse.vllm.local_gpu_inference import LocalGPUInference
-# from sparrow_parse.vllm.mlx_inference import MLXInference
+from sparrow_parse.vllm.mlx_inference import MLXInference
 class InferenceFactory:
@@ -14,8 +14,7 @@ class InferenceFactory:
             model = self._load_local_model()  # Replace with actual model loading logic
             return LocalGPUInference(model=model, device=self.config.get("device", "cuda"))
         elif self.config["method"] == "mlx":
-            # return MLXInference(model_name=self.config["model_name"])
-            return None
+            return MLXInference(model_name=self.config["model_name"])
         else:
             raise ValueError(f"Unknown method: {self.config['method']}")

sparrow_parse/vllm/local_gpu_inference.py CHANGED Viewed

@@ -8,7 +8,7 @@ class LocalGPUInference(ModelInference):
         self.device = device
         self.model.to(self.device)
-    def inference(self, input_data, mode=None):
+    def inference(self, input_data, apply_annotation=False, mode=None):
         self.model.eval()  # Set the model to evaluation mode
         with torch.no_grad():  # No need to calculate gradients
             input_tensor = torch.tensor(input_data).to(self.device)

sparrow_parse/vllm/mlx_inference.py CHANGED Viewed

@@ -1,217 +1,302 @@
-# from mlx_vlm import load, generate
-# from mlx_vlm.prompt_utils import apply_chat_template
-# from mlx_vlm.utils import load_image
-# from sparrow_parse.vllm.inference_base import ModelInference
-# import os
-# import json
-# from rich import print
-#
-#
-# class MLXInference(ModelInference):
-#     """
-#         A class for performing inference using the MLX model.
-#         Handles image preprocessing, response formatting, and model interaction.
-#         """
-#
-#     def __init__(self, model_name):
-#         """
-#         Initialize the inference class with the given model name.
-#
-#         :param model_name: Name of the model to load.
-#         """
-#         self.model_name = model_name
-#         print(f"MLXInference initialized for model: {model_name}")
-#
-#
-#     @staticmethod
-#     def _load_model_and_processor(model_name):
-#         """
-#         Load the model and processor for inference.
-#
-#         :param model_name: Name of the model to load.
-#         :return: Tuple containing the loaded model and processor.
-#         """
-#         model, processor = load(model_name)
-#         print(f"Loaded model: {model_name}")
-#         return model, processor
-#
-#
-#     def process_response(self, output_text):
-#         """
-#         Process and clean the model's raw output to format as JSON.
-#         """
-#         try:
-#             # Check if we have markdown code block markers
-#             if "```" in output_text:
-#                 # Handle markdown-formatted output
-#                 json_start = output_text.find("```json")
-#                 if json_start != -1:
-#                     # Extract content between ```json and ```
-#                     content = output_text[json_start + 7:]
-#                     json_end = content.rfind("```")
-#                     if json_end != -1:
-#                         content = content[:json_end].strip()
-#                         formatted_json = json.loads(content)
-#                         return json.dumps(formatted_json, indent=2)
-#
-#             # Handle raw JSON (no markdown formatting)
-#             # First try to find JSON array or object patterns
-#             for pattern in [r'\[\s*\{.*\}\s*\]', r'\{.*\}']:
-#                 import re
-#                 matches = re.search(pattern, output_text, re.DOTALL)
-#                 if matches:
-#                     potential_json = matches.group(0)
-#                     try:
-#                         formatted_json = json.loads(potential_json)
-#                         return json.dumps(formatted_json, indent=2)
-#                     except:
-#                         pass
-#
-#             # Last resort: try to parse the whole text as JSON
-#             formatted_json = json.loads(output_text.strip())
-#             return json.dumps(formatted_json, indent=2)
-#
-#         except Exception as e:
-#             print(f"Failed to parse JSON: {e}")
-#             return output_text
-#
-#
-#     def load_image_data(self, image_filepath, max_width=1250, max_height=1750):
-#         """
-#         Load and resize image while maintaining its aspect ratio.
-#
-#         :param image_filepath: Path to the image file.
-#         :param max_width: Maximum allowed width of the image.
-#         :param max_height: Maximum allowed height of the image.
-#         :return: Tuple containing the image object and its new dimensions.
-#         """
-#         image = load_image(image_filepath)  # Assuming load_image is defined elsewhere
-#         width, height = image.size
-#
-#         # Calculate new dimensions while maintaining the aspect ratio
-#         if width > max_width or height > max_height:
-#             aspect_ratio = width / height
-#             new_width = min(max_width, int(max_height * aspect_ratio))
-#             new_height = min(max_height, int(max_width / aspect_ratio))
-#             return image, new_width, new_height
-#
-#         return image, width, height
-#
-#
-#     def inference(self, input_data, mode=None):
-#         """
-#         Perform inference on input data using the specified model.
-#
-#         :param input_data: A list of dictionaries containing image file paths and text inputs.
-#         :param mode: Optional mode for inference ("static" for simple JSON output).
-#         :return: List of processed model responses.
-#         """
-#         # Handle static mode
-#         if mode == "static":
-#             return [self.get_simple_json()]
-#
-#         # Load the model and processor
-#         model, processor = self._load_model_and_processor(self.model_name)
-#         config = model.config
-#
-#         # Determine if we're doing text-only or image-based inference
-#         is_text_only = input_data[0].get("file_path") is None
-#
-#         if is_text_only:
-#             # Text-only inference
-#             messages = input_data[0]["text_input"]
-#             response = self._generate_text_response(model, processor, config, messages)
-#             results = [response]
-#         else:
-#             # Image-based inference
-#             file_paths = self._extract_file_paths(input_data)
-#             results = self._process_images(model, processor, config, file_paths, input_data)
-#
-#         return results
-#
-#     def _generate_text_response(self, model, processor, config, messages):
-#         """
-#         Generate a text response for text-only inputs.
-#
-#         :param model: The loaded model
-#         :param processor: The loaded processor
-#         :param config: Model configuration
-#         :param messages: Input messages
-#         :return: Generated response
-#         """
-#         prompt = apply_chat_template(processor, config, messages)
-#         response =  generate(
-#             model,
-#             processor,
-#             prompt,
-#             max_tokens=4000,
-#             temperature=0.0,
-#             verbose=False
-#         )
-#         print("Inference completed successfully")
-#         return response
-#
-#     def _process_images(self, model, processor, config, file_paths, input_data):
-#         """
-#         Process images and generate responses for each.
-#
-#         :param model: The loaded model
-#         :param processor: The loaded processor
-#         :param config: Model configuration
-#         :param file_paths: List of image file paths
-#         :param input_data: Original input data
-#         :return: List of processed responses
-#         """
-#         results = []
-#         for file_path in file_paths:
-#             image, width, height = self.load_image_data(file_path)
-#
-#             # Prepare messages based on model type
-#             messages = self._prepare_messages(input_data, file_path)
-#
-#             # Generate and process response
-#             prompt = apply_chat_template(processor, config, messages)
-#             response = generate(
-#                 model,
-#                 processor,
-#                 prompt,
-#                 image,
-#                 resize_shape=(width, height),
-#                 max_tokens=4000,
-#                 temperature=0.0,
-#                 verbose=False
-#             )
-#             results.append(self.process_response(response))
-#             print(f"Inference completed successfully for: {file_path}")
-#
-#         return results
-#
-#     def _prepare_messages(self, input_data, file_path):
-#         """
-#         Prepare the appropriate messages based on the model type.
-#
-#         :param input_data: Original input data
-#         :param file_path: Current file path being processed
-#         :return: Properly formatted messages
-#         """
-#         if "mistral" in self.model_name.lower():
-#             return input_data[0]["text_input"]
-#         else:
-#             return [
-#                 {"role": "system", "content": "You are an expert at extracting structured text from image documents."},
-#                 {"role": "user", "content": input_data[0]["text_input"]},
-#             ]
-#
-#     @staticmethod
-#     def _extract_file_paths(input_data):
-#         """
-#         Extract and resolve absolute file paths from input data.
-#
-#         :param input_data: List of dictionaries containing image file paths.
-#         :return: List of absolute file paths.
-#         """
-#         return [
-#             os.path.abspath(file_path)
-#             for data in input_data
-#             for file_path in data.get("file_path", [])
-#         ]
+from mlx_vlm import load, generate
+from mlx_vlm.prompt_utils import apply_chat_template
+from mlx_vlm.utils import load_image
+from sparrow_parse.vllm.inference_base import ModelInference
+import os
+import json, re
+from rich import print
+class MLXInference(ModelInference):
+    """
+        A class for performing inference using the MLX model.
+        Handles image preprocessing, response formatting, and model interaction.
+        """
+    def __init__(self, model_name):
+        """
+        Initialize the inference class with the given model name.
+        :param model_name: Name of the model to load.
+        """
+        self.model_name = model_name
+        print(f"MLXInference initialized for model: {model_name}")
+    @staticmethod
+    def _load_model_and_processor(model_name):
+        """
+        Load the model and processor for inference.
+        :param model_name: Name of the model to load.
+        :return: Tuple containing the loaded model and processor.
+        """
+        model, processor = load(model_name)
+        print(f"Loaded model: {model_name}")
+        return model, processor
+    def process_response(self, output_text):
+        """
+        Process and clean the model's raw output to format as JSON.
+        """
+        try:
+            # Check if we have markdown code block markers
+            if "```" in output_text:
+                # Handle markdown-formatted output
+                json_start = output_text.find("```json")
+                if json_start != -1:
+                    # Extract content between ```json and ```
+                    content = output_text[json_start + 7:]
+                    json_end = content.rfind("```")
+                    if json_end != -1:
+                        content = content[:json_end].strip()
+                        formatted_json = json.loads(content)
+                        return json.dumps(formatted_json, indent=2)
+            # Handle raw JSON (no markdown formatting)
+            # First try to find JSON array or object patterns
+            for pattern in [r'\[\s*\{.*\}\s*\]', r'\{.*\}']:
+                import re
+                matches = re.search(pattern, output_text, re.DOTALL)
+                if matches:
+                    potential_json = matches.group(0)
+                    try:
+                        formatted_json = json.loads(potential_json)
+                        return json.dumps(formatted_json, indent=2)
+                    except:
+                        pass
+            # Last resort: try to parse the whole text as JSON
+            formatted_json = json.loads(output_text.strip())
+            return json.dumps(formatted_json, indent=2)
+        except Exception as e:
+            print(f"Failed to parse JSON: {e}")
+            return output_text
+    def load_image_data(self, image_filepath, max_width=1250, max_height=1750):
+        """
+        Load and resize image while maintaining its aspect ratio.
+        :param image_filepath: Path to the image file.
+        :param max_width: Maximum allowed width of the image.
+        :param max_height: Maximum allowed height of the image.
+        :return: Tuple containing the image object and its new dimensions.
+        """
+        image = load_image(image_filepath)  # Assuming load_image is defined elsewhere
+        width, height = image.size
+        # Calculate new dimensions while maintaining the aspect ratio
+        if width > max_width or height > max_height:
+            aspect_ratio = width / height
+            new_width = min(max_width, int(max_height * aspect_ratio))
+            new_height = min(max_height, int(max_width / aspect_ratio))
+            return image, new_width, new_height
+        return image, width, height
+    def inference(self, input_data, apply_annotation=False, mode=None):
+        """
+        Perform inference on input data using the specified model.
+        :param input_data: A list of dictionaries containing image file paths and text inputs.
+        :param apply_annotation: Optional flag to apply annotations to the output.
+        :param mode: Optional mode for inference ("static" for simple JSON output).
+        :return: List of processed model responses.
+        """
+        # Handle static mode
+        if mode == "static":
+            return [self.get_simple_json()]
+        # Load the model and processor
+        model, processor = self._load_model_and_processor(self.model_name)
+        config = model.config
+        # Determine if we're doing text-only or image-based inference
+        is_text_only = input_data[0].get("file_path") is None
+        if is_text_only:
+            # Text-only inference
+            messages = input_data[0]["text_input"]
+            response = self._generate_text_response(model, processor, config, messages)
+            results = [response]
+        else:
+            # Image-based inference
+            file_paths = self._extract_file_paths(input_data)
+            results = self._process_images(model, processor, config, file_paths, input_data, apply_annotation)
+        return results
+    def _generate_text_response(self, model, processor, config, messages):
+        """
+        Generate a text response for text-only inputs.
+        :param model: The loaded model
+        :param processor: The loaded processor
+        :param config: Model configuration
+        :param messages: Input messages
+        :return: Generated response
+        """
+        prompt = apply_chat_template(processor, config, messages)
+        response =  generate(
+            model,
+            processor,
+            prompt,
+            max_tokens=4000,
+            temperature=0.0,
+            verbose=False
+        )
+        print("Inference completed successfully")
+        return response
+    def _process_images(self, model, processor, config, file_paths, input_data, apply_annotation):
+        """
+        Process images and generate responses for each.
+        :param model: The loaded model
+        :param processor: The loaded processor
+        :param config: Model configuration
+        :param file_paths: List of image file paths
+        :param input_data: Original input data
+        :param apply_annotation: Flag to apply annotations
+        :return: List of processed responses
+        """
+        results = []
+        for file_path in file_paths:
+            image, width, height = self.load_image_data(file_path)
+            # Prepare messages based on model type
+            messages = self._prepare_messages(input_data, apply_annotation)
+            # Generate and process response
+            prompt = apply_chat_template(processor, config, messages)
+            response, _ = generate(
+                model,
+                processor,
+                prompt,
+                image,
+                resize_shape=(width, height),
+                max_tokens=4000,
+                temperature=0.0,
+                verbose=False
+            )
+            results.append(self.process_response(response))
+            print(f"Inference completed successfully for: {file_path}")
+        return results
+    def transform_query_with_bbox(self, text_input):
+        """
+        Transform JSON schema in text_input to include value, bbox, and confidence.
+        Works with both array and object JSON structures.
+        Args:
+            text_input (str): The input text containing a JSON schema
+        Returns:
+            str: Text with transformed JSON including value, bbox, and confidence
+        """
+        # Split text into parts - find the JSON portion between "retrieve" and "return response"
+        retrieve_pattern = r'retrieve\s+'
+        return_pattern = r'\.\s+return\s+response'
+        retrieve_match = re.search(retrieve_pattern, text_input)
+        return_match = re.search(return_pattern, text_input)
+        if not retrieve_match or not return_match:
+            return text_input  # Return original if pattern not found
+        json_start = retrieve_match.end()
+        json_end = return_match.start()
+        prefix = text_input[:json_start]
+        json_str = text_input[json_start:json_end].strip()
+        suffix = text_input[json_end:]
+        # Parse and transform the JSON
+        try:
+            # Handle single quotes if needed
+            json_str = json_str.replace("'", '"')
+            json_obj = json.loads(json_str)
+            transformed_json = self.transform_query_structure(json_obj)
+            transformed_json_str = json.dumps(transformed_json)
+            # Rebuild the text
+            result = prefix + transformed_json_str + suffix
+            return result
+        except json.JSONDecodeError as e:
+            print(f"Error parsing JSON: {e}")
+            return text_input  # Return original if parsing fails
+    def transform_query_structure(self, json_obj):
+        """
+        Transform each field in the JSON structure to include value, bbox, and confidence.
+        Handles both array and object formats recursively.
+        """
+        if isinstance(json_obj, list):
+            # Handle array format
+            return [self.transform_query_structure(item) for item in json_obj]
+        elif isinstance(json_obj, dict):
+            # Handle object format
+            result = {}
+            for key, value in json_obj.items():
+                if isinstance(value, (dict, list)):
+                    # Recursively transform nested objects or arrays
+                    result[key] = self.transform_query_structure(value)
+                else:
+                    # Transform simple value to object with value, bbox, and confidence
+                    result[key] = {
+                        "value": value,
+                        "bbox": ["float", "float", "float", "float"],
+                        "confidence": "float"
+                    }
+            return result
+        else:
+            # For primitive values, no transformation needed
+            return json_obj
+    def _prepare_messages(self, input_data, apply_annotation):
+        """
+        Prepare the appropriate messages based on the model type.
+        :param input_data: Original input data
+        :param apply_annotation: Flag to apply annotations
+        :return: Properly formatted messages
+        """
+        if "mistral" in self.model_name.lower():
+            return input_data[0]["text_input"]
+        elif "qwen" in self.model_name.lower():
+            if apply_annotation:
+                system_prompt = {"role": "system", "content": "You are an expert at extracting text from images. "
+                                                              "For each item in the table, provide separate bounding boxes for each field. "
+                                                              "All coordinates should be in pixels relative to the original image. Format your response in JSON."}
+                user_prompt = {"role": "user", "content": self.transform_query_with_bbox(input_data[0]["text_input"])}
+                return [system_prompt, user_prompt]
+            return [
+                {"role": "system", "content": "You are an expert at extracting structured text from image documents."},
+                {"role": "user", "content": input_data[0]["text_input"]},
+            ]
+        else:
+            raise ValueError("Unsupported model type. Please use either Mistral or Qwen.")
+    @staticmethod
+    def _extract_file_paths(input_data):
+        """
+        Extract and resolve absolute file paths from input data.
+        :param input_data: List of dictionaries containing image file paths.
+        :return: List of absolute file paths.
+        """
+        return [
+            os.path.abspath(file_path)
+            for data in input_data
+            for file_path in data.get("file_path", [])
+        ]

{sparrow_parse-1.0.4a0.dist-info → sparrow_parse-1.0.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sparrow-parse
-Version: 1.0.4a0
+Version: 1.0.6
 Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
 Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
 Author: Andrej Baranovskij
@@ -20,9 +20,11 @@ Requires-Dist: torchvision >=0.22.0
 Requires-Dist: torch >=2.7.0
 Requires-Dist: sentence-transformers >=4.1.0
 Requires-Dist: numpy >=2.2.5
-Requires-Dist: pypdf >=5.4.0
+Requires-Dist: pypdf >=5.5.0
 Requires-Dist: gradio-client >=1.7.2
 Requires-Dist: pdf2image >=1.17.0
+Requires-Dist: mlx >=0.25.2 ; sys_platform == "darwin" and platform_machine == "arm64"
+Requires-Dist: mlx-vlm ==0.1.26 ; sys_platform == "darwin" and platform_machine == "arm64"
 # Sparrow Parse

sparrow_parse-1.0.6.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,21 @@
+sparrow_parse/__init__.py,sha256=zrUEHc9dmvLJ5ka5maZk9TTHoZ21dwKsENXeOSwXM3o,21
+sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
+sparrow_parse/text_extraction.py,sha256=uhYVNK5Q2FZnw1Poa3JWjtN-aEL7cyKpvaltdn0m2II,8948
+sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sparrow_parse/extractors/vllm_extractor.py,sha256=rgc3Ic25F89i7-LPAis-bNuq4os12iJPeAz6SCYSPj0,11029
+sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sparrow_parse/helpers/image_optimizer.py,sha256=gUAJuNzRAB5ipgfhxTNss4MHbCPPkV5y-BSyrEHcJ0Y,2164
+sparrow_parse/helpers/pdf_optimizer.py,sha256=A2BVkb2JMqTJUz6bdfVzMmFSYaWn1QMav7UadMi0XJg,3423
+sparrow_parse/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sparrow_parse/processors/table_structure_processor.py,sha256=BCYnrsqngEu0WpBORcefdnCUgCCT12fFWdrFqvdXAwc,9787
+sparrow_parse/vllm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sparrow_parse/vllm/huggingface_inference.py,sha256=RqYmP-wh_cm_BZ271HbejnZe30S5EHxQiAkzoNxZ6BY,2004
+sparrow_parse/vllm/inference_base.py,sha256=AmWF1OUjJLxSEK_WCbcRpXHX3cKk8nPJJHha_X-9Gs4,844
+sparrow_parse/vllm/inference_factory.py,sha256=FTM65O-dW2WZchHOrNN7_Q3-FlVoAc65iSptuuUuClM,1166
+sparrow_parse/vllm/local_gpu_inference.py,sha256=SIyprv12fYawwfxgQ7ZOTM5WmMfQqhO_9vbereRpZdk,652
+sparrow_parse/vllm/mlx_inference.py,sha256=wNysikBBU5tTg3u2902EkhJOoliccHydL4IXHOW6j3I,11824
+sparrow_parse-1.0.6.dist-info/METADATA,sha256=RKzmkA3uaUQ9g5kJivfYp7S20_gbqeiZ_uJA7_fbmQQ,7229
+sparrow_parse-1.0.6.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
+sparrow_parse-1.0.6.dist-info/entry_points.txt,sha256=HV5nnQVtr2m-kn6hzY_ynp0zugNCcGovbmnfmQgOyhw,53
+sparrow_parse-1.0.6.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
+sparrow_parse-1.0.6.dist-info/RECORD,,

sparrow_parse-1.0.4a0.dist-info/RECORD DELETED Viewed

@@ -1,21 +0,0 @@
-sparrow_parse/__init__.py,sha256=uaGkUYEjwal6HsB_xcaWl4f22MLGxgYjrQfnOg_f2FE,22
-sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
-sparrow_parse/text_extraction.py,sha256=lirPpvz8tnwCMGmoHPK94-vCviybuRyQM-mpvhtp3uY,1124
-sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sparrow_parse/extractors/vllm_extractor.py,sha256=ZxYiSrdKWLcBXn4LUuvEcDH0q_Ua8xTzqmEF15puP08,10557
-sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sparrow_parse/helpers/image_optimizer.py,sha256=gUAJuNzRAB5ipgfhxTNss4MHbCPPkV5y-BSyrEHcJ0Y,2164
-sparrow_parse/helpers/pdf_optimizer.py,sha256=A2BVkb2JMqTJUz6bdfVzMmFSYaWn1QMav7UadMi0XJg,3423
-sparrow_parse/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sparrow_parse/processors/table_structure_processor.py,sha256=BCYnrsqngEu0WpBORcefdnCUgCCT12fFWdrFqvdXAwc,9787
-sparrow_parse/vllm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sparrow_parse/vllm/huggingface_inference.py,sha256=EJnG6PesGKMc_0qGPN8ufE6pSnhAgFu0XjCbaLCNVyM,1980
-sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzKmLNzgjw,820
-sparrow_parse/vllm/inference_factory.py,sha256=Qd8233Xj9321ZhPEBW0bPpk4pfkIOcYnqoyyNcRCByI,1194
-sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
-sparrow_parse/vllm/mlx_inference.py,sha256=sLSt0qN--RuJAApWX2HgYfX0ZDiZqZbgI7LRxioy73s,8315
-sparrow_parse-1.0.4a0.dist-info/METADATA,sha256=2Jtk-kXCCG_RjTxtq3CPGI-K-nHDZrHsuAxtcBA_nlc,7053
-sparrow_parse-1.0.4a0.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
-sparrow_parse-1.0.4a0.dist-info/entry_points.txt,sha256=HV5nnQVtr2m-kn6hzY_ynp0zugNCcGovbmnfmQgOyhw,53
-sparrow_parse-1.0.4a0.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
-sparrow_parse-1.0.4a0.dist-info/RECORD,,

{sparrow_parse-1.0.4a0.dist-info → sparrow_parse-1.0.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{sparrow_parse-1.0.4a0.dist-info → sparrow_parse-1.0.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{sparrow_parse-1.0.4a0.dist-info → sparrow_parse-1.0.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

sparrow-parse 1.0.4a0__py3-none-any.whl → 1.0.6__py3-none-any.whl

sparrow-parse 1.0.4a0py3-none-any.whl → 1.0.6py3-none-any.whl