PyPI - sparrow-parse - Versions diffs - 0.5.0__tar.gz → 0.5.2__tar.gz - Mend

sparrow-parse 0.5.0tar.gz → 0.5.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sparrow-parse
-Version: 0.5.0
+Version: 0.5.2
 Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
 Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
 Author: Andrej Baranovskij
@@ -60,6 +60,7 @@ input_data = [
 # Now you can run inference without knowing which implementation is used
 results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
                                  generic_query=False,
+                                 crop_size=80,
                                  debug_dir=None,
                                  debug=True,
                                  mode=None)
@@ -71,6 +72,8 @@ print(f"Number of pages: {num_pages}")
 Use `tables_only=True` if you want to extract only tables.
+Use `crop_size=N` (where `N` is an integer) to crop N pixels from all borders of the input images. This can be helpful for removing unwanted borders or frame artifacts from scanned documents.
 Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
 Method `run_inference` will return results and number of pages processed.
@@ -95,7 +98,7 @@ from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
 pdf_optimizer = PDFOptimizer()
 num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
-                                                                     output_directory,
+                                                                     debug_dir,
                                                                      convert_to_images)
 ```
@@ -104,10 +107,30 @@ Example:
 *file_path* - `/data/invoice_1.pdf`
-*output_directory* - set to not `None`, for debug purposes only
+*debug_dir* - set to not `None`, for debug purposes only
 *convert_to_images* - default `False`, to split into PDF files
+## Image cropping
+```
+from sparrow_parse.helpers.image_optimizer import ImageOptimizer
+image_optimizer = ImageOptimizer()
+cropped_file_path = image_optimizer.crop_image_borders(file_path, temp_dir, debug_dir, crop_size)
+```
+Example:
+*file_path* - `/data/invoice_1.jpg`
+*temp_dir* - directory to store cropped files
+*debug_dir* - set to not `None`, for debug purposes only
+*crop_size* - Number of pixels to crop from each border
 ## Library build
 Create Python virtual environment

{sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/README.md RENAMED Viewed

@@ -41,6 +41,7 @@ input_data = [
 # Now you can run inference without knowing which implementation is used
 results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
                                  generic_query=False,
+                                 crop_size=80,
                                  debug_dir=None,
                                  debug=True,
                                  mode=None)
@@ -52,6 +53,8 @@ print(f"Number of pages: {num_pages}")
 Use `tables_only=True` if you want to extract only tables.
+Use `crop_size=N` (where `N` is an integer) to crop N pixels from all borders of the input images. This can be helpful for removing unwanted borders or frame artifacts from scanned documents.
 Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
 Method `run_inference` will return results and number of pages processed.
@@ -76,7 +79,7 @@ from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
 pdf_optimizer = PDFOptimizer()
 num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
-                                                                     output_directory,
+                                                                     debug_dir,
                                                                      convert_to_images)
 ```
@@ -85,10 +88,30 @@ Example:
 *file_path* - `/data/invoice_1.pdf`
-*output_directory* - set to not `None`, for debug purposes only
+*debug_dir* - set to not `None`, for debug purposes only
 *convert_to_images* - default `False`, to split into PDF files
+## Image cropping
+```
+from sparrow_parse.helpers.image_optimizer import ImageOptimizer
+image_optimizer = ImageOptimizer()
+cropped_file_path = image_optimizer.crop_image_borders(file_path, temp_dir, debug_dir, crop_size)
+```
+Example:
+*file_path* - `/data/invoice_1.jpg`
+*temp_dir* - directory to store cropped files
+*debug_dir* - set to not `None`, for debug purposes only
+*crop_size* - Number of pixels to crop from each border
 ## Library build
 Create Python virtual environment

{sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/setup.py RENAMED Viewed

@@ -8,7 +8,7 @@ with open("requirements.txt", "r", encoding="utf-8") as fh:
 setup(
     name="sparrow-parse",
-    version="0.5.0",
+    version="0.5.2",
     author="Andrej Baranovskij",
     author_email="andrejus.baranovskis@gmail.com",
     description="Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.",

sparrow-parse-0.5.2/sparrow_parse/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = '0.5.2'

{sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/extractors/vllm_extractor.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import json
 from sparrow_parse.vllm.inference_factory import InferenceFactory
 from sparrow_parse.helpers.pdf_optimizer import PDFOptimizer
+from sparrow_parse.helpers.image_optimizer import ImageOptimizer
 from sparrow_parse.processors.table_structure_processor import TableDetector
 from rich import print
 import os
@@ -14,7 +14,7 @@ class VLLMExtractor(object):
         pass
     def run_inference(self, model_inference_instance, input_data, tables_only=False,
-                      generic_query=False, debug_dir=None, debug=False, mode=None):
+                      generic_query=False, crop_size=None, debug_dir=None, debug=False, mode=None):
         """
         Main entry point for processing input data using a model inference instance.
         Handles generic queries, PDFs, and table extraction.
@@ -27,12 +27,12 @@ class VLLMExtractor(object):
         file_path = input_data[0]["file_path"]
         if self.is_pdf(file_path):
-            return self._process_pdf(model_inference_instance, input_data, tables_only, debug, debug_dir, mode)
+            return self._process_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode)
-        return self._process_non_pdf(model_inference_instance, input_data, tables_only, debug, debug_dir)
+        return self._process_non_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir)
-    def _process_pdf(self, model_inference_instance, input_data, tables_only, debug, debug_dir, mode):
+    def _process_pdf(self, model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode):
         """
         Handles processing and inference for PDF files, including page splitting and optional table extraction.
         """
@@ -40,26 +40,40 @@ class VLLMExtractor(object):
         num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(input_data[0]["file_path"],
                                                                              debug_dir, convert_to_images=True)
-        results = self._process_pages(model_inference_instance, output_files, input_data, tables_only, debug, debug_dir)
+        results = self._process_pages(model_inference_instance, output_files, input_data, tables_only, crop_size, debug, debug_dir)
         # Clean up temporary directory
         shutil.rmtree(temp_dir, ignore_errors=True)
         return results, num_pages
-    def _process_non_pdf(self, model_inference_instance, input_data, tables_only, debug, debug_dir):
+    def _process_non_pdf(self, model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir):
         """
         Handles processing and inference for non-PDF files, with optional table extraction.
         """
         file_path = input_data[0]["file_path"]
         if tables_only:
             return self._extract_tables(model_inference_instance, file_path, input_data, debug, debug_dir), 1
         else:
+            temp_dir = tempfile.mkdtemp()
+            if crop_size:
+                if debug:
+                    print(f"Cropping image borders by {crop_size} pixels.")
+                image_optimizer = ImageOptimizer()
+                cropped_file_path = image_optimizer.crop_image_borders(file_path, temp_dir, debug_dir, crop_size)
+                input_data[0]["file_path"] = cropped_file_path
+            file_path = input_data[0]["file_path"]
             input_data[0]["file_path"] = [file_path]
             results = model_inference_instance.inference(input_data)
+            shutil.rmtree(temp_dir, ignore_errors=True)
             return results, 1
-    def _process_pages(self, model_inference_instance, output_files, input_data, tables_only, debug, debug_dir):
+    def _process_pages(self, model_inference_instance, output_files, input_data, tables_only, crop_size, debug, debug_dir):
         """
         Processes individual pages (PDF split) and handles table extraction or inference.
@@ -68,6 +82,7 @@ class VLLMExtractor(object):
             output_files: List of file paths for the split PDF pages.
             input_data: Input data for inference.
             tables_only: Whether to only process tables.
+            crop_size: Size for cropping image borders.
             debug: Debug flag for logging.
             debug_dir: Directory for saving debug information.
@@ -89,11 +104,39 @@ class VLLMExtractor(object):
         else:
             if debug:
                 print(f"Processing {len(output_files)} pages for inference at once.")
-            # Pass all output files to the inference method for processing at once
-            input_data[0]["file_path"] = output_files
+            temp_dir = tempfile.mkdtemp()
+            cropped_files = []
+            if crop_size:
+                if debug:
+                    print(f"Cropping image borders by {crop_size} pixels from {len(output_files)} images.")
+                image_optimizer = ImageOptimizer()
+                # Process each file in the output_files array
+                for file_path in output_files:
+                    cropped_file_path = image_optimizer.crop_image_borders(
+                        file_path,
+                        temp_dir,
+                        debug_dir,
+                        crop_size
+                    )
+                    cropped_files.append(cropped_file_path)
+                # Use the cropped files for inference
+                input_data[0]["file_path"] = cropped_files
+            else:
+                # If no cropping needed, use original files directly
+                input_data[0]["file_path"] = output_files
+            # Process all files at once
             results = model_inference_instance.inference(input_data)
             results_array.extend(results)
+            # Clean up temporary directory
+            shutil.rmtree(temp_dir, ignore_errors=True)
         return results_array
@@ -174,8 +217,9 @@ if __name__ == "__main__":
     # ]
     #
     # # Now you can run inference without knowing which implementation is used
-    # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=True,
+    # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
     #                                                    generic_query=False,
+    #                                                    crop_size=80,
     #                                                    debug_dir="/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/",
     #                                                    debug=True,
     #                                                    mode=None)

sparrow-parse-0.5.2/sparrow_parse/helpers/image_optimizer.py ADDED Viewed

@@ -0,0 +1,59 @@
+from PIL import Image
+import os
+class ImageOptimizer(object):
+    def __init__(self):
+        pass
+    def crop_image_borders(self, file_path, temp_dir, debug_dir=None, crop_size=60):
+        """
+        Crops all four borders of an image by the specified size.
+        Args:
+            file_path (str): Path to the input image
+            temp_dir (str): Temporary directory to store the cropped image
+            debug_dir (str, optional): Directory to save a debug copy of the cropped image
+            crop_size (int): Number of pixels to crop from each border
+        Returns:
+            str: Path to the cropped image in temp_dir
+        """
+        try:
+            # Open the image
+            with Image.open(file_path) as img:
+                # Get image dimensions
+                width, height = img.size
+                # Calculate the crop box
+                left = crop_size
+                top = crop_size
+                right = width - crop_size
+                bottom = height - crop_size
+                # Ensure we're not trying to crop more than the image size
+                if right <= left or bottom <= top:
+                    raise ValueError("Crop size is too large for the image dimensions")
+                # Perform the crop
+                cropped_img = img.crop((left, top, right, bottom))
+                # Get original filename without path
+                filename = os.path.basename(file_path)
+                name, ext = os.path.splitext(filename)
+                # Save cropped image in temp_dir
+                output_path = os.path.join(temp_dir, f"{name}_cropped{ext}")
+                cropped_img.save(output_path)
+                # If debug_dir is provided, save a debug copy
+                if debug_dir:
+                    os.makedirs(debug_dir, exist_ok=True)
+                    debug_path = os.path.join(debug_dir, f"{name}_cropped_debug{ext}")
+                    cropped_img.save(debug_path)
+                    print(f"Debug cropped image saved to: {debug_path}")
+                return output_path
+        except Exception as e:
+            raise Exception(f"Error processing image: {str(e)}")

{sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/helpers/pdf_optimizer.py RENAMED Viewed

@@ -9,7 +9,7 @@ class PDFOptimizer(object):
     def __init__(self):
         pass
-    def split_pdf_to_pages(self, file_path, output_dir=None, convert_to_images=False):
+    def split_pdf_to_pages(self, file_path, debug_dir=None, convert_to_images=False):
         # Create a temporary directory
         temp_dir = tempfile.mkdtemp()
         output_files = []
@@ -30,9 +30,9 @@ class PDFOptimizer(object):
                         writer.write(output_file)
                         output_files.append(output_filename)
-                    if output_dir:
+                    if debug_dir:
                         # Save each page to the debug folder
-                        debug_output_filename = os.path.join(output_dir, f'page_{page_num + 1}.pdf')
+                        debug_output_filename = os.path.join(debug_dir, f'page_{page_num + 1}.pdf')
                         with open(debug_output_filename, 'wb') as output_file:
                             writer.write(output_file)
@@ -49,10 +49,12 @@ class PDFOptimizer(object):
                 image.save(output_filename, 'JPEG')
                 output_files.append(output_filename)
-                if output_dir:
+                if debug_dir:
                     # Save each image to the debug folder
-                    debug_output_filename = os.path.join(output_dir, f'{base_name}_page_{i + 1}.jpg')
+                    os.makedirs(debug_dir, exist_ok=True)
+                    debug_output_filename = os.path.join(debug_dir, f'{base_name}_page_{i + 1}_debug.jpg')
                     image.save(debug_output_filename, 'JPEG')
+                    print(f"Debug image saved to: {debug_output_filename}")
             # Return the number of pages, the list of file paths, and the temporary directory
             return len(images), output_files, temp_dir
@@ -61,13 +63,13 @@ class PDFOptimizer(object):
 if __name__ == "__main__":
     pdf_optimizer = PDFOptimizer()
-    # output_directory = "/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/"
+    # debug_dir = "/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/"
     # # Ensure the output directory exists
     # os.makedirs(output_directory, exist_ok=True)
     #
     # # Split the optimized PDF into separate pages
     # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/oracle_10k_2014_q1_small.pdf",
-    #                                                                      output_directory,
+    #                                                                      debug_dir,
     #                                                                      True)
     #
     # print(f"Number of pages: {num_pages}")

{sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/processors/table_structure_processor.py RENAMED Viewed

@@ -131,7 +131,7 @@ class TableDetector(object):
                 cropped_tables.append(cropped_table)
                 if debug_dir:
-                    file_name_table = self.append_filename(file_path, debug_dir, f"cropped_{i + 1}")
+                    file_name_table = self.append_filename(file_path, debug_dir, f"table_cropped_{i + 1}")
                     cropped_table.save(file_name_table)
         else:
             if debug:
@@ -141,7 +141,7 @@ class TableDetector(object):
             cropped_tables.append(cropped_table)
             if debug_dir:
-                file_name_table = self.append_filename(file_path, debug_dir, "cropped")
+                file_name_table = self.append_filename(file_path, debug_dir, "table_cropped")
                 cropped_table.save(file_name_table)
         return cropped_tables

sparrow-parse-0.5.2/sparrow_parse/text_extraction.py ADDED Viewed

@@ -0,0 +1,30 @@
+from mlx_vlm import load, apply_chat_template, generate
+from mlx_vlm.utils import load_image
+# For test purposes, we will use a sample image
+# Load model and processor
+qwen_vl_model, qwen_vl_processor = load("mlx-community/Qwen2-VL-7B-Instruct-8bit")
+qwen_vl_config = qwen_vl_model.config
+image = load_image("images/graph.png")
+messages = [
+    {"role": "system", "content": "You are an expert at extracting text from images. Format your response in json."},
+    {"role": "user", "content": "Extract the names, labels and y coordinates from the image."}
+]
+# Apply chat template
+prompt = apply_chat_template(qwen_vl_processor, qwen_vl_config, messages)
+# Generate text
+qwen_vl_output = generate(
+    qwen_vl_model,
+    qwen_vl_processor,
+    prompt,
+    image,
+    max_tokens=1000,
+    temperature=0.7,
+)
+print(qwen_vl_output)

{sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/vllm/mlx_inference.py RENAMED Viewed

@@ -112,8 +112,8 @@ class MLXInference(ModelInference):
             response = generate(
                 model,
                 processor,
-                image,
                 prompt,
+                image,
                 resize_shape=(width, height),
                 max_tokens=4000,
                 temperature=0.0,

{sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sparrow-parse
-Version: 0.5.0
+Version: 0.5.2
 Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
 Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
 Author: Andrej Baranovskij
@@ -60,6 +60,7 @@ input_data = [
 # Now you can run inference without knowing which implementation is used
 results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
                                  generic_query=False,
+                                 crop_size=80,
                                  debug_dir=None,
                                  debug=True,
                                  mode=None)
@@ -71,6 +72,8 @@ print(f"Number of pages: {num_pages}")
 Use `tables_only=True` if you want to extract only tables.
+Use `crop_size=N` (where `N` is an integer) to crop N pixels from all borders of the input images. This can be helpful for removing unwanted borders or frame artifacts from scanned documents.
 Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
 Method `run_inference` will return results and number of pages processed.
@@ -95,7 +98,7 @@ from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
 pdf_optimizer = PDFOptimizer()
 num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
-                                                                     output_directory,
+                                                                     debug_dir,
                                                                      convert_to_images)
 ```
@@ -104,10 +107,30 @@ Example:
 *file_path* - `/data/invoice_1.pdf`
-*output_directory* - set to not `None`, for debug purposes only
+*debug_dir* - set to not `None`, for debug purposes only
 *convert_to_images* - default `False`, to split into PDF files
+## Image cropping
+```
+from sparrow_parse.helpers.image_optimizer import ImageOptimizer
+image_optimizer = ImageOptimizer()
+cropped_file_path = image_optimizer.crop_image_borders(file_path, temp_dir, debug_dir, crop_size)
+```
+Example:
+*file_path* - `/data/invoice_1.jpg`
+*temp_dir* - directory to store cropped files
+*debug_dir* - set to not `None`, for debug purposes only
+*crop_size* - Number of pixels to crop from each border
 ## Library build
 Create Python virtual environment

{sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse.egg-info/SOURCES.txt RENAMED Viewed

@@ -2,6 +2,7 @@ README.md
 setup.py
 sparrow_parse/__init__.py
 sparrow_parse/__main__.py
+sparrow_parse/text_extraction.py
 sparrow_parse.egg-info/PKG-INFO
 sparrow_parse.egg-info/SOURCES.txt
 sparrow_parse.egg-info/dependency_links.txt
@@ -11,6 +12,7 @@ sparrow_parse.egg-info/top_level.txt
 sparrow_parse/extractors/__init__.py
 sparrow_parse/extractors/vllm_extractor.py
 sparrow_parse/helpers/__init__.py
+sparrow_parse/helpers/image_optimizer.py
 sparrow_parse/helpers/pdf_optimizer.py
 sparrow_parse/processors/__init__.py
 sparrow_parse/processors/table_structure_processor.py