PyPI - sparrow-parse - Versions diffs - 0.5.5__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

sparrow-parse 0.5.5py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

sparrow_parse/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = '0.~~5.5~~'
1	+ __version__ = '1.0.2'

sparrow_parse/extractors/vllm_extractor.py CHANGED Viewed

@@ -147,6 +147,14 @@ class VLLMExtractor(object):
         table_detector = TableDetector()
         cropped_tables = table_detector.detect_tables(file_path, local=False, debug_dir=debug_dir, debug=debug)
         results_array = []
+        # Check if no tables were found
+        if cropped_tables is None:
+            if debug:
+                print(f"No tables detected in {file_path}")
+            # Return a structured no-tables-found response instead of failing
+            return [json.dumps({"message": "No tables detected in the document", "status": "empty"})]
         temp_dir = tempfile.mkdtemp()
         for i, table in enumerate(cropped_tables):
@@ -198,7 +206,7 @@ if __name__ == "__main__":
     # # export HF_TOKEN="hf_"
     # config = {
     #     "method": "mlx",  # Could be 'huggingface', 'mlx' or 'local_gpu'
-    #     "model_name": "mlx-community/Qwen2.5-VL-7B-Instruct-8bit",
+    #     "model_name": "mlx-community/Mistral-Small-3.1-24B-Instruct-2503-8bit",
     #     # "hf_space": "katanaml/sparrow-qwen2-vl-7b",
     #     # "hf_token": os.getenv('HF_TOKEN'),
     #     # Additional fields for local GPU inference
@@ -211,8 +219,8 @@ if __name__ == "__main__":
     #
     # input_data = [
     #     {
-    #         "file_path": "/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/bonds_table.png",
-    #         "text_input": "retrieve document data. return response in JSON format"
+    #         "file_path": "sparrow_parse/images/bonds_table.png",
+    #         "text_input": "retrieve all data. return response in JSON format"
     #     }
     # ]
     #
@@ -226,4 +234,5 @@ if __name__ == "__main__":
     #
     # for i, result in enumerate(results_array):
     #     print(f"Result for page {i + 1}:", result)
-    # print(f"Number of pages: {num_pages}")
+    # print(f"Number of pages: {num_pages}")

sparrow_parse/text_extraction.py CHANGED Viewed

@@ -1,30 +1,35 @@
 from mlx_vlm import load, apply_chat_template, generate
 from mlx_vlm.utils import load_image
-# For test purposes, we will use a sample image
 # Load model and processor
-qwen_vl_model, qwen_vl_processor = load("mlx-community/Qwen2.5-VL-7B-Instruct-8bit")
-qwen_vl_config = qwen_vl_model.config
+# vl_model, vl_processor = load("mlx-community/Mistral-Small-3.1-24B-Instruct-2503-8bit")
+vl_model, vl_processor = load("mlx-community/Qwen2.5-VL-7B-Instruct-8bit")
+vl_config = vl_model.config
-image = load_image("images/graph.png")
+image = load_image("images/bonds_table.png")
 messages = [
     {"role": "system", "content": "You are an expert at extracting text from images. Format your response in json."},
-    {"role": "user", "content": "Extract the names, labels and y coordinates from the image."}
+    {"role": "user", "content": "retrieve [{\"instrument_name\":\"str\", \"valuation\":\"int\"}]. return response in JSON format"}
 ]
+# message = "retrieve all data. return response in JSON format"
+# message = "retrieve [{\"instrument_name\":\"str\", \"valuation\":\"int\"}]. return response in JSON format"
 # Apply chat template
-prompt = apply_chat_template(qwen_vl_processor, qwen_vl_config, messages)
+prompt = apply_chat_template(vl_processor, vl_config, messages)
+# prompt = apply_chat_template(vl_processor, vl_config, message)
 # Generate text
-qwen_vl_output = generate(
-    qwen_vl_model,
-    qwen_vl_processor,
+vl_output = generate(
+    vl_model,
+    vl_processor,
     prompt,
     image,
     max_tokens=1000,
-    temperature=0.7,
+    temperature=0,
+    verbose=False
 )
-print(qwen_vl_output)
+print(vl_output)

sparrow_parse/vllm/mlx_inference.py CHANGED Viewed

@@ -39,21 +39,40 @@ class MLXInference(ModelInference):
     def process_response(self, output_text):
         """
         Process and clean the model's raw output to format as JSON.
-        :param output_text: Raw output text from the model.
-        :return: A formatted JSON string or the original text in case of errors.
         """
         try:
-            cleaned_text = (
-                output_text.strip("[]'")
-                .replace("```json\n", "")
-                .replace("\n```", "")
-                .replace("'", "")
-            )
-            formatted_json = json.loads(cleaned_text)
+            # Check if we have markdown code block markers
+            if "```" in output_text:
+                # Handle markdown-formatted output
+                json_start = output_text.find("```json")
+                if json_start != -1:
+                    # Extract content between ```json and ```
+                    content = output_text[json_start + 7:]
+                    json_end = content.rfind("```")
+                    if json_end != -1:
+                        content = content[:json_end].strip()
+                        formatted_json = json.loads(content)
+                        return json.dumps(formatted_json, indent=2)
+            # Handle raw JSON (no markdown formatting)
+            # First try to find JSON array or object patterns
+            for pattern in [r'\[\s*\{.*\}\s*\]', r'\{.*\}']:
+                import re
+                matches = re.search(pattern, output_text, re.DOTALL)
+                if matches:
+                    potential_json = matches.group(0)
+                    try:
+                        formatted_json = json.loads(potential_json)
+                        return json.dumps(formatted_json, indent=2)
+                    except:
+                        pass
+            # Last resort: try to parse the whole text as JSON
+            formatted_json = json.loads(output_text.strip())
             return json.dumps(formatted_json, indent=2)
-        except json.JSONDecodeError as e:
-            print(f"Failed to parse JSON in MLX inference backend: {e}")
+        except Exception as e:
+            print(f"Failed to parse JSON: {e}")
             return output_text
@@ -102,10 +121,13 @@ class MLXInference(ModelInference):
             image, width, height = self.load_image_data(file_path)
             # Prepare messages for the chat model
-            messages = [
-                {"role": "system", "content": "You are an expert at extracting structured text from image documents."},
-                {"role": "user", "content": input_data[0]["text_input"]},
-            ]
+            if "mistral" in self.model_name.lower():
+                messages = input_data[0]["text_input"]
+            else:
+                messages = [
+                    {"role": "system", "content": "You are an expert at extracting structured text from image documents."},
+                    {"role": "user", "content": input_data[0]["text_input"]},
+                ]
             # Generate and process response
             prompt = apply_chat_template(processor, config, messages)  # Assuming defined

{sparrow_parse-0.5.5.dist-info → sparrow_parse-1.0.2.dist-info}/METADATA RENAMED Viewed

@@ -1,15 +1,13 @@
 Metadata-Version: 2.1
 Name: sparrow-parse
-Version: 0.5.5
+Version: 1.0.2
 Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
 Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
 Author: Andrej Baranovskij
 Author-email: andrejus.baranovskis@gmail.com
-License: UNKNOWN
 Project-URL: Homepage, https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
 Project-URL: Repository, https://github.com/katanaml/sparrow
 Keywords: llm,vllm,ocr,vision
-Platform: UNKNOWN
 Classifier: Operating System :: OS Independent
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Topic :: Software Development
@@ -17,16 +15,16 @@ Classifier: Programming Language :: Python :: 3.10
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 Requires-Dist: rich
-Requires-Dist: transformers>=4.49.0
-Requires-Dist: torchvision>=0.21.0
-Requires-Dist: torch>=2.6.0
-Requires-Dist: sentence-transformers>=3.3.1
-Requires-Dist: numpy>=2.1.3
-Requires-Dist: pypdf>=5.2.0
-Requires-Dist: gradio-client>=1.7.2
-Requires-Dist: pdf2image>=1.17.0
-Requires-Dist: mlx>=0.23.1; sys_platform == "darwin" and platform_machine == "arm64"
-Requires-Dist: mlx-vlm==0.1.14; sys_platform == "darwin" and platform_machine == "arm64"
+Requires-Dist: transformers >=4.50.1
+Requires-Dist: torchvision >=0.21.0
+Requires-Dist: torch >=2.6.0
+Requires-Dist: sentence-transformers >=4.0.0
+Requires-Dist: numpy >=2.2.4
+Requires-Dist: pypdf >=5.4.0
+Requires-Dist: gradio-client >=1.7.2
+Requires-Dist: pdf2image >=1.17.0
+Requires-Dist: mlx >=0.24.1 ; sys_platform == "darwin" and platform_machine == "arm64"
+Requires-Dist: mlx-vlm ==0.1.21 ; sys_platform == "darwin" and platform_machine == "arm64"
 # Sparrow Parse
@@ -187,5 +185,3 @@ If your organization is seeking to utilize Sparrow under a proprietary license,
 ## License
 Licensed under the GPL 3.0. Copyright 2020-2025 Katana ML, Andrej Baranovskij. [Copy of the license](https://github.com/katanaml/sparrow/blob/main/LICENSE).

{sparrow_parse-0.5.5.dist-info → sparrow_parse-1.0.2.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
-sparrow_parse/__init__.py,sha256=nrKeY2xA6SXRPdgHDxi2HLkFNpXRuW6MkqwC0reZpy8,21
+sparrow_parse/__init__.py,sha256=C8nyPP5-54GgYCcP38Lbel_pRimOW-Ra4bw6Vzp2lmE,21
 sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
-sparrow_parse/text_extraction.py,sha256=JtUU7swvV12xBai5S9ICxWWWrUlkpZTZqvUnbz1h5Mk,834
+sparrow_parse/text_extraction.py,sha256=lirPpvz8tnwCMGmoHPK94-vCviybuRyQM-mpvhtp3uY,1124
 sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sparrow_parse/extractors/vllm_extractor.py,sha256=Cf2sVgxDExj2ud4G6z9JnirVclTgPIEe9YSoCfTkW4k,9563
+sparrow_parse/extractors/vllm_extractor.py,sha256=uRSXzCQzjXujg1n1ozDitSPQoCfO435Nog7yO1IxWiU,9874
 sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sparrow_parse/helpers/image_optimizer.py,sha256=gUAJuNzRAB5ipgfhxTNss4MHbCPPkV5y-BSyrEHcJ0Y,2164
 sparrow_parse/helpers/pdf_optimizer.py,sha256=A2BVkb2JMqTJUz6bdfVzMmFSYaWn1QMav7UadMi0XJg,3423
@@ -13,9 +13,9 @@ sparrow_parse/vllm/huggingface_inference.py,sha256=EJnG6PesGKMc_0qGPN8ufE6pSnhAg
 sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzKmLNzgjw,820
 sparrow_parse/vllm/inference_factory.py,sha256=FTM65O-dW2WZchHOrNN7_Q3-FlVoAc65iSptuuUuClM,1166
 sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
-sparrow_parse/vllm/mlx_inference.py,sha256=MUuW56f-aKnVmeMAATxKLxsovEMmp1qlgtlmW8J2C7M,4899
-sparrow_parse-0.5.5.dist-info/METADATA,sha256=l9s3Vi-5KVtryw0a1Z7AXqQHIKuUMbPQu0XNeilRofU,7254
-sparrow_parse-0.5.5.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-sparrow_parse-0.5.5.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
-sparrow_parse-0.5.5.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
-sparrow_parse-0.5.5.dist-info/RECORD,,
+sparrow_parse/vllm/mlx_inference.py,sha256=KjAftUIAWxYfctE3n1BKXA8jETM4WT3ESyx97eMA_8U,5954
+sparrow_parse-1.0.2.dist-info/METADATA,sha256=K4XNgj-PpegO8aLAe32aOZ3D8kh6lnMX0po2wXTxn-w,7229
+sparrow_parse-1.0.2.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
+sparrow_parse-1.0.2.dist-info/entry_points.txt,sha256=HV5nnQVtr2m-kn6hzY_ynp0zugNCcGovbmnfmQgOyhw,53
+sparrow_parse-1.0.2.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
+sparrow_parse-1.0.2.dist-info/RECORD,,

{sparrow_parse-0.5.5.dist-info → sparrow_parse-1.0.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.45.1)
+Generator: bdist_wheel (0.41.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

{sparrow_parse-0.5.5.dist-info → sparrow_parse-1.0.2.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,3 +1,2 @@
 [console_scripts]
 sparrow-parse = sparrow_parse:main

{sparrow_parse-0.5.5.dist-info → sparrow_parse-1.0.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

sparrow-parse 0.5.5__py3-none-any.whl → 1.0.2__py3-none-any.whl

sparrow-parse 0.5.5py3-none-any.whl → 1.0.2py3-none-any.whl