PyPI - sparrow-parse - Versions diffs - 0.3.6__tar.gz → 0.3.8__tar.gz - Mend

sparrow-parse 0.3.6tar.gz → 0.3.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sparrow-parse
-Version: 0.3.6
+Version: 0.3.8
 Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
 Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
 Author: Andrej Baranovskij
@@ -65,7 +65,7 @@ input_data = [
 results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
                                  debug_dir="/data/",
                                  debug=True,
-                                 mode="static")
+                                 mode=None)
 for i, result in enumerate(results_array):
     print(f"Result for page {i + 1}:", result)
@@ -74,6 +74,10 @@ print(f"Number of pages: {num_pages}")
 Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
+Method `run_inference` will return results and number of pages processed.
+Note: GPU backend `katanaml/sparrow-qwen2-vl-7b` is private, to be able to run below command, you need to create your own backend on Hugging Face space using [code](https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse/sparrow_parse/vllm/infra/qwen2_vl_7b) from Sparrow Parse.
 ## PDF pre-processing
 ```

{sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/README.md RENAMED Viewed

@@ -46,7 +46,7 @@ input_data = [
 results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
                                  debug_dir="/data/",
                                  debug=True,
-                                 mode="static")
+                                 mode=None)
 for i, result in enumerate(results_array):
     print(f"Result for page {i + 1}:", result)
@@ -55,6 +55,10 @@ print(f"Number of pages: {num_pages}")
 Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
+Method `run_inference` will return results and number of pages processed.
+Note: GPU backend `katanaml/sparrow-qwen2-vl-7b` is private, to be able to run below command, you need to create your own backend on Hugging Face space using [code](https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse/sparrow_parse/vllm/infra/qwen2_vl_7b) from Sparrow Parse.
 ## PDF pre-processing
 ```

{sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/setup.py RENAMED Viewed

@@ -8,7 +8,7 @@ with open("requirements.txt", "r", encoding="utf-8") as fh:
 setup(
     name="sparrow-parse",
-    version="0.3.6",
+    version="0.3.8",
     author="Andrej Baranovskij",
     author_email="andrejus.baranovskis@gmail.com",
     description="Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.",

sparrow-parse-0.3.8/sparrow_parse/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = '0.3.8'

{sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/extractors/vllm_extractor.py RENAMED Viewed

@@ -30,21 +30,16 @@ class VLLMExtractor(object):
                                                                                  debug_dir,
                                                                                  True)
-            # Run inference on each page
-            for page_num, output_file in enumerate(output_files):
-                input_data[0]["file_path"] = output_file
-                if debug:
-                    print(f"Running inference on page {page_num + 1}...")
+            input_data[0]["file_path"] = output_files
-                # Run inference on the page
-                result = model_inference_instance.inference(input_data, mode)
-                results_array.append(result)
+            # Run inference on the page
+            results_array = model_inference_instance.inference(input_data, mode)
             shutil.rmtree(temp_dir, ignore_errors=True)
             return results_array, num_pages
-        result = model_inference_instance.inference(input_data)
-        results_array.append(result)
+        input_data[0]["file_path"] = [input_data[0]["file_path"]]
+        results_array = model_inference_instance.inference(input_data)
         return results_array, 1
@@ -80,7 +75,7 @@ if __name__ == "__main__":
     # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
     #                                  debug_dir="/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/",
     #                                  debug=True,
-    #                                  mode="static")
+    #                                  mode=None)
     #
     # for i, result in enumerate(results_array):
     #     print(f"Result for page {i + 1}:", result)

sparrow-parse-0.3.8/sparrow_parse/vllm/huggingface_inference.py ADDED Viewed

@@ -0,0 +1,60 @@
+from gradio_client import Client, handle_file
+from sparrow_parse.vllm.inference_base import ModelInference
+import json
+import os
+import ast
+class HuggingFaceInference(ModelInference):
+    def __init__(self, hf_space, hf_token):
+        self.hf_space = hf_space
+        self.hf_token = hf_token
+    def process_response(self, output_text):
+        json_string = output_text
+        json_string = json_string.strip("[]'")
+        json_string = json_string.replace("```json\n", "").replace("\n```", "")
+        json_string = json_string.replace("'", "")
+        try:
+            formatted_json = json.loads(json_string)
+            return json.dumps(formatted_json, indent=2)
+        except json.JSONDecodeError as e:
+            print("Failed to parse JSON:", e)
+            return output_text
+    def inference(self, input_data, mode=None):
+        if mode == "static":
+            simple_json = self.get_simple_json()
+            return [simple_json]
+        client = Client(self.hf_space, hf_token=self.hf_token)
+        # Extract and prepare the absolute paths for all file paths in input_data
+        file_paths = [
+            os.path.abspath(file_path)
+            for data in input_data
+            for file_path in data["file_path"]
+        ]
+        # Validate file existence and prepare files for the Gradio client
+        image_files = [handle_file(path) for path in file_paths if os.path.exists(path)]
+        results = client.predict(
+            input_imgs=image_files,
+            text_input=input_data[0]["text_input"],  # Single shared text input for all images
+            api_name="/run_inference"  # Specify the Gradio API endpoint
+        )
+        # Convert the string into a Python list
+        parsed_results = ast.literal_eval(results)
+        results_array = []
+        for page_output in parsed_results:
+            page_result = self.process_response(page_output)
+            results_array.append(page_result)
+        return results_array

{sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sparrow-parse
-Version: 0.3.6
+Version: 0.3.8
 Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
 Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
 Author: Andrej Baranovskij
@@ -65,7 +65,7 @@ input_data = [
 results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
                                  debug_dir="/data/",
                                  debug=True,
-                                 mode="static")
+                                 mode=None)
 for i, result in enumerate(results_array):
     print(f"Result for page {i + 1}:", result)
@@ -74,6 +74,10 @@ print(f"Number of pages: {num_pages}")
 Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
+Method `run_inference` will return results and number of pages processed.
+Note: GPU backend `katanaml/sparrow-qwen2-vl-7b` is private, to be able to run below command, you need to create your own backend on Hugging Face space using [code](https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse/sparrow_parse/vllm/infra/qwen2_vl_7b) from Sparrow Parse.
 ## PDF pre-processing
 ```

{sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse.egg-info/requires.txt RENAMED Viewed

@@ -1,8 +1,11 @@
 rich
-transformers==4.41.2
+transformers==4.45.1
 sentence-transformers==3.0.1
 numpy==1.26.4
 pypdf==4.3.0
 easyocr==1.7.1
 gradio_client
 pdf2image
+[:sys_platform == "darwin" and platform_machine == "arm64"]
+mlx-vlm==0.1.1

sparrow-parse-0.3.6/sparrow_parse/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = '0.3.6'

sparrow-parse-0.3.6/sparrow_parse/vllm/huggingface_inference.py DELETED Viewed

@@ -1,40 +0,0 @@
-from gradio_client import Client, handle_file
-from sparrow_parse.vllm.inference_base import ModelInference
-import json
-class HuggingFaceInference(ModelInference):
-    def __init__(self, hf_space, hf_token):
-        self.hf_space = hf_space
-        self.hf_token = hf_token
-    def process_response(self, output_text):
-        json_string = output_text
-        json_string = json_string.strip("[]'")
-        json_string = json_string.replace("```json\n", "").replace("\n```", "")
-        json_string = json_string.replace("'", "")
-        try:
-            formatted_json = json.loads(json_string)
-            return json.dumps(formatted_json, indent=2)
-        except json.JSONDecodeError as e:
-            print("Failed to parse JSON:", e)
-            return output_text
-    def inference(self, input_data, mode=None):
-        if mode == "static":
-            simple_json = self.get_simple_json()
-            return simple_json
-        client = Client(self.hf_space, hf_token=self.hf_token)
-        result = client.predict(
-            image=handle_file(input_data[0]["file_path"]),
-            text_input=input_data[0]["text_input"],
-            api_name="/run_inference"
-        )
-        return self.process_response(result)