sparrow-parse 0.3.8__tar.gz → 0.3.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/PKG-INFO +17 -11
  2. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/README.md +16 -10
  3. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/setup.py +1 -1
  4. sparrow-parse-0.3.10/sparrow_parse/__init__.py +1 -0
  5. sparrow-parse-0.3.10/sparrow_parse/extractors/vllm_extractor.py +87 -0
  6. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse/vllm/inference_factory.py +3 -0
  7. sparrow-parse-0.3.10/sparrow_parse/vllm/mlx_inference.py +135 -0
  8. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse.egg-info/PKG-INFO +17 -11
  9. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse.egg-info/SOURCES.txt +2 -1
  10. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse.egg-info/requires.txt +4 -4
  11. sparrow-parse-0.3.8/sparrow_parse/__init__.py +0 -1
  12. sparrow-parse-0.3.8/sparrow_parse/extractors/vllm_extractor.py +0 -82
  13. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/setup.cfg +0 -0
  14. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse/__main__.py +0 -0
  15. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse/extractors/__init__.py +0 -0
  16. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse/helpers/__init__.py +0 -0
  17. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse/helpers/pdf_optimizer.py +0 -0
  18. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse/processors/__init__.py +0 -0
  19. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse/processors/table_structure_processor.py +0 -0
  20. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse/vllm/__init__.py +0 -0
  21. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse/vllm/huggingface_inference.py +0 -0
  22. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse/vllm/inference_base.py +0 -0
  23. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse/vllm/local_gpu_inference.py +0 -0
  24. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse.egg-info/dependency_links.txt +0 -0
  25. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse.egg-info/entry_points.txt +0 -0
  26. {sparrow-parse-0.3.8 → sparrow-parse-0.3.10}/sparrow_parse.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.3.8
3
+ Version: 0.3.10
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
@@ -31,7 +31,7 @@ pip install sparrow-parse
31
31
 
32
32
  ## Parsing and extraction
33
33
 
34
- ### Sparrow Parse VL (vision-language model) extractor with Hugging Face GPU infra
34
+ ### Sparrow Parse VL (vision-language model) extractor with local MLX or Hugging Face Cloud GPU infra
35
35
 
36
36
  ```
37
37
  # run locally: python -m sparrow_parse.extractors.vllm_extractor
@@ -41,13 +41,9 @@ from sparrow_parse.extractors.vllm_extractor import VLLMExtractor
41
41
 
42
42
  extractor = VLLMExtractor()
43
43
 
44
- # export HF_TOKEN="hf_"
45
44
  config = {
46
- "method": "huggingface", # Could be 'huggingface' or 'local_gpu'
47
- "hf_space": "katanaml/sparrow-qwen2-vl-7b",
48
- "hf_token": os.getenv('HF_TOKEN'),
49
- # Additional fields for local GPU inference
50
- # "device": "cuda", "model_path": "model.pth"
45
+ "method": "mlx", # Could be 'huggingface', 'mlx' or 'local_gpu'
46
+ "model_name": "mlx-community/Qwen2-VL-72B-Instruct-4bit",
51
47
  }
52
48
 
53
49
  # Use the factory to get the correct instance
@@ -56,14 +52,14 @@ model_inference_instance = factory.get_inference_instance()
56
52
 
57
53
  input_data = [
58
54
  {
59
- "file_path": "/data/oracle_10k_2014_q1_small.pdf",
60
- "text_input": "retrieve {"table": [{"description": "str", "latest_amount": 0, "previous_amount": 0}]}. return response in JSON format"
55
+ "file_path": "/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/bonds_table.jpg",
56
+ "text_input": "retrieve all data. return response in JSON format"
61
57
  }
62
58
  ]
63
59
 
64
60
  # Now you can run inference without knowing which implementation is used
65
61
  results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
66
- debug_dir="/data/",
62
+ debug_dir=None,
67
63
  debug=True,
68
64
  mode=None)
69
65
 
@@ -76,6 +72,16 @@ Use `mode="static"` if you want to simulate LLM call, without executing LLM back
76
72
 
77
73
  Method `run_inference` will return results and number of pages processed.
78
74
 
75
+ To run with Hugging Face backend use these config values:
76
+
77
+ ```
78
+ config = {
79
+ "method": "huggingface", # Could be 'huggingface' or 'local_gpu'
80
+ "hf_space": "katanaml/sparrow-qwen2-vl-7b",
81
+ "hf_token": os.getenv('HF_TOKEN'),
82
+ }
83
+ ```
84
+
79
85
  Note: GPU backend `katanaml/sparrow-qwen2-vl-7b` is private, to be able to run below command, you need to create your own backend on Hugging Face space using [code](https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse/sparrow_parse/vllm/infra/qwen2_vl_7b) from Sparrow Parse.
80
86
 
81
87
  ## PDF pre-processing
@@ -12,7 +12,7 @@ pip install sparrow-parse
12
12
 
13
13
  ## Parsing and extraction
14
14
 
15
- ### Sparrow Parse VL (vision-language model) extractor with Hugging Face GPU infra
15
+ ### Sparrow Parse VL (vision-language model) extractor with local MLX or Hugging Face Cloud GPU infra
16
16
 
17
17
  ```
18
18
  # run locally: python -m sparrow_parse.extractors.vllm_extractor
@@ -22,13 +22,9 @@ from sparrow_parse.extractors.vllm_extractor import VLLMExtractor
22
22
 
23
23
  extractor = VLLMExtractor()
24
24
 
25
- # export HF_TOKEN="hf_"
26
25
  config = {
27
- "method": "huggingface", # Could be 'huggingface' or 'local_gpu'
28
- "hf_space": "katanaml/sparrow-qwen2-vl-7b",
29
- "hf_token": os.getenv('HF_TOKEN'),
30
- # Additional fields for local GPU inference
31
- # "device": "cuda", "model_path": "model.pth"
26
+ "method": "mlx", # Could be 'huggingface', 'mlx' or 'local_gpu'
27
+ "model_name": "mlx-community/Qwen2-VL-72B-Instruct-4bit",
32
28
  }
33
29
 
34
30
  # Use the factory to get the correct instance
@@ -37,14 +33,14 @@ model_inference_instance = factory.get_inference_instance()
37
33
 
38
34
  input_data = [
39
35
  {
40
- "file_path": "/data/oracle_10k_2014_q1_small.pdf",
41
- "text_input": "retrieve {"table": [{"description": "str", "latest_amount": 0, "previous_amount": 0}]}. return response in JSON format"
36
+ "file_path": "/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/bonds_table.jpg",
37
+ "text_input": "retrieve all data. return response in JSON format"
42
38
  }
43
39
  ]
44
40
 
45
41
  # Now you can run inference without knowing which implementation is used
46
42
  results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
47
- debug_dir="/data/",
43
+ debug_dir=None,
48
44
  debug=True,
49
45
  mode=None)
50
46
 
@@ -57,6 +53,16 @@ Use `mode="static"` if you want to simulate LLM call, without executing LLM back
57
53
 
58
54
  Method `run_inference` will return results and number of pages processed.
59
55
 
56
+ To run with Hugging Face backend use these config values:
57
+
58
+ ```
59
+ config = {
60
+ "method": "huggingface", # Could be 'huggingface' or 'local_gpu'
61
+ "hf_space": "katanaml/sparrow-qwen2-vl-7b",
62
+ "hf_token": os.getenv('HF_TOKEN'),
63
+ }
64
+ ```
65
+
60
66
  Note: GPU backend `katanaml/sparrow-qwen2-vl-7b` is private, to be able to run below command, you need to create your own backend on Hugging Face space using [code](https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse/sparrow_parse/vllm/infra/qwen2_vl_7b) from Sparrow Parse.
61
67
 
62
68
  ## PDF pre-processing
@@ -8,7 +8,7 @@ with open("requirements.txt", "r", encoding="utf-8") as fh:
8
8
 
9
9
  setup(
10
10
  name="sparrow-parse",
11
- version="0.3.8",
11
+ version="0.3.10",
12
12
  author="Andrej Baranovskij",
13
13
  author_email="andrejus.baranovskis@gmail.com",
14
14
  description="Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.",
@@ -0,0 +1 @@
1
+ __version__ = '0.3.10'
@@ -0,0 +1,87 @@
1
+ from sparrow_parse.vllm.inference_factory import InferenceFactory
2
+ from sparrow_parse.helpers.pdf_optimizer import PDFOptimizer
3
+ from rich import print
4
+ import os
5
+ import shutil
6
+
7
+
8
+ class VLLMExtractor(object):
9
+ def __init__(self):
10
+ pass
11
+
12
+ def run_inference(self, model_inference_instance, input_data,
13
+ generic_query=False, debug_dir=None, debug=False, mode=None):
14
+ # Modify input for generic queries
15
+ if generic_query:
16
+ input_data[0]["text_input"] = "retrieve document data. return response in JSON format"
17
+
18
+ if debug:
19
+ print("Input Data:", input_data)
20
+
21
+ # Check if the input file is a PDF
22
+ file_path = input_data[0]["file_path"]
23
+ if self.is_pdf(file_path):
24
+ return self._process_pdf(model_inference_instance, input_data, debug_dir, mode)
25
+
26
+ # Default processing for non-PDF files
27
+ input_data[0]["file_path"] = [file_path]
28
+ results_array = model_inference_instance.inference(input_data)
29
+ return results_array, 1
30
+
31
+
32
+ def _process_pdf(self, model_inference_instance, input_data, debug_dir, mode):
33
+ """Handles processing and inference for PDF files."""
34
+ pdf_optimizer = PDFOptimizer()
35
+ num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(input_data[0]["file_path"],
36
+ debug_dir,
37
+ True)
38
+ # Update file paths for PDF pages
39
+ input_data[0]["file_path"] = output_files
40
+
41
+ # Run inference on PDF pages
42
+ results_array = model_inference_instance.inference(input_data, mode)
43
+
44
+ # Clean up temporary directory
45
+ shutil.rmtree(temp_dir, ignore_errors=True)
46
+ return results_array, num_pages
47
+
48
+ @staticmethod
49
+ def is_pdf(file_path):
50
+ """Checks if a file is a PDF based on its extension."""
51
+ return file_path.lower().endswith('.pdf')
52
+
53
+ if __name__ == "__main__":
54
+ # run locally: python -m sparrow_parse.extractors.vllm_extractor
55
+
56
+ extractor = VLLMExtractor()
57
+
58
+ # # export HF_TOKEN="hf_"
59
+ # config = {
60
+ # "method": "mlx", # Could be 'huggingface', 'mlx' or 'local_gpu'
61
+ # "model_name": "mlx-community/Qwen2-VL-72B-Instruct-4bit",
62
+ # # "hf_space": "katanaml/sparrow-qwen2-vl-7b",
63
+ # # "hf_token": os.getenv('HF_TOKEN'),
64
+ # # Additional fields for local GPU inference
65
+ # # "device": "cuda", "model_path": "model.pth"
66
+ # }
67
+ #
68
+ # # Use the factory to get the correct instance
69
+ # factory = InferenceFactory(config)
70
+ # model_inference_instance = factory.get_inference_instance()
71
+ #
72
+ # input_data = [
73
+ # {
74
+ # "file_path": "/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/bonds_table.png",
75
+ # "text_input": "retrieve all data. return response in JSON format"
76
+ # }
77
+ # ]
78
+ #
79
+ # # Now you can run inference without knowing which implementation is used
80
+ # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
81
+ # debug_dir=None,
82
+ # debug=True,
83
+ # mode=None)
84
+ #
85
+ # for i, result in enumerate(results_array):
86
+ # print(f"Result for page {i + 1}:", result)
87
+ # print(f"Number of pages: {num_pages}")
@@ -1,5 +1,6 @@
1
1
  from sparrow_parse.vllm.huggingface_inference import HuggingFaceInference
2
2
  from sparrow_parse.vllm.local_gpu_inference import LocalGPUInference
3
+ from sparrow_parse.vllm.mlx_inference import MLXInference
3
4
 
4
5
 
5
6
  class InferenceFactory:
@@ -12,6 +13,8 @@ class InferenceFactory:
12
13
  elif self.config["method"] == "local_gpu":
13
14
  model = self._load_local_model() # Replace with actual model loading logic
14
15
  return LocalGPUInference(model=model, device=self.config.get("device", "cuda"))
16
+ elif self.config["method"] == "mlx":
17
+ return MLXInference(model_name=self.config["model_name"])
15
18
  else:
16
19
  raise ValueError(f"Unknown method: {self.config['method']}")
17
20
 
@@ -0,0 +1,135 @@
1
+ from mlx_vlm import load, generate
2
+ from mlx_vlm.prompt_utils import apply_chat_template
3
+ from mlx_vlm.utils import load_image
4
+ from sparrow_parse.vllm.inference_base import ModelInference
5
+ import os
6
+ import json
7
+
8
+
9
+ class MLXInference(ModelInference):
10
+ """
11
+ A class for performing inference using the MLX model.
12
+ Handles image preprocessing, response formatting, and model interaction.
13
+ """
14
+
15
+ def __init__(self, model_name):
16
+ """
17
+ Initialize the inference class with the given model name and load the model once.
18
+
19
+ :param model_name: Name of the model to load.
20
+ """
21
+ self.model, self.processor = self._load_model_and_processor(model_name)
22
+ self.config = self.model.config
23
+
24
+ print(f"Loaded model: {model_name}")
25
+
26
+
27
+ @staticmethod
28
+ def _load_model_and_processor(model_name):
29
+ """
30
+ Load the model and processor for inference.
31
+
32
+ :param model_name: Name of the model to load.
33
+ :return: Tuple containing the loaded model and processor.
34
+ """
35
+ return load(model_name)
36
+
37
+
38
+ def process_response(self, output_text):
39
+ """
40
+ Process and clean the model's raw output to format as JSON.
41
+
42
+ :param output_text: Raw output text from the model.
43
+ :return: A formatted JSON string or the original text in case of errors.
44
+ """
45
+ try:
46
+ cleaned_text = (
47
+ output_text.strip("[]'")
48
+ .replace("```json\n", "")
49
+ .replace("\n```", "")
50
+ .replace("'", "")
51
+ )
52
+ formatted_json = json.loads(cleaned_text)
53
+ return json.dumps(formatted_json, indent=2)
54
+ except json.JSONDecodeError as e:
55
+ print(f"Failed to parse JSON: {e}")
56
+ return output_text
57
+
58
+
59
+ def load_image_data(self, image_filepath, max_width=1250, max_height=1750):
60
+ """
61
+ Load and resize image while maintaining its aspect ratio.
62
+
63
+ :param image_filepath: Path to the image file.
64
+ :param max_width: Maximum allowed width of the image.
65
+ :param max_height: Maximum allowed height of the image.
66
+ :return: Tuple containing the image object and its new dimensions.
67
+ """
68
+ image = load_image(image_filepath) # Assuming load_image is defined elsewhere
69
+ width, height = image.size
70
+
71
+ # Calculate new dimensions while maintaining the aspect ratio
72
+ if width > max_width or height > max_height:
73
+ aspect_ratio = width / height
74
+ new_width = min(max_width, int(max_height * aspect_ratio))
75
+ new_height = min(max_height, int(max_width / aspect_ratio))
76
+ return image, new_width, new_height
77
+
78
+ return image, width, height
79
+
80
+
81
+ def inference(self, input_data, mode=None):
82
+ """
83
+ Perform inference on input data using the specified model.
84
+
85
+ :param input_data: A list of dictionaries containing image file paths and text inputs.
86
+ :param mode: Optional mode for inference ("static" for simple JSON output).
87
+ :return: List of processed model responses.
88
+ """
89
+ if mode == "static":
90
+ return [self.get_simple_json()]
91
+
92
+ # Prepare absolute file paths
93
+ file_paths = self._extract_file_paths(input_data)
94
+
95
+ results = []
96
+ for file_path in file_paths:
97
+ image, width, height = self.load_image_data(file_path)
98
+
99
+ # Prepare messages for the chat model
100
+ messages = [
101
+ {"role": "system", "content": "You are an expert at extracting structured text from image documents."},
102
+ {"role": "user", "content": input_data[0]["text_input"]},
103
+ ]
104
+
105
+ # Generate and process response
106
+ prompt = apply_chat_template(self.processor, self.config, messages) # Assuming defined
107
+ response = generate(
108
+ self.model,
109
+ self.processor,
110
+ image,
111
+ prompt,
112
+ resize_shape=(width, height),
113
+ max_tokens=4000,
114
+ temperature=0.0,
115
+ verbose=False
116
+ )
117
+ results.append(self.process_response(response))
118
+
119
+ print("Inference completed successfully for: ", file_path)
120
+
121
+ return results
122
+
123
+ @staticmethod
124
+ def _extract_file_paths(input_data):
125
+ """
126
+ Extract and resolve absolute file paths from input data.
127
+
128
+ :param input_data: List of dictionaries containing image file paths.
129
+ :return: List of absolute file paths.
130
+ """
131
+ return [
132
+ os.path.abspath(file_path)
133
+ for data in input_data
134
+ for file_path in data.get("file_path", [])
135
+ ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.3.8
3
+ Version: 0.3.10
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
@@ -31,7 +31,7 @@ pip install sparrow-parse
31
31
 
32
32
  ## Parsing and extraction
33
33
 
34
- ### Sparrow Parse VL (vision-language model) extractor with Hugging Face GPU infra
34
+ ### Sparrow Parse VL (vision-language model) extractor with local MLX or Hugging Face Cloud GPU infra
35
35
 
36
36
  ```
37
37
  # run locally: python -m sparrow_parse.extractors.vllm_extractor
@@ -41,13 +41,9 @@ from sparrow_parse.extractors.vllm_extractor import VLLMExtractor
41
41
 
42
42
  extractor = VLLMExtractor()
43
43
 
44
- # export HF_TOKEN="hf_"
45
44
  config = {
46
- "method": "huggingface", # Could be 'huggingface' or 'local_gpu'
47
- "hf_space": "katanaml/sparrow-qwen2-vl-7b",
48
- "hf_token": os.getenv('HF_TOKEN'),
49
- # Additional fields for local GPU inference
50
- # "device": "cuda", "model_path": "model.pth"
45
+ "method": "mlx", # Could be 'huggingface', 'mlx' or 'local_gpu'
46
+ "model_name": "mlx-community/Qwen2-VL-72B-Instruct-4bit",
51
47
  }
52
48
 
53
49
  # Use the factory to get the correct instance
@@ -56,14 +52,14 @@ model_inference_instance = factory.get_inference_instance()
56
52
 
57
53
  input_data = [
58
54
  {
59
- "file_path": "/data/oracle_10k_2014_q1_small.pdf",
60
- "text_input": "retrieve {"table": [{"description": "str", "latest_amount": 0, "previous_amount": 0}]}. return response in JSON format"
55
+ "file_path": "/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/bonds_table.jpg",
56
+ "text_input": "retrieve all data. return response in JSON format"
61
57
  }
62
58
  ]
63
59
 
64
60
  # Now you can run inference without knowing which implementation is used
65
61
  results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
66
- debug_dir="/data/",
62
+ debug_dir=None,
67
63
  debug=True,
68
64
  mode=None)
69
65
 
@@ -76,6 +72,16 @@ Use `mode="static"` if you want to simulate LLM call, without executing LLM back
76
72
 
77
73
  Method `run_inference` will return results and number of pages processed.
78
74
 
75
+ To run with Hugging Face backend use these config values:
76
+
77
+ ```
78
+ config = {
79
+ "method": "huggingface", # Could be 'huggingface' or 'local_gpu'
80
+ "hf_space": "katanaml/sparrow-qwen2-vl-7b",
81
+ "hf_token": os.getenv('HF_TOKEN'),
82
+ }
83
+ ```
84
+
79
85
  Note: GPU backend `katanaml/sparrow-qwen2-vl-7b` is private, to be able to run below command, you need to create your own backend on Hugging Face space using [code](https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse/sparrow_parse/vllm/infra/qwen2_vl_7b) from Sparrow Parse.
80
86
 
81
87
  ## PDF pre-processing
@@ -18,4 +18,5 @@ sparrow_parse/vllm/__init__.py
18
18
  sparrow_parse/vllm/huggingface_inference.py
19
19
  sparrow_parse/vllm/inference_base.py
20
20
  sparrow_parse/vllm/inference_factory.py
21
- sparrow_parse/vllm/local_gpu_inference.py
21
+ sparrow_parse/vllm/local_gpu_inference.py
22
+ sparrow_parse/vllm/mlx_inference.py
@@ -1,11 +1,11 @@
1
1
  rich
2
- transformers==4.45.1
3
- sentence-transformers==3.0.1
4
- numpy==1.26.4
2
+ transformers==4.46.3
3
+ sentence-transformers==3.3.1
4
+ numpy==2.1.3
5
5
  pypdf==4.3.0
6
6
  easyocr==1.7.1
7
7
  gradio_client
8
8
  pdf2image
9
9
 
10
10
  [:sys_platform == "darwin" and platform_machine == "arm64"]
11
- mlx-vlm==0.1.1
11
+ mlx-vlm==0.1.3
@@ -1 +0,0 @@
1
- __version__ = '0.3.8'
@@ -1,82 +0,0 @@
1
- from sparrow_parse.vllm.inference_factory import InferenceFactory
2
- from sparrow_parse.helpers.pdf_optimizer import PDFOptimizer
3
- from rich import print
4
- import os
5
- import shutil
6
-
7
-
8
- class VLLMExtractor(object):
9
- def __init__(self):
10
- pass
11
-
12
- def run_inference(self,
13
- model_inference_instance,
14
- input_data,
15
- generic_query=False,
16
- debug_dir=None,
17
- debug=False,
18
- mode=None):
19
- if generic_query:
20
- input_data[0]["text_input"] = "retrieve document data. return response in JSON format"
21
-
22
- if debug:
23
- print("Input Data:", input_data)
24
-
25
- results_array = []
26
-
27
- if self.is_pdf(input_data[0]["file_path"]):
28
- pdf_optimizer = PDFOptimizer()
29
- num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(input_data[0]["file_path"],
30
- debug_dir,
31
- True)
32
-
33
- input_data[0]["file_path"] = output_files
34
-
35
- # Run inference on the page
36
- results_array = model_inference_instance.inference(input_data, mode)
37
-
38
- shutil.rmtree(temp_dir, ignore_errors=True)
39
- return results_array, num_pages
40
-
41
- input_data[0]["file_path"] = [input_data[0]["file_path"]]
42
- results_array = model_inference_instance.inference(input_data)
43
-
44
- return results_array, 1
45
-
46
- def is_pdf(self, file_path):
47
- return file_path.lower().endswith('.pdf')
48
-
49
- if __name__ == "__main__":
50
- # run locally: python -m sparrow_parse.extractors.vllm_extractor
51
-
52
- extractor = VLLMExtractor()
53
-
54
- # # export HF_TOKEN="hf_"
55
- # config = {
56
- # "method": "huggingface", # Could be 'huggingface' or 'local_gpu'
57
- # "hf_space": "katanaml/sparrow-qwen2-vl-7b",
58
- # "hf_token": os.getenv('HF_TOKEN'),
59
- # # Additional fields for local GPU inference
60
- # # "device": "cuda", "model_path": "model.pth"
61
- # }
62
- #
63
- # # Use the factory to get the correct instance
64
- # factory = InferenceFactory(config)
65
- # model_inference_instance = factory.get_inference_instance()
66
- #
67
- # input_data = [
68
- # {
69
- # "file_path": "/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/oracle_10k_2014_q1_small.pdf",
70
- # "text_input": "retrieve {\"table\": [{\"description\": \"str\", \"latest_amount\": 0, \"previous_amount\": 0}]}. return response in JSON format"
71
- # }
72
- # ]
73
- #
74
- # # Now you can run inference without knowing which implementation is used
75
- # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
76
- # debug_dir="/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/",
77
- # debug=True,
78
- # mode=None)
79
- #
80
- # for i, result in enumerate(results_array):
81
- # print(f"Result for page {i + 1}:", result)
82
- # print(f"Number of pages: {num_pages}")
File without changes