sparrow-parse 0.3.6__tar.gz → 0.3.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/PKG-INFO +6 -2
  2. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/README.md +5 -1
  3. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/setup.py +1 -1
  4. sparrow-parse-0.3.8/sparrow_parse/__init__.py +1 -0
  5. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/extractors/vllm_extractor.py +6 -11
  6. sparrow-parse-0.3.8/sparrow_parse/vllm/huggingface_inference.py +60 -0
  7. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse.egg-info/PKG-INFO +6 -2
  8. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse.egg-info/requires.txt +4 -1
  9. sparrow-parse-0.3.6/sparrow_parse/__init__.py +0 -1
  10. sparrow-parse-0.3.6/sparrow_parse/vllm/huggingface_inference.py +0 -40
  11. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/setup.cfg +0 -0
  12. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/__main__.py +0 -0
  13. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/extractors/__init__.py +0 -0
  14. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/helpers/__init__.py +0 -0
  15. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/helpers/pdf_optimizer.py +0 -0
  16. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/processors/__init__.py +0 -0
  17. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/processors/table_structure_processor.py +0 -0
  18. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/vllm/__init__.py +0 -0
  19. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/vllm/inference_base.py +0 -0
  20. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/vllm/inference_factory.py +0 -0
  21. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/vllm/local_gpu_inference.py +0 -0
  22. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse.egg-info/SOURCES.txt +0 -0
  23. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse.egg-info/dependency_links.txt +0 -0
  24. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse.egg-info/entry_points.txt +0 -0
  25. {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.3.6
3
+ Version: 0.3.8
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
@@ -65,7 +65,7 @@ input_data = [
65
65
  results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
66
66
  debug_dir="/data/",
67
67
  debug=True,
68
- mode="static")
68
+ mode=None)
69
69
 
70
70
  for i, result in enumerate(results_array):
71
71
  print(f"Result for page {i + 1}:", result)
@@ -74,6 +74,10 @@ print(f"Number of pages: {num_pages}")
74
74
 
75
75
  Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
76
76
 
77
+ Method `run_inference` will return results and number of pages processed.
78
+
79
+ Note: GPU backend `katanaml/sparrow-qwen2-vl-7b` is private, to be able to run below command, you need to create your own backend on Hugging Face space using [code](https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse/sparrow_parse/vllm/infra/qwen2_vl_7b) from Sparrow Parse.
80
+
77
81
  ## PDF pre-processing
78
82
 
79
83
  ```
@@ -46,7 +46,7 @@ input_data = [
46
46
  results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
47
47
  debug_dir="/data/",
48
48
  debug=True,
49
- mode="static")
49
+ mode=None)
50
50
 
51
51
  for i, result in enumerate(results_array):
52
52
  print(f"Result for page {i + 1}:", result)
@@ -55,6 +55,10 @@ print(f"Number of pages: {num_pages}")
55
55
 
56
56
  Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
57
57
 
58
+ Method `run_inference` will return results and number of pages processed.
59
+
60
+ Note: GPU backend `katanaml/sparrow-qwen2-vl-7b` is private, to be able to run below command, you need to create your own backend on Hugging Face space using [code](https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse/sparrow_parse/vllm/infra/qwen2_vl_7b) from Sparrow Parse.
61
+
58
62
  ## PDF pre-processing
59
63
 
60
64
  ```
@@ -8,7 +8,7 @@ with open("requirements.txt", "r", encoding="utf-8") as fh:
8
8
 
9
9
  setup(
10
10
  name="sparrow-parse",
11
- version="0.3.6",
11
+ version="0.3.8",
12
12
  author="Andrej Baranovskij",
13
13
  author_email="andrejus.baranovskis@gmail.com",
14
14
  description="Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.",
@@ -0,0 +1 @@
1
+ __version__ = '0.3.8'
@@ -30,21 +30,16 @@ class VLLMExtractor(object):
30
30
  debug_dir,
31
31
  True)
32
32
 
33
- # Run inference on each page
34
- for page_num, output_file in enumerate(output_files):
35
- input_data[0]["file_path"] = output_file
36
- if debug:
37
- print(f"Running inference on page {page_num + 1}...")
33
+ input_data[0]["file_path"] = output_files
38
34
 
39
- # Run inference on the page
40
- result = model_inference_instance.inference(input_data, mode)
41
- results_array.append(result)
35
+ # Run inference on the page
36
+ results_array = model_inference_instance.inference(input_data, mode)
42
37
 
43
38
  shutil.rmtree(temp_dir, ignore_errors=True)
44
39
  return results_array, num_pages
45
40
 
46
- result = model_inference_instance.inference(input_data)
47
- results_array.append(result)
41
+ input_data[0]["file_path"] = [input_data[0]["file_path"]]
42
+ results_array = model_inference_instance.inference(input_data)
48
43
 
49
44
  return results_array, 1
50
45
 
@@ -80,7 +75,7 @@ if __name__ == "__main__":
80
75
  # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
81
76
  # debug_dir="/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/",
82
77
  # debug=True,
83
- # mode="static")
78
+ # mode=None)
84
79
  #
85
80
  # for i, result in enumerate(results_array):
86
81
  # print(f"Result for page {i + 1}:", result)
@@ -0,0 +1,60 @@
1
+ from gradio_client import Client, handle_file
2
+ from sparrow_parse.vllm.inference_base import ModelInference
3
+ import json
4
+ import os
5
+ import ast
6
+
7
+
8
+ class HuggingFaceInference(ModelInference):
9
+ def __init__(self, hf_space, hf_token):
10
+ self.hf_space = hf_space
11
+ self.hf_token = hf_token
12
+
13
+
14
+ def process_response(self, output_text):
15
+ json_string = output_text
16
+
17
+ json_string = json_string.strip("[]'")
18
+ json_string = json_string.replace("```json\n", "").replace("\n```", "")
19
+ json_string = json_string.replace("'", "")
20
+
21
+ try:
22
+ formatted_json = json.loads(json_string)
23
+ return json.dumps(formatted_json, indent=2)
24
+ except json.JSONDecodeError as e:
25
+ print("Failed to parse JSON:", e)
26
+ return output_text
27
+
28
+
29
+ def inference(self, input_data, mode=None):
30
+ if mode == "static":
31
+ simple_json = self.get_simple_json()
32
+ return [simple_json]
33
+
34
+ client = Client(self.hf_space, hf_token=self.hf_token)
35
+
36
+ # Extract and prepare the absolute paths for all file paths in input_data
37
+ file_paths = [
38
+ os.path.abspath(file_path)
39
+ for data in input_data
40
+ for file_path in data["file_path"]
41
+ ]
42
+
43
+ # Validate file existence and prepare files for the Gradio client
44
+ image_files = [handle_file(path) for path in file_paths if os.path.exists(path)]
45
+
46
+ results = client.predict(
47
+ input_imgs=image_files,
48
+ text_input=input_data[0]["text_input"], # Single shared text input for all images
49
+ api_name="/run_inference" # Specify the Gradio API endpoint
50
+ )
51
+
52
+ # Convert the string into a Python list
53
+ parsed_results = ast.literal_eval(results)
54
+
55
+ results_array = []
56
+ for page_output in parsed_results:
57
+ page_result = self.process_response(page_output)
58
+ results_array.append(page_result)
59
+
60
+ return results_array
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.3.6
3
+ Version: 0.3.8
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
@@ -65,7 +65,7 @@ input_data = [
65
65
  results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
66
66
  debug_dir="/data/",
67
67
  debug=True,
68
- mode="static")
68
+ mode=None)
69
69
 
70
70
  for i, result in enumerate(results_array):
71
71
  print(f"Result for page {i + 1}:", result)
@@ -74,6 +74,10 @@ print(f"Number of pages: {num_pages}")
74
74
 
75
75
  Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
76
76
 
77
+ Method `run_inference` will return results and number of pages processed.
78
+
79
+ Note: GPU backend `katanaml/sparrow-qwen2-vl-7b` is private, to be able to run below command, you need to create your own backend on Hugging Face space using [code](https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse/sparrow_parse/vllm/infra/qwen2_vl_7b) from Sparrow Parse.
80
+
77
81
  ## PDF pre-processing
78
82
 
79
83
  ```
@@ -1,8 +1,11 @@
1
1
  rich
2
- transformers==4.41.2
2
+ transformers==4.45.1
3
3
  sentence-transformers==3.0.1
4
4
  numpy==1.26.4
5
5
  pypdf==4.3.0
6
6
  easyocr==1.7.1
7
7
  gradio_client
8
8
  pdf2image
9
+
10
+ [:sys_platform == "darwin" and platform_machine == "arm64"]
11
+ mlx-vlm==0.1.1
@@ -1 +0,0 @@
1
- __version__ = '0.3.6'
@@ -1,40 +0,0 @@
1
- from gradio_client import Client, handle_file
2
- from sparrow_parse.vllm.inference_base import ModelInference
3
- import json
4
-
5
-
6
- class HuggingFaceInference(ModelInference):
7
- def __init__(self, hf_space, hf_token):
8
- self.hf_space = hf_space
9
- self.hf_token = hf_token
10
-
11
-
12
- def process_response(self, output_text):
13
- json_string = output_text
14
-
15
- json_string = json_string.strip("[]'")
16
- json_string = json_string.replace("```json\n", "").replace("\n```", "")
17
- json_string = json_string.replace("'", "")
18
-
19
- try:
20
- formatted_json = json.loads(json_string)
21
- return json.dumps(formatted_json, indent=2)
22
- except json.JSONDecodeError as e:
23
- print("Failed to parse JSON:", e)
24
- return output_text
25
-
26
-
27
- def inference(self, input_data, mode=None):
28
- if mode == "static":
29
- simple_json = self.get_simple_json()
30
- return simple_json
31
-
32
- client = Client(self.hf_space, hf_token=self.hf_token)
33
-
34
- result = client.predict(
35
- image=handle_file(input_data[0]["file_path"]),
36
- text_input=input_data[0]["text_input"],
37
- api_name="/run_inference"
38
- )
39
-
40
- return self.process_response(result)
File without changes