sparrow-parse 0.3.6__tar.gz → 0.3.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/PKG-INFO +6 -2
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/README.md +5 -1
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/setup.py +1 -1
- sparrow-parse-0.3.8/sparrow_parse/__init__.py +1 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/extractors/vllm_extractor.py +6 -11
- sparrow-parse-0.3.8/sparrow_parse/vllm/huggingface_inference.py +60 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse.egg-info/PKG-INFO +6 -2
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse.egg-info/requires.txt +4 -1
- sparrow-parse-0.3.6/sparrow_parse/__init__.py +0 -1
- sparrow-parse-0.3.6/sparrow_parse/vllm/huggingface_inference.py +0 -40
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/setup.cfg +0 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/__main__.py +0 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/extractors/__init__.py +0 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/helpers/__init__.py +0 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/helpers/pdf_optimizer.py +0 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/processors/__init__.py +0 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/processors/table_structure_processor.py +0 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/vllm/__init__.py +0 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/vllm/inference_base.py +0 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/vllm/inference_factory.py +0 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/vllm/local_gpu_inference.py +0 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse.egg-info/SOURCES.txt +0 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse.egg-info/dependency_links.txt +0 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse.egg-info/entry_points.txt +0 -0
- {sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.8
|
4
4
|
Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
Author: Andrej Baranovskij
|
@@ -65,7 +65,7 @@ input_data = [
|
|
65
65
|
results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
|
66
66
|
debug_dir="/data/",
|
67
67
|
debug=True,
|
68
|
-
mode=
|
68
|
+
mode=None)
|
69
69
|
|
70
70
|
for i, result in enumerate(results_array):
|
71
71
|
print(f"Result for page {i + 1}:", result)
|
@@ -74,6 +74,10 @@ print(f"Number of pages: {num_pages}")
|
|
74
74
|
|
75
75
|
Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
|
76
76
|
|
77
|
+
Method `run_inference` will return results and number of pages processed.
|
78
|
+
|
79
|
+
Note: GPU backend `katanaml/sparrow-qwen2-vl-7b` is private, to be able to run below command, you need to create your own backend on Hugging Face space using [code](https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse/sparrow_parse/vllm/infra/qwen2_vl_7b) from Sparrow Parse.
|
80
|
+
|
77
81
|
## PDF pre-processing
|
78
82
|
|
79
83
|
```
|
@@ -46,7 +46,7 @@ input_data = [
|
|
46
46
|
results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
|
47
47
|
debug_dir="/data/",
|
48
48
|
debug=True,
|
49
|
-
mode=
|
49
|
+
mode=None)
|
50
50
|
|
51
51
|
for i, result in enumerate(results_array):
|
52
52
|
print(f"Result for page {i + 1}:", result)
|
@@ -55,6 +55,10 @@ print(f"Number of pages: {num_pages}")
|
|
55
55
|
|
56
56
|
Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
|
57
57
|
|
58
|
+
Method `run_inference` will return results and number of pages processed.
|
59
|
+
|
60
|
+
Note: GPU backend `katanaml/sparrow-qwen2-vl-7b` is private, to be able to run below command, you need to create your own backend on Hugging Face space using [code](https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse/sparrow_parse/vllm/infra/qwen2_vl_7b) from Sparrow Parse.
|
61
|
+
|
58
62
|
## PDF pre-processing
|
59
63
|
|
60
64
|
```
|
@@ -8,7 +8,7 @@ with open("requirements.txt", "r", encoding="utf-8") as fh:
|
|
8
8
|
|
9
9
|
setup(
|
10
10
|
name="sparrow-parse",
|
11
|
-
version="0.3.
|
11
|
+
version="0.3.8",
|
12
12
|
author="Andrej Baranovskij",
|
13
13
|
author_email="andrejus.baranovskis@gmail.com",
|
14
14
|
description="Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.",
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = '0.3.8'
|
@@ -30,21 +30,16 @@ class VLLMExtractor(object):
|
|
30
30
|
debug_dir,
|
31
31
|
True)
|
32
32
|
|
33
|
-
|
34
|
-
for page_num, output_file in enumerate(output_files):
|
35
|
-
input_data[0]["file_path"] = output_file
|
36
|
-
if debug:
|
37
|
-
print(f"Running inference on page {page_num + 1}...")
|
33
|
+
input_data[0]["file_path"] = output_files
|
38
34
|
|
39
|
-
|
40
|
-
|
41
|
-
results_array.append(result)
|
35
|
+
# Run inference on the page
|
36
|
+
results_array = model_inference_instance.inference(input_data, mode)
|
42
37
|
|
43
38
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
44
39
|
return results_array, num_pages
|
45
40
|
|
46
|
-
|
47
|
-
results_array.
|
41
|
+
input_data[0]["file_path"] = [input_data[0]["file_path"]]
|
42
|
+
results_array = model_inference_instance.inference(input_data)
|
48
43
|
|
49
44
|
return results_array, 1
|
50
45
|
|
@@ -80,7 +75,7 @@ if __name__ == "__main__":
|
|
80
75
|
# results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
|
81
76
|
# debug_dir="/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/",
|
82
77
|
# debug=True,
|
83
|
-
# mode=
|
78
|
+
# mode=None)
|
84
79
|
#
|
85
80
|
# for i, result in enumerate(results_array):
|
86
81
|
# print(f"Result for page {i + 1}:", result)
|
@@ -0,0 +1,60 @@
|
|
1
|
+
from gradio_client import Client, handle_file
|
2
|
+
from sparrow_parse.vllm.inference_base import ModelInference
|
3
|
+
import json
|
4
|
+
import os
|
5
|
+
import ast
|
6
|
+
|
7
|
+
|
8
|
+
class HuggingFaceInference(ModelInference):
|
9
|
+
def __init__(self, hf_space, hf_token):
|
10
|
+
self.hf_space = hf_space
|
11
|
+
self.hf_token = hf_token
|
12
|
+
|
13
|
+
|
14
|
+
def process_response(self, output_text):
|
15
|
+
json_string = output_text
|
16
|
+
|
17
|
+
json_string = json_string.strip("[]'")
|
18
|
+
json_string = json_string.replace("```json\n", "").replace("\n```", "")
|
19
|
+
json_string = json_string.replace("'", "")
|
20
|
+
|
21
|
+
try:
|
22
|
+
formatted_json = json.loads(json_string)
|
23
|
+
return json.dumps(formatted_json, indent=2)
|
24
|
+
except json.JSONDecodeError as e:
|
25
|
+
print("Failed to parse JSON:", e)
|
26
|
+
return output_text
|
27
|
+
|
28
|
+
|
29
|
+
def inference(self, input_data, mode=None):
|
30
|
+
if mode == "static":
|
31
|
+
simple_json = self.get_simple_json()
|
32
|
+
return [simple_json]
|
33
|
+
|
34
|
+
client = Client(self.hf_space, hf_token=self.hf_token)
|
35
|
+
|
36
|
+
# Extract and prepare the absolute paths for all file paths in input_data
|
37
|
+
file_paths = [
|
38
|
+
os.path.abspath(file_path)
|
39
|
+
for data in input_data
|
40
|
+
for file_path in data["file_path"]
|
41
|
+
]
|
42
|
+
|
43
|
+
# Validate file existence and prepare files for the Gradio client
|
44
|
+
image_files = [handle_file(path) for path in file_paths if os.path.exists(path)]
|
45
|
+
|
46
|
+
results = client.predict(
|
47
|
+
input_imgs=image_files,
|
48
|
+
text_input=input_data[0]["text_input"], # Single shared text input for all images
|
49
|
+
api_name="/run_inference" # Specify the Gradio API endpoint
|
50
|
+
)
|
51
|
+
|
52
|
+
# Convert the string into a Python list
|
53
|
+
parsed_results = ast.literal_eval(results)
|
54
|
+
|
55
|
+
results_array = []
|
56
|
+
for page_output in parsed_results:
|
57
|
+
page_result = self.process_response(page_output)
|
58
|
+
results_array.append(page_result)
|
59
|
+
|
60
|
+
return results_array
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.8
|
4
4
|
Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
Author: Andrej Baranovskij
|
@@ -65,7 +65,7 @@ input_data = [
|
|
65
65
|
results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
|
66
66
|
debug_dir="/data/",
|
67
67
|
debug=True,
|
68
|
-
mode=
|
68
|
+
mode=None)
|
69
69
|
|
70
70
|
for i, result in enumerate(results_array):
|
71
71
|
print(f"Result for page {i + 1}:", result)
|
@@ -74,6 +74,10 @@ print(f"Number of pages: {num_pages}")
|
|
74
74
|
|
75
75
|
Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
|
76
76
|
|
77
|
+
Method `run_inference` will return results and number of pages processed.
|
78
|
+
|
79
|
+
Note: GPU backend `katanaml/sparrow-qwen2-vl-7b` is private, to be able to run below command, you need to create your own backend on Hugging Face space using [code](https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse/sparrow_parse/vllm/infra/qwen2_vl_7b) from Sparrow Parse.
|
80
|
+
|
77
81
|
## PDF pre-processing
|
78
82
|
|
79
83
|
```
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = '0.3.6'
|
@@ -1,40 +0,0 @@
|
|
1
|
-
from gradio_client import Client, handle_file
|
2
|
-
from sparrow_parse.vllm.inference_base import ModelInference
|
3
|
-
import json
|
4
|
-
|
5
|
-
|
6
|
-
class HuggingFaceInference(ModelInference):
|
7
|
-
def __init__(self, hf_space, hf_token):
|
8
|
-
self.hf_space = hf_space
|
9
|
-
self.hf_token = hf_token
|
10
|
-
|
11
|
-
|
12
|
-
def process_response(self, output_text):
|
13
|
-
json_string = output_text
|
14
|
-
|
15
|
-
json_string = json_string.strip("[]'")
|
16
|
-
json_string = json_string.replace("```json\n", "").replace("\n```", "")
|
17
|
-
json_string = json_string.replace("'", "")
|
18
|
-
|
19
|
-
try:
|
20
|
-
formatted_json = json.loads(json_string)
|
21
|
-
return json.dumps(formatted_json, indent=2)
|
22
|
-
except json.JSONDecodeError as e:
|
23
|
-
print("Failed to parse JSON:", e)
|
24
|
-
return output_text
|
25
|
-
|
26
|
-
|
27
|
-
def inference(self, input_data, mode=None):
|
28
|
-
if mode == "static":
|
29
|
-
simple_json = self.get_simple_json()
|
30
|
-
return simple_json
|
31
|
-
|
32
|
-
client = Client(self.hf_space, hf_token=self.hf_token)
|
33
|
-
|
34
|
-
result = client.predict(
|
35
|
-
image=handle_file(input_data[0]["file_path"]),
|
36
|
-
text_input=input_data[0]["text_input"],
|
37
|
-
api_name="/run_inference"
|
38
|
-
)
|
39
|
-
|
40
|
-
return self.process_response(result)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sparrow-parse-0.3.6 → sparrow-parse-0.3.8}/sparrow_parse/processors/table_structure_processor.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|