sparrow-parse 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sparrow_parse/__init__.py +1 -1
- sparrow_parse/extractors/vllm_extractor.py +13 -4
- sparrow_parse/vllm/mlx_inference.py +10 -8
- {sparrow_parse-0.4.1.dist-info → sparrow_parse-0.4.3.dist-info}/METADATA +1 -1
- {sparrow_parse-0.4.1.dist-info → sparrow_parse-0.4.3.dist-info}/RECORD +8 -8
- {sparrow_parse-0.4.1.dist-info → sparrow_parse-0.4.3.dist-info}/WHEEL +0 -0
- {sparrow_parse-0.4.1.dist-info → sparrow_parse-0.4.3.dist-info}/entry_points.txt +0 -0
- {sparrow_parse-0.4.1.dist-info → sparrow_parse-0.4.3.dist-info}/top_level.txt +0 -0
sparrow_parse/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = '0.4.
|
1
|
+
__version__ = '0.4.3'
|
@@ -54,7 +54,7 @@ class VLLMExtractor(object):
|
|
54
54
|
"""
|
55
55
|
file_path = input_data[0]["file_path"]
|
56
56
|
if tables_only:
|
57
|
-
return
|
57
|
+
return self._extract_tables(model_inference_instance, file_path, input_data, debug, debug_dir), 1
|
58
58
|
else:
|
59
59
|
input_data[0]["file_path"] = [file_path]
|
60
60
|
results = model_inference_instance.inference(input_data)
|
@@ -85,7 +85,8 @@ class VLLMExtractor(object):
|
|
85
85
|
tables_result = self._extract_tables(
|
86
86
|
model_inference_instance, file_path, input_data, debug, debug_dir, page_index=i
|
87
87
|
)
|
88
|
-
|
88
|
+
# Since _extract_tables returns a list with one JSON string, unpack it
|
89
|
+
results_array.extend(tables_result) # Unpack the single JSON string
|
89
90
|
else:
|
90
91
|
if debug:
|
91
92
|
print(f"Processing {len(output_files)} pages for inference at once.")
|
@@ -118,7 +119,15 @@ class VLLMExtractor(object):
|
|
118
119
|
results_array.append(result)
|
119
120
|
|
120
121
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
121
|
-
|
122
|
+
|
123
|
+
# Merge results_array elements into a single JSON structure
|
124
|
+
merged_results = {"page_tables": results_array}
|
125
|
+
|
126
|
+
# Format the merged results as a JSON string with indentation
|
127
|
+
formatted_results = json.dumps(merged_results, indent=4)
|
128
|
+
|
129
|
+
# Return the formatted JSON string wrapped in a list
|
130
|
+
return [formatted_results]
|
122
131
|
|
123
132
|
|
124
133
|
@staticmethod
|
@@ -166,7 +175,7 @@ if __name__ == "__main__":
|
|
166
175
|
# ]
|
167
176
|
#
|
168
177
|
# # Now you can run inference without knowing which implementation is used
|
169
|
-
# results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=
|
178
|
+
# results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=True,
|
170
179
|
# generic_query=False,
|
171
180
|
# debug_dir="/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/",
|
172
181
|
# debug=True,
|
@@ -14,14 +14,12 @@ class MLXInference(ModelInference):
|
|
14
14
|
|
15
15
|
def __init__(self, model_name):
|
16
16
|
"""
|
17
|
-
Initialize the inference class with the given model name
|
17
|
+
Initialize the inference class with the given model name.
|
18
18
|
|
19
19
|
:param model_name: Name of the model to load.
|
20
20
|
"""
|
21
|
-
self.
|
22
|
-
|
23
|
-
|
24
|
-
print(f"Loaded model: {model_name}")
|
21
|
+
self.model_name = model_name
|
22
|
+
print(f"MLXInference initialized with model: {model_name}")
|
25
23
|
|
26
24
|
|
27
25
|
@staticmethod
|
@@ -89,6 +87,10 @@ class MLXInference(ModelInference):
|
|
89
87
|
if mode == "static":
|
90
88
|
return [self.get_simple_json()]
|
91
89
|
|
90
|
+
# Load the model and processor
|
91
|
+
model, processor = self._load_model_and_processor(self.model_name)
|
92
|
+
config = model.config
|
93
|
+
|
92
94
|
# Prepare absolute file paths
|
93
95
|
file_paths = self._extract_file_paths(input_data)
|
94
96
|
|
@@ -103,10 +105,10 @@ class MLXInference(ModelInference):
|
|
103
105
|
]
|
104
106
|
|
105
107
|
# Generate and process response
|
106
|
-
prompt = apply_chat_template(
|
108
|
+
prompt = apply_chat_template(processor, config, messages) # Assuming defined
|
107
109
|
response = generate(
|
108
|
-
|
109
|
-
|
110
|
+
model,
|
111
|
+
processor,
|
110
112
|
image,
|
111
113
|
prompt,
|
112
114
|
resize_shape=(width, height),
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.3
|
4
4
|
Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
Author: Andrej Baranovskij
|
@@ -1,7 +1,7 @@
|
|
1
|
-
sparrow_parse/__init__.py,sha256=
|
1
|
+
sparrow_parse/__init__.py,sha256=udnlByVnFcZDwWir50pEbTU0bIwgBrpNtAiVExFEzu0,21
|
2
2
|
sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
|
3
3
|
sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
sparrow_parse/extractors/vllm_extractor.py,sha256=
|
4
|
+
sparrow_parse/extractors/vllm_extractor.py,sha256=ybWpRpDH0YHoYpHkjIJtm7DQoHJBKNsirK2YIAlMvGo,7863
|
5
5
|
sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
sparrow_parse/helpers/pdf_optimizer.py,sha256=GIqQYWtixFeZGCRFXL0lQfQByapCDuQzzRHAkzcPwLE,3302
|
7
7
|
sparrow_parse/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -11,9 +11,9 @@ sparrow_parse/vllm/huggingface_inference.py,sha256=EJnG6PesGKMc_0qGPN8ufE6pSnhAg
|
|
11
11
|
sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzKmLNzgjw,820
|
12
12
|
sparrow_parse/vllm/inference_factory.py,sha256=FTM65O-dW2WZchHOrNN7_Q3-FlVoAc65iSptuuUuClM,1166
|
13
13
|
sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
|
14
|
-
sparrow_parse/vllm/mlx_inference.py,sha256=
|
15
|
-
sparrow_parse-0.4.
|
16
|
-
sparrow_parse-0.4.
|
17
|
-
sparrow_parse-0.4.
|
18
|
-
sparrow_parse-0.4.
|
19
|
-
sparrow_parse-0.4.
|
14
|
+
sparrow_parse/vllm/mlx_inference.py,sha256=cx-PLXf1t8ro50YALddj70FiR7s0gk_Ddp-I9XlPQQU,4788
|
15
|
+
sparrow_parse-0.4.3.dist-info/METADATA,sha256=W7zeOHa09rgn-58aIdTkNOSqBLgpziDF7sZ_059jaoo,6432
|
16
|
+
sparrow_parse-0.4.3.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
17
|
+
sparrow_parse-0.4.3.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
|
18
|
+
sparrow_parse-0.4.3.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
|
19
|
+
sparrow_parse-0.4.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|