sparrow-parse 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sparrow_parse/__init__.py +1 -1
- sparrow_parse/extractors/vllm_extractor.py +19 -2
- sparrow_parse/vllm/mlx_inference.py +72 -17
- {sparrow_parse-1.0.2.dist-info → sparrow_parse-1.0.4.dist-info}/METADATA +5 -5
- {sparrow_parse-1.0.2.dist-info → sparrow_parse-1.0.4.dist-info}/RECORD +8 -8
- {sparrow_parse-1.0.2.dist-info → sparrow_parse-1.0.4.dist-info}/WHEEL +0 -0
- {sparrow_parse-1.0.2.dist-info → sparrow_parse-1.0.4.dist-info}/entry_points.txt +0 -0
- {sparrow_parse-1.0.2.dist-info → sparrow_parse-1.0.4.dist-info}/top_level.txt +0 -0
sparrow_parse/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = '1.0.
|
1
|
+
__version__ = '1.0.4'
|
@@ -25,11 +25,21 @@ class VLLMExtractor(object):
|
|
25
25
|
if debug:
|
26
26
|
print("Input data:", input_data)
|
27
27
|
|
28
|
+
# Handle both missing file_path and file_path=None as text-only inference
|
29
|
+
is_text_only = "file_path" not in input_data[0] or input_data[0]["file_path"] is None
|
30
|
+
|
31
|
+
if is_text_only:
|
32
|
+
# Ensure file_path exists and is None for consistency
|
33
|
+
input_data[0]["file_path"] = None
|
34
|
+
results = model_inference_instance.inference(input_data)
|
35
|
+
return results, 0
|
36
|
+
|
37
|
+
# Document data extraction inference (file_path exists and is not None)
|
28
38
|
file_path = input_data[0]["file_path"]
|
29
39
|
if self.is_pdf(file_path):
|
30
40
|
return self._process_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode)
|
31
|
-
|
32
|
-
|
41
|
+
else:
|
42
|
+
return self._process_non_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir)
|
33
43
|
|
34
44
|
|
35
45
|
def _process_pdf(self, model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode):
|
@@ -224,6 +234,13 @@ if __name__ == "__main__":
|
|
224
234
|
# }
|
225
235
|
# ]
|
226
236
|
#
|
237
|
+
# # input_data = [
|
238
|
+
# # {
|
239
|
+
# # "file_path": None,
|
240
|
+
# # "text_input": "why earth is spinning around the sun?"
|
241
|
+
# # }
|
242
|
+
# # ]
|
243
|
+
#
|
227
244
|
# # Now you can run inference without knowing which implementation is used
|
228
245
|
# results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
|
229
246
|
# generic_query=False,
|
@@ -106,31 +106,71 @@ class MLXInference(ModelInference):
|
|
106
106
|
:param mode: Optional mode for inference ("static" for simple JSON output).
|
107
107
|
:return: List of processed model responses.
|
108
108
|
"""
|
109
|
+
# Handle static mode
|
109
110
|
if mode == "static":
|
110
111
|
return [self.get_simple_json()]
|
111
112
|
|
112
113
|
# Load the model and processor
|
113
114
|
model, processor = self._load_model_and_processor(self.model_name)
|
114
115
|
config = model.config
|
116
|
+
|
117
|
+
# Determine if we're doing text-only or image-based inference
|
118
|
+
is_text_only = input_data[0].get("file_path") is None
|
119
|
+
|
120
|
+
if is_text_only:
|
121
|
+
# Text-only inference
|
122
|
+
messages = input_data[0]["text_input"]
|
123
|
+
response = self._generate_text_response(model, processor, config, messages)
|
124
|
+
results = [response]
|
125
|
+
else:
|
126
|
+
# Image-based inference
|
127
|
+
file_paths = self._extract_file_paths(input_data)
|
128
|
+
results = self._process_images(model, processor, config, file_paths, input_data)
|
129
|
+
|
130
|
+
return results
|
115
131
|
|
116
|
-
|
117
|
-
|
118
|
-
|
132
|
+
def _generate_text_response(self, model, processor, config, messages):
|
133
|
+
"""
|
134
|
+
Generate a text response for text-only inputs.
|
135
|
+
|
136
|
+
:param model: The loaded model
|
137
|
+
:param processor: The loaded processor
|
138
|
+
:param config: Model configuration
|
139
|
+
:param messages: Input messages
|
140
|
+
:return: Generated response
|
141
|
+
"""
|
142
|
+
prompt = apply_chat_template(processor, config, messages)
|
143
|
+
response = generate(
|
144
|
+
model,
|
145
|
+
processor,
|
146
|
+
prompt,
|
147
|
+
max_tokens=4000,
|
148
|
+
temperature=0.0,
|
149
|
+
verbose=False
|
150
|
+
)
|
151
|
+
print("Inference completed successfully")
|
152
|
+
return response
|
153
|
+
|
154
|
+
def _process_images(self, model, processor, config, file_paths, input_data):
|
155
|
+
"""
|
156
|
+
Process images and generate responses for each.
|
157
|
+
|
158
|
+
:param model: The loaded model
|
159
|
+
:param processor: The loaded processor
|
160
|
+
:param config: Model configuration
|
161
|
+
:param file_paths: List of image file paths
|
162
|
+
:param input_data: Original input data
|
163
|
+
:return: List of processed responses
|
164
|
+
"""
|
119
165
|
results = []
|
120
166
|
for file_path in file_paths:
|
121
167
|
image, width, height = self.load_image_data(file_path)
|
122
|
-
|
123
|
-
# Prepare messages
|
124
|
-
|
125
|
-
|
126
|
-
else:
|
127
|
-
messages = [
|
128
|
-
{"role": "system", "content": "You are an expert at extracting structured text from image documents."},
|
129
|
-
{"role": "user", "content": input_data[0]["text_input"]},
|
130
|
-
]
|
131
|
-
|
168
|
+
|
169
|
+
# Prepare messages based on model type
|
170
|
+
messages = self._prepare_messages(input_data, file_path)
|
171
|
+
|
132
172
|
# Generate and process response
|
133
|
-
prompt = apply_chat_template(processor, config, messages)
|
173
|
+
prompt = apply_chat_template(processor, config, messages)
|
134
174
|
response = generate(
|
135
175
|
model,
|
136
176
|
processor,
|
@@ -142,11 +182,26 @@ class MLXInference(ModelInference):
|
|
142
182
|
verbose=False
|
143
183
|
)
|
144
184
|
results.append(self.process_response(response))
|
145
|
-
|
146
|
-
|
147
|
-
|
185
|
+
print(f"Inference completed successfully for: {file_path}")
|
186
|
+
|
148
187
|
return results
|
149
188
|
|
189
|
+
def _prepare_messages(self, input_data, file_path):
|
190
|
+
"""
|
191
|
+
Prepare the appropriate messages based on the model type.
|
192
|
+
|
193
|
+
:param input_data: Original input data
|
194
|
+
:param file_path: Current file path being processed
|
195
|
+
:return: Properly formatted messages
|
196
|
+
"""
|
197
|
+
if "mistral" in self.model_name.lower():
|
198
|
+
return input_data[0]["text_input"]
|
199
|
+
else:
|
200
|
+
return [
|
201
|
+
{"role": "system", "content": "You are an expert at extracting structured text from image documents."},
|
202
|
+
{"role": "user", "content": input_data[0]["text_input"]},
|
203
|
+
]
|
204
|
+
|
150
205
|
@staticmethod
|
151
206
|
def _extract_file_paths(input_data):
|
152
207
|
"""
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.4
|
4
4
|
Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
Author: Andrej Baranovskij
|
@@ -15,16 +15,16 @@ Classifier: Programming Language :: Python :: 3.10
|
|
15
15
|
Requires-Python: >=3.10
|
16
16
|
Description-Content-Type: text/markdown
|
17
17
|
Requires-Dist: rich
|
18
|
-
Requires-Dist: transformers >=4.
|
18
|
+
Requires-Dist: transformers >=4.51.3
|
19
19
|
Requires-Dist: torchvision >=0.21.0
|
20
20
|
Requires-Dist: torch >=2.6.0
|
21
|
-
Requires-Dist: sentence-transformers >=4.
|
21
|
+
Requires-Dist: sentence-transformers >=4.1.0
|
22
22
|
Requires-Dist: numpy >=2.2.4
|
23
23
|
Requires-Dist: pypdf >=5.4.0
|
24
24
|
Requires-Dist: gradio-client >=1.7.2
|
25
25
|
Requires-Dist: pdf2image >=1.17.0
|
26
|
-
Requires-Dist: mlx >=0.
|
27
|
-
Requires-Dist: mlx-vlm ==0.1.
|
26
|
+
Requires-Dist: mlx >=0.25.0 ; sys_platform == "darwin" and platform_machine == "arm64"
|
27
|
+
Requires-Dist: mlx-vlm ==0.1.23 ; sys_platform == "darwin" and platform_machine == "arm64"
|
28
28
|
|
29
29
|
# Sparrow Parse
|
30
30
|
|
@@ -1,8 +1,8 @@
|
|
1
|
-
sparrow_parse/__init__.py,sha256=
|
1
|
+
sparrow_parse/__init__.py,sha256=Oi2b5pm3sFbESQW0xgj8kqwDPX_Hxmx4gNILYpLzYqI,21
|
2
2
|
sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
|
3
3
|
sparrow_parse/text_extraction.py,sha256=lirPpvz8tnwCMGmoHPK94-vCviybuRyQM-mpvhtp3uY,1124
|
4
4
|
sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
sparrow_parse/extractors/vllm_extractor.py,sha256=
|
5
|
+
sparrow_parse/extractors/vllm_extractor.py,sha256=ZxYiSrdKWLcBXn4LUuvEcDH0q_Ua8xTzqmEF15puP08,10557
|
6
6
|
sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
sparrow_parse/helpers/image_optimizer.py,sha256=gUAJuNzRAB5ipgfhxTNss4MHbCPPkV5y-BSyrEHcJ0Y,2164
|
8
8
|
sparrow_parse/helpers/pdf_optimizer.py,sha256=A2BVkb2JMqTJUz6bdfVzMmFSYaWn1QMav7UadMi0XJg,3423
|
@@ -13,9 +13,9 @@ sparrow_parse/vllm/huggingface_inference.py,sha256=EJnG6PesGKMc_0qGPN8ufE6pSnhAg
|
|
13
13
|
sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzKmLNzgjw,820
|
14
14
|
sparrow_parse/vllm/inference_factory.py,sha256=FTM65O-dW2WZchHOrNN7_Q3-FlVoAc65iSptuuUuClM,1166
|
15
15
|
sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
|
16
|
-
sparrow_parse/vllm/mlx_inference.py,sha256=
|
17
|
-
sparrow_parse-1.0.
|
18
|
-
sparrow_parse-1.0.
|
19
|
-
sparrow_parse-1.0.
|
20
|
-
sparrow_parse-1.0.
|
21
|
-
sparrow_parse-1.0.
|
16
|
+
sparrow_parse/vllm/mlx_inference.py,sha256=StjoeTqhy_CKh8lM0BLIBrDeYlWY17FrxKBpdOtqD5g,7996
|
17
|
+
sparrow_parse-1.0.4.dist-info/METADATA,sha256=6WD5U1d66RXvijQojfnJ35eiN3nfBgWQ98E94_Zsglg,7229
|
18
|
+
sparrow_parse-1.0.4.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
19
|
+
sparrow_parse-1.0.4.dist-info/entry_points.txt,sha256=HV5nnQVtr2m-kn6hzY_ynp0zugNCcGovbmnfmQgOyhw,53
|
20
|
+
sparrow_parse-1.0.4.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
|
21
|
+
sparrow_parse-1.0.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|