sparrow-parse 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sparrow_parse/__init__.py +1 -1
- sparrow_parse/extractors/vllm_extractor.py +19 -2
- sparrow_parse/vllm/mlx_inference.py +71 -17
- {sparrow_parse-1.0.2.dist-info → sparrow_parse-1.0.3.dist-info}/METADATA +5 -5
- {sparrow_parse-1.0.2.dist-info → sparrow_parse-1.0.3.dist-info}/RECORD +8 -8
- {sparrow_parse-1.0.2.dist-info → sparrow_parse-1.0.3.dist-info}/WHEEL +0 -0
- {sparrow_parse-1.0.2.dist-info → sparrow_parse-1.0.3.dist-info}/entry_points.txt +0 -0
- {sparrow_parse-1.0.2.dist-info → sparrow_parse-1.0.3.dist-info}/top_level.txt +0 -0
sparrow_parse/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = '1.0.
|
1
|
+
__version__ = '1.0.3'
|
@@ -25,11 +25,21 @@ class VLLMExtractor(object):
|
|
25
25
|
if debug:
|
26
26
|
print("Input data:", input_data)
|
27
27
|
|
28
|
+
# Handle both missing file_path and file_path=None as text-only inference
|
29
|
+
is_text_only = "file_path" not in input_data[0] or input_data[0]["file_path"] is None
|
30
|
+
|
31
|
+
if is_text_only:
|
32
|
+
# Ensure file_path exists and is None for consistency
|
33
|
+
input_data[0]["file_path"] = None
|
34
|
+
results = model_inference_instance.inference(input_data)
|
35
|
+
return results, 0
|
36
|
+
|
37
|
+
# Document data extraction inference (file_path exists and is not None)
|
28
38
|
file_path = input_data[0]["file_path"]
|
29
39
|
if self.is_pdf(file_path):
|
30
40
|
return self._process_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode)
|
31
|
-
|
32
|
-
|
41
|
+
else:
|
42
|
+
return self._process_non_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir)
|
33
43
|
|
34
44
|
|
35
45
|
def _process_pdf(self, model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode):
|
@@ -224,6 +234,13 @@ if __name__ == "__main__":
|
|
224
234
|
# }
|
225
235
|
# ]
|
226
236
|
#
|
237
|
+
# # input_data = [
|
238
|
+
# # {
|
239
|
+
# # "file_path": None,
|
240
|
+
# # "text_input": "why earth is spinning around the sun?"
|
241
|
+
# # }
|
242
|
+
# # ]
|
243
|
+
#
|
227
244
|
# # Now you can run inference without knowing which implementation is used
|
228
245
|
# results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
|
229
246
|
# generic_query=False,
|
@@ -106,31 +106,70 @@ class MLXInference(ModelInference):
|
|
106
106
|
:param mode: Optional mode for inference ("static" for simple JSON output).
|
107
107
|
:return: List of processed model responses.
|
108
108
|
"""
|
109
|
+
# Handle static mode
|
109
110
|
if mode == "static":
|
110
111
|
return [self.get_simple_json()]
|
111
112
|
|
112
113
|
# Load the model and processor
|
113
114
|
model, processor = self._load_model_and_processor(self.model_name)
|
114
115
|
config = model.config
|
116
|
+
|
117
|
+
# Determine if we're doing text-only or image-based inference
|
118
|
+
is_text_only = input_data[0].get("file_path") is None
|
119
|
+
|
120
|
+
if is_text_only:
|
121
|
+
# Text-only inference
|
122
|
+
messages = input_data[0]["text_input"]
|
123
|
+
response = self._generate_text_response(model, processor, config, messages)
|
124
|
+
results = [self.process_response(response)]
|
125
|
+
print("Agent inference completed successfully")
|
126
|
+
else:
|
127
|
+
# Image-based inference
|
128
|
+
file_paths = self._extract_file_paths(input_data)
|
129
|
+
results = self._process_images(model, processor, config, file_paths, input_data)
|
130
|
+
|
131
|
+
return results
|
115
132
|
|
116
|
-
|
117
|
-
|
118
|
-
|
133
|
+
def _generate_text_response(self, model, processor, config, messages):
|
134
|
+
"""
|
135
|
+
Generate a text response for text-only inputs.
|
136
|
+
|
137
|
+
:param model: The loaded model
|
138
|
+
:param processor: The loaded processor
|
139
|
+
:param config: Model configuration
|
140
|
+
:param messages: Input messages
|
141
|
+
:return: Generated response
|
142
|
+
"""
|
143
|
+
prompt = apply_chat_template(processor, config, messages)
|
144
|
+
return generate(
|
145
|
+
model,
|
146
|
+
processor,
|
147
|
+
prompt,
|
148
|
+
max_tokens=4000,
|
149
|
+
temperature=0.0,
|
150
|
+
verbose=False
|
151
|
+
)
|
152
|
+
|
153
|
+
def _process_images(self, model, processor, config, file_paths, input_data):
|
154
|
+
"""
|
155
|
+
Process images and generate responses for each.
|
156
|
+
|
157
|
+
:param model: The loaded model
|
158
|
+
:param processor: The loaded processor
|
159
|
+
:param config: Model configuration
|
160
|
+
:param file_paths: List of image file paths
|
161
|
+
:param input_data: Original input data
|
162
|
+
:return: List of processed responses
|
163
|
+
"""
|
119
164
|
results = []
|
120
165
|
for file_path in file_paths:
|
121
166
|
image, width, height = self.load_image_data(file_path)
|
122
|
-
|
123
|
-
# Prepare messages
|
124
|
-
|
125
|
-
|
126
|
-
else:
|
127
|
-
messages = [
|
128
|
-
{"role": "system", "content": "You are an expert at extracting structured text from image documents."},
|
129
|
-
{"role": "user", "content": input_data[0]["text_input"]},
|
130
|
-
]
|
131
|
-
|
167
|
+
|
168
|
+
# Prepare messages based on model type
|
169
|
+
messages = self._prepare_messages(input_data, file_path)
|
170
|
+
|
132
171
|
# Generate and process response
|
133
|
-
prompt = apply_chat_template(processor, config, messages)
|
172
|
+
prompt = apply_chat_template(processor, config, messages)
|
134
173
|
response = generate(
|
135
174
|
model,
|
136
175
|
processor,
|
@@ -142,11 +181,26 @@ class MLXInference(ModelInference):
|
|
142
181
|
verbose=False
|
143
182
|
)
|
144
183
|
results.append(self.process_response(response))
|
145
|
-
|
146
|
-
|
147
|
-
|
184
|
+
print(f"Inference completed successfully for: {file_path}")
|
185
|
+
|
148
186
|
return results
|
149
187
|
|
188
|
+
def _prepare_messages(self, input_data, file_path):
|
189
|
+
"""
|
190
|
+
Prepare the appropriate messages based on the model type.
|
191
|
+
|
192
|
+
:param input_data: Original input data
|
193
|
+
:param file_path: Current file path being processed
|
194
|
+
:return: Properly formatted messages
|
195
|
+
"""
|
196
|
+
if "mistral" in self.model_name.lower():
|
197
|
+
return input_data[0]["text_input"]
|
198
|
+
else:
|
199
|
+
return [
|
200
|
+
{"role": "system", "content": "You are an expert at extracting structured text from image documents."},
|
201
|
+
{"role": "user", "content": input_data[0]["text_input"]},
|
202
|
+
]
|
203
|
+
|
150
204
|
@staticmethod
|
151
205
|
def _extract_file_paths(input_data):
|
152
206
|
"""
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.3
|
4
4
|
Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
Author: Andrej Baranovskij
|
@@ -15,16 +15,16 @@ Classifier: Programming Language :: Python :: 3.10
|
|
15
15
|
Requires-Python: >=3.10
|
16
16
|
Description-Content-Type: text/markdown
|
17
17
|
Requires-Dist: rich
|
18
|
-
Requires-Dist: transformers >=4.
|
18
|
+
Requires-Dist: transformers >=4.51.3
|
19
19
|
Requires-Dist: torchvision >=0.21.0
|
20
20
|
Requires-Dist: torch >=2.6.0
|
21
|
-
Requires-Dist: sentence-transformers >=4.
|
21
|
+
Requires-Dist: sentence-transformers >=4.1.0
|
22
22
|
Requires-Dist: numpy >=2.2.4
|
23
23
|
Requires-Dist: pypdf >=5.4.0
|
24
24
|
Requires-Dist: gradio-client >=1.7.2
|
25
25
|
Requires-Dist: pdf2image >=1.17.0
|
26
|
-
Requires-Dist: mlx >=0.
|
27
|
-
Requires-Dist: mlx-vlm ==0.1.
|
26
|
+
Requires-Dist: mlx >=0.25.0 ; sys_platform == "darwin" and platform_machine == "arm64"
|
27
|
+
Requires-Dist: mlx-vlm ==0.1.23 ; sys_platform == "darwin" and platform_machine == "arm64"
|
28
28
|
|
29
29
|
# Sparrow Parse
|
30
30
|
|
@@ -1,8 +1,8 @@
|
|
1
|
-
sparrow_parse/__init__.py,sha256=
|
1
|
+
sparrow_parse/__init__.py,sha256=MpVHFFoITiYyPltTb_qFrdeX2entdTm4x0PczXi3txY,21
|
2
2
|
sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
|
3
3
|
sparrow_parse/text_extraction.py,sha256=lirPpvz8tnwCMGmoHPK94-vCviybuRyQM-mpvhtp3uY,1124
|
4
4
|
sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
sparrow_parse/extractors/vllm_extractor.py,sha256=
|
5
|
+
sparrow_parse/extractors/vllm_extractor.py,sha256=ZxYiSrdKWLcBXn4LUuvEcDH0q_Ua8xTzqmEF15puP08,10557
|
6
6
|
sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
sparrow_parse/helpers/image_optimizer.py,sha256=gUAJuNzRAB5ipgfhxTNss4MHbCPPkV5y-BSyrEHcJ0Y,2164
|
8
8
|
sparrow_parse/helpers/pdf_optimizer.py,sha256=A2BVkb2JMqTJUz6bdfVzMmFSYaWn1QMav7UadMi0XJg,3423
|
@@ -13,9 +13,9 @@ sparrow_parse/vllm/huggingface_inference.py,sha256=EJnG6PesGKMc_0qGPN8ufE6pSnhAg
|
|
13
13
|
sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzKmLNzgjw,820
|
14
14
|
sparrow_parse/vllm/inference_factory.py,sha256=FTM65O-dW2WZchHOrNN7_Q3-FlVoAc65iSptuuUuClM,1166
|
15
15
|
sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
|
16
|
-
sparrow_parse/vllm/mlx_inference.py,sha256=
|
17
|
-
sparrow_parse-1.0.
|
18
|
-
sparrow_parse-1.0.
|
19
|
-
sparrow_parse-1.0.
|
20
|
-
sparrow_parse-1.0.
|
21
|
-
sparrow_parse-1.0.
|
16
|
+
sparrow_parse/vllm/mlx_inference.py,sha256=vqIkfTd5rP8bnZ8K_CGVEWe_G3E4i3rwN9MfLBDiE3c,8000
|
17
|
+
sparrow_parse-1.0.3.dist-info/METADATA,sha256=dIGBhBhtR5rSKj4RbT1PhyrWxKUVUq5AxbJ33FsKNlE,7229
|
18
|
+
sparrow_parse-1.0.3.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
19
|
+
sparrow_parse-1.0.3.dist-info/entry_points.txt,sha256=HV5nnQVtr2m-kn6hzY_ynp0zugNCcGovbmnfmQgOyhw,53
|
20
|
+
sparrow_parse-1.0.3.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
|
21
|
+
sparrow_parse-1.0.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|