sparrow-parse 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sparrow_parse/__init__.py +1 -1
- sparrow_parse/extractors/vllm_extractor.py +27 -2
- sparrow_parse/vllm/mlx_inference.py +71 -17
- {sparrow_parse-1.0.1.dist-info → sparrow_parse-1.0.3.dist-info}/METADATA +5 -5
- {sparrow_parse-1.0.1.dist-info → sparrow_parse-1.0.3.dist-info}/RECORD +8 -8
- {sparrow_parse-1.0.1.dist-info → sparrow_parse-1.0.3.dist-info}/WHEEL +0 -0
- {sparrow_parse-1.0.1.dist-info → sparrow_parse-1.0.3.dist-info}/entry_points.txt +0 -0
- {sparrow_parse-1.0.1.dist-info → sparrow_parse-1.0.3.dist-info}/top_level.txt +0 -0
sparrow_parse/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = '1.0.
|
1
|
+
__version__ = '1.0.3'
|
@@ -25,11 +25,21 @@ class VLLMExtractor(object):
|
|
25
25
|
if debug:
|
26
26
|
print("Input data:", input_data)
|
27
27
|
|
28
|
+
# Handle both missing file_path and file_path=None as text-only inference
|
29
|
+
is_text_only = "file_path" not in input_data[0] or input_data[0]["file_path"] is None
|
30
|
+
|
31
|
+
if is_text_only:
|
32
|
+
# Ensure file_path exists and is None for consistency
|
33
|
+
input_data[0]["file_path"] = None
|
34
|
+
results = model_inference_instance.inference(input_data)
|
35
|
+
return results, 0
|
36
|
+
|
37
|
+
# Document data extraction inference (file_path exists and is not None)
|
28
38
|
file_path = input_data[0]["file_path"]
|
29
39
|
if self.is_pdf(file_path):
|
30
40
|
return self._process_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode)
|
31
|
-
|
32
|
-
|
41
|
+
else:
|
42
|
+
return self._process_non_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir)
|
33
43
|
|
34
44
|
|
35
45
|
def _process_pdf(self, model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode):
|
@@ -147,6 +157,14 @@ class VLLMExtractor(object):
|
|
147
157
|
table_detector = TableDetector()
|
148
158
|
cropped_tables = table_detector.detect_tables(file_path, local=False, debug_dir=debug_dir, debug=debug)
|
149
159
|
results_array = []
|
160
|
+
|
161
|
+
# Check if no tables were found
|
162
|
+
if cropped_tables is None:
|
163
|
+
if debug:
|
164
|
+
print(f"No tables detected in {file_path}")
|
165
|
+
# Return a structured no-tables-found response instead of failing
|
166
|
+
return [json.dumps({"message": "No tables detected in the document", "status": "empty"})]
|
167
|
+
|
150
168
|
temp_dir = tempfile.mkdtemp()
|
151
169
|
|
152
170
|
for i, table in enumerate(cropped_tables):
|
@@ -216,6 +234,13 @@ if __name__ == "__main__":
|
|
216
234
|
# }
|
217
235
|
# ]
|
218
236
|
#
|
237
|
+
# # input_data = [
|
238
|
+
# # {
|
239
|
+
# # "file_path": None,
|
240
|
+
# # "text_input": "why earth is spinning around the sun?"
|
241
|
+
# # }
|
242
|
+
# # ]
|
243
|
+
#
|
219
244
|
# # Now you can run inference without knowing which implementation is used
|
220
245
|
# results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
|
221
246
|
# generic_query=False,
|
@@ -106,31 +106,70 @@ class MLXInference(ModelInference):
|
|
106
106
|
:param mode: Optional mode for inference ("static" for simple JSON output).
|
107
107
|
:return: List of processed model responses.
|
108
108
|
"""
|
109
|
+
# Handle static mode
|
109
110
|
if mode == "static":
|
110
111
|
return [self.get_simple_json()]
|
111
112
|
|
112
113
|
# Load the model and processor
|
113
114
|
model, processor = self._load_model_and_processor(self.model_name)
|
114
115
|
config = model.config
|
116
|
+
|
117
|
+
# Determine if we're doing text-only or image-based inference
|
118
|
+
is_text_only = input_data[0].get("file_path") is None
|
119
|
+
|
120
|
+
if is_text_only:
|
121
|
+
# Text-only inference
|
122
|
+
messages = input_data[0]["text_input"]
|
123
|
+
response = self._generate_text_response(model, processor, config, messages)
|
124
|
+
results = [self.process_response(response)]
|
125
|
+
print("Agent inference completed successfully")
|
126
|
+
else:
|
127
|
+
# Image-based inference
|
128
|
+
file_paths = self._extract_file_paths(input_data)
|
129
|
+
results = self._process_images(model, processor, config, file_paths, input_data)
|
130
|
+
|
131
|
+
return results
|
115
132
|
|
116
|
-
|
117
|
-
|
118
|
-
|
133
|
+
def _generate_text_response(self, model, processor, config, messages):
|
134
|
+
"""
|
135
|
+
Generate a text response for text-only inputs.
|
136
|
+
|
137
|
+
:param model: The loaded model
|
138
|
+
:param processor: The loaded processor
|
139
|
+
:param config: Model configuration
|
140
|
+
:param messages: Input messages
|
141
|
+
:return: Generated response
|
142
|
+
"""
|
143
|
+
prompt = apply_chat_template(processor, config, messages)
|
144
|
+
return generate(
|
145
|
+
model,
|
146
|
+
processor,
|
147
|
+
prompt,
|
148
|
+
max_tokens=4000,
|
149
|
+
temperature=0.0,
|
150
|
+
verbose=False
|
151
|
+
)
|
152
|
+
|
153
|
+
def _process_images(self, model, processor, config, file_paths, input_data):
|
154
|
+
"""
|
155
|
+
Process images and generate responses for each.
|
156
|
+
|
157
|
+
:param model: The loaded model
|
158
|
+
:param processor: The loaded processor
|
159
|
+
:param config: Model configuration
|
160
|
+
:param file_paths: List of image file paths
|
161
|
+
:param input_data: Original input data
|
162
|
+
:return: List of processed responses
|
163
|
+
"""
|
119
164
|
results = []
|
120
165
|
for file_path in file_paths:
|
121
166
|
image, width, height = self.load_image_data(file_path)
|
122
|
-
|
123
|
-
# Prepare messages
|
124
|
-
|
125
|
-
|
126
|
-
else:
|
127
|
-
messages = [
|
128
|
-
{"role": "system", "content": "You are an expert at extracting structured text from image documents."},
|
129
|
-
{"role": "user", "content": input_data[0]["text_input"]},
|
130
|
-
]
|
131
|
-
|
167
|
+
|
168
|
+
# Prepare messages based on model type
|
169
|
+
messages = self._prepare_messages(input_data, file_path)
|
170
|
+
|
132
171
|
# Generate and process response
|
133
|
-
prompt = apply_chat_template(processor, config, messages)
|
172
|
+
prompt = apply_chat_template(processor, config, messages)
|
134
173
|
response = generate(
|
135
174
|
model,
|
136
175
|
processor,
|
@@ -142,11 +181,26 @@ class MLXInference(ModelInference):
|
|
142
181
|
verbose=False
|
143
182
|
)
|
144
183
|
results.append(self.process_response(response))
|
145
|
-
|
146
|
-
|
147
|
-
|
184
|
+
print(f"Inference completed successfully for: {file_path}")
|
185
|
+
|
148
186
|
return results
|
149
187
|
|
188
|
+
def _prepare_messages(self, input_data, file_path):
|
189
|
+
"""
|
190
|
+
Prepare the appropriate messages based on the model type.
|
191
|
+
|
192
|
+
:param input_data: Original input data
|
193
|
+
:param file_path: Current file path being processed
|
194
|
+
:return: Properly formatted messages
|
195
|
+
"""
|
196
|
+
if "mistral" in self.model_name.lower():
|
197
|
+
return input_data[0]["text_input"]
|
198
|
+
else:
|
199
|
+
return [
|
200
|
+
{"role": "system", "content": "You are an expert at extracting structured text from image documents."},
|
201
|
+
{"role": "user", "content": input_data[0]["text_input"]},
|
202
|
+
]
|
203
|
+
|
150
204
|
@staticmethod
|
151
205
|
def _extract_file_paths(input_data):
|
152
206
|
"""
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.3
|
4
4
|
Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
Author: Andrej Baranovskij
|
@@ -15,16 +15,16 @@ Classifier: Programming Language :: Python :: 3.10
|
|
15
15
|
Requires-Python: >=3.10
|
16
16
|
Description-Content-Type: text/markdown
|
17
17
|
Requires-Dist: rich
|
18
|
-
Requires-Dist: transformers >=4.
|
18
|
+
Requires-Dist: transformers >=4.51.3
|
19
19
|
Requires-Dist: torchvision >=0.21.0
|
20
20
|
Requires-Dist: torch >=2.6.0
|
21
|
-
Requires-Dist: sentence-transformers >=4.
|
21
|
+
Requires-Dist: sentence-transformers >=4.1.0
|
22
22
|
Requires-Dist: numpy >=2.2.4
|
23
23
|
Requires-Dist: pypdf >=5.4.0
|
24
24
|
Requires-Dist: gradio-client >=1.7.2
|
25
25
|
Requires-Dist: pdf2image >=1.17.0
|
26
|
-
Requires-Dist: mlx >=0.
|
27
|
-
Requires-Dist: mlx-vlm ==0.1.
|
26
|
+
Requires-Dist: mlx >=0.25.0 ; sys_platform == "darwin" and platform_machine == "arm64"
|
27
|
+
Requires-Dist: mlx-vlm ==0.1.23 ; sys_platform == "darwin" and platform_machine == "arm64"
|
28
28
|
|
29
29
|
# Sparrow Parse
|
30
30
|
|
@@ -1,8 +1,8 @@
|
|
1
|
-
sparrow_parse/__init__.py,sha256=
|
1
|
+
sparrow_parse/__init__.py,sha256=MpVHFFoITiYyPltTb_qFrdeX2entdTm4x0PczXi3txY,21
|
2
2
|
sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
|
3
3
|
sparrow_parse/text_extraction.py,sha256=lirPpvz8tnwCMGmoHPK94-vCviybuRyQM-mpvhtp3uY,1124
|
4
4
|
sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
sparrow_parse/extractors/vllm_extractor.py,sha256=
|
5
|
+
sparrow_parse/extractors/vllm_extractor.py,sha256=ZxYiSrdKWLcBXn4LUuvEcDH0q_Ua8xTzqmEF15puP08,10557
|
6
6
|
sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
sparrow_parse/helpers/image_optimizer.py,sha256=gUAJuNzRAB5ipgfhxTNss4MHbCPPkV5y-BSyrEHcJ0Y,2164
|
8
8
|
sparrow_parse/helpers/pdf_optimizer.py,sha256=A2BVkb2JMqTJUz6bdfVzMmFSYaWn1QMav7UadMi0XJg,3423
|
@@ -13,9 +13,9 @@ sparrow_parse/vllm/huggingface_inference.py,sha256=EJnG6PesGKMc_0qGPN8ufE6pSnhAg
|
|
13
13
|
sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzKmLNzgjw,820
|
14
14
|
sparrow_parse/vllm/inference_factory.py,sha256=FTM65O-dW2WZchHOrNN7_Q3-FlVoAc65iSptuuUuClM,1166
|
15
15
|
sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
|
16
|
-
sparrow_parse/vllm/mlx_inference.py,sha256=
|
17
|
-
sparrow_parse-1.0.
|
18
|
-
sparrow_parse-1.0.
|
19
|
-
sparrow_parse-1.0.
|
20
|
-
sparrow_parse-1.0.
|
21
|
-
sparrow_parse-1.0.
|
16
|
+
sparrow_parse/vllm/mlx_inference.py,sha256=vqIkfTd5rP8bnZ8K_CGVEWe_G3E4i3rwN9MfLBDiE3c,8000
|
17
|
+
sparrow_parse-1.0.3.dist-info/METADATA,sha256=dIGBhBhtR5rSKj4RbT1PhyrWxKUVUq5AxbJ33FsKNlE,7229
|
18
|
+
sparrow_parse-1.0.3.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
19
|
+
sparrow_parse-1.0.3.dist-info/entry_points.txt,sha256=HV5nnQVtr2m-kn6hzY_ynp0zugNCcGovbmnfmQgOyhw,53
|
20
|
+
sparrow_parse-1.0.3.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
|
21
|
+
sparrow_parse-1.0.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|