sparrow-parse 0.5.5__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sparrow_parse/__init__.py +1 -1
- sparrow_parse/extractors/vllm_extractor.py +13 -4
- sparrow_parse/text_extraction.py +16 -11
- sparrow_parse/vllm/mlx_inference.py +38 -16
- {sparrow_parse-0.5.5.dist-info → sparrow_parse-1.0.2.dist-info}/METADATA +11 -15
- {sparrow_parse-0.5.5.dist-info → sparrow_parse-1.0.2.dist-info}/RECORD +9 -9
- {sparrow_parse-0.5.5.dist-info → sparrow_parse-1.0.2.dist-info}/WHEEL +1 -1
- {sparrow_parse-0.5.5.dist-info → sparrow_parse-1.0.2.dist-info}/entry_points.txt +0 -1
- {sparrow_parse-0.5.5.dist-info → sparrow_parse-1.0.2.dist-info}/top_level.txt +0 -0
sparrow_parse/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = '0.
|
1
|
+
__version__ = '1.0.2'
|
@@ -147,6 +147,14 @@ class VLLMExtractor(object):
|
|
147
147
|
table_detector = TableDetector()
|
148
148
|
cropped_tables = table_detector.detect_tables(file_path, local=False, debug_dir=debug_dir, debug=debug)
|
149
149
|
results_array = []
|
150
|
+
|
151
|
+
# Check if no tables were found
|
152
|
+
if cropped_tables is None:
|
153
|
+
if debug:
|
154
|
+
print(f"No tables detected in {file_path}")
|
155
|
+
# Return a structured no-tables-found response instead of failing
|
156
|
+
return [json.dumps({"message": "No tables detected in the document", "status": "empty"})]
|
157
|
+
|
150
158
|
temp_dir = tempfile.mkdtemp()
|
151
159
|
|
152
160
|
for i, table in enumerate(cropped_tables):
|
@@ -198,7 +206,7 @@ if __name__ == "__main__":
|
|
198
206
|
# # export HF_TOKEN="hf_"
|
199
207
|
# config = {
|
200
208
|
# "method": "mlx", # Could be 'huggingface', 'mlx' or 'local_gpu'
|
201
|
-
# "model_name": "mlx-community/
|
209
|
+
# "model_name": "mlx-community/Mistral-Small-3.1-24B-Instruct-2503-8bit",
|
202
210
|
# # "hf_space": "katanaml/sparrow-qwen2-vl-7b",
|
203
211
|
# # "hf_token": os.getenv('HF_TOKEN'),
|
204
212
|
# # Additional fields for local GPU inference
|
@@ -211,8 +219,8 @@ if __name__ == "__main__":
|
|
211
219
|
#
|
212
220
|
# input_data = [
|
213
221
|
# {
|
214
|
-
# "file_path": "/
|
215
|
-
# "text_input": "retrieve
|
222
|
+
# "file_path": "sparrow_parse/images/bonds_table.png",
|
223
|
+
# "text_input": "retrieve all data. return response in JSON format"
|
216
224
|
# }
|
217
225
|
# ]
|
218
226
|
#
|
@@ -226,4 +234,5 @@ if __name__ == "__main__":
|
|
226
234
|
#
|
227
235
|
# for i, result in enumerate(results_array):
|
228
236
|
# print(f"Result for page {i + 1}:", result)
|
229
|
-
# print(f"Number of pages: {num_pages}")
|
237
|
+
# print(f"Number of pages: {num_pages}")
|
238
|
+
|
sparrow_parse/text_extraction.py
CHANGED
@@ -1,30 +1,35 @@
|
|
1
1
|
from mlx_vlm import load, apply_chat_template, generate
|
2
2
|
from mlx_vlm.utils import load_image
|
3
3
|
|
4
|
-
# For test purposes, we will use a sample image
|
5
4
|
|
6
5
|
# Load model and processor
|
7
|
-
|
8
|
-
|
6
|
+
# vl_model, vl_processor = load("mlx-community/Mistral-Small-3.1-24B-Instruct-2503-8bit")
|
7
|
+
vl_model, vl_processor = load("mlx-community/Qwen2.5-VL-7B-Instruct-8bit")
|
8
|
+
vl_config = vl_model.config
|
9
9
|
|
10
|
-
image = load_image("images/
|
10
|
+
image = load_image("images/bonds_table.png")
|
11
11
|
|
12
12
|
messages = [
|
13
13
|
{"role": "system", "content": "You are an expert at extracting text from images. Format your response in json."},
|
14
|
-
{"role": "user", "content": "
|
14
|
+
{"role": "user", "content": "retrieve [{\"instrument_name\":\"str\", \"valuation\":\"int\"}]. return response in JSON format"}
|
15
15
|
]
|
16
16
|
|
17
|
+
# message = "retrieve all data. return response in JSON format"
|
18
|
+
# message = "retrieve [{\"instrument_name\":\"str\", \"valuation\":\"int\"}]. return response in JSON format"
|
19
|
+
|
17
20
|
# Apply chat template
|
18
|
-
prompt = apply_chat_template(
|
21
|
+
prompt = apply_chat_template(vl_processor, vl_config, messages)
|
22
|
+
# prompt = apply_chat_template(vl_processor, vl_config, message)
|
19
23
|
|
20
24
|
# Generate text
|
21
|
-
|
22
|
-
|
23
|
-
|
25
|
+
vl_output = generate(
|
26
|
+
vl_model,
|
27
|
+
vl_processor,
|
24
28
|
prompt,
|
25
29
|
image,
|
26
30
|
max_tokens=1000,
|
27
|
-
temperature=0
|
31
|
+
temperature=0,
|
32
|
+
verbose=False
|
28
33
|
)
|
29
34
|
|
30
|
-
print(
|
35
|
+
print(vl_output)
|
@@ -39,21 +39,40 @@ class MLXInference(ModelInference):
|
|
39
39
|
def process_response(self, output_text):
|
40
40
|
"""
|
41
41
|
Process and clean the model's raw output to format as JSON.
|
42
|
-
|
43
|
-
:param output_text: Raw output text from the model.
|
44
|
-
:return: A formatted JSON string or the original text in case of errors.
|
45
42
|
"""
|
46
43
|
try:
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
.
|
51
|
-
|
52
|
-
|
53
|
-
|
44
|
+
# Check if we have markdown code block markers
|
45
|
+
if "```" in output_text:
|
46
|
+
# Handle markdown-formatted output
|
47
|
+
json_start = output_text.find("```json")
|
48
|
+
if json_start != -1:
|
49
|
+
# Extract content between ```json and ```
|
50
|
+
content = output_text[json_start + 7:]
|
51
|
+
json_end = content.rfind("```")
|
52
|
+
if json_end != -1:
|
53
|
+
content = content[:json_end].strip()
|
54
|
+
formatted_json = json.loads(content)
|
55
|
+
return json.dumps(formatted_json, indent=2)
|
56
|
+
|
57
|
+
# Handle raw JSON (no markdown formatting)
|
58
|
+
# First try to find JSON array or object patterns
|
59
|
+
for pattern in [r'\[\s*\{.*\}\s*\]', r'\{.*\}']:
|
60
|
+
import re
|
61
|
+
matches = re.search(pattern, output_text, re.DOTALL)
|
62
|
+
if matches:
|
63
|
+
potential_json = matches.group(0)
|
64
|
+
try:
|
65
|
+
formatted_json = json.loads(potential_json)
|
66
|
+
return json.dumps(formatted_json, indent=2)
|
67
|
+
except:
|
68
|
+
pass
|
69
|
+
|
70
|
+
# Last resort: try to parse the whole text as JSON
|
71
|
+
formatted_json = json.loads(output_text.strip())
|
54
72
|
return json.dumps(formatted_json, indent=2)
|
55
|
-
|
56
|
-
|
73
|
+
|
74
|
+
except Exception as e:
|
75
|
+
print(f"Failed to parse JSON: {e}")
|
57
76
|
return output_text
|
58
77
|
|
59
78
|
|
@@ -102,10 +121,13 @@ class MLXInference(ModelInference):
|
|
102
121
|
image, width, height = self.load_image_data(file_path)
|
103
122
|
|
104
123
|
# Prepare messages for the chat model
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
124
|
+
if "mistral" in self.model_name.lower():
|
125
|
+
messages = input_data[0]["text_input"]
|
126
|
+
else:
|
127
|
+
messages = [
|
128
|
+
{"role": "system", "content": "You are an expert at extracting structured text from image documents."},
|
129
|
+
{"role": "user", "content": input_data[0]["text_input"]},
|
130
|
+
]
|
109
131
|
|
110
132
|
# Generate and process response
|
111
133
|
prompt = apply_chat_template(processor, config, messages) # Assuming defined
|
@@ -1,15 +1,13 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 0.
|
3
|
+
Version: 1.0.2
|
4
4
|
Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
Author: Andrej Baranovskij
|
7
7
|
Author-email: andrejus.baranovskis@gmail.com
|
8
|
-
License: UNKNOWN
|
9
8
|
Project-URL: Homepage, https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
10
9
|
Project-URL: Repository, https://github.com/katanaml/sparrow
|
11
10
|
Keywords: llm,vllm,ocr,vision
|
12
|
-
Platform: UNKNOWN
|
13
11
|
Classifier: Operating System :: OS Independent
|
14
12
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
15
13
|
Classifier: Topic :: Software Development
|
@@ -17,16 +15,16 @@ Classifier: Programming Language :: Python :: 3.10
|
|
17
15
|
Requires-Python: >=3.10
|
18
16
|
Description-Content-Type: text/markdown
|
19
17
|
Requires-Dist: rich
|
20
|
-
Requires-Dist: transformers>=4.
|
21
|
-
Requires-Dist: torchvision>=0.21.0
|
22
|
-
Requires-Dist: torch>=2.6.0
|
23
|
-
Requires-Dist: sentence-transformers>=
|
24
|
-
Requires-Dist: numpy>=2.
|
25
|
-
Requires-Dist: pypdf>=5.
|
26
|
-
Requires-Dist: gradio-client>=1.7.2
|
27
|
-
Requires-Dist: pdf2image>=1.17.0
|
28
|
-
Requires-Dist: mlx>=0.
|
29
|
-
Requires-Dist: mlx-vlm==0.1.
|
18
|
+
Requires-Dist: transformers >=4.50.1
|
19
|
+
Requires-Dist: torchvision >=0.21.0
|
20
|
+
Requires-Dist: torch >=2.6.0
|
21
|
+
Requires-Dist: sentence-transformers >=4.0.0
|
22
|
+
Requires-Dist: numpy >=2.2.4
|
23
|
+
Requires-Dist: pypdf >=5.4.0
|
24
|
+
Requires-Dist: gradio-client >=1.7.2
|
25
|
+
Requires-Dist: pdf2image >=1.17.0
|
26
|
+
Requires-Dist: mlx >=0.24.1 ; sys_platform == "darwin" and platform_machine == "arm64"
|
27
|
+
Requires-Dist: mlx-vlm ==0.1.21 ; sys_platform == "darwin" and platform_machine == "arm64"
|
30
28
|
|
31
29
|
# Sparrow Parse
|
32
30
|
|
@@ -187,5 +185,3 @@ If your organization is seeking to utilize Sparrow under a proprietary license,
|
|
187
185
|
## License
|
188
186
|
|
189
187
|
Licensed under the GPL 3.0. Copyright 2020-2025 Katana ML, Andrej Baranovskij. [Copy of the license](https://github.com/katanaml/sparrow/blob/main/LICENSE).
|
190
|
-
|
191
|
-
|
@@ -1,8 +1,8 @@
|
|
1
|
-
sparrow_parse/__init__.py,sha256=
|
1
|
+
sparrow_parse/__init__.py,sha256=C8nyPP5-54GgYCcP38Lbel_pRimOW-Ra4bw6Vzp2lmE,21
|
2
2
|
sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
|
3
|
-
sparrow_parse/text_extraction.py,sha256=
|
3
|
+
sparrow_parse/text_extraction.py,sha256=lirPpvz8tnwCMGmoHPK94-vCviybuRyQM-mpvhtp3uY,1124
|
4
4
|
sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
sparrow_parse/extractors/vllm_extractor.py,sha256=
|
5
|
+
sparrow_parse/extractors/vllm_extractor.py,sha256=uRSXzCQzjXujg1n1ozDitSPQoCfO435Nog7yO1IxWiU,9874
|
6
6
|
sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
sparrow_parse/helpers/image_optimizer.py,sha256=gUAJuNzRAB5ipgfhxTNss4MHbCPPkV5y-BSyrEHcJ0Y,2164
|
8
8
|
sparrow_parse/helpers/pdf_optimizer.py,sha256=A2BVkb2JMqTJUz6bdfVzMmFSYaWn1QMav7UadMi0XJg,3423
|
@@ -13,9 +13,9 @@ sparrow_parse/vllm/huggingface_inference.py,sha256=EJnG6PesGKMc_0qGPN8ufE6pSnhAg
|
|
13
13
|
sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzKmLNzgjw,820
|
14
14
|
sparrow_parse/vllm/inference_factory.py,sha256=FTM65O-dW2WZchHOrNN7_Q3-FlVoAc65iSptuuUuClM,1166
|
15
15
|
sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
|
16
|
-
sparrow_parse/vllm/mlx_inference.py,sha256=
|
17
|
-
sparrow_parse-0.
|
18
|
-
sparrow_parse-0.
|
19
|
-
sparrow_parse-0.
|
20
|
-
sparrow_parse-0.
|
21
|
-
sparrow_parse-0.
|
16
|
+
sparrow_parse/vllm/mlx_inference.py,sha256=KjAftUIAWxYfctE3n1BKXA8jETM4WT3ESyx97eMA_8U,5954
|
17
|
+
sparrow_parse-1.0.2.dist-info/METADATA,sha256=K4XNgj-PpegO8aLAe32aOZ3D8kh6lnMX0po2wXTxn-w,7229
|
18
|
+
sparrow_parse-1.0.2.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
19
|
+
sparrow_parse-1.0.2.dist-info/entry_points.txt,sha256=HV5nnQVtr2m-kn6hzY_ynp0zugNCcGovbmnfmQgOyhw,53
|
20
|
+
sparrow_parse-1.0.2.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
|
21
|
+
sparrow_parse-1.0.2.dist-info/RECORD,,
|
File without changes
|