sparrow-parse 0.5.5__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sparrow_parse/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '0.5.5'
1
+ __version__ = '1.0.1'
@@ -198,7 +198,7 @@ if __name__ == "__main__":
198
198
  # # export HF_TOKEN="hf_"
199
199
  # config = {
200
200
  # "method": "mlx", # Could be 'huggingface', 'mlx' or 'local_gpu'
201
- # "model_name": "mlx-community/Qwen2.5-VL-7B-Instruct-8bit",
201
+ # "model_name": "mlx-community/Mistral-Small-3.1-24B-Instruct-2503-8bit",
202
202
  # # "hf_space": "katanaml/sparrow-qwen2-vl-7b",
203
203
  # # "hf_token": os.getenv('HF_TOKEN'),
204
204
  # # Additional fields for local GPU inference
@@ -211,8 +211,8 @@ if __name__ == "__main__":
211
211
  #
212
212
  # input_data = [
213
213
  # {
214
- # "file_path": "/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/bonds_table.png",
215
- # "text_input": "retrieve document data. return response in JSON format"
214
+ # "file_path": "sparrow_parse/images/bonds_table.png",
215
+ # "text_input": "retrieve all data. return response in JSON format"
216
216
  # }
217
217
  # ]
218
218
  #
@@ -226,4 +226,5 @@ if __name__ == "__main__":
226
226
  #
227
227
  # for i, result in enumerate(results_array):
228
228
  # print(f"Result for page {i + 1}:", result)
229
- # print(f"Number of pages: {num_pages}")
229
+ # print(f"Number of pages: {num_pages}")
230
+
@@ -1,30 +1,35 @@
1
1
  from mlx_vlm import load, apply_chat_template, generate
2
2
  from mlx_vlm.utils import load_image
3
3
 
4
- # For test purposes, we will use a sample image
5
4
 
6
5
  # Load model and processor
7
- qwen_vl_model, qwen_vl_processor = load("mlx-community/Qwen2.5-VL-7B-Instruct-8bit")
8
- qwen_vl_config = qwen_vl_model.config
6
+ # vl_model, vl_processor = load("mlx-community/Mistral-Small-3.1-24B-Instruct-2503-8bit")
7
+ vl_model, vl_processor = load("mlx-community/Qwen2.5-VL-7B-Instruct-8bit")
8
+ vl_config = vl_model.config
9
9
 
10
- image = load_image("images/graph.png")
10
+ image = load_image("images/bonds_table.png")
11
11
 
12
12
  messages = [
13
13
  {"role": "system", "content": "You are an expert at extracting text from images. Format your response in json."},
14
- {"role": "user", "content": "Extract the names, labels and y coordinates from the image."}
14
+ {"role": "user", "content": "retrieve [{\"instrument_name\":\"str\", \"valuation\":\"int\"}]. return response in JSON format"}
15
15
  ]
16
16
 
17
+ # message = "retrieve all data. return response in JSON format"
18
+ # message = "retrieve [{\"instrument_name\":\"str\", \"valuation\":\"int\"}]. return response in JSON format"
19
+
17
20
  # Apply chat template
18
- prompt = apply_chat_template(qwen_vl_processor, qwen_vl_config, messages)
21
+ prompt = apply_chat_template(vl_processor, vl_config, messages)
22
+ # prompt = apply_chat_template(vl_processor, vl_config, message)
19
23
 
20
24
  # Generate text
21
- qwen_vl_output = generate(
22
- qwen_vl_model,
23
- qwen_vl_processor,
25
+ vl_output = generate(
26
+ vl_model,
27
+ vl_processor,
24
28
  prompt,
25
29
  image,
26
30
  max_tokens=1000,
27
- temperature=0.7,
31
+ temperature=0,
32
+ verbose=False
28
33
  )
29
34
 
30
- print(qwen_vl_output)
35
+ print(vl_output)
@@ -39,21 +39,40 @@ class MLXInference(ModelInference):
39
39
  def process_response(self, output_text):
40
40
  """
41
41
  Process and clean the model's raw output to format as JSON.
42
-
43
- :param output_text: Raw output text from the model.
44
- :return: A formatted JSON string or the original text in case of errors.
45
42
  """
46
43
  try:
47
- cleaned_text = (
48
- output_text.strip("[]'")
49
- .replace("```json\n", "")
50
- .replace("\n```", "")
51
- .replace("'", "")
52
- )
53
- formatted_json = json.loads(cleaned_text)
44
+ # Check if we have markdown code block markers
45
+ if "```" in output_text:
46
+ # Handle markdown-formatted output
47
+ json_start = output_text.find("```json")
48
+ if json_start != -1:
49
+ # Extract content between ```json and ```
50
+ content = output_text[json_start + 7:]
51
+ json_end = content.rfind("```")
52
+ if json_end != -1:
53
+ content = content[:json_end].strip()
54
+ formatted_json = json.loads(content)
55
+ return json.dumps(formatted_json, indent=2)
56
+
57
+ # Handle raw JSON (no markdown formatting)
58
+ # First try to find JSON array or object patterns
59
+ for pattern in [r'\[\s*\{.*\}\s*\]', r'\{.*\}']:
60
+ import re
61
+ matches = re.search(pattern, output_text, re.DOTALL)
62
+ if matches:
63
+ potential_json = matches.group(0)
64
+ try:
65
+ formatted_json = json.loads(potential_json)
66
+ return json.dumps(formatted_json, indent=2)
67
+ except:
68
+ pass
69
+
70
+ # Last resort: try to parse the whole text as JSON
71
+ formatted_json = json.loads(output_text.strip())
54
72
  return json.dumps(formatted_json, indent=2)
55
- except json.JSONDecodeError as e:
56
- print(f"Failed to parse JSON in MLX inference backend: {e}")
73
+
74
+ except Exception as e:
75
+ print(f"Failed to parse JSON: {e}")
57
76
  return output_text
58
77
 
59
78
 
@@ -102,10 +121,13 @@ class MLXInference(ModelInference):
102
121
  image, width, height = self.load_image_data(file_path)
103
122
 
104
123
  # Prepare messages for the chat model
105
- messages = [
106
- {"role": "system", "content": "You are an expert at extracting structured text from image documents."},
107
- {"role": "user", "content": input_data[0]["text_input"]},
108
- ]
124
+ if "mistral" in self.model_name.lower():
125
+ messages = input_data[0]["text_input"]
126
+ else:
127
+ messages = [
128
+ {"role": "system", "content": "You are an expert at extracting structured text from image documents."},
129
+ {"role": "user", "content": input_data[0]["text_input"]},
130
+ ]
109
131
 
110
132
  # Generate and process response
111
133
  prompt = apply_chat_template(processor, config, messages) # Assuming defined
@@ -1,15 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.5.5
3
+ Version: 1.0.1
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
7
7
  Author-email: andrejus.baranovskis@gmail.com
8
- License: UNKNOWN
9
8
  Project-URL: Homepage, https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
10
9
  Project-URL: Repository, https://github.com/katanaml/sparrow
11
10
  Keywords: llm,vllm,ocr,vision
12
- Platform: UNKNOWN
13
11
  Classifier: Operating System :: OS Independent
14
12
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
15
13
  Classifier: Topic :: Software Development
@@ -17,16 +15,16 @@ Classifier: Programming Language :: Python :: 3.10
17
15
  Requires-Python: >=3.10
18
16
  Description-Content-Type: text/markdown
19
17
  Requires-Dist: rich
20
- Requires-Dist: transformers>=4.49.0
21
- Requires-Dist: torchvision>=0.21.0
22
- Requires-Dist: torch>=2.6.0
23
- Requires-Dist: sentence-transformers>=3.3.1
24
- Requires-Dist: numpy>=2.1.3
25
- Requires-Dist: pypdf>=5.2.0
26
- Requires-Dist: gradio-client>=1.7.2
27
- Requires-Dist: pdf2image>=1.17.0
28
- Requires-Dist: mlx>=0.23.1; sys_platform == "darwin" and platform_machine == "arm64"
29
- Requires-Dist: mlx-vlm==0.1.14; sys_platform == "darwin" and platform_machine == "arm64"
18
+ Requires-Dist: transformers >=4.50.1
19
+ Requires-Dist: torchvision >=0.21.0
20
+ Requires-Dist: torch >=2.6.0
21
+ Requires-Dist: sentence-transformers >=4.0.0
22
+ Requires-Dist: numpy >=2.2.4
23
+ Requires-Dist: pypdf >=5.4.0
24
+ Requires-Dist: gradio-client >=1.7.2
25
+ Requires-Dist: pdf2image >=1.17.0
26
+ Requires-Dist: mlx >=0.24.1 ; sys_platform == "darwin" and platform_machine == "arm64"
27
+ Requires-Dist: mlx-vlm ==0.1.21 ; sys_platform == "darwin" and platform_machine == "arm64"
30
28
 
31
29
  # Sparrow Parse
32
30
 
@@ -187,5 +185,3 @@ If your organization is seeking to utilize Sparrow under a proprietary license,
187
185
  ## License
188
186
 
189
187
  Licensed under the GPL 3.0. Copyright 2020-2025 Katana ML, Andrej Baranovskij. [Copy of the license](https://github.com/katanaml/sparrow/blob/main/LICENSE).
190
-
191
-
@@ -1,8 +1,8 @@
1
- sparrow_parse/__init__.py,sha256=nrKeY2xA6SXRPdgHDxi2HLkFNpXRuW6MkqwC0reZpy8,21
1
+ sparrow_parse/__init__.py,sha256=bMIenWosteoeUs51RbaWVZetIuzRhWymuyy-n0rfK0I,21
2
2
  sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
- sparrow_parse/text_extraction.py,sha256=JtUU7swvV12xBai5S9ICxWWWrUlkpZTZqvUnbz1h5Mk,834
3
+ sparrow_parse/text_extraction.py,sha256=lirPpvz8tnwCMGmoHPK94-vCviybuRyQM-mpvhtp3uY,1124
4
4
  sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- sparrow_parse/extractors/vllm_extractor.py,sha256=Cf2sVgxDExj2ud4G6z9JnirVclTgPIEe9YSoCfTkW4k,9563
5
+ sparrow_parse/extractors/vllm_extractor.py,sha256=MYaT8ITaxEDH6TpSwMZl6WbOiLYqyFN_VQsCmVe4YkI,9535
6
6
  sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  sparrow_parse/helpers/image_optimizer.py,sha256=gUAJuNzRAB5ipgfhxTNss4MHbCPPkV5y-BSyrEHcJ0Y,2164
8
8
  sparrow_parse/helpers/pdf_optimizer.py,sha256=A2BVkb2JMqTJUz6bdfVzMmFSYaWn1QMav7UadMi0XJg,3423
@@ -13,9 +13,9 @@ sparrow_parse/vllm/huggingface_inference.py,sha256=EJnG6PesGKMc_0qGPN8ufE6pSnhAg
13
13
  sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzKmLNzgjw,820
14
14
  sparrow_parse/vllm/inference_factory.py,sha256=FTM65O-dW2WZchHOrNN7_Q3-FlVoAc65iSptuuUuClM,1166
15
15
  sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
16
- sparrow_parse/vllm/mlx_inference.py,sha256=MUuW56f-aKnVmeMAATxKLxsovEMmp1qlgtlmW8J2C7M,4899
17
- sparrow_parse-0.5.5.dist-info/METADATA,sha256=l9s3Vi-5KVtryw0a1Z7AXqQHIKuUMbPQu0XNeilRofU,7254
18
- sparrow_parse-0.5.5.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
19
- sparrow_parse-0.5.5.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
20
- sparrow_parse-0.5.5.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
21
- sparrow_parse-0.5.5.dist-info/RECORD,,
16
+ sparrow_parse/vllm/mlx_inference.py,sha256=KjAftUIAWxYfctE3n1BKXA8jETM4WT3ESyx97eMA_8U,5954
17
+ sparrow_parse-1.0.1.dist-info/METADATA,sha256=IADhZaH5VgE530Czablge_kn3mEAtlq8Zq1TubTkMv8,7229
18
+ sparrow_parse-1.0.1.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
19
+ sparrow_parse-1.0.1.dist-info/entry_points.txt,sha256=HV5nnQVtr2m-kn6hzY_ynp0zugNCcGovbmnfmQgOyhw,53
20
+ sparrow_parse-1.0.1.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
21
+ sparrow_parse-1.0.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.45.1)
2
+ Generator: bdist_wheel (0.41.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,3 +1,2 @@
1
1
  [console_scripts]
2
2
  sparrow-parse = sparrow_parse:main
3
-