sparrow-parse 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sparrow_parse/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '1.0.2'
1
+ __version__ = '1.0.3'
@@ -25,11 +25,21 @@ class VLLMExtractor(object):
25
25
  if debug:
26
26
  print("Input data:", input_data)
27
27
 
28
+ # Handle both missing file_path and file_path=None as text-only inference
29
+ is_text_only = "file_path" not in input_data[0] or input_data[0]["file_path"] is None
30
+
31
+ if is_text_only:
32
+ # Ensure file_path exists and is None for consistency
33
+ input_data[0]["file_path"] = None
34
+ results = model_inference_instance.inference(input_data)
35
+ return results, 0
36
+
37
+ # Document data extraction inference (file_path exists and is not None)
28
38
  file_path = input_data[0]["file_path"]
29
39
  if self.is_pdf(file_path):
30
40
  return self._process_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode)
31
-
32
- return self._process_non_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir)
41
+ else:
42
+ return self._process_non_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir)
33
43
 
34
44
 
35
45
  def _process_pdf(self, model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode):
@@ -224,6 +234,13 @@ if __name__ == "__main__":
224
234
  # }
225
235
  # ]
226
236
  #
237
+ # # input_data = [
238
+ # # {
239
+ # # "file_path": None,
240
+ # # "text_input": "why earth is spinning around the sun?"
241
+ # # }
242
+ # # ]
243
+ #
227
244
  # # Now you can run inference without knowing which implementation is used
228
245
  # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
229
246
  # generic_query=False,
@@ -106,31 +106,70 @@ class MLXInference(ModelInference):
106
106
  :param mode: Optional mode for inference ("static" for simple JSON output).
107
107
  :return: List of processed model responses.
108
108
  """
109
+ # Handle static mode
109
110
  if mode == "static":
110
111
  return [self.get_simple_json()]
111
112
 
112
113
  # Load the model and processor
113
114
  model, processor = self._load_model_and_processor(self.model_name)
114
115
  config = model.config
116
+
117
+ # Determine if we're doing text-only or image-based inference
118
+ is_text_only = input_data[0].get("file_path") is None
119
+
120
+ if is_text_only:
121
+ # Text-only inference
122
+ messages = input_data[0]["text_input"]
123
+ response = self._generate_text_response(model, processor, config, messages)
124
+ results = [self.process_response(response)]
125
+ print("Agent inference completed successfully")
126
+ else:
127
+ # Image-based inference
128
+ file_paths = self._extract_file_paths(input_data)
129
+ results = self._process_images(model, processor, config, file_paths, input_data)
130
+
131
+ return results
115
132
 
116
- # Prepare absolute file paths
117
- file_paths = self._extract_file_paths(input_data)
118
-
133
+ def _generate_text_response(self, model, processor, config, messages):
134
+ """
135
+ Generate a text response for text-only inputs.
136
+
137
+ :param model: The loaded model
138
+ :param processor: The loaded processor
139
+ :param config: Model configuration
140
+ :param messages: Input messages
141
+ :return: Generated response
142
+ """
143
+ prompt = apply_chat_template(processor, config, messages)
144
+ return generate(
145
+ model,
146
+ processor,
147
+ prompt,
148
+ max_tokens=4000,
149
+ temperature=0.0,
150
+ verbose=False
151
+ )
152
+
153
+ def _process_images(self, model, processor, config, file_paths, input_data):
154
+ """
155
+ Process images and generate responses for each.
156
+
157
+ :param model: The loaded model
158
+ :param processor: The loaded processor
159
+ :param config: Model configuration
160
+ :param file_paths: List of image file paths
161
+ :param input_data: Original input data
162
+ :return: List of processed responses
163
+ """
119
164
  results = []
120
165
  for file_path in file_paths:
121
166
  image, width, height = self.load_image_data(file_path)
122
-
123
- # Prepare messages for the chat model
124
- if "mistral" in self.model_name.lower():
125
- messages = input_data[0]["text_input"]
126
- else:
127
- messages = [
128
- {"role": "system", "content": "You are an expert at extracting structured text from image documents."},
129
- {"role": "user", "content": input_data[0]["text_input"]},
130
- ]
131
-
167
+
168
+ # Prepare messages based on model type
169
+ messages = self._prepare_messages(input_data, file_path)
170
+
132
171
  # Generate and process response
133
- prompt = apply_chat_template(processor, config, messages) # Assuming defined
172
+ prompt = apply_chat_template(processor, config, messages)
134
173
  response = generate(
135
174
  model,
136
175
  processor,
@@ -142,11 +181,26 @@ class MLXInference(ModelInference):
142
181
  verbose=False
143
182
  )
144
183
  results.append(self.process_response(response))
145
-
146
- print("Inference completed successfully for: ", file_path)
147
-
184
+ print(f"Inference completed successfully for: {file_path}")
185
+
148
186
  return results
149
187
 
188
+ def _prepare_messages(self, input_data, file_path):
189
+ """
190
+ Prepare the appropriate messages based on the model type.
191
+
192
+ :param input_data: Original input data
193
+ :param file_path: Current file path being processed
194
+ :return: Properly formatted messages
195
+ """
196
+ if "mistral" in self.model_name.lower():
197
+ return input_data[0]["text_input"]
198
+ else:
199
+ return [
200
+ {"role": "system", "content": "You are an expert at extracting structured text from image documents."},
201
+ {"role": "user", "content": input_data[0]["text_input"]},
202
+ ]
203
+
150
204
  @staticmethod
151
205
  def _extract_file_paths(input_data):
152
206
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 1.0.2
3
+ Version: 1.0.3
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
@@ -15,16 +15,16 @@ Classifier: Programming Language :: Python :: 3.10
15
15
  Requires-Python: >=3.10
16
16
  Description-Content-Type: text/markdown
17
17
  Requires-Dist: rich
18
- Requires-Dist: transformers >=4.50.1
18
+ Requires-Dist: transformers >=4.51.3
19
19
  Requires-Dist: torchvision >=0.21.0
20
20
  Requires-Dist: torch >=2.6.0
21
- Requires-Dist: sentence-transformers >=4.0.0
21
+ Requires-Dist: sentence-transformers >=4.1.0
22
22
  Requires-Dist: numpy >=2.2.4
23
23
  Requires-Dist: pypdf >=5.4.0
24
24
  Requires-Dist: gradio-client >=1.7.2
25
25
  Requires-Dist: pdf2image >=1.17.0
26
- Requires-Dist: mlx >=0.24.1 ; sys_platform == "darwin" and platform_machine == "arm64"
27
- Requires-Dist: mlx-vlm ==0.1.21 ; sys_platform == "darwin" and platform_machine == "arm64"
26
+ Requires-Dist: mlx >=0.25.0 ; sys_platform == "darwin" and platform_machine == "arm64"
27
+ Requires-Dist: mlx-vlm ==0.1.23 ; sys_platform == "darwin" and platform_machine == "arm64"
28
28
 
29
29
  # Sparrow Parse
30
30
 
@@ -1,8 +1,8 @@
1
- sparrow_parse/__init__.py,sha256=C8nyPP5-54GgYCcP38Lbel_pRimOW-Ra4bw6Vzp2lmE,21
1
+ sparrow_parse/__init__.py,sha256=MpVHFFoITiYyPltTb_qFrdeX2entdTm4x0PczXi3txY,21
2
2
  sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
3
  sparrow_parse/text_extraction.py,sha256=lirPpvz8tnwCMGmoHPK94-vCviybuRyQM-mpvhtp3uY,1124
4
4
  sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- sparrow_parse/extractors/vllm_extractor.py,sha256=uRSXzCQzjXujg1n1ozDitSPQoCfO435Nog7yO1IxWiU,9874
5
+ sparrow_parse/extractors/vllm_extractor.py,sha256=ZxYiSrdKWLcBXn4LUuvEcDH0q_Ua8xTzqmEF15puP08,10557
6
6
  sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  sparrow_parse/helpers/image_optimizer.py,sha256=gUAJuNzRAB5ipgfhxTNss4MHbCPPkV5y-BSyrEHcJ0Y,2164
8
8
  sparrow_parse/helpers/pdf_optimizer.py,sha256=A2BVkb2JMqTJUz6bdfVzMmFSYaWn1QMav7UadMi0XJg,3423
@@ -13,9 +13,9 @@ sparrow_parse/vllm/huggingface_inference.py,sha256=EJnG6PesGKMc_0qGPN8ufE6pSnhAg
13
13
  sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzKmLNzgjw,820
14
14
  sparrow_parse/vllm/inference_factory.py,sha256=FTM65O-dW2WZchHOrNN7_Q3-FlVoAc65iSptuuUuClM,1166
15
15
  sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
16
- sparrow_parse/vllm/mlx_inference.py,sha256=KjAftUIAWxYfctE3n1BKXA8jETM4WT3ESyx97eMA_8U,5954
17
- sparrow_parse-1.0.2.dist-info/METADATA,sha256=K4XNgj-PpegO8aLAe32aOZ3D8kh6lnMX0po2wXTxn-w,7229
18
- sparrow_parse-1.0.2.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
19
- sparrow_parse-1.0.2.dist-info/entry_points.txt,sha256=HV5nnQVtr2m-kn6hzY_ynp0zugNCcGovbmnfmQgOyhw,53
20
- sparrow_parse-1.0.2.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
21
- sparrow_parse-1.0.2.dist-info/RECORD,,
16
+ sparrow_parse/vllm/mlx_inference.py,sha256=vqIkfTd5rP8bnZ8K_CGVEWe_G3E4i3rwN9MfLBDiE3c,8000
17
+ sparrow_parse-1.0.3.dist-info/METADATA,sha256=dIGBhBhtR5rSKj4RbT1PhyrWxKUVUq5AxbJ33FsKNlE,7229
18
+ sparrow_parse-1.0.3.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
19
+ sparrow_parse-1.0.3.dist-info/entry_points.txt,sha256=HV5nnQVtr2m-kn6hzY_ynp0zugNCcGovbmnfmQgOyhw,53
20
+ sparrow_parse-1.0.3.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
21
+ sparrow_parse-1.0.3.dist-info/RECORD,,