sparrow-parse 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sparrow_parse/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '0.4.1'
1
+ __version__ = '0.4.3'
@@ -54,7 +54,7 @@ class VLLMExtractor(object):
54
54
  """
55
55
  file_path = input_data[0]["file_path"]
56
56
  if tables_only:
57
- return [self._extract_tables(model_inference_instance, file_path, input_data, debug, debug_dir)], 1
57
+ return self._extract_tables(model_inference_instance, file_path, input_data, debug, debug_dir), 1
58
58
  else:
59
59
  input_data[0]["file_path"] = [file_path]
60
60
  results = model_inference_instance.inference(input_data)
@@ -85,7 +85,8 @@ class VLLMExtractor(object):
85
85
  tables_result = self._extract_tables(
86
86
  model_inference_instance, file_path, input_data, debug, debug_dir, page_index=i
87
87
  )
88
- results_array.append(tables_result)
88
+ # Since _extract_tables returns a list with one JSON string, unpack it
89
+ results_array.extend(tables_result) # Unpack the single JSON string
89
90
  else:
90
91
  if debug:
91
92
  print(f"Processing {len(output_files)} pages for inference at once.")
@@ -118,7 +119,15 @@ class VLLMExtractor(object):
118
119
  results_array.append(result)
119
120
 
120
121
  shutil.rmtree(temp_dir, ignore_errors=True)
121
- return json.dumps(results_array, indent=4)
122
+
123
+ # Merge results_array elements into a single JSON structure
124
+ merged_results = {"page_tables": results_array}
125
+
126
+ # Format the merged results as a JSON string with indentation
127
+ formatted_results = json.dumps(merged_results, indent=4)
128
+
129
+ # Return the formatted JSON string wrapped in a list
130
+ return [formatted_results]
122
131
 
123
132
 
124
133
  @staticmethod
@@ -166,7 +175,7 @@ if __name__ == "__main__":
166
175
  # ]
167
176
  #
168
177
  # # Now you can run inference without knowing which implementation is used
169
- # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
178
+ # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=True,
170
179
  # generic_query=False,
171
180
  # debug_dir="/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/",
172
181
  # debug=True,
@@ -14,14 +14,12 @@ class MLXInference(ModelInference):
14
14
 
15
15
  def __init__(self, model_name):
16
16
  """
17
- Initialize the inference class with the given model name and load the model once.
17
+ Initialize the inference class with the given model name.
18
18
 
19
19
  :param model_name: Name of the model to load.
20
20
  """
21
- self.model, self.processor = self._load_model_and_processor(model_name)
22
- self.config = self.model.config
23
-
24
- print(f"Loaded model: {model_name}")
21
+ self.model_name = model_name
22
+ print(f"MLXInference initialized with model: {model_name}")
25
23
 
26
24
 
27
25
  @staticmethod
@@ -89,6 +87,10 @@ class MLXInference(ModelInference):
89
87
  if mode == "static":
90
88
  return [self.get_simple_json()]
91
89
 
90
+ # Load the model and processor
91
+ model, processor = self._load_model_and_processor(self.model_name)
92
+ config = model.config
93
+
92
94
  # Prepare absolute file paths
93
95
  file_paths = self._extract_file_paths(input_data)
94
96
 
@@ -103,10 +105,10 @@ class MLXInference(ModelInference):
103
105
  ]
104
106
 
105
107
  # Generate and process response
106
- prompt = apply_chat_template(self.processor, self.config, messages) # Assuming defined
108
+ prompt = apply_chat_template(processor, config, messages) # Assuming defined
107
109
  response = generate(
108
- self.model,
109
- self.processor,
110
+ model,
111
+ processor,
110
112
  image,
111
113
  prompt,
112
114
  resize_shape=(width, height),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
@@ -1,7 +1,7 @@
1
- sparrow_parse/__init__.py,sha256=8yPI9dbwQUYqhMtA3RfAi5yJOhZBnz-g8966ssrYXiU,21
1
+ sparrow_parse/__init__.py,sha256=udnlByVnFcZDwWir50pEbTU0bIwgBrpNtAiVExFEzu0,21
2
2
  sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
3
  sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- sparrow_parse/extractors/vllm_extractor.py,sha256=QIg7AMCfw81YHQN6CutF2ipV_DZ3txSGduPIcvQRmiA,7439
4
+ sparrow_parse/extractors/vllm_extractor.py,sha256=ybWpRpDH0YHoYpHkjIJtm7DQoHJBKNsirK2YIAlMvGo,7863
5
5
  sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  sparrow_parse/helpers/pdf_optimizer.py,sha256=GIqQYWtixFeZGCRFXL0lQfQByapCDuQzzRHAkzcPwLE,3302
7
7
  sparrow_parse/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -11,9 +11,9 @@ sparrow_parse/vllm/huggingface_inference.py,sha256=EJnG6PesGKMc_0qGPN8ufE6pSnhAg
11
11
  sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzKmLNzgjw,820
12
12
  sparrow_parse/vllm/inference_factory.py,sha256=FTM65O-dW2WZchHOrNN7_Q3-FlVoAc65iSptuuUuClM,1166
13
13
  sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
14
- sparrow_parse/vllm/mlx_inference.py,sha256=xR40qwjIR0HvrN8x58oOq6F4r1hEANRB-9kcokUQHHU,4748
15
- sparrow_parse-0.4.1.dist-info/METADATA,sha256=4rmJ1CURKtyTs-ZH1eyHn_VptHosJZwhQFB5Fssr5e0,6432
16
- sparrow_parse-0.4.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
17
- sparrow_parse-0.4.1.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
18
- sparrow_parse-0.4.1.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
19
- sparrow_parse-0.4.1.dist-info/RECORD,,
14
+ sparrow_parse/vllm/mlx_inference.py,sha256=cx-PLXf1t8ro50YALddj70FiR7s0gk_Ddp-I9XlPQQU,4788
15
+ sparrow_parse-0.4.3.dist-info/METADATA,sha256=W7zeOHa09rgn-58aIdTkNOSqBLgpziDF7sZ_059jaoo,6432
16
+ sparrow_parse-0.4.3.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
17
+ sparrow_parse-0.4.3.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
18
+ sparrow_parse-0.4.3.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
19
+ sparrow_parse-0.4.3.dist-info/RECORD,,