sparrow-parse 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sparrow_parse/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '0.4.0'
1
+ __version__ = '0.4.2'
@@ -54,7 +54,7 @@ class VLLMExtractor(object):
54
54
  """
55
55
  file_path = input_data[0]["file_path"]
56
56
  if tables_only:
57
- return [self._extract_tables(model_inference_instance, file_path, input_data, debug, debug_dir)], 1
57
+ return self._extract_tables(model_inference_instance, file_path, input_data, debug, debug_dir), 1
58
58
  else:
59
59
  input_data[0]["file_path"] = [file_path]
60
60
  results = model_inference_instance.inference(input_data)
@@ -85,7 +85,8 @@ class VLLMExtractor(object):
85
85
  tables_result = self._extract_tables(
86
86
  model_inference_instance, file_path, input_data, debug, debug_dir, page_index=i
87
87
  )
88
- results_array.append(tables_result)
88
+ # Since _extract_tables returns a list with one JSON string, unpack it
89
+ results_array.extend(tables_result) # Unpack the single JSON string
89
90
  else:
90
91
  if debug:
91
92
  print(f"Processing {len(output_files)} pages for inference at once.")
@@ -115,11 +116,18 @@ class VLLMExtractor(object):
115
116
 
116
117
  input_data[0]["file_path"] = [output_filename]
117
118
  result = self._run_model_inference(model_inference_instance, input_data)
118
- result = self.add_table_info_to_data(result, "table_nr", i + 1)
119
119
  results_array.append(result)
120
120
 
121
121
  shutil.rmtree(temp_dir, ignore_errors=True)
122
- return json.dumps(results_array, indent=4)
122
+
123
+ # Merge results_array elements into a single JSON structure
124
+ merged_results = {"page_tables": results_array}
125
+
126
+ # Format the merged results as a JSON string with indentation
127
+ formatted_results = json.dumps(merged_results, indent=4)
128
+
129
+ # Return the formatted JSON string wrapped in a list
130
+ return [formatted_results]
123
131
 
124
132
 
125
133
  @staticmethod
@@ -140,35 +148,6 @@ class VLLMExtractor(object):
140
148
  return file_path.lower().endswith('.pdf')
141
149
 
142
150
 
143
- @staticmethod
144
- def add_table_info_to_data(data: Union[Dict, List], key: str, message: Any) -> Dict:
145
- """
146
- Add a key-value pair to a dictionary or wrap a list in a dictionary.
147
- If a 'table' key exists, add or update the key-value pair inside it.
148
-
149
- Args:
150
- data (Union[Dict, List]): The input data (either a dictionary or list).
151
- key (str): The key to add.
152
- message (Any): The value to associate with the key.
153
-
154
- Returns:
155
- Dict: The modified data.
156
- """
157
- if isinstance(data, dict):
158
- if "table" in data and isinstance(data["table"], list):
159
- # Add or update the key-value pair in the existing structure
160
- data[key] = message
161
- else:
162
- # Wrap the dictionary inside a `table` key and include the additional key-value pair
163
- data = {"table": [data], key: message}
164
- elif isinstance(data, list):
165
- # Wrap the list in a dictionary with the additional key-value pair
166
- data = {"table": data, key: message}
167
- else:
168
- raise TypeError("Data must be a dictionary or a list.")
169
- return data
170
-
171
-
172
151
  if __name__ == "__main__":
173
152
  # run locally: python -m sparrow_parse.extractors.vllm_extractor
174
153
 
@@ -196,7 +175,7 @@ if __name__ == "__main__":
196
175
  # ]
197
176
  #
198
177
  # # Now you can run inference without knowing which implementation is used
199
- # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
178
+ # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=True,
200
179
  # generic_query=False,
201
180
  # debug_dir="/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/",
202
181
  # debug=True,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
@@ -1,7 +1,7 @@
1
- sparrow_parse/__init__.py,sha256=DObMj8zITWgJRRICOQXNFEgLDtZ9uQZUVwbNAU-P3oc,21
1
+ sparrow_parse/__init__.py,sha256=ZQn_AcWYegaUtOl4-txMtrEFR2pE4wBpoPlmrITggnY,21
2
2
  sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
3
  sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- sparrow_parse/extractors/vllm_extractor.py,sha256=SCqxdr8V_cm0COfs0TelTcBXapVcz2WffhESJ1fry0g,8716
4
+ sparrow_parse/extractors/vllm_extractor.py,sha256=ybWpRpDH0YHoYpHkjIJtm7DQoHJBKNsirK2YIAlMvGo,7863
5
5
  sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  sparrow_parse/helpers/pdf_optimizer.py,sha256=GIqQYWtixFeZGCRFXL0lQfQByapCDuQzzRHAkzcPwLE,3302
7
7
  sparrow_parse/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -12,8 +12,8 @@ sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzK
12
12
  sparrow_parse/vllm/inference_factory.py,sha256=FTM65O-dW2WZchHOrNN7_Q3-FlVoAc65iSptuuUuClM,1166
13
13
  sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
14
14
  sparrow_parse/vllm/mlx_inference.py,sha256=xR40qwjIR0HvrN8x58oOq6F4r1hEANRB-9kcokUQHHU,4748
15
- sparrow_parse-0.4.0.dist-info/METADATA,sha256=IQqfUUKnpA0ystjBmrrpSWw4b1hDYnLO4sqKdoNYEHk,6432
16
- sparrow_parse-0.4.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
17
- sparrow_parse-0.4.0.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
18
- sparrow_parse-0.4.0.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
19
- sparrow_parse-0.4.0.dist-info/RECORD,,
15
+ sparrow_parse-0.4.2.dist-info/METADATA,sha256=JTSkKdB2X5o3tXovyjnLSEANrPnkvJLtTfeYH7PZRDw,6432
16
+ sparrow_parse-0.4.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
17
+ sparrow_parse-0.4.2.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
18
+ sparrow_parse-0.4.2.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
19
+ sparrow_parse-0.4.2.dist-info/RECORD,,