sparrow-parse 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sparrow_parse/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '0.4.1'
1
+ __version__ = '0.4.2'
@@ -54,7 +54,7 @@ class VLLMExtractor(object):
54
54
  """
55
55
  file_path = input_data[0]["file_path"]
56
56
  if tables_only:
57
- return [self._extract_tables(model_inference_instance, file_path, input_data, debug, debug_dir)], 1
57
+ return self._extract_tables(model_inference_instance, file_path, input_data, debug, debug_dir), 1
58
58
  else:
59
59
  input_data[0]["file_path"] = [file_path]
60
60
  results = model_inference_instance.inference(input_data)
@@ -85,7 +85,8 @@ class VLLMExtractor(object):
85
85
  tables_result = self._extract_tables(
86
86
  model_inference_instance, file_path, input_data, debug, debug_dir, page_index=i
87
87
  )
88
- results_array.append(tables_result)
88
+ # Since _extract_tables returns a list with one JSON string, unpack it
89
+ results_array.extend(tables_result) # Unpack the single JSON string
89
90
  else:
90
91
  if debug:
91
92
  print(f"Processing {len(output_files)} pages for inference at once.")
@@ -118,7 +119,15 @@ class VLLMExtractor(object):
118
119
  results_array.append(result)
119
120
 
120
121
  shutil.rmtree(temp_dir, ignore_errors=True)
121
- return json.dumps(results_array, indent=4)
122
+
123
+ # Merge results_array elements into a single JSON structure
124
+ merged_results = {"page_tables": results_array}
125
+
126
+ # Format the merged results as a JSON string with indentation
127
+ formatted_results = json.dumps(merged_results, indent=4)
128
+
129
+ # Return the formatted JSON string wrapped in a list
130
+ return [formatted_results]
122
131
 
123
132
 
124
133
  @staticmethod
@@ -166,7 +175,7 @@ if __name__ == "__main__":
166
175
  # ]
167
176
  #
168
177
  # # Now you can run inference without knowing which implementation is used
169
- # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
178
+ # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=True,
170
179
  # generic_query=False,
171
180
  # debug_dir="/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/",
172
181
  # debug=True,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
@@ -1,7 +1,7 @@
1
- sparrow_parse/__init__.py,sha256=8yPI9dbwQUYqhMtA3RfAi5yJOhZBnz-g8966ssrYXiU,21
1
+ sparrow_parse/__init__.py,sha256=ZQn_AcWYegaUtOl4-txMtrEFR2pE4wBpoPlmrITggnY,21
2
2
  sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
3
  sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- sparrow_parse/extractors/vllm_extractor.py,sha256=QIg7AMCfw81YHQN6CutF2ipV_DZ3txSGduPIcvQRmiA,7439
4
+ sparrow_parse/extractors/vllm_extractor.py,sha256=ybWpRpDH0YHoYpHkjIJtm7DQoHJBKNsirK2YIAlMvGo,7863
5
5
  sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  sparrow_parse/helpers/pdf_optimizer.py,sha256=GIqQYWtixFeZGCRFXL0lQfQByapCDuQzzRHAkzcPwLE,3302
7
7
  sparrow_parse/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -12,8 +12,8 @@ sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzK
12
12
  sparrow_parse/vllm/inference_factory.py,sha256=FTM65O-dW2WZchHOrNN7_Q3-FlVoAc65iSptuuUuClM,1166
13
13
  sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
14
14
  sparrow_parse/vllm/mlx_inference.py,sha256=xR40qwjIR0HvrN8x58oOq6F4r1hEANRB-9kcokUQHHU,4748
15
- sparrow_parse-0.4.1.dist-info/METADATA,sha256=4rmJ1CURKtyTs-ZH1eyHn_VptHosJZwhQFB5Fssr5e0,6432
16
- sparrow_parse-0.4.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
17
- sparrow_parse-0.4.1.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
18
- sparrow_parse-0.4.1.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
19
- sparrow_parse-0.4.1.dist-info/RECORD,,
15
+ sparrow_parse-0.4.2.dist-info/METADATA,sha256=JTSkKdB2X5o3tXovyjnLSEANrPnkvJLtTfeYH7PZRDw,6432
16
+ sparrow_parse-0.4.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
17
+ sparrow_parse-0.4.2.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
18
+ sparrow_parse-0.4.2.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
19
+ sparrow_parse-0.4.2.dist-info/RECORD,,