sparrow-parse 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sparrow_parse/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '0.3.6'
1
+ __version__ = '0.3.7'
@@ -30,21 +30,16 @@ class VLLMExtractor(object):
30
30
  debug_dir,
31
31
  True)
32
32
 
33
- # Run inference on each page
34
- for page_num, output_file in enumerate(output_files):
35
- input_data[0]["file_path"] = output_file
36
- if debug:
37
- print(f"Running inference on page {page_num + 1}...")
33
+ input_data[0]["file_path"] = output_files
38
34
 
39
- # Run inference on the page
40
- result = model_inference_instance.inference(input_data, mode)
41
- results_array.append(result)
35
+ # Run inference on the page
36
+ results_array = model_inference_instance.inference(input_data, mode)
42
37
 
43
38
  shutil.rmtree(temp_dir, ignore_errors=True)
44
39
  return results_array, num_pages
45
40
 
46
- result = model_inference_instance.inference(input_data)
47
- results_array.append(result)
41
+ input_data[0]["file_path"] = [input_data[0]["file_path"]]
42
+ results_array = model_inference_instance.inference(input_data)
48
43
 
49
44
  return results_array, 1
50
45
 
@@ -80,7 +75,7 @@ if __name__ == "__main__":
80
75
  # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
81
76
  # debug_dir="/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/",
82
77
  # debug=True,
83
- # mode="static")
78
+ # mode=None)
84
79
  #
85
80
  # for i, result in enumerate(results_array):
86
81
  # print(f"Result for page {i + 1}:", result)
@@ -1,6 +1,8 @@
1
1
  from gradio_client import Client, handle_file
2
2
  from sparrow_parse.vllm.inference_base import ModelInference
3
3
  import json
4
+ import os
5
+ import ast
4
6
 
5
7
 
6
8
  class HuggingFaceInference(ModelInference):
@@ -27,14 +29,32 @@ class HuggingFaceInference(ModelInference):
27
29
  def inference(self, input_data, mode=None):
28
30
  if mode == "static":
29
31
  simple_json = self.get_simple_json()
30
- return simple_json
32
+ return [simple_json]
31
33
 
32
34
  client = Client(self.hf_space, hf_token=self.hf_token)
33
35
 
34
- result = client.predict(
35
- image=handle_file(input_data[0]["file_path"]),
36
- text_input=input_data[0]["text_input"],
37
- api_name="/run_inference"
36
+ # Extract and prepare the absolute paths for all file paths in input_data
37
+ file_paths = [
38
+ os.path.abspath(file_path)
39
+ for data in input_data
40
+ for file_path in data["file_path"]
41
+ ]
42
+
43
+ # Validate file existence and prepare files for the Gradio client
44
+ image_files = [handle_file(path) for path in file_paths if os.path.exists(path)]
45
+
46
+ results = client.predict(
47
+ input_imgs=image_files,
48
+ text_input=input_data[0]["text_input"], # Single shared text input for all images
49
+ api_name="/run_inference" # Specify the Gradio API endpoint
38
50
  )
39
51
 
40
- return self.process_response(result)
52
+ # Convert the string into a Python list
53
+ parsed_results = ast.literal_eval(results)
54
+
55
+ results_array = []
56
+ for page_output in parsed_results:
57
+ page_result = self.process_response(page_output)
58
+ results_array.append(page_result)
59
+
60
+ return results_array
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.3.6
3
+ Version: 0.3.7
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
@@ -73,7 +73,7 @@ input_data = [
73
73
  results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
74
74
  debug_dir="/data/",
75
75
  debug=True,
76
- mode="static")
76
+ mode=None)
77
77
 
78
78
  for i, result in enumerate(results_array):
79
79
  print(f"Result for page {i + 1}:", result)
@@ -82,6 +82,10 @@ print(f"Number of pages: {num_pages}")
82
82
 
83
83
  Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
84
84
 
85
+ Method `run_inference` will return results and number of pages processed.
86
+
87
+ Note: GPU backend `katanaml/sparrow-qwen2-vl-7b` is private, to be able to run below command, you need to create your own backend on Hugging Face space using [code](https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse/sparrow_parse/vllm/infra/qwen2_vl_7b) from Sparrow Parse.
88
+
85
89
  ## PDF pre-processing
86
90
 
87
91
  ```
@@ -1,18 +1,18 @@
1
- sparrow_parse/__init__.py,sha256=IbpUPwvtjLOqowcOFsWQ6LKq-FH6cI19IpvfQlxufq0,21
1
+ sparrow_parse/__init__.py,sha256=V3RDzgFfGW_qKkRklGT6eISHLybQsgfScnd9neXG7Cs,21
2
2
  sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
3
  sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- sparrow_parse/extractors/vllm_extractor.py,sha256=Wo8sOvsQt6YHd7bvB_DB8MUa71FioO9xcQOWA3PQ6eU,3415
4
+ sparrow_parse/extractors/vllm_extractor.py,sha256=mBPgeyMuHUa6jN_OZLVE-426tD4zYnFT61oxebk7XJc,3191
5
5
  sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  sparrow_parse/helpers/pdf_optimizer.py,sha256=GIqQYWtixFeZGCRFXL0lQfQByapCDuQzzRHAkzcPwLE,3302
7
7
  sparrow_parse/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  sparrow_parse/processors/table_structure_processor.py,sha256=bG_6jx66n_KNdY_O6hrZD1D4DHX5Qy__RYcKHmrSGnc,23894
9
9
  sparrow_parse/vllm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- sparrow_parse/vllm/huggingface_inference.py,sha256=nalmPJFfrFlRnfd4yTq4HvIwDvIXjhKUlEyZ6gzMqe0,1239
10
+ sparrow_parse/vllm/huggingface_inference.py,sha256=EJnG6PesGKMc_0qGPN8ufE6pSnhAgFu0XjCbaLCNVyM,1980
11
11
  sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzKmLNzgjw,820
12
12
  sparrow_parse/vllm/inference_factory.py,sha256=r04e95uPWG5l8Q23yeDqKmvFxLyF991aA2m0hfBTNn8,993
13
13
  sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
14
- sparrow_parse-0.3.6.dist-info/METADATA,sha256=ANS8eWCx07bQOOFFnJUKwsiPo-ZT42b8DvMwP9o-jf4,5827
15
- sparrow_parse-0.3.6.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
16
- sparrow_parse-0.3.6.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
17
- sparrow_parse-0.3.6.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
18
- sparrow_parse-0.3.6.dist-info/RECORD,,
14
+ sparrow_parse-0.3.7.dist-info/METADATA,sha256=ErE4fDTkcyOrVbgpc6x9AO9cU3Gf8HbEGsbKmK-F0RA,6187
15
+ sparrow_parse-0.3.7.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
16
+ sparrow_parse-0.3.7.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
17
+ sparrow_parse-0.3.7.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
18
+ sparrow_parse-0.3.7.dist-info/RECORD,,