sparrow-parse 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sparrow_parse/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '0.3.4'
1
+ __version__ = '0.3.6'
@@ -1,46 +1,87 @@
1
1
  from sparrow_parse.vllm.inference_factory import InferenceFactory
2
+ from sparrow_parse.helpers.pdf_optimizer import PDFOptimizer
2
3
  from rich import print
3
4
  import os
5
+ import shutil
4
6
 
5
7
 
6
8
  class VLLMExtractor(object):
7
9
  def __init__(self):
8
10
  pass
9
11
 
10
- def run_inference(self, model_inference_instance, input_data, generic_query=False, debug=False):
12
+ def run_inference(self,
13
+ model_inference_instance,
14
+ input_data,
15
+ generic_query=False,
16
+ debug_dir=None,
17
+ debug=False,
18
+ mode=None):
11
19
  if generic_query:
12
20
  input_data[0]["text_input"] = "retrieve document data. return response in JSON format"
13
21
 
14
22
  if debug:
15
23
  print("Input Data:", input_data)
16
24
 
25
+ results_array = []
26
+
27
+ if self.is_pdf(input_data[0]["file_path"]):
28
+ pdf_optimizer = PDFOptimizer()
29
+ num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(input_data[0]["file_path"],
30
+ debug_dir,
31
+ True)
32
+
33
+ # Run inference on each page
34
+ for page_num, output_file in enumerate(output_files):
35
+ input_data[0]["file_path"] = output_file
36
+ if debug:
37
+ print(f"Running inference on page {page_num + 1}...")
38
+
39
+ # Run inference on the page
40
+ result = model_inference_instance.inference(input_data, mode)
41
+ results_array.append(result)
42
+
43
+ shutil.rmtree(temp_dir, ignore_errors=True)
44
+ return results_array, num_pages
45
+
17
46
  result = model_inference_instance.inference(input_data)
47
+ results_array.append(result)
48
+
49
+ return results_array, 1
18
50
 
19
- return result
51
+ def is_pdf(self, file_path):
52
+ return file_path.lower().endswith('.pdf')
20
53
 
21
54
  if __name__ == "__main__":
55
+ # run locally: python -m sparrow_parse.extractors.vllm_extractor
56
+
22
57
  extractor = VLLMExtractor()
23
58
 
24
- # export HF_TOKEN="hf_"
25
- config = {
26
- "method": "huggingface", # Could be 'huggingface' or 'local_gpu'
27
- "hf_space": "katanaml/sparrow-qwen2-vl-7b",
28
- "hf_token": os.getenv('HF_TOKEN'),
29
- # Additional fields for local GPU inference
30
- # "device": "cuda", "model_path": "model.pth"
31
- }
32
-
33
- # Use the factory to get the correct instance
34
- factory = InferenceFactory(config)
35
- model_inference_instance = factory.get_inference_instance()
36
-
37
- input_data = [
38
- {
39
- "image": "/Users/andrejb/Documents/work/epik/bankstatement/bonds_table.png",
40
- "text_input": "retrieve financial instruments data. return response in JSON format"
41
- }
42
- ]
43
-
44
- # Now you can run inference without knowing which implementation is used
45
- result = extractor.run_inference(model_inference_instance, input_data, generic_query=False, debug=True)
46
- print("Inference Result:", result)
59
+ # # export HF_TOKEN="hf_"
60
+ # config = {
61
+ # "method": "huggingface", # Could be 'huggingface' or 'local_gpu'
62
+ # "hf_space": "katanaml/sparrow-qwen2-vl-7b",
63
+ # "hf_token": os.getenv('HF_TOKEN'),
64
+ # # Additional fields for local GPU inference
65
+ # # "device": "cuda", "model_path": "model.pth"
66
+ # }
67
+ #
68
+ # # Use the factory to get the correct instance
69
+ # factory = InferenceFactory(config)
70
+ # model_inference_instance = factory.get_inference_instance()
71
+ #
72
+ # input_data = [
73
+ # {
74
+ # "file_path": "/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/oracle_10k_2014_q1_small.pdf",
75
+ # "text_input": "retrieve {\"table\": [{\"description\": \"str\", \"latest_amount\": 0, \"previous_amount\": 0}]}. return response in JSON format"
76
+ # }
77
+ # ]
78
+ #
79
+ # # Now you can run inference without knowing which implementation is used
80
+ # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
81
+ # debug_dir="/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/",
82
+ # debug=True,
83
+ # mode="static")
84
+ #
85
+ # for i, result in enumerate(results_array):
86
+ # print(f"Result for page {i + 1}:", result)
87
+ # print(f"Number of pages: {num_pages}")
@@ -40,17 +40,18 @@ class PDFOptimizer(object):
40
40
  return number_of_pages, output_files, temp_dir
41
41
  else:
42
42
  # Convert the PDF to images
43
- images = convert_from_path(file_path, dpi=400)
43
+ images = convert_from_path(file_path, dpi=300)
44
+ base_name = os.path.splitext(os.path.basename(file_path))[0]
44
45
 
45
46
  # Save the images to the temporary directory
46
47
  for i, image in enumerate(images):
47
- output_filename = os.path.join(temp_dir, f'page_{i + 1}.jpg')
48
+ output_filename = os.path.join(temp_dir, f'{base_name}_page_{i + 1}.jpg')
48
49
  image.save(output_filename, 'JPEG')
49
50
  output_files.append(output_filename)
50
51
 
51
52
  if output_dir:
52
53
  # Save each image to the debug folder
53
- debug_output_filename = os.path.join(output_dir, f'page_{i + 1}.jpg')
54
+ debug_output_filename = os.path.join(output_dir, f'{base_name}_page_{i + 1}.jpg')
54
55
  image.save(debug_output_filename, 'JPEG')
55
56
 
56
57
  # Return the number of pages, the list of file paths, and the temporary directory
@@ -60,13 +61,17 @@ class PDFOptimizer(object):
60
61
  if __name__ == "__main__":
61
62
  pdf_optimizer = PDFOptimizer()
62
63
 
63
- # output_directory = "/Users/andrejb/Documents/work/bankstatement/output_pages"
64
+ # output_directory = "/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/"
64
65
  # # Ensure the output directory exists
65
66
  # os.makedirs(output_directory, exist_ok=True)
66
67
  #
67
68
  # # Split the optimized PDF into separate pages
68
- # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/Documents/work/bankstatement/statement.pdf",
69
+ # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/oracle_10k_2014_q1_small.pdf",
69
70
  # output_directory,
70
- # False)
71
+ # True)
72
+ #
73
+ # print(f"Number of pages: {num_pages}")
74
+ # print(f"Output files: {output_files}")
75
+ # print(f"Temporary directory: {temp_dir}")
71
76
  #
72
77
  # shutil.rmtree(temp_dir, ignore_errors=True)
@@ -24,11 +24,15 @@ class HuggingFaceInference(ModelInference):
24
24
  return output_text
25
25
 
26
26
 
27
- def inference(self, input_data):
27
+ def inference(self, input_data, mode=None):
28
+ if mode == "static":
29
+ simple_json = self.get_simple_json()
30
+ return simple_json
31
+
28
32
  client = Client(self.hf_space, hf_token=self.hf_token)
29
33
 
30
34
  result = client.predict(
31
- image=handle_file(input_data[0]["image"]),
35
+ image=handle_file(input_data[0]["file_path"]),
32
36
  text_input=input_data[0]["text_input"],
33
37
  api_name="/run_inference"
34
38
  )
@@ -1,7 +1,30 @@
1
1
  from abc import ABC, abstractmethod
2
+ import json
3
+
2
4
 
3
5
  class ModelInference(ABC):
4
6
  @abstractmethod
5
- def inference(self, input_data):
7
+ def inference(self, input_data, mode=None):
6
8
  """This method should be implemented by subclasses."""
7
9
  pass
10
+
11
+ def get_simple_json(self):
12
+ # Define a simple data structure
13
+ data = {
14
+ "table": [
15
+ {
16
+ "description": "Revenues",
17
+ "latest_amount": 12453,
18
+ "previous_amount": 11445
19
+ },
20
+ {
21
+ "description": "Operating expenses",
22
+ "latest_amount": 9157,
23
+ "previous_amount": 8822
24
+ }
25
+ ]
26
+ }
27
+
28
+ # Convert the dictionary to a JSON string
29
+ json_data = json.dumps(data, indent=4)
30
+ return json_data
@@ -8,7 +8,7 @@ class LocalGPUInference(ModelInference):
8
8
  self.device = device
9
9
  self.model.to(self.device)
10
10
 
11
- def inference(self, input_data):
11
+ def inference(self, input_data, mode=None):
12
12
  self.model.eval() # Set the model to evaluation mode
13
13
  with torch.no_grad(): # No need to calculate gradients
14
14
  input_tensor = torch.tensor(input_data).to(self.device)
@@ -1,14 +1,14 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.3.4
4
- Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
3
+ Version: 0.3.6
4
+ Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
7
7
  Author-email: andrejus.baranovskis@gmail.com
8
8
  License: UNKNOWN
9
9
  Project-URL: Homepage, https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
10
10
  Project-URL: Repository, https://github.com/katanaml/sparrow
11
- Keywords: llm,rag,vision
11
+ Keywords: llm,vllm,ocr,vision
12
12
  Platform: UNKNOWN
13
13
  Classifier: Operating System :: OS Independent
14
14
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
@@ -16,23 +16,20 @@ Classifier: Topic :: Software Development
16
16
  Classifier: Programming Language :: Python :: 3.10
17
17
  Requires-Python: >=3.10
18
18
  Description-Content-Type: text/markdown
19
- Requires-Dist: torch ==2.2.2
20
- Requires-Dist: unstructured[all-docs] ==0.14.5
21
- Requires-Dist: unstructured-inference ==0.7.33
22
19
  Requires-Dist: rich
23
- Requires-Dist: pymupdf4llm ==0.0.9
24
- Requires-Dist: transformers ==4.41.2
25
- Requires-Dist: sentence-transformers ==3.0.1
26
- Requires-Dist: numpy ==1.26.4
27
- Requires-Dist: pypdf ==4.3.0
28
- Requires-Dist: easyocr ==1.7.1
20
+ Requires-Dist: transformers==4.41.2
21
+ Requires-Dist: sentence-transformers==3.0.1
22
+ Requires-Dist: numpy==1.26.4
23
+ Requires-Dist: pypdf==4.3.0
24
+ Requires-Dist: easyocr==1.7.1
29
25
  Requires-Dist: gradio-client
26
+ Requires-Dist: pdf2image
30
27
 
31
28
  # Sparrow Parse
32
29
 
33
30
  ## Description
34
31
 
35
- This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing, parsing and extracting information.
32
+ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) library with helpful methods for data pre-processing, parsing and extracting information. This library relies on Visual LLM functionality, Table Transformers and is part of Sparrow. Check main [README](https://github.com/katanaml/sparrow)
36
33
 
37
34
  ## Install
38
35
 
@@ -40,101 +37,16 @@ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-
40
37
  pip install sparrow-parse
41
38
  ```
42
39
 
43
- ## Pre-processing
44
-
45
- ### Unstructured
46
-
47
- ```
48
- from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
49
-
50
- processor = UnstructuredProcessor()
51
-
52
- content, table_content = processor.extract_data(
53
- file_path, # file to process
54
- strategy, # data processing strategy supported by unstructured
55
- model_name, # model supported by unstructured
56
- options, # table extraction into HTML format
57
- local, # True if running from CLI, or False if running from FastAPI
58
- debug) # Debug
59
- ```
60
-
61
- Example:
62
-
63
- *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
64
-
65
- *strategy* - `hi_res`
66
-
67
- *model_name* - `yolox`
68
-
69
- *options* - `['tables', 'unstructured']`
70
-
71
- *local* - `True`
72
-
73
- *debug* - `True`
74
-
75
- ### Markdown
76
-
77
- ```
78
- from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
79
-
80
- processor = MarkdownProcessor()
81
-
82
- content, table_content = processor.extract_data(
83
- file_path, # file to process
84
- options, # table extraction into HTML format
85
- local, # True if running from CLI, or False if running from FastAPI
86
- debug) # Debug
87
- ```
88
-
89
- Example:
90
-
91
- *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
92
-
93
- *options* - `['tables', 'markdown']`
94
-
95
- *local* - `True`
96
-
97
- *debug* - `True`
98
-
99
40
  ## Parsing and extraction
100
41
 
101
- ### HTML extractor
42
+ ### Sparrow Parse VL (vision-language model) extractor with Hugging Face GPU infra
102
43
 
103
44
  ```
104
- from sparrow_parse.extractor.html_extractor import HTMLExtractor
105
-
106
- extractor = HTMLExtractor()
107
-
108
- answer, targets_unprocessed = extractor.read_data(
109
- target_columns, # list of table columns data to fetch
110
- data, # list of HTML tables
111
- column_keywords, # list of valid column names, can be empty. Useful to filter junk content
112
- group_by_rows, # JSON result grouping
113
- update_targets, # Set to true, if page contains multiple tables with the same columns
114
- local, # True if running from CLI, or False if running from FastAPI
115
- debug) # Debug
116
-
117
- ```
118
-
119
- Example:
45
+ # run locally: python -m sparrow_parse.extractors.vllm_extractor
120
46
 
121
- *target_columns* - `['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth']`
47
+ from sparrow_parse.vllm.inference_factory import InferenceFactory
48
+ from sparrow_parse.extractors.vllm_extractor import VLLMExtractor
122
49
 
123
- *data* - `list of HTML tables`
124
-
125
- *column_keywords* - `None`
126
-
127
- *group_by_rows* - `True`
128
-
129
- *update_targets* - `True`
130
-
131
- *local* - `True`
132
-
133
- *debug* - `True`
134
-
135
- ### Sparrow Parse VL (vision-language) extractor
136
-
137
- ```
138
50
  extractor = VLLMExtractor()
139
51
 
140
52
  # export HF_TOKEN="hf_"
@@ -152,17 +64,25 @@ model_inference_instance = factory.get_inference_instance()
152
64
 
153
65
  input_data = [
154
66
  {
155
- "image": "/Users/andrejb/Documents/work/epik/bankstatement/bonds_table.png",
156
- "text_input": "retrieve financial instruments data. return response in JSON format"
67
+ "file_path": "/data/oracle_10k_2014_q1_small.pdf",
68
+ "text_input": "retrieve {"table": [{"description": "str", "latest_amount": 0, "previous_amount": 0}]}. return response in JSON format"
157
69
  }
158
70
  ]
159
71
 
160
72
  # Now you can run inference without knowing which implementation is used
161
- result = extractor.run_inference(model_inference_instance, input_data, generic_query=False, debug=True)
162
- print("Inference Result:", result)
73
+ results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, generic_query=False,
74
+ debug_dir="/data/",
75
+ debug=True,
76
+ mode="static")
77
+
78
+ for i, result in enumerate(results_array):
79
+ print(f"Result for page {i + 1}:", result)
80
+ print(f"Number of pages: {num_pages}")
163
81
  ```
164
82
 
165
- ## PDF optimization
83
+ Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
84
+
85
+ ## PDF pre-processing
166
86
 
167
87
  ```
168
88
  from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
@@ -177,7 +97,7 @@ num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
177
97
 
178
98
  Example:
179
99
 
180
- *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
100
+ *file_path* - `/data/invoice_1.pdf`
181
101
 
182
102
  *output_directory* - set to not `None`, for debug purposes only
183
103
 
@@ -0,0 +1,18 @@
1
+ sparrow_parse/__init__.py,sha256=IbpUPwvtjLOqowcOFsWQ6LKq-FH6cI19IpvfQlxufq0,21
2
+ sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
+ sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ sparrow_parse/extractors/vllm_extractor.py,sha256=Wo8sOvsQt6YHd7bvB_DB8MUa71FioO9xcQOWA3PQ6eU,3415
5
+ sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ sparrow_parse/helpers/pdf_optimizer.py,sha256=GIqQYWtixFeZGCRFXL0lQfQByapCDuQzzRHAkzcPwLE,3302
7
+ sparrow_parse/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ sparrow_parse/processors/table_structure_processor.py,sha256=bG_6jx66n_KNdY_O6hrZD1D4DHX5Qy__RYcKHmrSGnc,23894
9
+ sparrow_parse/vllm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ sparrow_parse/vllm/huggingface_inference.py,sha256=nalmPJFfrFlRnfd4yTq4HvIwDvIXjhKUlEyZ6gzMqe0,1239
11
+ sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzKmLNzgjw,820
12
+ sparrow_parse/vllm/inference_factory.py,sha256=r04e95uPWG5l8Q23yeDqKmvFxLyF991aA2m0hfBTNn8,993
13
+ sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
14
+ sparrow_parse-0.3.6.dist-info/METADATA,sha256=ANS8eWCx07bQOOFFnJUKwsiPo-ZT42b8DvMwP9o-jf4,5827
15
+ sparrow_parse-0.3.6.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
16
+ sparrow_parse-0.3.6.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
17
+ sparrow_parse-0.3.6.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
18
+ sparrow_parse-0.3.6.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: bdist_wheel (0.44.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5