sparrow-parse 0.5.0__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sparrow_parse/__init__.py +1 -1
- sparrow_parse/extractors/vllm_extractor.py +57 -13
- sparrow_parse/helpers/image_optimizer.py +59 -0
- sparrow_parse/helpers/pdf_optimizer.py +9 -7
- sparrow_parse/processors/table_structure_processor.py +2 -2
- sparrow_parse/text_extraction.py +30 -0
- sparrow_parse/vllm/mlx_inference.py +1 -1
- {sparrow_parse-0.5.0.dist-info → sparrow_parse-0.5.3.dist-info}/METADATA +33 -7
- sparrow_parse-0.5.3.dist-info/RECORD +21 -0
- sparrow_parse-0.5.0.dist-info/RECORD +0 -19
- {sparrow_parse-0.5.0.dist-info → sparrow_parse-0.5.3.dist-info}/WHEEL +0 -0
- {sparrow_parse-0.5.0.dist-info → sparrow_parse-0.5.3.dist-info}/entry_points.txt +0 -0
- {sparrow_parse-0.5.0.dist-info → sparrow_parse-0.5.3.dist-info}/top_level.txt +0 -0
sparrow_parse/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = '0.5.
|
1
|
+
__version__ = '0.5.3'
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import json
|
2
|
-
|
3
2
|
from sparrow_parse.vllm.inference_factory import InferenceFactory
|
4
3
|
from sparrow_parse.helpers.pdf_optimizer import PDFOptimizer
|
4
|
+
from sparrow_parse.helpers.image_optimizer import ImageOptimizer
|
5
5
|
from sparrow_parse.processors.table_structure_processor import TableDetector
|
6
6
|
from rich import print
|
7
7
|
import os
|
@@ -14,7 +14,7 @@ class VLLMExtractor(object):
|
|
14
14
|
pass
|
15
15
|
|
16
16
|
def run_inference(self, model_inference_instance, input_data, tables_only=False,
|
17
|
-
generic_query=False, debug_dir=None, debug=False, mode=None):
|
17
|
+
generic_query=False, crop_size=None, debug_dir=None, debug=False, mode=None):
|
18
18
|
"""
|
19
19
|
Main entry point for processing input data using a model inference instance.
|
20
20
|
Handles generic queries, PDFs, and table extraction.
|
@@ -27,12 +27,12 @@ class VLLMExtractor(object):
|
|
27
27
|
|
28
28
|
file_path = input_data[0]["file_path"]
|
29
29
|
if self.is_pdf(file_path):
|
30
|
-
return self._process_pdf(model_inference_instance, input_data, tables_only, debug, debug_dir, mode)
|
30
|
+
return self._process_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode)
|
31
31
|
|
32
|
-
return self._process_non_pdf(model_inference_instance, input_data, tables_only, debug, debug_dir)
|
32
|
+
return self._process_non_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir)
|
33
33
|
|
34
34
|
|
35
|
-
def _process_pdf(self, model_inference_instance, input_data, tables_only, debug, debug_dir, mode):
|
35
|
+
def _process_pdf(self, model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode):
|
36
36
|
"""
|
37
37
|
Handles processing and inference for PDF files, including page splitting and optional table extraction.
|
38
38
|
"""
|
@@ -40,26 +40,40 @@ class VLLMExtractor(object):
|
|
40
40
|
num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(input_data[0]["file_path"],
|
41
41
|
debug_dir, convert_to_images=True)
|
42
42
|
|
43
|
-
results = self._process_pages(model_inference_instance, output_files, input_data, tables_only, debug, debug_dir)
|
43
|
+
results = self._process_pages(model_inference_instance, output_files, input_data, tables_only, crop_size, debug, debug_dir)
|
44
44
|
|
45
45
|
# Clean up temporary directory
|
46
46
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
47
47
|
return results, num_pages
|
48
48
|
|
49
49
|
|
50
|
-
def _process_non_pdf(self, model_inference_instance, input_data, tables_only, debug, debug_dir):
|
50
|
+
def _process_non_pdf(self, model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir):
|
51
51
|
"""
|
52
52
|
Handles processing and inference for non-PDF files, with optional table extraction.
|
53
53
|
"""
|
54
54
|
file_path = input_data[0]["file_path"]
|
55
|
+
|
55
56
|
if tables_only:
|
56
57
|
return self._extract_tables(model_inference_instance, file_path, input_data, debug, debug_dir), 1
|
57
58
|
else:
|
59
|
+
temp_dir = tempfile.mkdtemp()
|
60
|
+
|
61
|
+
if crop_size:
|
62
|
+
if debug:
|
63
|
+
print(f"Cropping image borders by {crop_size} pixels.")
|
64
|
+
image_optimizer = ImageOptimizer()
|
65
|
+
cropped_file_path = image_optimizer.crop_image_borders(file_path, temp_dir, debug_dir, crop_size)
|
66
|
+
input_data[0]["file_path"] = cropped_file_path
|
67
|
+
|
68
|
+
file_path = input_data[0]["file_path"]
|
58
69
|
input_data[0]["file_path"] = [file_path]
|
59
70
|
results = model_inference_instance.inference(input_data)
|
71
|
+
|
72
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
73
|
+
|
60
74
|
return results, 1
|
61
75
|
|
62
|
-
def _process_pages(self, model_inference_instance, output_files, input_data, tables_only, debug, debug_dir):
|
76
|
+
def _process_pages(self, model_inference_instance, output_files, input_data, tables_only, crop_size, debug, debug_dir):
|
63
77
|
"""
|
64
78
|
Processes individual pages (PDF split) and handles table extraction or inference.
|
65
79
|
|
@@ -68,6 +82,7 @@ class VLLMExtractor(object):
|
|
68
82
|
output_files: List of file paths for the split PDF pages.
|
69
83
|
input_data: Input data for inference.
|
70
84
|
tables_only: Whether to only process tables.
|
85
|
+
crop_size: Size for cropping image borders.
|
71
86
|
debug: Debug flag for logging.
|
72
87
|
debug_dir: Directory for saving debug information.
|
73
88
|
|
@@ -89,11 +104,39 @@ class VLLMExtractor(object):
|
|
89
104
|
else:
|
90
105
|
if debug:
|
91
106
|
print(f"Processing {len(output_files)} pages for inference at once.")
|
92
|
-
|
93
|
-
|
107
|
+
|
108
|
+
temp_dir = tempfile.mkdtemp()
|
109
|
+
cropped_files = []
|
110
|
+
|
111
|
+
if crop_size:
|
112
|
+
if debug:
|
113
|
+
print(f"Cropping image borders by {crop_size} pixels from {len(output_files)} images.")
|
114
|
+
|
115
|
+
image_optimizer = ImageOptimizer()
|
116
|
+
|
117
|
+
# Process each file in the output_files array
|
118
|
+
for file_path in output_files:
|
119
|
+
cropped_file_path = image_optimizer.crop_image_borders(
|
120
|
+
file_path,
|
121
|
+
temp_dir,
|
122
|
+
debug_dir,
|
123
|
+
crop_size
|
124
|
+
)
|
125
|
+
cropped_files.append(cropped_file_path)
|
126
|
+
|
127
|
+
# Use the cropped files for inference
|
128
|
+
input_data[0]["file_path"] = cropped_files
|
129
|
+
else:
|
130
|
+
# If no cropping needed, use original files directly
|
131
|
+
input_data[0]["file_path"] = output_files
|
132
|
+
|
133
|
+
# Process all files at once
|
94
134
|
results = model_inference_instance.inference(input_data)
|
95
135
|
results_array.extend(results)
|
96
136
|
|
137
|
+
# Clean up temporary directory
|
138
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
139
|
+
|
97
140
|
return results_array
|
98
141
|
|
99
142
|
|
@@ -155,7 +198,7 @@ if __name__ == "__main__":
|
|
155
198
|
# # export HF_TOKEN="hf_"
|
156
199
|
# config = {
|
157
200
|
# "method": "mlx", # Could be 'huggingface', 'mlx' or 'local_gpu'
|
158
|
-
# "model_name": "mlx-community/Qwen2-VL-7B-Instruct-8bit",
|
201
|
+
# "model_name": "mlx-community/Qwen2.5-VL-7B-Instruct-8bit",
|
159
202
|
# # "hf_space": "katanaml/sparrow-qwen2-vl-7b",
|
160
203
|
# # "hf_token": os.getenv('HF_TOKEN'),
|
161
204
|
# # Additional fields for local GPU inference
|
@@ -168,14 +211,15 @@ if __name__ == "__main__":
|
|
168
211
|
#
|
169
212
|
# input_data = [
|
170
213
|
# {
|
171
|
-
# "file_path": "/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/
|
214
|
+
# "file_path": "/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/bonds_table.png",
|
172
215
|
# "text_input": "retrieve document data. return response in JSON format"
|
173
216
|
# }
|
174
217
|
# ]
|
175
218
|
#
|
176
219
|
# # Now you can run inference without knowing which implementation is used
|
177
|
-
# results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=
|
220
|
+
# results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
|
178
221
|
# generic_query=False,
|
222
|
+
# crop_size=0,
|
179
223
|
# debug_dir="/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/",
|
180
224
|
# debug=True,
|
181
225
|
# mode=None)
|
@@ -0,0 +1,59 @@
|
|
1
|
+
from PIL import Image
|
2
|
+
import os
|
3
|
+
|
4
|
+
|
5
|
+
class ImageOptimizer(object):
|
6
|
+
def __init__(self):
|
7
|
+
pass
|
8
|
+
|
9
|
+
def crop_image_borders(self, file_path, temp_dir, debug_dir=None, crop_size=60):
|
10
|
+
"""
|
11
|
+
Crops all four borders of an image by the specified size.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
file_path (str): Path to the input image
|
15
|
+
temp_dir (str): Temporary directory to store the cropped image
|
16
|
+
debug_dir (str, optional): Directory to save a debug copy of the cropped image
|
17
|
+
crop_size (int): Number of pixels to crop from each border
|
18
|
+
|
19
|
+
Returns:
|
20
|
+
str: Path to the cropped image in temp_dir
|
21
|
+
"""
|
22
|
+
try:
|
23
|
+
# Open the image
|
24
|
+
with Image.open(file_path) as img:
|
25
|
+
# Get image dimensions
|
26
|
+
width, height = img.size
|
27
|
+
|
28
|
+
# Calculate the crop box
|
29
|
+
left = crop_size
|
30
|
+
top = crop_size
|
31
|
+
right = width - crop_size
|
32
|
+
bottom = height - crop_size
|
33
|
+
|
34
|
+
# Ensure we're not trying to crop more than the image size
|
35
|
+
if right <= left or bottom <= top:
|
36
|
+
raise ValueError("Crop size is too large for the image dimensions")
|
37
|
+
|
38
|
+
# Perform the crop
|
39
|
+
cropped_img = img.crop((left, top, right, bottom))
|
40
|
+
|
41
|
+
# Get original filename without path
|
42
|
+
filename = os.path.basename(file_path)
|
43
|
+
name, ext = os.path.splitext(filename)
|
44
|
+
|
45
|
+
# Save cropped image in temp_dir
|
46
|
+
output_path = os.path.join(temp_dir, f"{name}_cropped{ext}")
|
47
|
+
cropped_img.save(output_path)
|
48
|
+
|
49
|
+
# If debug_dir is provided, save a debug copy
|
50
|
+
if debug_dir:
|
51
|
+
os.makedirs(debug_dir, exist_ok=True)
|
52
|
+
debug_path = os.path.join(debug_dir, f"{name}_cropped_debug{ext}")
|
53
|
+
cropped_img.save(debug_path)
|
54
|
+
print(f"Debug cropped image saved to: {debug_path}")
|
55
|
+
|
56
|
+
return output_path
|
57
|
+
|
58
|
+
except Exception as e:
|
59
|
+
raise Exception(f"Error processing image: {str(e)}")
|
@@ -9,7 +9,7 @@ class PDFOptimizer(object):
|
|
9
9
|
def __init__(self):
|
10
10
|
pass
|
11
11
|
|
12
|
-
def split_pdf_to_pages(self, file_path,
|
12
|
+
def split_pdf_to_pages(self, file_path, debug_dir=None, convert_to_images=False):
|
13
13
|
# Create a temporary directory
|
14
14
|
temp_dir = tempfile.mkdtemp()
|
15
15
|
output_files = []
|
@@ -30,9 +30,9 @@ class PDFOptimizer(object):
|
|
30
30
|
writer.write(output_file)
|
31
31
|
output_files.append(output_filename)
|
32
32
|
|
33
|
-
if
|
33
|
+
if debug_dir:
|
34
34
|
# Save each page to the debug folder
|
35
|
-
debug_output_filename = os.path.join(
|
35
|
+
debug_output_filename = os.path.join(debug_dir, f'page_{page_num + 1}.pdf')
|
36
36
|
with open(debug_output_filename, 'wb') as output_file:
|
37
37
|
writer.write(output_file)
|
38
38
|
|
@@ -49,10 +49,12 @@ class PDFOptimizer(object):
|
|
49
49
|
image.save(output_filename, 'JPEG')
|
50
50
|
output_files.append(output_filename)
|
51
51
|
|
52
|
-
if
|
52
|
+
if debug_dir:
|
53
53
|
# Save each image to the debug folder
|
54
|
-
|
54
|
+
os.makedirs(debug_dir, exist_ok=True)
|
55
|
+
debug_output_filename = os.path.join(debug_dir, f'{base_name}_page_{i + 1}_debug.jpg')
|
55
56
|
image.save(debug_output_filename, 'JPEG')
|
57
|
+
print(f"Debug image saved to: {debug_output_filename}")
|
56
58
|
|
57
59
|
# Return the number of pages, the list of file paths, and the temporary directory
|
58
60
|
return len(images), output_files, temp_dir
|
@@ -61,13 +63,13 @@ class PDFOptimizer(object):
|
|
61
63
|
if __name__ == "__main__":
|
62
64
|
pdf_optimizer = PDFOptimizer()
|
63
65
|
|
64
|
-
#
|
66
|
+
# debug_dir = "/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/"
|
65
67
|
# # Ensure the output directory exists
|
66
68
|
# os.makedirs(output_directory, exist_ok=True)
|
67
69
|
#
|
68
70
|
# # Split the optimized PDF into separate pages
|
69
71
|
# num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/oracle_10k_2014_q1_small.pdf",
|
70
|
-
#
|
72
|
+
# debug_dir,
|
71
73
|
# True)
|
72
74
|
#
|
73
75
|
# print(f"Number of pages: {num_pages}")
|
@@ -131,7 +131,7 @@ class TableDetector(object):
|
|
131
131
|
cropped_tables.append(cropped_table)
|
132
132
|
|
133
133
|
if debug_dir:
|
134
|
-
file_name_table = self.append_filename(file_path, debug_dir, f"
|
134
|
+
file_name_table = self.append_filename(file_path, debug_dir, f"table_cropped_{i + 1}")
|
135
135
|
cropped_table.save(file_name_table)
|
136
136
|
else:
|
137
137
|
if debug:
|
@@ -141,7 +141,7 @@ class TableDetector(object):
|
|
141
141
|
cropped_tables.append(cropped_table)
|
142
142
|
|
143
143
|
if debug_dir:
|
144
|
-
file_name_table = self.append_filename(file_path, debug_dir, "
|
144
|
+
file_name_table = self.append_filename(file_path, debug_dir, "table_cropped")
|
145
145
|
cropped_table.save(file_name_table)
|
146
146
|
|
147
147
|
return cropped_tables
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from mlx_vlm import load, apply_chat_template, generate
|
2
|
+
from mlx_vlm.utils import load_image
|
3
|
+
|
4
|
+
# For test purposes, we will use a sample image
|
5
|
+
|
6
|
+
# Load model and processor
|
7
|
+
qwen_vl_model, qwen_vl_processor = load("mlx-community/Qwen2.5-VL-7B-Instruct-8bit")
|
8
|
+
qwen_vl_config = qwen_vl_model.config
|
9
|
+
|
10
|
+
image = load_image("images/graph.png")
|
11
|
+
|
12
|
+
messages = [
|
13
|
+
{"role": "system", "content": "You are an expert at extracting text from images. Format your response in json."},
|
14
|
+
{"role": "user", "content": "Extract the names, labels and y coordinates from the image."}
|
15
|
+
]
|
16
|
+
|
17
|
+
# Apply chat template
|
18
|
+
prompt = apply_chat_template(qwen_vl_processor, qwen_vl_config, messages)
|
19
|
+
|
20
|
+
# Generate text
|
21
|
+
qwen_vl_output = generate(
|
22
|
+
qwen_vl_model,
|
23
|
+
qwen_vl_processor,
|
24
|
+
prompt,
|
25
|
+
image,
|
26
|
+
max_tokens=1000,
|
27
|
+
temperature=0.7,
|
28
|
+
)
|
29
|
+
|
30
|
+
print(qwen_vl_output)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.3
|
4
4
|
Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
Author: Andrej Baranovskij
|
@@ -17,19 +17,22 @@ Classifier: Programming Language :: Python :: 3.10
|
|
17
17
|
Requires-Python: >=3.10
|
18
18
|
Description-Content-Type: text/markdown
|
19
19
|
Requires-Dist: rich
|
20
|
-
Requires-Dist: transformers==4.
|
20
|
+
Requires-Dist: transformers==4.48.2
|
21
|
+
Requires-Dist: torchvision==0.21.0
|
22
|
+
Requires-Dist: torch==2.6.0
|
21
23
|
Requires-Dist: sentence-transformers==3.3.1
|
22
24
|
Requires-Dist: numpy==2.1.3
|
23
25
|
Requires-Dist: pypdf==4.3.0
|
24
26
|
Requires-Dist: gradio-client
|
25
27
|
Requires-Dist: pdf2image
|
26
|
-
Requires-Dist: mlx
|
28
|
+
Requires-Dist: mlx>=0.22.0; sys_platform == "darwin" and platform_machine == "arm64"
|
29
|
+
Requires-Dist: mlx-vlm==0.1.12; sys_platform == "darwin" and platform_machine == "arm64"
|
27
30
|
|
28
31
|
# Sparrow Parse
|
29
32
|
|
30
33
|
## Description
|
31
34
|
|
32
|
-
This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) library with helpful methods for data pre-processing, parsing and extracting information.
|
35
|
+
This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) library with helpful methods for data pre-processing, parsing and extracting information. Library relies on Visual LLM functionality, Table Transformers and is part of Sparrow. Check main [README](https://github.com/katanaml/sparrow)
|
33
36
|
|
34
37
|
## Install
|
35
38
|
|
@@ -68,6 +71,7 @@ input_data = [
|
|
68
71
|
# Now you can run inference without knowing which implementation is used
|
69
72
|
results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
|
70
73
|
generic_query=False,
|
74
|
+
crop_size=80,
|
71
75
|
debug_dir=None,
|
72
76
|
debug=True,
|
73
77
|
mode=None)
|
@@ -79,6 +83,8 @@ print(f"Number of pages: {num_pages}")
|
|
79
83
|
|
80
84
|
Use `tables_only=True` if you want to extract only tables.
|
81
85
|
|
86
|
+
Use `crop_size=N` (where `N` is an integer) to crop N pixels from all borders of the input images. This can be helpful for removing unwanted borders or frame artifacts from scanned documents.
|
87
|
+
|
82
88
|
Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
|
83
89
|
|
84
90
|
Method `run_inference` will return results and number of pages processed.
|
@@ -103,7 +109,7 @@ from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
|
|
103
109
|
pdf_optimizer = PDFOptimizer()
|
104
110
|
|
105
111
|
num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
|
106
|
-
|
112
|
+
debug_dir,
|
107
113
|
convert_to_images)
|
108
114
|
|
109
115
|
```
|
@@ -112,10 +118,30 @@ Example:
|
|
112
118
|
|
113
119
|
*file_path* - `/data/invoice_1.pdf`
|
114
120
|
|
115
|
-
*
|
121
|
+
*debug_dir* - set to not `None`, for debug purposes only
|
116
122
|
|
117
123
|
*convert_to_images* - default `False`, to split into PDF files
|
118
124
|
|
125
|
+
## Image cropping
|
126
|
+
|
127
|
+
```
|
128
|
+
from sparrow_parse.helpers.image_optimizer import ImageOptimizer
|
129
|
+
|
130
|
+
image_optimizer = ImageOptimizer()
|
131
|
+
|
132
|
+
cropped_file_path = image_optimizer.crop_image_borders(file_path, temp_dir, debug_dir, crop_size)
|
133
|
+
```
|
134
|
+
|
135
|
+
Example:
|
136
|
+
|
137
|
+
*file_path* - `/data/invoice_1.jpg`
|
138
|
+
|
139
|
+
*temp_dir* - directory to store cropped files
|
140
|
+
|
141
|
+
*debug_dir* - set to not `None`, for debug purposes only
|
142
|
+
|
143
|
+
*crop_size* - Number of pixels to crop from each border
|
144
|
+
|
119
145
|
## Library build
|
120
146
|
|
121
147
|
Create Python virtual environment
|
@@ -160,6 +186,6 @@ If your organization is seeking to utilize Sparrow under a proprietary license,
|
|
160
186
|
|
161
187
|
## License
|
162
188
|
|
163
|
-
Licensed under the GPL 3.0. Copyright 2020-
|
189
|
+
Licensed under the GPL 3.0. Copyright 2020-2025 Katana ML, Andrej Baranovskij. [Copy of the license](https://github.com/katanaml/sparrow/blob/main/LICENSE).
|
164
190
|
|
165
191
|
|
@@ -0,0 +1,21 @@
|
|
1
|
+
sparrow_parse/__init__.py,sha256=IIIADjPr2y0W_XfgU1cH-K2HswMouXAPagGe6_twaIk,21
|
2
|
+
sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
|
3
|
+
sparrow_parse/text_extraction.py,sha256=JtUU7swvV12xBai5S9ICxWWWrUlkpZTZqvUnbz1h5Mk,834
|
4
|
+
sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
sparrow_parse/extractors/vllm_extractor.py,sha256=Cf2sVgxDExj2ud4G6z9JnirVclTgPIEe9YSoCfTkW4k,9563
|
6
|
+
sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
sparrow_parse/helpers/image_optimizer.py,sha256=gUAJuNzRAB5ipgfhxTNss4MHbCPPkV5y-BSyrEHcJ0Y,2164
|
8
|
+
sparrow_parse/helpers/pdf_optimizer.py,sha256=A2BVkb2JMqTJUz6bdfVzMmFSYaWn1QMav7UadMi0XJg,3423
|
9
|
+
sparrow_parse/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
sparrow_parse/processors/table_structure_processor.py,sha256=BCYnrsqngEu0WpBORcefdnCUgCCT12fFWdrFqvdXAwc,9787
|
11
|
+
sparrow_parse/vllm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
|
+
sparrow_parse/vllm/huggingface_inference.py,sha256=EJnG6PesGKMc_0qGPN8ufE6pSnhAgFu0XjCbaLCNVyM,1980
|
13
|
+
sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzKmLNzgjw,820
|
14
|
+
sparrow_parse/vllm/inference_factory.py,sha256=FTM65O-dW2WZchHOrNN7_Q3-FlVoAc65iSptuuUuClM,1166
|
15
|
+
sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
|
16
|
+
sparrow_parse/vllm/mlx_inference.py,sha256=MUuW56f-aKnVmeMAATxKLxsovEMmp1qlgtlmW8J2C7M,4899
|
17
|
+
sparrow_parse-0.5.3.dist-info/METADATA,sha256=NOwPut-aOo6gdWH44k_Ei3WP3-bvkc-Dl7qyKE3r2FQ,7239
|
18
|
+
sparrow_parse-0.5.3.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
19
|
+
sparrow_parse-0.5.3.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
|
20
|
+
sparrow_parse-0.5.3.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
|
21
|
+
sparrow_parse-0.5.3.dist-info/RECORD,,
|
@@ -1,19 +0,0 @@
|
|
1
|
-
sparrow_parse/__init__.py,sha256=LyVsN6QRbZCjxbel-HtG6unyJHf29KfzURL0WnqwB_I,21
|
2
|
-
sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
|
3
|
-
sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
sparrow_parse/extractors/vllm_extractor.py,sha256=PDLgLlKiq3Bv-UOQTzX3AgxNOLcEU2EniGAXLjMC30U,7820
|
5
|
-
sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
sparrow_parse/helpers/pdf_optimizer.py,sha256=GIqQYWtixFeZGCRFXL0lQfQByapCDuQzzRHAkzcPwLE,3302
|
7
|
-
sparrow_parse/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
sparrow_parse/processors/table_structure_processor.py,sha256=PQHHFdQUuTin3Mm2USuUga2n4fGWMLwiBJYq4CVD67o,9775
|
9
|
-
sparrow_parse/vllm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
-
sparrow_parse/vllm/huggingface_inference.py,sha256=EJnG6PesGKMc_0qGPN8ufE6pSnhAgFu0XjCbaLCNVyM,1980
|
11
|
-
sparrow_parse/vllm/inference_base.py,sha256=4mwGoAY63MB4cHZpV0czTkJWEzimmiTzqqzKmLNzgjw,820
|
12
|
-
sparrow_parse/vllm/inference_factory.py,sha256=FTM65O-dW2WZchHOrNN7_Q3-FlVoAc65iSptuuUuClM,1166
|
13
|
-
sparrow_parse/vllm/local_gpu_inference.py,sha256=aHoJTejb5xrXjWDIGu5RBQWEyRCOBCB04sMvO2Wyvg8,628
|
14
|
-
sparrow_parse/vllm/mlx_inference.py,sha256=eFyi42-ju28Eb6bzFj0xa5UUaOV68SR12vL4O3vaD9s,4899
|
15
|
-
sparrow_parse-0.5.0.dist-info/METADATA,sha256=9Y-dPyKPITLJflxIlN0zDC607aJbsbQcgpaB6MS4qFQ,6433
|
16
|
-
sparrow_parse-0.5.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
17
|
-
sparrow_parse-0.5.0.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
|
18
|
-
sparrow_parse-0.5.0.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
|
19
|
-
sparrow_parse-0.5.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|