sparrow-parse 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sparrow_parse/__init__.py +1 -1
- sparrow_parse/extractors/vllm_extractor.py +66 -25
- sparrow_parse/helpers/pdf_optimizer.py +11 -6
- sparrow_parse/vllm/huggingface_inference.py +6 -2
- sparrow_parse/vllm/inference_base.py +24 -1
- sparrow_parse/vllm/local_gpu_inference.py +1 -1
- {sparrow_parse-0.3.4.dist-info → sparrow_parse-0.3.6.dist-info}/METADATA +28 -108
- sparrow_parse-0.3.6.dist-info/RECORD +18 -0
- {sparrow_parse-0.3.4.dist-info → sparrow_parse-0.3.6.dist-info}/WHEEL +1 -1
- sparrow_parse/extractors/html_extractor.py +0 -251
- sparrow_parse/helpers/html_extractor_helper.py +0 -374
- sparrow_parse/processors/markdown_processor.py +0 -137
- sparrow_parse/processors/unstructured_processor.py +0 -178
- sparrow_parse/temp.py +0 -27
- sparrow_parse-0.3.4.dist-info/RECORD +0 -23
- {sparrow_parse-0.3.4.dist-info → sparrow_parse-0.3.6.dist-info}/entry_points.txt +0 -0
- {sparrow_parse-0.3.4.dist-info → sparrow_parse-0.3.6.dist-info}/top_level.txt +0 -0
@@ -1,137 +0,0 @@
|
|
1
|
-
import pymupdf4llm
|
2
|
-
import pandas as pd
|
3
|
-
import re
|
4
|
-
from rich.progress import Progress, SpinnerColumn, TextColumn
|
5
|
-
from rich import print
|
6
|
-
from bs4 import BeautifulSoup
|
7
|
-
|
8
|
-
|
9
|
-
class MarkdownProcessor(object):
|
10
|
-
def __init__(self):
|
11
|
-
pass
|
12
|
-
|
13
|
-
def extract_data(self, file_path, options, local=True, debug=False):
|
14
|
-
markdown_text = self.invoke_pipeline_step(
|
15
|
-
lambda: pymupdf4llm.to_markdown(file_path),
|
16
|
-
"Extracting markdown text from the document...",
|
17
|
-
local
|
18
|
-
)
|
19
|
-
|
20
|
-
content, table_content = self.invoke_pipeline_step(
|
21
|
-
lambda: self.load_text_data(markdown_text, options),
|
22
|
-
"Loading text data...",
|
23
|
-
local
|
24
|
-
)
|
25
|
-
|
26
|
-
if debug:
|
27
|
-
print("Data extracted from the document:")
|
28
|
-
print(content)
|
29
|
-
print("\n")
|
30
|
-
print("Table content extracted from the document:")
|
31
|
-
if table_content:
|
32
|
-
print(len(table_content))
|
33
|
-
print(table_content)
|
34
|
-
|
35
|
-
return content, table_content
|
36
|
-
|
37
|
-
def load_text_data(self, markdown_text, options):
|
38
|
-
content, table_content = None, None
|
39
|
-
|
40
|
-
if options is None:
|
41
|
-
content = markdown_text
|
42
|
-
|
43
|
-
if options and "tables" in options and "markdown" in options:
|
44
|
-
content = self.extract_form_data(markdown_text)
|
45
|
-
table_content = self.extract_tables(markdown_text)
|
46
|
-
|
47
|
-
return content, table_content
|
48
|
-
|
49
|
-
def extract_form_data(self, markdown_text):
|
50
|
-
return markdown_text
|
51
|
-
|
52
|
-
def extract_tables(self, markdown_text):
|
53
|
-
# Regular expression to match markdown tables
|
54
|
-
table_pattern = re.compile(r'(\|.+\|\n\|[-| ]+\|\n(?:\|.*\|\n)*?)(?=\|.*TOTAL)', re.MULTILINE)
|
55
|
-
|
56
|
-
# Find all tables in the markdown text
|
57
|
-
tables = table_pattern.findall(markdown_text)
|
58
|
-
|
59
|
-
html_tables = []
|
60
|
-
for table_text in tables:
|
61
|
-
# Split the table into lines
|
62
|
-
lines = table_text.strip().split('\n')
|
63
|
-
|
64
|
-
# Extract headers and rows
|
65
|
-
headers = [self.clean_column_name(header.strip()) for header in lines[0].split('|') if header]
|
66
|
-
rows = []
|
67
|
-
for line in lines[2:]: # Skip header and separator lines
|
68
|
-
row = [cell.strip() for cell in line.split('|') if cell]
|
69
|
-
rows.append(row)
|
70
|
-
|
71
|
-
# Convert to Pandas DataFrame
|
72
|
-
df = pd.DataFrame(rows, columns=headers)
|
73
|
-
|
74
|
-
# Convert DataFrame to HTML and append to the list
|
75
|
-
html_table = df.to_html(index=False)
|
76
|
-
if self.table_has_header(html_table):
|
77
|
-
html_tables.append(html_table)
|
78
|
-
|
79
|
-
return html_tables
|
80
|
-
|
81
|
-
def clean_column_name(self, name):
|
82
|
-
"""
|
83
|
-
Cleans the column name by removing spaces if the name is a single word with spaces between letters.
|
84
|
-
|
85
|
-
Args:
|
86
|
-
name (str): The column name to clean.
|
87
|
-
|
88
|
-
Returns:
|
89
|
-
str: The cleaned column name.
|
90
|
-
"""
|
91
|
-
# Check if the name contains only letters and spaces
|
92
|
-
if all(char.isalpha() or char.isspace() for char in name):
|
93
|
-
# Check if it is a single word with spaces between letters
|
94
|
-
parts = name.split()
|
95
|
-
if len(parts) > 1 and all(len(part) == 1 for part in parts):
|
96
|
-
return ''.join(parts)
|
97
|
-
return name
|
98
|
-
|
99
|
-
def invoke_pipeline_step(self, task_call, task_description, local):
|
100
|
-
if local:
|
101
|
-
with Progress(
|
102
|
-
SpinnerColumn(),
|
103
|
-
TextColumn("[progress.description]{task.description}"),
|
104
|
-
transient=False,
|
105
|
-
) as progress:
|
106
|
-
progress.add_task(description=task_description, total=None)
|
107
|
-
ret = task_call()
|
108
|
-
else:
|
109
|
-
print(task_description)
|
110
|
-
ret = task_call()
|
111
|
-
|
112
|
-
return ret
|
113
|
-
|
114
|
-
def table_has_header(self, table_html):
|
115
|
-
soup = BeautifulSoup(table_html, 'html.parser')
|
116
|
-
table = soup.find('table')
|
117
|
-
|
118
|
-
# Check if the table contains a <thead> tag
|
119
|
-
if table.find('thead'):
|
120
|
-
return True
|
121
|
-
|
122
|
-
# Check if the table contains any <th> tags inside the table (in case there's no <thead>)
|
123
|
-
if table.find_all('th'):
|
124
|
-
return True
|
125
|
-
|
126
|
-
return False
|
127
|
-
|
128
|
-
|
129
|
-
if __name__ == "__main__":
|
130
|
-
processor = MarkdownProcessor()
|
131
|
-
|
132
|
-
# content, table_content = processor.extract_data(
|
133
|
-
# '/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
|
134
|
-
# ['tables', 'markdown'],
|
135
|
-
# True,
|
136
|
-
# True)
|
137
|
-
|
@@ -1,178 +0,0 @@
|
|
1
|
-
import tempfile
|
2
|
-
import os
|
3
|
-
from unstructured.partition.pdf import partition_pdf
|
4
|
-
from unstructured.partition.image import partition_image
|
5
|
-
import json
|
6
|
-
from unstructured.staging.base import elements_to_json
|
7
|
-
from rich.progress import Progress, SpinnerColumn, TextColumn
|
8
|
-
from rich import print
|
9
|
-
from bs4 import BeautifulSoup
|
10
|
-
|
11
|
-
|
12
|
-
class UnstructuredProcessor(object):
|
13
|
-
def __init__(self):
|
14
|
-
pass
|
15
|
-
|
16
|
-
def extract_data(self, file_path, strategy, model_name, options, local=True, debug=False):
|
17
|
-
# Extracts the elements from the PDF
|
18
|
-
elements = self.invoke_pipeline_step(
|
19
|
-
lambda: self.process_file(file_path, strategy, model_name),
|
20
|
-
"Extracting elements from the document...",
|
21
|
-
local
|
22
|
-
)
|
23
|
-
|
24
|
-
if debug:
|
25
|
-
new_extension = 'json' # You can change this to any extension you want
|
26
|
-
new_file_path = self.change_file_extension(file_path, new_extension)
|
27
|
-
|
28
|
-
content, table_content = self.invoke_pipeline_step(
|
29
|
-
lambda: self.load_text_data(elements, new_file_path, options),
|
30
|
-
"Loading text data...",
|
31
|
-
local
|
32
|
-
)
|
33
|
-
else:
|
34
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
35
|
-
temp_file_path = os.path.join(temp_dir, "file_data.json")
|
36
|
-
|
37
|
-
content, table_content = self.invoke_pipeline_step(
|
38
|
-
lambda: self.load_text_data(elements, temp_file_path, options),
|
39
|
-
"Loading text data...",
|
40
|
-
local
|
41
|
-
)
|
42
|
-
|
43
|
-
if debug:
|
44
|
-
print("Data extracted from the document:")
|
45
|
-
print(content)
|
46
|
-
print("\n")
|
47
|
-
print("Table content extracted from the document:")
|
48
|
-
if table_content:
|
49
|
-
print(len(table_content))
|
50
|
-
print(table_content)
|
51
|
-
|
52
|
-
return content, table_content
|
53
|
-
|
54
|
-
def process_file(self, file_path, strategy, model_name):
|
55
|
-
elements = None
|
56
|
-
|
57
|
-
if file_path.lower().endswith('.pdf'):
|
58
|
-
elements = partition_pdf(
|
59
|
-
filename=file_path,
|
60
|
-
strategy=strategy,
|
61
|
-
infer_table_structure=True,
|
62
|
-
hi_res_model_name=model_name,
|
63
|
-
languages=['en']
|
64
|
-
)
|
65
|
-
elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
|
66
|
-
elements = partition_image(
|
67
|
-
filename=file_path,
|
68
|
-
strategy=strategy,
|
69
|
-
infer_table_structure=True,
|
70
|
-
hi_res_model_name=model_name,
|
71
|
-
languages=['en']
|
72
|
-
)
|
73
|
-
|
74
|
-
return elements
|
75
|
-
|
76
|
-
def change_file_extension(self, file_path, new_extension, suffix=None):
|
77
|
-
# Check if the new extension starts with a dot and add one if not
|
78
|
-
if not new_extension.startswith('.'):
|
79
|
-
new_extension = '.' + new_extension
|
80
|
-
|
81
|
-
# Split the file path into two parts: the base (everything before the last dot) and the extension
|
82
|
-
# If there's no dot in the filename, it'll just return the original filename without an extension
|
83
|
-
base = file_path.rsplit('.', 1)[0]
|
84
|
-
|
85
|
-
# Concatenate the base with the new extension
|
86
|
-
if suffix is None:
|
87
|
-
new_file_path = base + new_extension
|
88
|
-
else:
|
89
|
-
new_file_path = base + "_" + suffix + new_extension
|
90
|
-
|
91
|
-
return new_file_path
|
92
|
-
|
93
|
-
def load_text_data(self, elements, file_path, options):
|
94
|
-
elements_to_json(elements, filename=file_path)
|
95
|
-
|
96
|
-
content, table_content = None, None
|
97
|
-
|
98
|
-
if options is None:
|
99
|
-
content = self.process_json_file(file_path)
|
100
|
-
|
101
|
-
if options and "tables" in options and "unstructured" in options:
|
102
|
-
content = self.process_json_file(file_path, "form")
|
103
|
-
|
104
|
-
table_content = self.process_json_file(file_path, "table")
|
105
|
-
|
106
|
-
return content, table_content
|
107
|
-
|
108
|
-
def process_json_file(self, file_path, option=None):
|
109
|
-
# Read the JSON file
|
110
|
-
with open(file_path, 'r') as file:
|
111
|
-
data = json.load(file)
|
112
|
-
|
113
|
-
# Iterate over the JSON data and extract required elements
|
114
|
-
extracted_elements = []
|
115
|
-
for entry in data:
|
116
|
-
if entry["type"] == "Table" and (option is None or option == "table" or option == "form"):
|
117
|
-
table_data = entry["metadata"]["text_as_html"]
|
118
|
-
if option == "table" and self.table_has_header(table_data):
|
119
|
-
extracted_elements.append(table_data)
|
120
|
-
if option is None or option == "form":
|
121
|
-
extracted_elements.append(table_data)
|
122
|
-
elif entry["type"] == "Title" and (option is None or option == "form"):
|
123
|
-
extracted_elements.append(entry["text"])
|
124
|
-
elif entry["type"] == "NarrativeText" and (option is None or option == "form"):
|
125
|
-
extracted_elements.append(entry["text"])
|
126
|
-
elif entry["type"] == "UncategorizedText" and (option is None or option == "form"):
|
127
|
-
extracted_elements.append(entry["text"])
|
128
|
-
elif entry["type"] == "ListItem" and (option is None or option == "form"):
|
129
|
-
extracted_elements.append(entry["text"])
|
130
|
-
elif entry["type"] == "Image" and (option is None or option == "form"):
|
131
|
-
extracted_elements.append(entry["text"])
|
132
|
-
|
133
|
-
if option is None or option == "form":
|
134
|
-
# Convert list to single string with two new lines between each element
|
135
|
-
extracted_data = "\n\n".join(extracted_elements)
|
136
|
-
return extracted_data
|
137
|
-
|
138
|
-
return extracted_elements
|
139
|
-
|
140
|
-
def invoke_pipeline_step(self, task_call, task_description, local):
|
141
|
-
if local:
|
142
|
-
with Progress(
|
143
|
-
SpinnerColumn(),
|
144
|
-
TextColumn("[progress.description]{task.description}"),
|
145
|
-
transient=False,
|
146
|
-
) as progress:
|
147
|
-
progress.add_task(description=task_description, total=None)
|
148
|
-
ret = task_call()
|
149
|
-
else:
|
150
|
-
print(task_description)
|
151
|
-
ret = task_call()
|
152
|
-
|
153
|
-
return ret
|
154
|
-
|
155
|
-
def table_has_header(self, table_html):
|
156
|
-
soup = BeautifulSoup(table_html, 'html.parser')
|
157
|
-
table = soup.find('table')
|
158
|
-
|
159
|
-
# Check if the table contains a <thead> tag
|
160
|
-
if table.find('thead'):
|
161
|
-
return True
|
162
|
-
|
163
|
-
# Check if the table contains any <th> tags inside the table (in case there's no <thead>)
|
164
|
-
if table.find_all('th'):
|
165
|
-
return True
|
166
|
-
|
167
|
-
return False
|
168
|
-
|
169
|
-
|
170
|
-
if __name__ == "__main__":
|
171
|
-
processor = UnstructuredProcessor()
|
172
|
-
# content, table_content = processor.extract_data(
|
173
|
-
# '/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
|
174
|
-
# 'hi_res',
|
175
|
-
# 'yolox',
|
176
|
-
# ['tables', 'unstructured'],
|
177
|
-
# True,
|
178
|
-
# True)
|
sparrow_parse/temp.py
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
# content, table_content = processor.extract_data(
|
2
|
-
# '/Users/andrejb/Documents/work/schreiber/invoice_data/test/2618407.pdf',
|
3
|
-
# 'hi_res',
|
4
|
-
# 'yolox',
|
5
|
-
# # 'detectron2_onnx',
|
6
|
-
# ['tables', 'unstructured'],
|
7
|
-
# True,
|
8
|
-
# True)
|
9
|
-
|
10
|
-
# content, table_content = processor.extract_data(
|
11
|
-
# '/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_1.pdf',
|
12
|
-
# 'hi_res',
|
13
|
-
# 'yolox',
|
14
|
-
# ['tables', 'unstructured'],
|
15
|
-
# True,
|
16
|
-
# True)
|
17
|
-
|
18
|
-
# output_directory = "/Users/andrejb/Documents/work/epik/bankstatement/output_pages"
|
19
|
-
# # Ensure the output directory exists
|
20
|
-
# os.makedirs(output_directory, exist_ok=True)
|
21
|
-
#
|
22
|
-
# # Split the optimized PDF into separate pages
|
23
|
-
# num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_statement.pdf",
|
24
|
-
# output_directory,
|
25
|
-
# False)
|
26
|
-
#
|
27
|
-
# shutil.rmtree(temp_dir, ignore_errors=True)
|
@@ -1,23 +0,0 @@
|
|
1
|
-
sparrow_parse/__init__.py,sha256=SH0xuWVUkyLHZJwWBZ8GJoeliTeYFcqA6TWJgrkLv-U,21
|
2
|
-
sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
|
3
|
-
sparrow_parse/temp.py,sha256=gy4_mtNW_KfXn9br_suu6jHx7JKYLKs9pIOBynh_JWY,1134
|
4
|
-
sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
sparrow_parse/extractors/html_extractor.py,sha256=qe9Oz7J-GiIE8G1kIDMOeh96xe6P59Gyh5SjgV3v2c8,9977
|
6
|
-
sparrow_parse/extractors/vllm_extractor.py,sha256=Qwmf-SW4z_UstiiynX5TkyovlkokVhLuzcbUVZ16TXM,1540
|
7
|
-
sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
sparrow_parse/helpers/html_extractor_helper.py,sha256=n9M9NyZfesiCCj3ET9WoyqRcWIFJ4k-jyQlUAarKIhE,13658
|
9
|
-
sparrow_parse/helpers/pdf_optimizer.py,sha256=KI_EweGt9Y_rDH1uCpYD5wKCW3rdjSFFhoVtiPBxX8k,3013
|
10
|
-
sparrow_parse/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
sparrow_parse/processors/markdown_processor.py,sha256=dC2WUdA-v2psh7oytruftxYkXdQi72FoEYxF30ROuO0,4506
|
12
|
-
sparrow_parse/processors/table_structure_processor.py,sha256=bG_6jx66n_KNdY_O6hrZD1D4DHX5Qy__RYcKHmrSGnc,23894
|
13
|
-
sparrow_parse/processors/unstructured_processor.py,sha256=oonkB5ALaV1pVs0a-xr8yAf-kirIabmtugHMnnEILqo,6770
|
14
|
-
sparrow_parse/vllm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
sparrow_parse/vllm/huggingface_inference.py,sha256=Q2Ju65LDzbO-8RWW7cXzrR-pbZ1zKuPVODlKOTWKg_E,1114
|
16
|
-
sparrow_parse/vllm/inference_base.py,sha256=W0N2khehGdF1XHzZACG3I1UZaydHMk6BZgWNvaJD4Ck,197
|
17
|
-
sparrow_parse/vllm/inference_factory.py,sha256=r04e95uPWG5l8Q23yeDqKmvFxLyF991aA2m0hfBTNn8,993
|
18
|
-
sparrow_parse/vllm/local_gpu_inference.py,sha256=I_uWYiFAQhRrykOKbVz69NzftDxuemDKtAye4kWhtnU,617
|
19
|
-
sparrow_parse-0.3.4.dist-info/METADATA,sha256=L7qXKxktk42gUQlBlZAdzHQqfORoC6vBwRCd-VSwv3Y,7444
|
20
|
-
sparrow_parse-0.3.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
21
|
-
sparrow_parse-0.3.4.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
|
22
|
-
sparrow_parse-0.3.4.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
|
23
|
-
sparrow_parse-0.3.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|