sparrow-parse 0.2.7__tar.gz → 0.2.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sparrow_parse-0.2.7 → sparrow_parse-0.2.8}/PKG-INFO +1 -1
- {sparrow_parse-0.2.7 → sparrow_parse-0.2.8}/pyproject.toml +1 -1
- sparrow_parse-0.2.8/sparrow_parse/__init__.py +1 -0
- sparrow_parse-0.2.8/sparrow_parse/extractor/__pycache__/__init__.cpython-310.pyc +0 -0
- sparrow_parse-0.2.8/sparrow_parse/extractor/__pycache__/extractor_helper.cpython-310.pyc +0 -0
- sparrow_parse-0.2.8/sparrow_parse/extractor/__pycache__/html_extractor.cpython-310.pyc +0 -0
- {sparrow_parse-0.2.7 → sparrow_parse-0.2.8}/sparrow_parse/extractor/extractor_helper.py +10 -9
- {sparrow_parse-0.2.7 → sparrow_parse-0.2.8}/sparrow_parse/extractor/html_extractor.py +33 -29
- {sparrow_parse-0.2.7 → sparrow_parse-0.2.8}/sparrow_parse/extractor/unstructured_processor.py +8 -0
- sparrow_parse-0.2.7/sparrow_parse/__init__.py +0 -1
- {sparrow_parse-0.2.7 → sparrow_parse-0.2.8}/README.md +0 -0
- {sparrow_parse-0.2.7 → sparrow_parse-0.2.8}/sparrow_parse/__main__.py +0 -0
- {sparrow_parse-0.2.7 → sparrow_parse-0.2.8}/sparrow_parse/data/invoice_1_table.txt +0 -0
- {sparrow_parse-0.2.7 → sparrow_parse-0.2.8}/sparrow_parse/extractor/__init__.py +0 -0
- {sparrow_parse-0.2.7 → sparrow_parse-0.2.8}/sparrow_parse/extractor/markdown_processor.py +0 -0
- {sparrow_parse-0.2.7 → sparrow_parse-0.2.8}/sparrow_parse/temp.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.8
|
4
4
|
Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
License: GPL-3.0
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "sparrow-parse"
|
3
|
-
version = "0.2.
|
3
|
+
version = "0.2.8"
|
4
4
|
description = "Sparrow Parse is a Python package for parsing and extracting information from documents."
|
5
5
|
authors = ["Andrej Baranovskij <andrejus.baranovskis@gmail.com>"]
|
6
6
|
license = "GPL-3.0"
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = '0.2.8'
|
Binary file
|
Binary file
|
@@ -3,9 +3,10 @@ from sentence_transformers import SentenceTransformer, util
|
|
3
3
|
import pandas as pd
|
4
4
|
import re
|
5
5
|
from io import StringIO
|
6
|
+
from rich import print
|
6
7
|
|
7
8
|
|
8
|
-
def merge_html_table_headers(html_table, column_keywords, debug=False):
|
9
|
+
def merge_html_table_headers(html_table, column_keywords, similarity_threshold, debug=False):
|
9
10
|
soup = BeautifulSoup(html_table, 'html.parser')
|
10
11
|
|
11
12
|
# Find all thead elements
|
@@ -18,7 +19,7 @@ def merge_html_table_headers(html_table, column_keywords, debug=False):
|
|
18
19
|
html_table = normalize_html_table(html_table, debug)
|
19
20
|
html_table = fix_rowspan_elements(html_table)
|
20
21
|
html_table = merge_rows_with_rowspan(html_table)
|
21
|
-
html_table = detect_and_remove_junk_columns(html_table, column_keywords, debug)
|
22
|
+
html_table = detect_and_remove_junk_columns(html_table, column_keywords, similarity_threshold, debug)
|
22
23
|
else:
|
23
24
|
# If there is only one thead, return the original table
|
24
25
|
return html_table
|
@@ -272,7 +273,7 @@ def merge_rows_with_rowspan(html):
|
|
272
273
|
return str(new_table_soup.table)
|
273
274
|
|
274
275
|
|
275
|
-
def detect_and_remove_junk_columns(html_table, target_columns, debug=False):
|
276
|
+
def detect_and_remove_junk_columns(html_table, target_columns, similarity_threshold_param, debug=False):
|
276
277
|
html_table = clean_html_table_header_names(html_table)
|
277
278
|
|
278
279
|
# Wrap the HTML string in a StringIO object
|
@@ -295,7 +296,7 @@ def detect_and_remove_junk_columns(html_table, target_columns, debug=False):
|
|
295
296
|
|
296
297
|
# Identify junk columns based on similarity threshold
|
297
298
|
junk_columns = []
|
298
|
-
similarity_threshold =
|
299
|
+
similarity_threshold = similarity_threshold_param
|
299
300
|
|
300
301
|
for idx, col_embedding in enumerate(column_embeddings):
|
301
302
|
similarities = util.pytorch_cos_sim(col_embedding, target_embeddings)[0]
|
@@ -357,11 +358,11 @@ def clean_html_table_header_names(html_table: str) -> str:
|
|
357
358
|
# Extract the headers and clean them
|
358
359
|
headers = table.find_all("th")
|
359
360
|
for th in headers:
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
361
|
+
if th.string:
|
362
|
+
# Clean the header
|
363
|
+
clean_header = re.sub(r"[^a-zA-Z0-9\s]", "", th.get_text())
|
364
|
+
# Keep it empty if the cleaned name is empty
|
365
|
+
th.string.replace_with(clean_header.strip() if clean_header.strip() else "")
|
365
366
|
|
366
367
|
html_table = str(soup)
|
367
368
|
|
@@ -12,8 +12,8 @@ class HTMLExtractor(object):
|
|
12
12
|
def __init__(self):
|
13
13
|
pass
|
14
14
|
|
15
|
-
def read_data(self, target_columns, data,
|
16
|
-
local=True, debug=False):
|
15
|
+
def read_data(self, target_columns, data, similarity_threshold_junk, similarity_threshold_column_id,
|
16
|
+
column_keywords=None, group_by_rows=True, update_targets=False, local=True, debug=False):
|
17
17
|
answer = {}
|
18
18
|
|
19
19
|
json_result, targets_unprocessed = [], []
|
@@ -22,7 +22,8 @@ class HTMLExtractor(object):
|
|
22
22
|
if not target_columns:
|
23
23
|
break
|
24
24
|
|
25
|
-
json_result, targets_unprocessed = self.read_data_from_table(target_columns, table,
|
25
|
+
json_result, targets_unprocessed = self.read_data_from_table(target_columns, table, similarity_threshold_junk,
|
26
|
+
similarity_threshold_column_id, column_keywords,
|
26
27
|
group_by_rows, local, debug)
|
27
28
|
answer = self.add_answer_section(answer, "items" + str(i + 1), json_result)
|
28
29
|
|
@@ -33,9 +34,10 @@ class HTMLExtractor(object):
|
|
33
34
|
|
34
35
|
return answer, targets_unprocessed
|
35
36
|
|
36
|
-
def read_data_from_table(self, target_columns, data,
|
37
|
+
def read_data_from_table(self, target_columns, data, similarity_threshold_junk, similarity_threshold_column_id,
|
38
|
+
column_keywords=None, group_by_rows=True, local=True, debug=False):
|
37
39
|
data = self.invoke_pipeline_step(
|
38
|
-
lambda: merge_html_table_headers(data, column_keywords, debug),
|
40
|
+
lambda: merge_html_table_headers(data, column_keywords, similarity_threshold_junk, debug),
|
39
41
|
"Merging HTML table headers...",
|
40
42
|
local
|
41
43
|
)
|
@@ -54,7 +56,7 @@ class HTMLExtractor(object):
|
|
54
56
|
print(f"Target columns: {target_columns}")
|
55
57
|
|
56
58
|
indices, targets, targets_unprocessed = self.invoke_pipeline_step(
|
57
|
-
lambda: self.calculate_similarity(columns, target_columns, debug),
|
59
|
+
lambda: self.calculate_similarity(columns, target_columns, similarity_threshold_column_id, debug),
|
58
60
|
"Calculating cosine similarity between columns and target values...",
|
59
61
|
local
|
60
62
|
)
|
@@ -73,7 +75,7 @@ class HTMLExtractor(object):
|
|
73
75
|
|
74
76
|
return json_result, targets_unprocessed
|
75
77
|
|
76
|
-
def calculate_similarity(self, columns, target_columns, debug):
|
78
|
+
def calculate_similarity(self, columns, target_columns, similarity_threshold_column_id, debug):
|
77
79
|
model = SentenceTransformer('all-mpnet-base-v2')
|
78
80
|
|
79
81
|
# Compute embeddings for columns and target values
|
@@ -93,7 +95,7 @@ class HTMLExtractor(object):
|
|
93
95
|
most_similar_idx = similarities.argmax().item()
|
94
96
|
most_similar_column = columns[most_similar_idx]
|
95
97
|
similarity_score = similarities[most_similar_idx].item()
|
96
|
-
if similarity_score >
|
98
|
+
if similarity_score > similarity_threshold_column_id:
|
97
99
|
if most_similar_idx in most_similar_indices:
|
98
100
|
if similarity_score > most_similar_indices[most_similar_idx][1]:
|
99
101
|
targets_unprocessed.append(most_similar_indices[most_similar_idx][0])
|
@@ -222,27 +224,29 @@ if __name__ == "__main__":
|
|
222
224
|
# to run for debugging, navigate to sparrow_parse and run the following command:
|
223
225
|
# python -m extractor.html_extractor
|
224
226
|
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
227
|
+
with open('data/invoice_1_table.txt', 'r') as file:
|
228
|
+
file_content = file.read()
|
229
|
+
|
230
|
+
file_content = file_content.strip()[1:-1].strip()
|
231
|
+
data_list = re.split(r"',\s*'", file_content)
|
232
|
+
data_list = [item.strip(" '") for item in data_list]
|
231
233
|
|
232
234
|
extractor = HTMLExtractor()
|
233
235
|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
236
|
+
answer, targets_unprocessed = extractor.read_data(
|
237
|
+
# ['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth'],
|
238
|
+
['transaction_date', 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance',
|
239
|
+
'deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'maturity_date'],
|
240
|
+
data_list,
|
241
|
+
0.5,
|
242
|
+
0.3,
|
243
|
+
# None,
|
244
|
+
['deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'transaction_date',
|
245
|
+
'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance', 'maturity_date'],
|
246
|
+
True,
|
247
|
+
True,
|
248
|
+
True,
|
249
|
+
True)
|
250
|
+
|
251
|
+
print(answer)
|
252
|
+
print(targets_unprocessed)
|
{sparrow_parse-0.2.7 → sparrow_parse-0.2.8}/sparrow_parse/extractor/unstructured_processor.py
RENAMED
@@ -177,3 +177,11 @@ if __name__ == "__main__":
|
|
177
177
|
# True,
|
178
178
|
# True)
|
179
179
|
|
180
|
+
content, table_content = processor.extract_data(
|
181
|
+
'/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_1.pdf',
|
182
|
+
'hi_res',
|
183
|
+
'yolox',
|
184
|
+
['tables', 'unstructured'],
|
185
|
+
True,
|
186
|
+
True)
|
187
|
+
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = '0.2.7'
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|