sparrow-parse 0.2.7__tar.gz → 0.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.2.7
3
+ Version: 0.2.9
4
4
  Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  License: GPL-3.0
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sparrow-parse"
3
- version = "0.2.7"
3
+ version = "0.2.9"
4
4
  description = "Sparrow Parse is a Python package for parsing and extracting information from documents."
5
5
  authors = ["Andrej Baranovskij <andrejus.baranovskis@gmail.com>"]
6
6
  license = "GPL-3.0"
@@ -0,0 +1 @@
1
+ __version__ = '0.2.9'
@@ -3,9 +3,10 @@ from sentence_transformers import SentenceTransformer, util
3
3
  import pandas as pd
4
4
  import re
5
5
  from io import StringIO
6
+ from rich import print
6
7
 
7
8
 
8
- def merge_html_table_headers(html_table, column_keywords, debug=False):
9
+ def merge_html_table_headers(html_table, column_keywords, similarity_threshold, debug=False):
9
10
  soup = BeautifulSoup(html_table, 'html.parser')
10
11
 
11
12
  # Find all thead elements
@@ -18,7 +19,7 @@ def merge_html_table_headers(html_table, column_keywords, debug=False):
18
19
  html_table = normalize_html_table(html_table, debug)
19
20
  html_table = fix_rowspan_elements(html_table)
20
21
  html_table = merge_rows_with_rowspan(html_table)
21
- html_table = detect_and_remove_junk_columns(html_table, column_keywords, debug)
22
+ html_table = detect_and_remove_junk_columns(html_table, column_keywords, similarity_threshold, debug)
22
23
  else:
23
24
  # If there is only one thead, return the original table
24
25
  return html_table
@@ -272,7 +273,7 @@ def merge_rows_with_rowspan(html):
272
273
  return str(new_table_soup.table)
273
274
 
274
275
 
275
- def detect_and_remove_junk_columns(html_table, target_columns, debug=False):
276
+ def detect_and_remove_junk_columns(html_table, target_columns, similarity_threshold_param, debug=False):
276
277
  html_table = clean_html_table_header_names(html_table)
277
278
 
278
279
  # Wrap the HTML string in a StringIO object
@@ -295,7 +296,7 @@ def detect_and_remove_junk_columns(html_table, target_columns, debug=False):
295
296
 
296
297
  # Identify junk columns based on similarity threshold
297
298
  junk_columns = []
298
- similarity_threshold = 0.5 # Adjust this threshold as needed
299
+ similarity_threshold = similarity_threshold_param
299
300
 
300
301
  for idx, col_embedding in enumerate(column_embeddings):
301
302
  similarities = util.pytorch_cos_sim(col_embedding, target_embeddings)[0]
@@ -357,11 +358,11 @@ def clean_html_table_header_names(html_table: str) -> str:
357
358
  # Extract the headers and clean them
358
359
  headers = table.find_all("th")
359
360
  for th in headers:
360
- clean_header = re.sub(r"[^a-zA-Z0-9\s]", "", th.get_text())
361
- # Check if the cleaned name is empty
362
- if not clean_header.strip():
363
- clean_header = "-"
364
- th.string.replace_with(clean_header)
361
+ if th.string:
362
+ # Clean the header
363
+ clean_header = re.sub(r"[^a-zA-Z0-9\s]", "", th.get_text())
364
+ # Keep it empty if the cleaned name is empty
365
+ th.string.replace_with(clean_header.strip() if clean_header.strip() else "")
365
366
 
366
367
  html_table = str(soup)
367
368
 
@@ -12,8 +12,8 @@ class HTMLExtractor(object):
12
12
  def __init__(self):
13
13
  pass
14
14
 
15
- def read_data(self, target_columns, data, column_keywords=None, group_by_rows=True, update_targets=False,
16
- local=True, debug=False):
15
+ def read_data(self, target_columns, data, similarity_threshold_junk, similarity_threshold_column_id,
16
+ column_keywords=None, group_by_rows=True, update_targets=False, local=True, debug=False):
17
17
  answer = {}
18
18
 
19
19
  json_result, targets_unprocessed = [], []
@@ -22,7 +22,8 @@ class HTMLExtractor(object):
22
22
  if not target_columns:
23
23
  break
24
24
 
25
- json_result, targets_unprocessed = self.read_data_from_table(target_columns, table, column_keywords,
25
+ json_result, targets_unprocessed = self.read_data_from_table(target_columns, table, similarity_threshold_junk,
26
+ similarity_threshold_column_id, column_keywords,
26
27
  group_by_rows, local, debug)
27
28
  answer = self.add_answer_section(answer, "items" + str(i + 1), json_result)
28
29
 
@@ -33,9 +34,10 @@ class HTMLExtractor(object):
33
34
 
34
35
  return answer, targets_unprocessed
35
36
 
36
- def read_data_from_table(self, target_columns, data, column_keywords=None, group_by_rows=True, local=True, debug=False):
37
+ def read_data_from_table(self, target_columns, data, similarity_threshold_junk, similarity_threshold_column_id,
38
+ column_keywords=None, group_by_rows=True, local=True, debug=False):
37
39
  data = self.invoke_pipeline_step(
38
- lambda: merge_html_table_headers(data, column_keywords, debug),
40
+ lambda: merge_html_table_headers(data, column_keywords, similarity_threshold_junk, debug),
39
41
  "Merging HTML table headers...",
40
42
  local
41
43
  )
@@ -54,7 +56,7 @@ class HTMLExtractor(object):
54
56
  print(f"Target columns: {target_columns}")
55
57
 
56
58
  indices, targets, targets_unprocessed = self.invoke_pipeline_step(
57
- lambda: self.calculate_similarity(columns, target_columns, debug),
59
+ lambda: self.calculate_similarity(columns, target_columns, similarity_threshold_column_id, debug),
58
60
  "Calculating cosine similarity between columns and target values...",
59
61
  local
60
62
  )
@@ -73,7 +75,7 @@ class HTMLExtractor(object):
73
75
 
74
76
  return json_result, targets_unprocessed
75
77
 
76
- def calculate_similarity(self, columns, target_columns, debug):
78
+ def calculate_similarity(self, columns, target_columns, similarity_threshold_column_id, debug):
77
79
  model = SentenceTransformer('all-mpnet-base-v2')
78
80
 
79
81
  # Compute embeddings for columns and target values
@@ -93,7 +95,7 @@ class HTMLExtractor(object):
93
95
  most_similar_idx = similarities.argmax().item()
94
96
  most_similar_column = columns[most_similar_idx]
95
97
  similarity_score = similarities[most_similar_idx].item()
96
- if similarity_score > 0.3:
98
+ if similarity_score > similarity_threshold_column_id:
97
99
  if most_similar_idx in most_similar_indices:
98
100
  if similarity_score > most_similar_indices[most_similar_idx][1]:
99
101
  targets_unprocessed.append(most_similar_indices[most_similar_idx][0])
@@ -232,13 +234,15 @@ if __name__ == "__main__":
232
234
  extractor = HTMLExtractor()
233
235
 
234
236
  # answer, targets_unprocessed = extractor.read_data(
235
- # ['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth'],
236
- # # ['transaction_date', 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance',
237
- # # 'deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'maturity_date'],
237
+ # # ['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth'],
238
+ # ['transaction_date', 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance',
239
+ # 'deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'maturity_date'],
238
240
  # data_list,
239
- # None,
240
- # # ['deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'transaction_date',
241
- # # 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance', 'maturity_date'],
241
+ # 0.5,
242
+ # 0.3,
243
+ # # None,
244
+ # ['deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'transaction_date',
245
+ # 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance', 'maturity_date'],
242
246
  # True,
243
247
  # True,
244
248
  # True,
@@ -1 +0,0 @@
1
- __version__ = '0.2.7'
File without changes