PyPI - sparrow-parse - Versions diffs - 0.2.7__tar.gz → 0.2.9__tar.gz - Mend

sparrow-parse 0.2.7tar.gz → 0.2.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{sparrow_parse-0.2.7 → sparrow_parse-0.2.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sparrow-parse
-Version: 0.2.7
+Version: 0.2.9
 Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
 Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
 License: GPL-3.0

{sparrow_parse-0.2.7 → sparrow_parse-0.2.9}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sparrow-parse"
-version = "0.2.7"
+version = "0.2.9"
 description = "Sparrow Parse is a Python package for parsing and extracting information from documents."
 authors = ["Andrej Baranovskij <andrejus.baranovskis@gmail.com>"]
 license = "GPL-3.0"

sparrow_parse-0.2.9/sparrow_parse/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = '0.2.9'

sparrow_parse-0.2.9/sparrow_parse/extractor/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file

sparrow_parse-0.2.9/sparrow_parse/extractor/__pycache__/extractor_helper.cpython-310.pyc ADDED Viewed

Binary file

sparrow_parse-0.2.9/sparrow_parse/extractor/__pycache__/html_extractor.cpython-310.pyc ADDED Viewed

Binary file

{sparrow_parse-0.2.7 → sparrow_parse-0.2.9}/sparrow_parse/extractor/extractor_helper.py RENAMED Viewed

@@ -3,9 +3,10 @@ from sentence_transformers import SentenceTransformer, util
 import pandas as pd
 import re
 from io import StringIO
+from rich import print
-def merge_html_table_headers(html_table, column_keywords, debug=False):
+def merge_html_table_headers(html_table, column_keywords, similarity_threshold, debug=False):
     soup = BeautifulSoup(html_table, 'html.parser')
     # Find all thead elements
@@ -18,7 +19,7 @@ def merge_html_table_headers(html_table, column_keywords, debug=False):
         html_table = normalize_html_table(html_table, debug)
         html_table = fix_rowspan_elements(html_table)
         html_table = merge_rows_with_rowspan(html_table)
-        html_table = detect_and_remove_junk_columns(html_table, column_keywords, debug)
+        html_table = detect_and_remove_junk_columns(html_table, column_keywords, similarity_threshold, debug)
     else:
         # If there is only one thead, return the original table
         return html_table
@@ -272,7 +273,7 @@ def merge_rows_with_rowspan(html):
     return str(new_table_soup.table)
-def detect_and_remove_junk_columns(html_table, target_columns, debug=False):
+def detect_and_remove_junk_columns(html_table, target_columns, similarity_threshold_param, debug=False):
     html_table = clean_html_table_header_names(html_table)
     # Wrap the HTML string in a StringIO object
@@ -295,7 +296,7 @@ def detect_and_remove_junk_columns(html_table, target_columns, debug=False):
     # Identify junk columns based on similarity threshold
     junk_columns = []
-    similarity_threshold = 0.5  # Adjust this threshold as needed
+    similarity_threshold = similarity_threshold_param
     for idx, col_embedding in enumerate(column_embeddings):
         similarities = util.pytorch_cos_sim(col_embedding, target_embeddings)[0]
@@ -357,11 +358,11 @@ def clean_html_table_header_names(html_table: str) -> str:
     # Extract the headers and clean them
     headers = table.find_all("th")
     for th in headers:
-        clean_header = re.sub(r"[^a-zA-Z0-9\s]", "", th.get_text())
-        # Check if the cleaned name is empty
-        if not clean_header.strip():
-            clean_header = "-"
-        th.string.replace_with(clean_header)
+        if th.string:
+            # Clean the header
+            clean_header = re.sub(r"[^a-zA-Z0-9\s]", "", th.get_text())
+            # Keep it empty if the cleaned name is empty
+            th.string.replace_with(clean_header.strip() if clean_header.strip() else "")
     html_table = str(soup)

{sparrow_parse-0.2.7 → sparrow_parse-0.2.9}/sparrow_parse/extractor/html_extractor.py RENAMED Viewed

@@ -12,8 +12,8 @@ class HTMLExtractor(object):
     def __init__(self):
         pass
-    def read_data(self, target_columns, data, column_keywords=None, group_by_rows=True, update_targets=False,
-                  local=True, debug=False):
+    def read_data(self, target_columns, data, similarity_threshold_junk, similarity_threshold_column_id,
+                  column_keywords=None, group_by_rows=True, update_targets=False, local=True, debug=False):
         answer = {}
         json_result, targets_unprocessed = [], []
@@ -22,7 +22,8 @@ class HTMLExtractor(object):
             if not target_columns:
                 break
-            json_result, targets_unprocessed = self.read_data_from_table(target_columns, table, column_keywords,
+            json_result, targets_unprocessed = self.read_data_from_table(target_columns, table, similarity_threshold_junk,
+                                                                         similarity_threshold_column_id, column_keywords,
                                                                          group_by_rows, local, debug)
             answer = self.add_answer_section(answer, "items" + str(i + 1), json_result)
@@ -33,9 +34,10 @@ class HTMLExtractor(object):
         return answer, targets_unprocessed
-    def read_data_from_table(self, target_columns, data, column_keywords=None, group_by_rows=True, local=True, debug=False):
+    def read_data_from_table(self, target_columns, data, similarity_threshold_junk, similarity_threshold_column_id,
+                             column_keywords=None, group_by_rows=True, local=True, debug=False):
         data = self.invoke_pipeline_step(
-            lambda: merge_html_table_headers(data, column_keywords, debug),
+            lambda: merge_html_table_headers(data, column_keywords, similarity_threshold_junk, debug),
             "Merging HTML table headers...",
             local
         )
@@ -54,7 +56,7 @@ class HTMLExtractor(object):
             print(f"Target columns: {target_columns}")
         indices, targets, targets_unprocessed = self.invoke_pipeline_step(
-            lambda: self.calculate_similarity(columns, target_columns, debug),
+            lambda: self.calculate_similarity(columns, target_columns, similarity_threshold_column_id, debug),
             "Calculating cosine similarity between columns and target values...",
             local
         )
@@ -73,7 +75,7 @@ class HTMLExtractor(object):
         return json_result, targets_unprocessed
-    def calculate_similarity(self, columns, target_columns, debug):
+    def calculate_similarity(self, columns, target_columns, similarity_threshold_column_id, debug):
         model = SentenceTransformer('all-mpnet-base-v2')
         # Compute embeddings for columns and target values
@@ -93,7 +95,7 @@ class HTMLExtractor(object):
             most_similar_idx = similarities.argmax().item()
             most_similar_column = columns[most_similar_idx]
             similarity_score = similarities[most_similar_idx].item()
-            if similarity_score > 0.3:
+            if similarity_score > similarity_threshold_column_id:
                 if most_similar_idx in most_similar_indices:
                     if similarity_score > most_similar_indices[most_similar_idx][1]:
                         targets_unprocessed.append(most_similar_indices[most_similar_idx][0])
@@ -232,13 +234,15 @@ if __name__ == "__main__":
     extractor = HTMLExtractor()
     # answer, targets_unprocessed = extractor.read_data(
-    #     ['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth'],
-    #     # ['transaction_date', 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance',
-    #     #  'deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'maturity_date'],
+    #     # ['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth'],
+    #     ['transaction_date', 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance',
+    #      'deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'maturity_date'],
     #     data_list,
-    #     None,
-    #     # ['deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'transaction_date',
-    #     #  'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance', 'maturity_date'],
+    #     0.5,
+    #     0.3,
+    #     # None,
+    #     ['deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'transaction_date',
+    #      'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance', 'maturity_date'],
     #     True,
     #     True,
     #     True,