sparrow-parse 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sparrow_parse/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '0.2.6'
1
+ __version__ = '0.2.7'
@@ -273,6 +273,8 @@ def merge_rows_with_rowspan(html):
273
273
 
274
274
 
275
275
  def detect_and_remove_junk_columns(html_table, target_columns, debug=False):
276
+ html_table = clean_html_table_header_names(html_table)
277
+
276
278
  # Wrap the HTML string in a StringIO object
277
279
  html_buffer = StringIO(html_table)
278
280
 
@@ -356,6 +358,9 @@ def clean_html_table_header_names(html_table: str) -> str:
356
358
  headers = table.find_all("th")
357
359
  for th in headers:
358
360
  clean_header = re.sub(r"[^a-zA-Z0-9\s]", "", th.get_text())
361
+ # Check if the cleaned name is empty
362
+ if not clean_header.strip():
363
+ clean_header = "-"
359
364
  th.string.replace_with(clean_header)
360
365
 
361
366
  html_table = str(soup)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.2.6
3
+ Version: 0.2.7
4
4
  Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  License: GPL-3.0
@@ -1,13 +1,13 @@
1
- sparrow_parse/__init__.py,sha256=G3YzB5xTyZTVDHgjxzwfUZHPQbqRjfJO_4NfpO-O5Lg,21
1
+ sparrow_parse/__init__.py,sha256=k14HNX-j7JGQmMcNd5V9YYE1rd2NN5QFLz4W-ttaFD0,21
2
2
  sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
3
  sparrow_parse/data/invoice_1_table.txt,sha256=dsWEASxlVNidpTCQDowCM7SjaUzSqwx7DuydTfaQ7xI,1115
4
4
  sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- sparrow_parse/extractor/extractor_helper.py,sha256=fvA7iuGdpW_WFc4jkALBzQbACqlE5x_K9ScW-E6RCoY,13357
5
+ sparrow_parse/extractor/extractor_helper.py,sha256=zia9a1duBENQM2T4yyt-TD4vhuz8ubyJ8aESQBXPli8,13530
6
6
  sparrow_parse/extractor/html_extractor.py,sha256=juPl01Ws83jN59u3ZVSq7BSUlOHjYcIddsM9-sqAHmc,9492
7
7
  sparrow_parse/extractor/markdown_processor.py,sha256=dC2WUdA-v2psh7oytruftxYkXdQi72FoEYxF30ROuO0,4506
8
8
  sparrow_parse/extractor/unstructured_processor.py,sha256=z46aXacMvfW_wmsACs0LtamoMc19eogGd5fVVAj4vIo,6771
9
9
  sparrow_parse/temp.py,sha256=Hl1wPOEytXnfbUobU8BJgEswPsfncibbQdwrpSHtlOo,513
10
- sparrow_parse-0.2.6.dist-info/METADATA,sha256=_fbNIonxy8eYcb_UjRzoCCCyRGzlz6n6mf_VAixiv4c,5622
11
- sparrow_parse-0.2.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
12
- sparrow_parse-0.2.6.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
13
- sparrow_parse-0.2.6.dist-info/RECORD,,
10
+ sparrow_parse-0.2.7.dist-info/METADATA,sha256=QQHYCvcEa-l4bAt9ZIowAXKbvcUYyVTb59D0niT9Fgs,5622
11
+ sparrow_parse-0.2.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
12
+ sparrow_parse-0.2.7.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
13
+ sparrow_parse-0.2.7.dist-info/RECORD,,