sparrow-parse 0.2.5__tar.gz → 0.2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sparrow_parse-0.2.5 → sparrow_parse-0.2.7}/PKG-INFO +1 -1
- {sparrow_parse-0.2.5 → sparrow_parse-0.2.7}/pyproject.toml +1 -1
- sparrow_parse-0.2.7/sparrow_parse/__init__.py +1 -0
- {sparrow_parse-0.2.5 → sparrow_parse-0.2.7}/sparrow_parse/extractor/extractor_helper.py +5 -0
- {sparrow_parse-0.2.5 → sparrow_parse-0.2.7}/sparrow_parse/extractor/html_extractor.py +1 -1
- sparrow_parse-0.2.5/sparrow_parse/__init__.py +0 -1
- {sparrow_parse-0.2.5 → sparrow_parse-0.2.7}/README.md +0 -0
- {sparrow_parse-0.2.5 → sparrow_parse-0.2.7}/sparrow_parse/__main__.py +0 -0
- {sparrow_parse-0.2.5 → sparrow_parse-0.2.7}/sparrow_parse/data/invoice_1_table.txt +0 -0
- {sparrow_parse-0.2.5 → sparrow_parse-0.2.7}/sparrow_parse/extractor/__init__.py +0 -0
- {sparrow_parse-0.2.5 → sparrow_parse-0.2.7}/sparrow_parse/extractor/markdown_processor.py +0 -0
- {sparrow_parse-0.2.5 → sparrow_parse-0.2.7}/sparrow_parse/extractor/unstructured_processor.py +0 -0
- {sparrow_parse-0.2.5 → sparrow_parse-0.2.7}/sparrow_parse/temp.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.7
|
4
4
|
Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
License: GPL-3.0
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "sparrow-parse"
|
3
|
-
version = "0.2.
|
3
|
+
version = "0.2.7"
|
4
4
|
description = "Sparrow Parse is a Python package for parsing and extracting information from documents."
|
5
5
|
authors = ["Andrej Baranovskij <andrejus.baranovskis@gmail.com>"]
|
6
6
|
license = "GPL-3.0"
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = '0.2.7'
|
@@ -273,6 +273,8 @@ def merge_rows_with_rowspan(html):
|
|
273
273
|
|
274
274
|
|
275
275
|
def detect_and_remove_junk_columns(html_table, target_columns, debug=False):
|
276
|
+
html_table = clean_html_table_header_names(html_table)
|
277
|
+
|
276
278
|
# Wrap the HTML string in a StringIO object
|
277
279
|
html_buffer = StringIO(html_table)
|
278
280
|
|
@@ -356,6 +358,9 @@ def clean_html_table_header_names(html_table: str) -> str:
|
|
356
358
|
headers = table.find_all("th")
|
357
359
|
for th in headers:
|
358
360
|
clean_header = re.sub(r"[^a-zA-Z0-9\s]", "", th.get_text())
|
361
|
+
# Check if the cleaned name is empty
|
362
|
+
if not clean_header.strip():
|
363
|
+
clean_header = "-"
|
359
364
|
th.string.replace_with(clean_header)
|
360
365
|
|
361
366
|
html_table = str(soup)
|
@@ -24,7 +24,7 @@ class HTMLExtractor(object):
|
|
24
24
|
|
25
25
|
json_result, targets_unprocessed = self.read_data_from_table(target_columns, table, column_keywords,
|
26
26
|
group_by_rows, local, debug)
|
27
|
-
answer = self.add_answer_section(answer, "items" + str(i), json_result)
|
27
|
+
answer = self.add_answer_section(answer, "items" + str(i + 1), json_result)
|
28
28
|
|
29
29
|
if update_targets:
|
30
30
|
target_columns = targets_unprocessed
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = '0.2.5'
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sparrow_parse-0.2.5 → sparrow_parse-0.2.7}/sparrow_parse/extractor/unstructured_processor.py
RENAMED
File without changes
|
File without changes
|