sparrow-parse 0.1.9__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.1.9
3
+ Version: 0.2.0
4
4
  Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  License: GPL-3.0
@@ -15,10 +15,13 @@ Classifier: Programming Language :: Python :: 3.9
15
15
  Classifier: Programming Language :: Python :: 3.10
16
16
  Classifier: Programming Language :: Python :: 3.11
17
17
  Classifier: Topic :: Software Development
18
+ Requires-Dist: pymupdf4llm (==0.0.5)
18
19
  Requires-Dist: rich (>=13.7.1,<14.0.0)
20
+ Requires-Dist: sentence-transformers (==3.0.1)
19
21
  Requires-Dist: torch (==2.2.2)
20
- Requires-Dist: unstructured-inference (==0.7.29)
21
- Requires-Dist: unstructured[all-docs] (==0.13.6)
22
+ Requires-Dist: transformers (==4.41.2)
23
+ Requires-Dist: unstructured-inference (==0.7.33)
24
+ Requires-Dist: unstructured[all-docs] (==0.14.5)
22
25
  Project-URL: Repository, https://github.com/katanaml/sparrow
23
26
  Description-Content-Type: text/markdown
24
27
 
@@ -26,7 +29,7 @@ Description-Content-Type: text/markdown
26
29
 
27
30
  ## Description
28
31
 
29
- This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing.
32
+ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing, parsing and extracting information.
30
33
 
31
34
  ## Install
32
35
 
@@ -34,22 +37,97 @@ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-
34
37
  pip install sparrow-parse
35
38
  ```
36
39
 
37
- ## Use
40
+ ## Pre-processing
38
41
 
39
- Import
42
+ ### Unstructured
40
43
 
41
44
  ```
42
- from sparrow_parse.pdf.pdf_processor import PDFProcessor
45
+ from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
46
+
47
+ processor = UnstructuredProcessor()
48
+
49
+ content, table_content = processor.extract_data(
50
+ file_path, # file to process
51
+ strategy, # data processing strategy supported by unstructured
52
+ model_name, # model supported by unstructured
53
+ options, # table extraction into HTML format
54
+ local, # True if running from CLI, or False if running from FastAPI
55
+ debug) # Debug
43
56
  ```
44
57
 
45
- Usage
58
+ Example:
59
+
60
+ *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
61
+
62
+ *strategy* - `hi_res`
63
+
64
+ *model_name* - `yolox`
65
+
66
+ *options* - `['tables', 'html']`
67
+
68
+ *local* - `True`
69
+
70
+ *debug* - `True`
71
+
72
+ ### Markdown
46
73
 
47
74
  ```
48
- processor = PDFProcessor()
49
- result = processor.process_file(file_path, strategy, model_name)
75
+ from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
76
+
77
+ processor = MarkdownProcessor()
78
+
79
+ content, table_content = processor.extract_data(
80
+ file_path, # file to process
81
+ options, # table extraction into HTML format
82
+ local, # True if running from CLI, or False if running from FastAPI
83
+ debug) # Debug
50
84
  ```
51
85
 
52
- Build for development
86
+ Example:
87
+
88
+ *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
89
+
90
+ *options* - `['tables', 'markdown']`
91
+
92
+ *local* - `True`
93
+
94
+ *debug* - `True`
95
+
96
+ ## Parsing and extraction
97
+
98
+ ```
99
+ from sparrow_parse.extractor.html_extractor import HTMLExtractor
100
+
101
+ extractor = HTMLExtractor()
102
+
103
+ answer, targets_unprocessed = extractor.read_data(
104
+ target_columns, # list of table columns data to fetch
105
+ data, # list of HTML tables
106
+ column_keywords, # list of valid column names, can be empty. Useful to filter junk content
107
+ group_by_rows, # JSON result grouping
108
+ update_targets, # Set to true, if page contains multiple tables with the same columns
109
+ local, # True if running from CLI, or False if running from FastAPI
110
+ debug) # Debug
111
+
112
+ ```
113
+
114
+ Example:
115
+
116
+ *target_columns* - `['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth']`
117
+
118
+ *data* - `list of HTML tables`
119
+
120
+ *column_keywords* - `None`
121
+
122
+ *group_by_rows* - `True`
123
+
124
+ *update_targets* - `True`
125
+
126
+ *local* - `True`
127
+
128
+ *debug* - `True`
129
+
130
+ ## Library build
53
131
 
54
132
  ```
55
133
  poetry build
@@ -0,0 +1,131 @@
1
+ # Sparrow Parse
2
+
3
+ ## Description
4
+
5
+ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing, parsing and extracting information.
6
+
7
+ ## Install
8
+
9
+ ```
10
+ pip install sparrow-parse
11
+ ```
12
+
13
+ ## Pre-processing
14
+
15
+ ### Unstructured
16
+
17
+ ```
18
+ from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
19
+
20
+ processor = UnstructuredProcessor()
21
+
22
+ content, table_content = processor.extract_data(
23
+ file_path, # file to process
24
+ strategy, # data processing strategy supported by unstructured
25
+ model_name, # model supported by unstructured
26
+ options, # table extraction into HTML format
27
+ local, # True if running from CLI, or False if running from FastAPI
28
+ debug) # Debug
29
+ ```
30
+
31
+ Example:
32
+
33
+ *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
34
+
35
+ *strategy* - `hi_res`
36
+
37
+ *model_name* - `yolox`
38
+
39
+ *options* - `['tables', 'html']`
40
+
41
+ *local* - `True`
42
+
43
+ *debug* - `True`
44
+
45
+ ### Markdown
46
+
47
+ ```
48
+ from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
49
+
50
+ processor = MarkdownProcessor()
51
+
52
+ content, table_content = processor.extract_data(
53
+ file_path, # file to process
54
+ options, # table extraction into HTML format
55
+ local, # True if running from CLI, or False if running from FastAPI
56
+ debug) # Debug
57
+ ```
58
+
59
+ Example:
60
+
61
+ *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
62
+
63
+ *options* - `['tables', 'markdown']`
64
+
65
+ *local* - `True`
66
+
67
+ *debug* - `True`
68
+
69
+ ## Parsing and extraction
70
+
71
+ ```
72
+ from sparrow_parse.extractor.html_extractor import HTMLExtractor
73
+
74
+ extractor = HTMLExtractor()
75
+
76
+ answer, targets_unprocessed = extractor.read_data(
77
+ target_columns, # list of table columns data to fetch
78
+ data, # list of HTML tables
79
+ column_keywords, # list of valid column names, can be empty. Useful to filter junk content
80
+ group_by_rows, # JSON result grouping
81
+ update_targets, # Set to true, if page contains multiple tables with the same columns
82
+ local, # True if running from CLI, or False if running from FastAPI
83
+ debug) # Debug
84
+
85
+ ```
86
+
87
+ Example:
88
+
89
+ *target_columns* - `['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth']`
90
+
91
+ *data* - `list of HTML tables`
92
+
93
+ *column_keywords* - `None`
94
+
95
+ *group_by_rows* - `True`
96
+
97
+ *update_targets* - `True`
98
+
99
+ *local* - `True`
100
+
101
+ *debug* - `True`
102
+
103
+ ## Library build
104
+
105
+ ```
106
+ poetry build
107
+ ```
108
+
109
+ Publish to PyPi
110
+
111
+ ```
112
+ poetry publish
113
+ ```
114
+
115
+ ## Commercial usage
116
+
117
+ Sparrow is available under the GPL 3.0 license, promoting freedom to use, modify, and distribute the software while ensuring any modifications remain open source under the same license. This aligns with our commitment to supporting the open-source community and fostering collaboration.
118
+
119
+ Additionally, we recognize the diverse needs of organizations, including small to medium-sized enterprises (SMEs). Therefore, Sparrow is also offered for free commercial use to organizations with gross revenue below $5 million USD in the past 12 months, enabling them to leverage Sparrow without the financial burden often associated with high-quality software solutions.
120
+
121
+ For businesses that exceed this revenue threshold or require usage terms not accommodated by the GPL 3.0 license—such as integrating Sparrow into proprietary software without the obligation to disclose source code modifications—we offer dual licensing options. Dual licensing allows Sparrow to be used under a separate proprietary license, offering greater flexibility for commercial applications and proprietary integrations. This model supports both the project's sustainability and the business's needs for confidentiality and customization.
122
+
123
+ If your organization is seeking to utilize Sparrow under a proprietary license, or if you are interested in custom workflows, consulting services, or dedicated support and maintenance options, please contact us at abaranovskis@redsamuraiconsulting.com. We're here to provide tailored solutions that meet your unique requirements, ensuring you can maximize the benefits of Sparrow for your projects and workflows.
124
+
125
+ ## Author
126
+
127
+ [Katana ML](https://katanaml.io), [Andrej Baranovskij](https://github.com/abaranovskis-redsamurai)
128
+
129
+ ## License
130
+
131
+ Licensed under the GPL 3.0. Copyright 2020-2024 Katana ML, Andrej Baranovskij. [Copy of the license](https://github.com/katanaml/sparrow/blob/main/LICENSE).
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sparrow-parse"
3
- version = "0.1.9"
3
+ version = "0.2.0"
4
4
  description = "Sparrow Parse is a Python package for parsing and extracting information from documents."
5
5
  authors = ["Andrej Baranovskij <andrejus.baranovskis@gmail.com>"]
6
6
  license = "GPL-3.0"
@@ -22,9 +22,12 @@ include = [
22
22
  [tool.poetry.dependencies]
23
23
  python = ">=3.9,<3.12"
24
24
  torch = {version = "2.2.2", source = "pypi"}
25
- unstructured = {version = "0.13.6", extras = ["all-docs"]}
26
- unstructured-inference = "0.7.29"
25
+ unstructured = {version = "0.14.5", extras = ["all-docs"]}
26
+ unstructured-inference = "0.7.33"
27
27
  rich = "^13.7.1"
28
+ pymupdf4llm = "0.0.5"
29
+ transformers = "4.41.2"
30
+ sentence-transformers = "3.0.1"
28
31
 
29
32
 
30
33
  [tool.poetry.scripts]
@@ -33,4 +36,4 @@ sparrow-parse = 'sparrow_parse:main'
33
36
 
34
37
  [build-system]
35
38
  requires = ["poetry-core"]
36
- build-backend = "poetry.core.masonry.api"
39
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1 @@
1
+ __version__ = '0.2.0'
@@ -0,0 +1,9 @@
1
+ [
2
+ '<table><thead><th>No.</th><th>Description</th><th>Qty</th><th>UM</th><th>Net price</th><th>Net worth</th><th>VAT [%]</th><th>Gross worth</th></thead><tr><td></td><td>Wine Glasses Goblets Pair Clear
3
+ Glass</td><td>5,00</td><td>eacn</td><td>12,00</td><td>60,00</td><td>10%</td><td>66,00</td></tr><tr><td></td><td>With Hooks Stemware Storage Multiple Uses Iron Wine Rack Hanging
4
+ Glass</td><td>4,00</td><td>eacn</td><td>28,08</td><td>112,32</td><td>10%</td><td>123,55</td></tr><tr><td></td><td>Replacement Corkscrew Parts Spiral Worm Wine Opener Bottle
5
+ Houdini</td><td>1,00</td><td>eacn</td><td>7,50</td><td>7,50</td><td>10%</td><td>8,25</td></tr><tr><td></td><td>HOME ESSENTIALS GRADIENT STEMLESS WINE GLASSES SET OF 4 20 FL OZ (591 ml)
6
+ NEW</td><td>1,00</td><td>eacn</td><td>12,99</td><td>12,99</td><td>10%</td><td>14,29</td></tr></table>',
7
+ '<table><thead><th>VAT</th><th>[%]</th><th>Net worth</th><th>VAT</th><th>Gross worth</th></thead><tr><td></td><td>10%</td><td>192,81</td><td>19,28</td><td>212,09</td></tr><tr><td colspan="2">Total</td><td>$ 192,81</td><td>$
8
+ 19,28</td><td>$ 212,09</td></tr></table>'
9
+ ]
@@ -0,0 +1,368 @@
1
+ from bs4 import BeautifulSoup
2
+ from sentence_transformers import SentenceTransformer, util
3
+ import pandas as pd
4
+ import re
5
+ from io import StringIO
6
+
7
+
8
+ def merge_html_table_headers(html_table, column_keywords, debug=False):
9
+ soup = BeautifulSoup(html_table, 'html.parser')
10
+
11
+ # Find all thead elements
12
+ theads = soup.find_all('thead')
13
+
14
+ if len(theads) > 1 and column_keywords is not None:
15
+ html_table = update_table_header_colspan(html_table)
16
+ html_table = merge_table_header_thead(html_table)
17
+ html_table = merge_colspan_columns(html_table)
18
+ html_table = normalize_html_table(html_table, debug)
19
+ html_table = fix_rowspan_elements(html_table)
20
+ html_table = merge_rows_with_rowspan(html_table)
21
+ html_table = detect_and_remove_junk_columns(html_table, column_keywords, debug)
22
+ else:
23
+ # If there is only one thead, return the original table
24
+ return html_table
25
+
26
+ return html_table
27
+
28
+
29
+ def update_table_header_colspan(html_table):
30
+ soup = BeautifulSoup(html_table, 'html.parser')
31
+ theads = soup.find_all('thead')
32
+
33
+ for thead in theads:
34
+ for th in thead.find_all('th'):
35
+ colspan = th.get('colspan')
36
+ if colspan and int(colspan) > 1:
37
+ colspan_count = int(colspan)
38
+ th['colspan'] = 1
39
+ for _ in range(colspan_count - 1):
40
+ new_th = soup.new_tag('th')
41
+ th.insert_after(new_th)
42
+
43
+ return str(soup)
44
+
45
+
46
+ def merge_table_header_thead(html_table):
47
+ soup = BeautifulSoup(html_table, 'html.parser')
48
+ theads = soup.find_all('thead')
49
+
50
+ primary_thead = theads[0]
51
+ secondary_thead = theads[1]
52
+
53
+ primary_ths = primary_thead.find_all('th')
54
+ secondary_ths = secondary_thead.find_all('th')
55
+
56
+ for i, th in enumerate(primary_ths):
57
+ if i < len(secondary_ths):
58
+ primary_text = th.text.strip()
59
+ secondary_text = secondary_ths[i].text.strip()
60
+ if primary_text and secondary_text:
61
+ th.string = (primary_text + ' ' + secondary_text).strip()
62
+ elif not primary_text and secondary_text:
63
+ th.string = secondary_text
64
+ # Remove colspan and rowspan attributes
65
+ th.attrs.pop('colspan', None)
66
+ th.attrs.pop('rowspan', None)
67
+
68
+ secondary_thead.decompose()
69
+
70
+ return str(soup)
71
+
72
+
73
+ def merge_colspan_columns(html_table):
74
+ # Parse the HTML
75
+ soup = BeautifulSoup(html_table, 'html.parser')
76
+
77
+ # Process colspan attributes by adding empty <td> elements
78
+ for row in soup.find_all('tr'):
79
+ cols = []
80
+ for cell in row.find_all(['th', 'td']):
81
+ colspan = int(cell.get('colspan', 1))
82
+ # Add the cell and additional empty cells if colspan is greater than 1
83
+ cols.append(cell)
84
+ for _ in range(colspan - 1):
85
+ new_td = soup.new_tag('td')
86
+ cols.append(new_td)
87
+ # Remove the colspan attribute
88
+ if cell.has_attr('colspan'):
89
+ del cell['colspan']
90
+
91
+ # Replace the row's children with the updated cells
92
+ row.clear()
93
+ row.extend(cols)
94
+
95
+ return str(soup)
96
+
97
+
98
+ def normalize_html_table(html, debug = False):
99
+ soup = BeautifulSoup(html, 'html.parser')
100
+
101
+ # Find the header row and count the number of cells
102
+ header = soup.find('thead').find_all(['th', 'td'])
103
+ header_cell_count = len(header)
104
+
105
+ if debug:
106
+ # Print the number of header cells
107
+ print(f"Number of cells in header: {header_cell_count}")
108
+
109
+ # Find all rows in the table body
110
+ rows = soup.find_all('tr')
111
+
112
+ for row in rows:
113
+ cells = row.find_all(['td', 'th'])
114
+ if len(cells) > header_cell_count:
115
+ extra_cells = len(cells) - header_cell_count
116
+ for cell in cells:
117
+ if cell.text.strip() == '' and extra_cells > 0:
118
+ cell.decompose()
119
+ extra_cells -= 1
120
+ elif len(cells) < header_cell_count:
121
+ missing_cells = header_cell_count - len(cells)
122
+ for _ in range(missing_cells):
123
+ new_cell = soup.new_tag('td')
124
+ row.insert(0, new_cell)
125
+
126
+ return str(soup)
127
+
128
+
129
+ def fix_rowspan_elements(html_table):
130
+ # Parse the HTML table
131
+ soup = BeautifulSoup(html_table, 'html.parser')
132
+
133
+ # Find all table rows
134
+ rows = soup.find_all('tr')
135
+
136
+ # Dictionary to store rows with rowspan elements
137
+ rowspan_dict = {}
138
+
139
+ # Iterate over each row
140
+ for row_index, row in enumerate(rows):
141
+ # Find all cells in the row
142
+ cells = row.find_all(['td', 'th'])
143
+
144
+ # Iterate over each cell
145
+ for cell_index, cell in enumerate(cells):
146
+ # Check if the cell has a rowspan attribute
147
+ if cell.has_attr('rowspan'):
148
+ # Store the rowspan value and cell position
149
+ rowspan_value = int(cell['rowspan'])
150
+ if row_index not in rowspan_dict:
151
+ rowspan_dict[row_index] = []
152
+ rowspan_dict[row_index].append((cell_index, rowspan_value))
153
+
154
+ # List to store the number of rows until the next rowspan row
155
+ rows_below_until_next_rowspan = []
156
+
157
+ # Get the sorted row indices that have rowspan elements
158
+ sorted_row_indices = sorted(rowspan_dict.keys())
159
+
160
+ # Calculate rows below each rowspan row until the next rowspan row
161
+ for i in range(len(sorted_row_indices)):
162
+ current_row = sorted_row_indices[i]
163
+ if i < len(sorted_row_indices) - 1:
164
+ next_row = sorted_row_indices[i + 1]
165
+ rows_below = next_row - current_row - 1
166
+ else:
167
+ rows_below = len(rows) - current_row - 1
168
+ rows_below_until_next_rowspan.append((current_row, rows_below))
169
+
170
+ # Detect rows where rowspan value is incorrect
171
+ rows_with_bad_rowspan = []
172
+ for row_index, rows_below in rows_below_until_next_rowspan:
173
+ if row_index in rowspan_dict:
174
+ for cell_index, rowspan_value in rowspan_dict[row_index]:
175
+ if rowspan_value - 1 < rows_below:
176
+ print(f"Row {row_index} has a large rowspan value: {rowspan_value}")
177
+ rows_with_bad_rowspan.append(row_index)
178
+ break
179
+
180
+ # Modify the HTML table to adjust the rowspan attributes
181
+ for row_index in rows_with_bad_rowspan:
182
+ if row_index in rowspan_dict:
183
+ for cell_index, rowspan_value in rowspan_dict[row_index]:
184
+ # Find the cell with the rowspan attribute
185
+ cell = rows[row_index].find_all(['td', 'th'])[cell_index]
186
+ # Remove the rowspan attribute
187
+ del cell['rowspan']
188
+ # Find the next row and assign the rowspan value
189
+ next_row_index = row_index + 1
190
+ if next_row_index < len(rows):
191
+ next_row_cells = rows[next_row_index].find_all(['td', 'th'])
192
+ if len(next_row_cells) > cell_index:
193
+ next_row_cell = next_row_cells[cell_index]
194
+ next_row_cell['rowspan'] = rowspan_value
195
+ else:
196
+ # Create a new cell if it does not exist
197
+ new_cell = soup.new_tag(cell.name)
198
+ new_cell['rowspan'] = rowspan_value
199
+ new_cell.string = cell.string
200
+ rows[next_row_index].append(new_cell)
201
+
202
+ # Return the modified HTML table
203
+ return str(soup)
204
+
205
+
206
+ def merge_rows_with_rowspan(html):
207
+ # Parse the HTML table using BeautifulSoup
208
+ soup = BeautifulSoup(html, 'html.parser')
209
+
210
+ # Extract the header
211
+ thead = soup.find('thead')
212
+
213
+ # Find all rows
214
+ rows = soup.find_all('tr')
215
+
216
+ result = []
217
+ i = 0
218
+
219
+ while i < len(rows):
220
+ row = rows[i]
221
+ # Check if any td in the row has a rowspan attribute
222
+ for td in row.find_all('td'):
223
+ if td.has_attr('rowspan'):
224
+ rowspan_value = int(td['rowspan'])
225
+ result.append(row)
226
+
227
+ skip_concatenation = False
228
+ concatenation_pairs = []
229
+
230
+ # Add rows below the current row based on the rowspan number
231
+ for j in range(1, rowspan_value):
232
+ if i + j < len(rows):
233
+ below_row = rows[i + j]
234
+
235
+ # Compare cells
236
+ row_cells = row.find_all('td')
237
+ below_row_cells = below_row.find_all('td')
238
+ min_length = min(len(row_cells), len(below_row_cells))
239
+
240
+ for k in range(min_length):
241
+ if is_numeric(row_cells[k].get_text(strip=True)) and is_numeric(below_row_cells[k].get_text(strip=True)):
242
+ skip_concatenation = True
243
+ break
244
+ else:
245
+ concatenation_pairs.append((row_cells[k], below_row_cells[k]))
246
+
247
+ if skip_concatenation:
248
+ result.append(below_row)
249
+
250
+ if not skip_concatenation:
251
+ for row_cell, below_row_cell in concatenation_pairs:
252
+ concatenated_text = (row_cell.get_text(strip=True) + ' ' + below_row_cell.get_text(strip=True)).strip()
253
+ row_cell.string = concatenated_text
254
+
255
+ i += rowspan_value - 1 # Skip the rows that have been added
256
+ break
257
+ else:
258
+ result.append(row)
259
+ break
260
+ i += 1
261
+
262
+ # Convert result list of rows back to an HTML table string
263
+ new_table_soup = BeautifulSoup(f'<table>{str(thead)}</table>', 'html.parser')
264
+ tbody = new_table_soup.new_tag('tbody')
265
+ new_table_soup.table.append(tbody)
266
+ for row in result:
267
+ for td in row.find_all('td'):
268
+ if td.has_attr('rowspan'):
269
+ del td['rowspan']
270
+ tbody.append(row)
271
+
272
+ return str(new_table_soup.table)
273
+
274
+
275
+ def detect_and_remove_junk_columns(html_table, target_columns, debug=False):
276
+ # Wrap the HTML string in a StringIO object
277
+ html_buffer = StringIO(html_table)
278
+
279
+ # Read the HTML table
280
+ df = pd.read_html(html_buffer)[0]
281
+
282
+ model = SentenceTransformer('all-mpnet-base-v2')
283
+
284
+ # Get the column names of the dataframe
285
+ column_names = df.columns.tolist()
286
+
287
+ # Calculate the similarity of each column name to the target column names
288
+ target_embeddings = model.encode(target_columns)
289
+ column_embeddings = model.encode(column_names)
290
+
291
+ # Initialize a dictionary to store the similarity scores
292
+ similarity_scores = {}
293
+
294
+ # Identify junk columns based on similarity threshold
295
+ junk_columns = []
296
+ similarity_threshold = 0.5 # Adjust this threshold as needed
297
+
298
+ for idx, col_embedding in enumerate(column_embeddings):
299
+ similarities = util.pytorch_cos_sim(col_embedding, target_embeddings)[0]
300
+ max_similarity = max(similarities)
301
+ max_similarity_idx = similarities.argmax().item() # Get the index of the max similarity
302
+ similarity_scores[column_names[idx]] = (
303
+ max_similarity.item(), target_columns[max_similarity_idx]) # Store similarity score and target column name
304
+ if max_similarity < similarity_threshold:
305
+ junk_columns.append(column_names[idx])
306
+
307
+ if debug:
308
+ # Print the similarity scores for debugging purposes
309
+ for column, (score, target_col) in similarity_scores.items():
310
+ print(f"Column: {column}, Similarity: {score:.4f}, Target Column: {target_col}")
311
+
312
+ # Handle junk columns by concatenating their values to the nearest column on the left
313
+ for junk_col in junk_columns:
314
+ junk_col_index = column_names.index(junk_col)
315
+ if junk_col_index > 0:
316
+ nearest_col = column_names[junk_col_index - 1]
317
+ df[nearest_col] = df.apply(
318
+ lambda row: str(row[junk_col]) if pd.isna(row[nearest_col]) and pd.notna(row[junk_col])
319
+ else (str(row[nearest_col]) + ' ' + str(row[junk_col])) if pd.notna(row[junk_col])
320
+ else row[nearest_col],
321
+ axis=1
322
+ )
323
+ df.drop(columns=[junk_col], inplace=True)
324
+
325
+ # Replace any remaining NaN values with empty strings
326
+ df = df.fillna('')
327
+
328
+ if debug:
329
+ print(f"Junk columns: {junk_columns}")
330
+ print(df.to_string())
331
+
332
+ # Convert the result into an HTML table
333
+ html_table = df.to_html(index=False)
334
+
335
+ if debug:
336
+ print(html_table)
337
+
338
+ return html_table
339
+
340
+
341
+ def clean_html_table_header_names(html_table: str) -> str:
342
+ """
343
+ Cleans the headers of an HTML table by removing junk characters and returns the updated HTML as a string.
344
+
345
+ Parameters:
346
+ html (str): The HTML content containing the table.
347
+
348
+ Returns:
349
+ str: The updated HTML table with cleaned headers.
350
+ """
351
+ # Parse the HTML table
352
+ soup = BeautifulSoup(html_table, "html.parser")
353
+ table = soup.find("table")
354
+
355
+ # Extract the headers and clean them
356
+ headers = table.find_all("th")
357
+ for th in headers:
358
+ clean_header = re.sub(r"[^a-zA-Z0-9\s]", "", th.get_text())
359
+ th.string.replace_with(clean_header)
360
+
361
+ html_table = str(soup)
362
+
363
+ return html_table
364
+
365
+
366
+ def is_numeric(value):
367
+ # Check if the value is numeric
368
+ return bool(re.match(r'^\d+(?:,\d{3})*(?:\.\d+)?$', value))