sparrow-parse 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,251 +0,0 @@
1
- from rich import print
2
- from sentence_transformers import SentenceTransformer, util
3
- from bs4 import BeautifulSoup
4
- import json
5
- from rich.progress import Progress, SpinnerColumn, TextColumn
6
- from sparrow_parse.helpers.html_extractor_helper import merge_html_table_headers
7
- from sparrow_parse.helpers.html_extractor_helper import clean_html_table_header_names
8
-
9
-
10
- class HTMLExtractor(object):
11
- def __init__(self):
12
- pass
13
-
14
- def read_data(self, target_columns, data, similarity_threshold_junk, similarity_threshold_column_id,
15
- column_keywords=None, group_by_rows=True, update_targets=False, local=True, debug=False):
16
- answer = {}
17
-
18
- json_result, targets_unprocessed = [], []
19
-
20
- for i, table in enumerate(data):
21
- if not target_columns:
22
- break
23
-
24
- json_result, targets_unprocessed = self.read_data_from_table(target_columns, table, similarity_threshold_junk,
25
- similarity_threshold_column_id, column_keywords,
26
- group_by_rows, local, debug)
27
- answer = self.add_answer_section(answer, "items" + str(i + 1), json_result)
28
-
29
- if update_targets:
30
- target_columns = targets_unprocessed
31
-
32
- answer = self.format_json_output(answer)
33
-
34
- return answer, targets_unprocessed
35
-
36
- def read_data_from_table(self, target_columns, data, similarity_threshold_junk, similarity_threshold_column_id,
37
- column_keywords=None, group_by_rows=True, local=True, debug=False):
38
- data = self.invoke_pipeline_step(
39
- lambda: merge_html_table_headers(data, column_keywords, similarity_threshold_junk, debug),
40
- "Merging HTML table headers...",
41
- local
42
- )
43
-
44
- data = self.invoke_pipeline_step(
45
- lambda: clean_html_table_header_names(data),
46
- "Cleaning HTML table headers...",
47
- local
48
- )
49
-
50
- columns = self.get_table_column_names(data)
51
-
52
- if debug:
53
- print("\n")
54
- print(f"Columns: {columns}")
55
- print(f"Target columns: {target_columns}")
56
-
57
- indices, targets, targets_unprocessed = self.invoke_pipeline_step(
58
- lambda: self.calculate_similarity(columns, target_columns, similarity_threshold_column_id, debug),
59
- "Calculating cosine similarity between columns and target values...",
60
- local
61
- )
62
-
63
- if debug:
64
- print(f"Unprocessed targets: {targets_unprocessed}")
65
-
66
- # Extracting data
67
- extracted_data = self.invoke_pipeline_step(
68
- lambda: self.extract_columns_from_table(data, indices, targets, group_by_rows),
69
- "Extracting data from the table...",
70
- local
71
- )
72
-
73
- json_result = self.convert_to_json(extracted_data)
74
-
75
- return json_result, targets_unprocessed
76
-
77
- def calculate_similarity(self, columns, target_columns, similarity_threshold_column_id, debug):
78
- model = SentenceTransformer('all-mpnet-base-v2')
79
-
80
- # Compute embeddings for columns and target values
81
- column_embeddings = model.encode(columns)
82
- target_embeddings = model.encode(target_columns)
83
-
84
- # List to store indices of the most similar columns
85
- most_similar_indices = {}
86
- targets_unprocessed = []
87
-
88
- # Calculate cosine similarity between each column and target value
89
- similarity_scores = util.pytorch_cos_sim(column_embeddings, target_embeddings)
90
-
91
- # Find the most similar column for each target value and provide the order ID
92
- for idx, target in enumerate(target_columns):
93
- similarities = similarity_scores[:, idx]
94
- most_similar_idx = similarities.argmax().item()
95
- most_similar_column = columns[most_similar_idx]
96
- similarity_score = similarities[most_similar_idx].item()
97
- if similarity_score > similarity_threshold_column_id:
98
- if most_similar_idx in most_similar_indices:
99
- if similarity_score > most_similar_indices[most_similar_idx][1]:
100
- targets_unprocessed.append(most_similar_indices[most_similar_idx][0])
101
- most_similar_indices[most_similar_idx] = (target, similarity_score)
102
- else:
103
- targets_unprocessed.append(target)
104
- else:
105
- most_similar_indices[most_similar_idx] = (target, similarity_score)
106
- else:
107
- targets_unprocessed.append(target)
108
- if debug:
109
- print(
110
- f"The most similar column to '{target}' is '{most_similar_column}' with a similarity score of {similarity_score:.4f} and order ID {most_similar_idx}")
111
-
112
- most_similar_indices = dict(sorted(most_similar_indices.items()))
113
-
114
- indices = []
115
- targets = []
116
-
117
- for idx, (target, _) in most_similar_indices.items():
118
- indices.append(idx)
119
- targets.append(target)
120
-
121
- if debug:
122
- print()
123
- for idx, (target, score) in most_similar_indices.items():
124
- print(f"Target: '{target}', Column: '{columns[idx]}', Column ID: {idx}, Score: {score:.4f}")
125
- print()
126
-
127
- return indices, targets, targets_unprocessed
128
-
129
- def extract_columns_from_table(self, html_table, column_ids, target_columns, group_by_rows=False):
130
- soup = BeautifulSoup(html_table, 'html.parser')
131
- table = soup.find('table')
132
-
133
- if group_by_rows:
134
- # Initialize a list to store each row's data as a dictionary
135
- extracted_data = []
136
- else:
137
- # Initialize the extracted data with custom column names
138
- extracted_data = {target_columns[i]: [] for i in range(len(column_ids))}
139
-
140
- # Extract row information
141
- rows = table.find_all('tr')
142
-
143
- for row in rows:
144
- # Skip the header row
145
- if row.find_all('th'):
146
- continue
147
-
148
- cells = row.find_all('td')
149
- if cells: # Ensure the row contains data cells
150
- if group_by_rows:
151
- row_data = {}
152
- for idx, col_id in enumerate(column_ids):
153
- value = cells[col_id].text.strip() if col_id < len(cells) else ''
154
- value = value.replace('|', '').strip()
155
- row_data[target_columns[idx]] = value
156
- extracted_data.append(row_data)
157
- else:
158
- for idx, col_id in enumerate(column_ids):
159
- value = cells[col_id].text.strip() if col_id < len(cells) else ''
160
- value = value.replace('|', '').strip()
161
- extracted_data[target_columns[idx]].append(value)
162
-
163
- return extracted_data
164
-
165
- def convert_to_json(self, extracted_data):
166
- return json.dumps(extracted_data, indent=4)
167
-
168
- def get_table_column_names(self, html_table):
169
- """
170
- Extract column names from an HTML table.
171
-
172
- Args:
173
- html_table (str): The HTML content of the table.
174
-
175
- Returns:
176
- list: A list of column names.
177
- """
178
- # Parse the HTML content using BeautifulSoup with html.parser
179
- soup = BeautifulSoup(html_table, 'html.parser')
180
-
181
- # Find the <thead> tag
182
- thead = soup.find('thead')
183
-
184
- # Extract column names into a list
185
- column_names = [th.get_text() for th in thead.find_all('th')]
186
-
187
- return column_names
188
-
189
- def invoke_pipeline_step(self, task_call, task_description, local):
190
- if local:
191
- with Progress(
192
- SpinnerColumn(),
193
- TextColumn("[progress.description]{task.description}"),
194
- transient=False,
195
- ) as progress:
196
- progress.add_task(description=task_description, total=None)
197
- ret = task_call()
198
- else:
199
- print(task_description)
200
- ret = task_call()
201
-
202
- return ret
203
-
204
- def add_answer_section(self, answer, section_name, answer_table):
205
- if not isinstance(answer, dict):
206
- raise ValueError("The answer should be a dictionary.")
207
-
208
- # Parse answer_table if it is a JSON string
209
- if isinstance(answer_table, str):
210
- answer_table = json.loads(answer_table)
211
-
212
- answer[section_name] = answer_table
213
- return answer
214
-
215
- def format_json_output(self, answer):
216
- formatted_json = json.dumps(answer, indent=4)
217
- formatted_json = formatted_json.replace('", "', '",\n"')
218
- formatted_json = formatted_json.replace('}, {', '},\n{')
219
- return formatted_json
220
-
221
-
222
- if __name__ == "__main__":
223
- # to run for debugging, navigate above sparrow_parse and run the following command:
224
- # python -m sparrow_parse.extractors.html_extractor
225
-
226
- # with open('data/invoice_1_table.txt', 'r') as file:
227
- # file_content = file.read()
228
- #
229
- # file_content = file_content.strip()[1:-1].strip()
230
- # data_list = re.split(r"',\s*'", file_content)
231
- # data_list = [item.strip(" '") for item in data_list]
232
-
233
- extractor = HTMLExtractor()
234
-
235
- # answer, targets_unprocessed = extractors.read_data(
236
- # # ['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth'],
237
- # ['transaction_date', 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance',
238
- # 'deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'maturity_date'],
239
- # data_list,
240
- # 0.5,
241
- # 0.3,
242
- # # None,
243
- # ['deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'transaction_date',
244
- # 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance', 'maturity_date'],
245
- # True,
246
- # False,
247
- # True,
248
- # True)
249
- #
250
- # print(answer)
251
- # print(targets_unprocessed)
@@ -1,374 +0,0 @@
1
- from bs4 import BeautifulSoup
2
- from sentence_transformers import SentenceTransformer, util
3
- import pandas as pd
4
- import re
5
- from io import StringIO
6
- from rich import print
7
-
8
-
9
- def merge_html_table_headers(html_table, column_keywords, similarity_threshold, debug=False):
10
- soup = BeautifulSoup(html_table, 'html.parser')
11
-
12
- # Find all thead elements
13
- theads = soup.find_all('thead')
14
-
15
- if len(theads) > 1 and column_keywords is not None:
16
- html_table = update_table_header_colspan(html_table)
17
- html_table = merge_table_header_thead(html_table)
18
- html_table = merge_colspan_columns(html_table)
19
- html_table = normalize_html_table(html_table, debug)
20
- html_table = fix_rowspan_elements(html_table)
21
- html_table = merge_rows_with_rowspan(html_table)
22
- html_table = detect_and_remove_junk_columns(html_table, column_keywords, similarity_threshold, debug)
23
- else:
24
- # If there is only one thead, return the original table
25
- return html_table
26
-
27
- return html_table
28
-
29
-
30
- def update_table_header_colspan(html_table):
31
- soup = BeautifulSoup(html_table, 'html.parser')
32
- theads = soup.find_all('thead')
33
-
34
- for thead in theads:
35
- for th in thead.find_all('th'):
36
- colspan = th.get('colspan')
37
- if colspan and int(colspan) > 1:
38
- colspan_count = int(colspan)
39
- th['colspan'] = 1
40
- for _ in range(colspan_count - 1):
41
- new_th = soup.new_tag('th')
42
- th.insert_after(new_th)
43
-
44
- return str(soup)
45
-
46
-
47
- def merge_table_header_thead(html_table):
48
- soup = BeautifulSoup(html_table, 'html.parser')
49
- theads = soup.find_all('thead')
50
-
51
- primary_thead = theads[0]
52
- secondary_thead = theads[1]
53
-
54
- primary_ths = primary_thead.find_all('th')
55
- secondary_ths = secondary_thead.find_all('th')
56
-
57
- for i, th in enumerate(primary_ths):
58
- if i < len(secondary_ths):
59
- primary_text = th.text.strip()
60
- secondary_text = secondary_ths[i].text.strip()
61
- if primary_text and secondary_text:
62
- th.string = (primary_text + ' ' + secondary_text).strip()
63
- elif not primary_text and secondary_text:
64
- th.string = secondary_text
65
- # Remove colspan and rowspan attributes
66
- th.attrs.pop('colspan', None)
67
- th.attrs.pop('rowspan', None)
68
-
69
- secondary_thead.decompose()
70
-
71
- return str(soup)
72
-
73
-
74
- def merge_colspan_columns(html_table):
75
- # Parse the HTML
76
- soup = BeautifulSoup(html_table, 'html.parser')
77
-
78
- # Process colspan attributes by adding empty <td> elements
79
- for row in soup.find_all('tr'):
80
- cols = []
81
- for cell in row.find_all(['th', 'td']):
82
- colspan = int(cell.get('colspan', 1))
83
- # Add the cell and additional empty cells if colspan is greater than 1
84
- cols.append(cell)
85
- for _ in range(colspan - 1):
86
- new_td = soup.new_tag('td')
87
- cols.append(new_td)
88
- # Remove the colspan attribute
89
- if cell.has_attr('colspan'):
90
- del cell['colspan']
91
-
92
- # Replace the row's children with the updated cells
93
- row.clear()
94
- row.extend(cols)
95
-
96
- return str(soup)
97
-
98
-
99
- def normalize_html_table(html, debug = False):
100
- soup = BeautifulSoup(html, 'html.parser')
101
-
102
- # Find the header row and count the number of cells
103
- header = soup.find('thead').find_all(['th', 'td'])
104
- header_cell_count = len(header)
105
-
106
- if debug:
107
- # Print the number of header cells
108
- print(f"Number of cells in header: {header_cell_count}")
109
-
110
- # Find all rows in the table body
111
- rows = soup.find_all('tr')
112
-
113
- for row in rows:
114
- cells = row.find_all(['td', 'th'])
115
- if len(cells) > header_cell_count:
116
- extra_cells = len(cells) - header_cell_count
117
- for cell in cells:
118
- if cell.text.strip() == '' and extra_cells > 0:
119
- cell.decompose()
120
- extra_cells -= 1
121
- elif len(cells) < header_cell_count:
122
- missing_cells = header_cell_count - len(cells)
123
- for _ in range(missing_cells):
124
- new_cell = soup.new_tag('td')
125
- row.insert(0, new_cell)
126
-
127
- return str(soup)
128
-
129
-
130
- def fix_rowspan_elements(html_table):
131
- # Parse the HTML table
132
- soup = BeautifulSoup(html_table, 'html.parser')
133
-
134
- # Find all table rows
135
- rows = soup.find_all('tr')
136
-
137
- # Dictionary to store rows with rowspan elements
138
- rowspan_dict = {}
139
-
140
- # Iterate over each row
141
- for row_index, row in enumerate(rows):
142
- # Find all cells in the row
143
- cells = row.find_all(['td', 'th'])
144
-
145
- # Iterate over each cell
146
- for cell_index, cell in enumerate(cells):
147
- # Check if the cell has a rowspan attribute
148
- if cell.has_attr('rowspan'):
149
- # Store the rowspan value and cell position
150
- rowspan_value = int(cell['rowspan'])
151
- if row_index not in rowspan_dict:
152
- rowspan_dict[row_index] = []
153
- rowspan_dict[row_index].append((cell_index, rowspan_value))
154
-
155
- # List to store the number of rows until the next rowspan row
156
- rows_below_until_next_rowspan = []
157
-
158
- # Get the sorted row indices that have rowspan elements
159
- sorted_row_indices = sorted(rowspan_dict.keys())
160
-
161
- # Calculate rows below each rowspan row until the next rowspan row
162
- for i in range(len(sorted_row_indices)):
163
- current_row = sorted_row_indices[i]
164
- if i < len(sorted_row_indices) - 1:
165
- next_row = sorted_row_indices[i + 1]
166
- rows_below = next_row - current_row - 1
167
- else:
168
- rows_below = len(rows) - current_row - 1
169
- rows_below_until_next_rowspan.append((current_row, rows_below))
170
-
171
- # Detect rows where rowspan value is incorrect
172
- rows_with_bad_rowspan = []
173
- for row_index, rows_below in rows_below_until_next_rowspan:
174
- if row_index in rowspan_dict:
175
- for cell_index, rowspan_value in rowspan_dict[row_index]:
176
- if rowspan_value - 1 < rows_below:
177
- print(f"Row {row_index} has a large rowspan value: {rowspan_value}")
178
- rows_with_bad_rowspan.append(row_index)
179
- break
180
-
181
- # Modify the HTML table to adjust the rowspan attributes
182
- for row_index in rows_with_bad_rowspan:
183
- if row_index in rowspan_dict:
184
- for cell_index, rowspan_value in rowspan_dict[row_index]:
185
- # Find the cell with the rowspan attribute
186
- cell = rows[row_index].find_all(['td', 'th'])[cell_index]
187
- # Remove the rowspan attribute
188
- del cell['rowspan']
189
- # Find the next row and assign the rowspan value
190
- next_row_index = row_index + 1
191
- if next_row_index < len(rows):
192
- next_row_cells = rows[next_row_index].find_all(['td', 'th'])
193
- if len(next_row_cells) > cell_index:
194
- next_row_cell = next_row_cells[cell_index]
195
- next_row_cell['rowspan'] = rowspan_value
196
- else:
197
- # Create a new cell if it does not exist
198
- new_cell = soup.new_tag(cell.name)
199
- new_cell['rowspan'] = rowspan_value
200
- new_cell.string = cell.string
201
- rows[next_row_index].append(new_cell)
202
-
203
- # Return the modified HTML table
204
- return str(soup)
205
-
206
-
207
- def merge_rows_with_rowspan(html):
208
- # Parse the HTML table using BeautifulSoup
209
- soup = BeautifulSoup(html, 'html.parser')
210
-
211
- # Extract the header
212
- thead = soup.find('thead')
213
-
214
- # Find all rows
215
- rows = soup.find_all('tr')
216
-
217
- result = []
218
- i = 0
219
-
220
- while i < len(rows):
221
- row = rows[i]
222
- # Check if any td in the row has a rowspan attribute
223
- for td in row.find_all('td'):
224
- if td.has_attr('rowspan'):
225
- rowspan_value = int(td['rowspan'])
226
- result.append(row)
227
-
228
- skip_concatenation = False
229
- concatenation_pairs = []
230
-
231
- # Add rows below the current row based on the rowspan number
232
- for j in range(1, rowspan_value):
233
- if i + j < len(rows):
234
- below_row = rows[i + j]
235
-
236
- # Compare cells
237
- row_cells = row.find_all('td')
238
- below_row_cells = below_row.find_all('td')
239
- min_length = min(len(row_cells), len(below_row_cells))
240
-
241
- for k in range(min_length):
242
- if is_numeric(row_cells[k].get_text(strip=True)) and is_numeric(below_row_cells[k].get_text(strip=True)):
243
- skip_concatenation = True
244
- break
245
- else:
246
- concatenation_pairs.append((row_cells[k], below_row_cells[k]))
247
-
248
- if skip_concatenation:
249
- result.append(below_row)
250
-
251
- if not skip_concatenation:
252
- for row_cell, below_row_cell in concatenation_pairs:
253
- concatenated_text = (row_cell.get_text(strip=True) + ' ' + below_row_cell.get_text(strip=True)).strip()
254
- row_cell.string = concatenated_text
255
-
256
- i += rowspan_value - 1 # Skip the rows that have been added
257
- break
258
- else:
259
- result.append(row)
260
- break
261
- i += 1
262
-
263
- # Convert result list of rows back to an HTML table string
264
- new_table_soup = BeautifulSoup(f'<table>{str(thead)}</table>', 'html.parser')
265
- tbody = new_table_soup.new_tag('tbody')
266
- new_table_soup.table.append(tbody)
267
- for row in result:
268
- for td in row.find_all('td'):
269
- if td.has_attr('rowspan'):
270
- del td['rowspan']
271
- tbody.append(row)
272
-
273
- return str(new_table_soup.table)
274
-
275
-
276
- def detect_and_remove_junk_columns(html_table, target_columns, similarity_threshold_param, debug=False):
277
- html_table = clean_html_table_header_names(html_table)
278
-
279
- # Wrap the HTML string in a StringIO object
280
- html_buffer = StringIO(html_table)
281
-
282
- # Read the HTML table
283
- df = pd.read_html(html_buffer)[0]
284
-
285
- model = SentenceTransformer('all-mpnet-base-v2')
286
-
287
- # Get the column names of the dataframe
288
- column_names = df.columns.tolist()
289
-
290
- # Calculate the similarity of each column name to the target column names
291
- target_embeddings = model.encode(target_columns)
292
- column_embeddings = model.encode(column_names)
293
-
294
- # Initialize a dictionary to store the similarity scores
295
- similarity_scores = {}
296
-
297
- # Identify junk columns based on similarity threshold
298
- junk_columns = []
299
- similarity_threshold = similarity_threshold_param
300
-
301
- for idx, col_embedding in enumerate(column_embeddings):
302
- similarities = util.pytorch_cos_sim(col_embedding, target_embeddings)[0]
303
- max_similarity = max(similarities)
304
- max_similarity_idx = similarities.argmax().item() # Get the index of the max similarity
305
- similarity_scores[column_names[idx]] = (
306
- max_similarity.item(), target_columns[max_similarity_idx]) # Store similarity score and target column name
307
- if max_similarity < similarity_threshold:
308
- junk_columns.append(column_names[idx])
309
-
310
- if debug:
311
- # Print the similarity scores for debugging purposes
312
- for column, (score, target_col) in similarity_scores.items():
313
- print(f"Column: {column}, Similarity: {score:.4f}, Target Column: {target_col}")
314
-
315
- # Handle junk columns by concatenating their values to the nearest column on the left
316
- for junk_col in junk_columns:
317
- junk_col_index = column_names.index(junk_col)
318
- if junk_col_index > 0:
319
- nearest_col = column_names[junk_col_index - 1]
320
- df[nearest_col] = df.apply(
321
- lambda row: str(row[junk_col]) if pd.isna(row[nearest_col]) and pd.notna(row[junk_col])
322
- else (str(row[nearest_col]) + ' ' + str(row[junk_col])) if pd.notna(row[junk_col])
323
- else row[nearest_col],
324
- axis=1
325
- )
326
- df.drop(columns=[junk_col], inplace=True)
327
-
328
- # Replace any remaining NaN values with empty strings
329
- df = df.fillna('')
330
-
331
- if debug:
332
- print(f"Junk columns: {junk_columns}")
333
- print(df.to_string())
334
-
335
- # Convert the result into an HTML table
336
- html_table = df.to_html(index=False)
337
-
338
- if debug:
339
- print(html_table)
340
-
341
- return html_table
342
-
343
-
344
- def clean_html_table_header_names(html_table: str) -> str:
345
- """
346
- Cleans the headers of an HTML table by removing junk characters and returns the updated HTML as a string.
347
-
348
- Parameters:
349
- html (str): The HTML content containing the table.
350
-
351
- Returns:
352
- str: The updated HTML table with cleaned headers.
353
- """
354
- # Parse the HTML table
355
- soup = BeautifulSoup(html_table, "html.parser")
356
- table = soup.find("table")
357
-
358
- # Extract the headers and clean them
359
- headers = table.find_all("th")
360
- for th in headers:
361
- if th.string:
362
- # Clean the header
363
- clean_header = re.sub(r"[^a-zA-Z0-9\s]", "", th.get_text())
364
- # Keep it empty if the cleaned name is empty
365
- th.string.replace_with(clean_header.strip() if clean_header.strip() else "")
366
-
367
- html_table = str(soup)
368
-
369
- return html_table
370
-
371
-
372
- def is_numeric(value):
373
- # Check if the value is numeric
374
- return bool(re.match(r'^\d+(?:,\d{3})*(?:\.\d+)?$', value))