sparrow-parse 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sparrow_parse/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '0.1.8'
1
+ __version__ = '0.1.10'
@@ -0,0 +1,9 @@
1
+ [
2
+ '<table><thead><th>No.</th><th>Description</th><th>Qty</th><th>UM</th><th>Net price</th><th>Net worth</th><th>VAT [%]</th><th>Gross worth</th></thead><tr><td></td><td>Wine Glasses Goblets Pair Clear
3
+ Glass</td><td>5,00</td><td>eacn</td><td>12,00</td><td>60,00</td><td>10%</td><td>66,00</td></tr><tr><td></td><td>With Hooks Stemware Storage Multiple Uses Iron Wine Rack Hanging
4
+ Glass</td><td>4,00</td><td>eacn</td><td>28,08</td><td>112,32</td><td>10%</td><td>123,55</td></tr><tr><td></td><td>Replacement Corkscrew Parts Spiral Worm Wine Opener Bottle
5
+ Houdini</td><td>1,00</td><td>eacn</td><td>7,50</td><td>7,50</td><td>10%</td><td>8,25</td></tr><tr><td></td><td>HOME ESSENTIALS GRADIENT STEMLESS WINE GLASSES SET OF 4 20 FL OZ (591 ml)
6
+ NEW</td><td>1,00</td><td>eacn</td><td>12,99</td><td>12,99</td><td>10%</td><td>14,29</td></tr></table>',
7
+ '<table><thead><th>VAT</th><th>[%]</th><th>Net worth</th><th>VAT</th><th>Gross worth</th></thead><tr><td></td><td>10%</td><td>192,81</td><td>19,28</td><td>212,09</td></tr><tr><td colspan="2">Total</td><td>$ 192,81</td><td>$
8
+ 19,28</td><td>$ 212,09</td></tr></table>'
9
+ ]
@@ -0,0 +1,368 @@
1
+ from bs4 import BeautifulSoup
2
+ from sentence_transformers import SentenceTransformer, util
3
+ import pandas as pd
4
+ import re
5
+ from io import StringIO
6
+
7
+
8
+ def merge_html_table_headers(html_table, column_keywords, debug=False):
9
+ soup = BeautifulSoup(html_table, 'html.parser')
10
+
11
+ # Find all thead elements
12
+ theads = soup.find_all('thead')
13
+
14
+ if len(theads) > 1 and column_keywords is not None:
15
+ html_table = update_table_header_colspan(html_table)
16
+ html_table = merge_table_header_thead(html_table)
17
+ html_table = merge_colspan_columns(html_table)
18
+ html_table = normalize_html_table(html_table, debug)
19
+ html_table = fix_rowspan_elements(html_table)
20
+ html_table = merge_rows_with_rowspan(html_table)
21
+ html_table = detect_and_remove_junk_columns(html_table, column_keywords, debug)
22
+ else:
23
+ # If there is only one thead, return the original table
24
+ return html_table
25
+
26
+ return html_table
27
+
28
+
29
+ def update_table_header_colspan(html_table):
30
+ soup = BeautifulSoup(html_table, 'html.parser')
31
+ theads = soup.find_all('thead')
32
+
33
+ for thead in theads:
34
+ for th in thead.find_all('th'):
35
+ colspan = th.get('colspan')
36
+ if colspan and int(colspan) > 1:
37
+ colspan_count = int(colspan)
38
+ th['colspan'] = 1
39
+ for _ in range(colspan_count - 1):
40
+ new_th = soup.new_tag('th')
41
+ th.insert_after(new_th)
42
+
43
+ return str(soup)
44
+
45
+
46
+ def merge_table_header_thead(html_table):
47
+ soup = BeautifulSoup(html_table, 'html.parser')
48
+ theads = soup.find_all('thead')
49
+
50
+ primary_thead = theads[0]
51
+ secondary_thead = theads[1]
52
+
53
+ primary_ths = primary_thead.find_all('th')
54
+ secondary_ths = secondary_thead.find_all('th')
55
+
56
+ for i, th in enumerate(primary_ths):
57
+ if i < len(secondary_ths):
58
+ primary_text = th.text.strip()
59
+ secondary_text = secondary_ths[i].text.strip()
60
+ if primary_text and secondary_text:
61
+ th.string = (primary_text + ' ' + secondary_text).strip()
62
+ elif not primary_text and secondary_text:
63
+ th.string = secondary_text
64
+ # Remove colspan and rowspan attributes
65
+ th.attrs.pop('colspan', None)
66
+ th.attrs.pop('rowspan', None)
67
+
68
+ secondary_thead.decompose()
69
+
70
+ return str(soup)
71
+
72
+
73
+ def merge_colspan_columns(html_table):
74
+ # Parse the HTML
75
+ soup = BeautifulSoup(html_table, 'html.parser')
76
+
77
+ # Process colspan attributes by adding empty <td> elements
78
+ for row in soup.find_all('tr'):
79
+ cols = []
80
+ for cell in row.find_all(['th', 'td']):
81
+ colspan = int(cell.get('colspan', 1))
82
+ # Add the cell and additional empty cells if colspan is greater than 1
83
+ cols.append(cell)
84
+ for _ in range(colspan - 1):
85
+ new_td = soup.new_tag('td')
86
+ cols.append(new_td)
87
+ # Remove the colspan attribute
88
+ if cell.has_attr('colspan'):
89
+ del cell['colspan']
90
+
91
+ # Replace the row's children with the updated cells
92
+ row.clear()
93
+ row.extend(cols)
94
+
95
+ return str(soup)
96
+
97
+
98
+ def normalize_html_table(html, debug = False):
99
+ soup = BeautifulSoup(html, 'html.parser')
100
+
101
+ # Find the header row and count the number of cells
102
+ header = soup.find('thead').find_all(['th', 'td'])
103
+ header_cell_count = len(header)
104
+
105
+ if debug:
106
+ # Print the number of header cells
107
+ print(f"Number of cells in header: {header_cell_count}")
108
+
109
+ # Find all rows in the table body
110
+ rows = soup.find_all('tr')
111
+
112
+ for row in rows:
113
+ cells = row.find_all(['td', 'th'])
114
+ if len(cells) > header_cell_count:
115
+ extra_cells = len(cells) - header_cell_count
116
+ for cell in cells:
117
+ if cell.text.strip() == '' and extra_cells > 0:
118
+ cell.decompose()
119
+ extra_cells -= 1
120
+ elif len(cells) < header_cell_count:
121
+ missing_cells = header_cell_count - len(cells)
122
+ for _ in range(missing_cells):
123
+ new_cell = soup.new_tag('td')
124
+ row.insert(0, new_cell)
125
+
126
+ return str(soup)
127
+
128
+
129
+ def fix_rowspan_elements(html_table):
130
+ # Parse the HTML table
131
+ soup = BeautifulSoup(html_table, 'html.parser')
132
+
133
+ # Find all table rows
134
+ rows = soup.find_all('tr')
135
+
136
+ # Dictionary to store rows with rowspan elements
137
+ rowspan_dict = {}
138
+
139
+ # Iterate over each row
140
+ for row_index, row in enumerate(rows):
141
+ # Find all cells in the row
142
+ cells = row.find_all(['td', 'th'])
143
+
144
+ # Iterate over each cell
145
+ for cell_index, cell in enumerate(cells):
146
+ # Check if the cell has a rowspan attribute
147
+ if cell.has_attr('rowspan'):
148
+ # Store the rowspan value and cell position
149
+ rowspan_value = int(cell['rowspan'])
150
+ if row_index not in rowspan_dict:
151
+ rowspan_dict[row_index] = []
152
+ rowspan_dict[row_index].append((cell_index, rowspan_value))
153
+
154
+ # List to store the number of rows until the next rowspan row
155
+ rows_below_until_next_rowspan = []
156
+
157
+ # Get the sorted row indices that have rowspan elements
158
+ sorted_row_indices = sorted(rowspan_dict.keys())
159
+
160
+ # Calculate rows below each rowspan row until the next rowspan row
161
+ for i in range(len(sorted_row_indices)):
162
+ current_row = sorted_row_indices[i]
163
+ if i < len(sorted_row_indices) - 1:
164
+ next_row = sorted_row_indices[i + 1]
165
+ rows_below = next_row - current_row - 1
166
+ else:
167
+ rows_below = len(rows) - current_row - 1
168
+ rows_below_until_next_rowspan.append((current_row, rows_below))
169
+
170
+ # Detect rows where rowspan value is incorrect
171
+ rows_with_bad_rowspan = []
172
+ for row_index, rows_below in rows_below_until_next_rowspan:
173
+ if row_index in rowspan_dict:
174
+ for cell_index, rowspan_value in rowspan_dict[row_index]:
175
+ if rowspan_value - 1 < rows_below:
176
+ print(f"Row {row_index} has a large rowspan value: {rowspan_value}")
177
+ rows_with_bad_rowspan.append(row_index)
178
+ break
179
+
180
+ # Modify the HTML table to adjust the rowspan attributes
181
+ for row_index in rows_with_bad_rowspan:
182
+ if row_index in rowspan_dict:
183
+ for cell_index, rowspan_value in rowspan_dict[row_index]:
184
+ # Find the cell with the rowspan attribute
185
+ cell = rows[row_index].find_all(['td', 'th'])[cell_index]
186
+ # Remove the rowspan attribute
187
+ del cell['rowspan']
188
+ # Find the next row and assign the rowspan value
189
+ next_row_index = row_index + 1
190
+ if next_row_index < len(rows):
191
+ next_row_cells = rows[next_row_index].find_all(['td', 'th'])
192
+ if len(next_row_cells) > cell_index:
193
+ next_row_cell = next_row_cells[cell_index]
194
+ next_row_cell['rowspan'] = rowspan_value
195
+ else:
196
+ # Create a new cell if it does not exist
197
+ new_cell = soup.new_tag(cell.name)
198
+ new_cell['rowspan'] = rowspan_value
199
+ new_cell.string = cell.string
200
+ rows[next_row_index].append(new_cell)
201
+
202
+ # Return the modified HTML table
203
+ return str(soup)
204
+
205
+
206
+ def merge_rows_with_rowspan(html):
207
+ # Parse the HTML table using BeautifulSoup
208
+ soup = BeautifulSoup(html, 'html.parser')
209
+
210
+ # Extract the header
211
+ thead = soup.find('thead')
212
+
213
+ # Find all rows
214
+ rows = soup.find_all('tr')
215
+
216
+ result = []
217
+ i = 0
218
+
219
+ while i < len(rows):
220
+ row = rows[i]
221
+ # Check if any td in the row has a rowspan attribute
222
+ for td in row.find_all('td'):
223
+ if td.has_attr('rowspan'):
224
+ rowspan_value = int(td['rowspan'])
225
+ result.append(row)
226
+
227
+ skip_concatenation = False
228
+ concatenation_pairs = []
229
+
230
+ # Add rows below the current row based on the rowspan number
231
+ for j in range(1, rowspan_value):
232
+ if i + j < len(rows):
233
+ below_row = rows[i + j]
234
+
235
+ # Compare cells
236
+ row_cells = row.find_all('td')
237
+ below_row_cells = below_row.find_all('td')
238
+ min_length = min(len(row_cells), len(below_row_cells))
239
+
240
+ for k in range(min_length):
241
+ if is_numeric(row_cells[k].get_text(strip=True)) and is_numeric(below_row_cells[k].get_text(strip=True)):
242
+ skip_concatenation = True
243
+ break
244
+ else:
245
+ concatenation_pairs.append((row_cells[k], below_row_cells[k]))
246
+
247
+ if skip_concatenation:
248
+ result.append(below_row)
249
+
250
+ if not skip_concatenation:
251
+ for row_cell, below_row_cell in concatenation_pairs:
252
+ concatenated_text = (row_cell.get_text(strip=True) + ' ' + below_row_cell.get_text(strip=True)).strip()
253
+ row_cell.string = concatenated_text
254
+
255
+ i += rowspan_value - 1 # Skip the rows that have been added
256
+ break
257
+ else:
258
+ result.append(row)
259
+ break
260
+ i += 1
261
+
262
+ # Convert result list of rows back to an HTML table string
263
+ new_table_soup = BeautifulSoup(f'<table>{str(thead)}</table>', 'html.parser')
264
+ tbody = new_table_soup.new_tag('tbody')
265
+ new_table_soup.table.append(tbody)
266
+ for row in result:
267
+ for td in row.find_all('td'):
268
+ if td.has_attr('rowspan'):
269
+ del td['rowspan']
270
+ tbody.append(row)
271
+
272
+ return str(new_table_soup.table)
273
+
274
+
275
+ def detect_and_remove_junk_columns(html_table, target_columns, debug=False):
276
+ # Wrap the HTML string in a StringIO object
277
+ html_buffer = StringIO(html_table)
278
+
279
+ # Read the HTML table
280
+ df = pd.read_html(html_buffer)[0]
281
+
282
+ model = SentenceTransformer('all-mpnet-base-v2')
283
+
284
+ # Get the column names of the dataframe
285
+ column_names = df.columns.tolist()
286
+
287
+ # Calculate the similarity of each column name to the target column names
288
+ target_embeddings = model.encode(target_columns)
289
+ column_embeddings = model.encode(column_names)
290
+
291
+ # Initialize a dictionary to store the similarity scores
292
+ similarity_scores = {}
293
+
294
+ # Identify junk columns based on similarity threshold
295
+ junk_columns = []
296
+ similarity_threshold = 0.5 # Adjust this threshold as needed
297
+
298
+ for idx, col_embedding in enumerate(column_embeddings):
299
+ similarities = util.pytorch_cos_sim(col_embedding, target_embeddings)[0]
300
+ max_similarity = max(similarities)
301
+ max_similarity_idx = similarities.argmax().item() # Get the index of the max similarity
302
+ similarity_scores[column_names[idx]] = (
303
+ max_similarity.item(), target_columns[max_similarity_idx]) # Store similarity score and target column name
304
+ if max_similarity < similarity_threshold:
305
+ junk_columns.append(column_names[idx])
306
+
307
+ if debug:
308
+ # Print the similarity scores for debugging purposes
309
+ for column, (score, target_col) in similarity_scores.items():
310
+ print(f"Column: {column}, Similarity: {score:.4f}, Target Column: {target_col}")
311
+
312
+ # Handle junk columns by concatenating their values to the nearest column on the left
313
+ for junk_col in junk_columns:
314
+ junk_col_index = column_names.index(junk_col)
315
+ if junk_col_index > 0:
316
+ nearest_col = column_names[junk_col_index - 1]
317
+ df[nearest_col] = df.apply(
318
+ lambda row: str(row[junk_col]) if pd.isna(row[nearest_col]) and pd.notna(row[junk_col])
319
+ else (str(row[nearest_col]) + ' ' + str(row[junk_col])) if pd.notna(row[junk_col])
320
+ else row[nearest_col],
321
+ axis=1
322
+ )
323
+ df.drop(columns=[junk_col], inplace=True)
324
+
325
+ # Replace any remaining NaN values with empty strings
326
+ df = df.fillna('')
327
+
328
+ if debug:
329
+ print(f"Junk columns: {junk_columns}")
330
+ print(df.to_string())
331
+
332
+ # Convert the result into an HTML table
333
+ html_table = df.to_html(index=False)
334
+
335
+ if debug:
336
+ print(html_table)
337
+
338
+ return html_table
339
+
340
+
341
+ def clean_html_table_header_names(html_table: str) -> str:
342
+ """
343
+ Cleans the headers of an HTML table by removing junk characters and returns the updated HTML as a string.
344
+
345
+ Parameters:
346
+ html (str): The HTML content containing the table.
347
+
348
+ Returns:
349
+ str: The updated HTML table with cleaned headers.
350
+ """
351
+ # Parse the HTML table
352
+ soup = BeautifulSoup(html_table, "html.parser")
353
+ table = soup.find("table")
354
+
355
+ # Extract the headers and clean them
356
+ headers = table.find_all("th")
357
+ for th in headers:
358
+ clean_header = re.sub(r"[^a-zA-Z0-9\s]", "", th.get_text())
359
+ th.string.replace_with(clean_header)
360
+
361
+ html_table = str(soup)
362
+
363
+ return html_table
364
+
365
+
366
+ def is_numeric(value):
367
+ # Check if the value is numeric
368
+ return bool(re.match(r'^\d+(?:,\d{3})*(?:\.\d+)?$', value))
@@ -0,0 +1,247 @@
1
+ from rich import print
2
+ from sentence_transformers import SentenceTransformer, util
3
+ from bs4 import BeautifulSoup
4
+ import json
5
+ from rich.progress import Progress, SpinnerColumn, TextColumn
6
+ from extractor_helper import merge_html_table_headers
7
+ from extractor_helper import clean_html_table_header_names
8
+ import re
9
+
10
+
11
+ class HTMLExtractor(object):
12
+ def __init__(self):
13
+ pass
14
+
15
+ def read_data(self, target_columns, data, column_keywords=None, group_by_rows=True, update_targets=False,
16
+ local=True, debug=False):
17
+ answer = {}
18
+
19
+ json_result, targets_unprocessed = [], []
20
+
21
+ i = 0
22
+ for table in data:
23
+ if not target_columns:
24
+ break
25
+
26
+ i += 1
27
+ json_result, targets_unprocessed = self.read_data_from_table(target_columns, table, column_keywords,
28
+ group_by_rows, local, debug)
29
+ answer = self.add_answer_section(answer, "items" + str(i), json_result)
30
+
31
+ if update_targets:
32
+ target_columns = targets_unprocessed
33
+
34
+ answer = self.format_json_output(answer)
35
+
36
+ return answer, targets_unprocessed
37
+
38
+ def read_data_from_table(self, target_columns, data, column_keywords=None, group_by_rows=True, local=True, debug=False):
39
+ data = self.invoke_pipeline_step(
40
+ lambda: merge_html_table_headers(data, column_keywords, debug),
41
+ "Merging HTML table headers...",
42
+ local
43
+ )
44
+
45
+ data = self.invoke_pipeline_step(
46
+ lambda: clean_html_table_header_names(data),
47
+ "Cleaning HTML table headers...",
48
+ local
49
+ )
50
+
51
+ columns = self.get_table_column_names(data)
52
+
53
+ if debug:
54
+ print("\n")
55
+ print(f"Columns: {columns}")
56
+ print(f"Target columns: {target_columns}")
57
+
58
+ indices, targets, targets_unprocessed = self.invoke_pipeline_step(
59
+ lambda: self.calculate_similarity(columns, target_columns, debug),
60
+ "Calculating cosine similarity between columns and target values...",
61
+ local
62
+ )
63
+
64
+ if debug:
65
+ print(f"Unprocessed targets: {targets_unprocessed}")
66
+
67
+ # Extracting data
68
+ extracted_data = self.invoke_pipeline_step(
69
+ lambda: self.extract_columns_from_table(data, indices, targets, group_by_rows),
70
+ "Extracting data from the table...",
71
+ local
72
+ )
73
+
74
+ json_result = self.convert_to_json(extracted_data)
75
+
76
+ return json_result, targets_unprocessed
77
+
78
+ def calculate_similarity(self, columns, target_columns, debug):
79
+ model = SentenceTransformer('all-mpnet-base-v2')
80
+
81
+ # Compute embeddings for columns and target values
82
+ column_embeddings = model.encode(columns)
83
+ target_embeddings = model.encode(target_columns)
84
+
85
+ # List to store indices of the most similar columns
86
+ most_similar_indices = {}
87
+ targets_unprocessed = []
88
+
89
+ # Calculate cosine similarity between each column and target value
90
+ similarity_scores = util.pytorch_cos_sim(column_embeddings, target_embeddings)
91
+
92
+ # Find the most similar column for each target value and provide the order ID
93
+ for idx, target in enumerate(target_columns):
94
+ similarities = similarity_scores[:, idx]
95
+ most_similar_idx = similarities.argmax().item()
96
+ most_similar_column = columns[most_similar_idx]
97
+ similarity_score = similarities[most_similar_idx].item()
98
+ if similarity_score > 0.3:
99
+ if most_similar_idx in most_similar_indices:
100
+ if similarity_score > most_similar_indices[most_similar_idx][1]:
101
+ targets_unprocessed.append(most_similar_indices[most_similar_idx][0])
102
+ most_similar_indices[most_similar_idx] = (target, similarity_score)
103
+ else:
104
+ targets_unprocessed.append(target)
105
+ else:
106
+ most_similar_indices[most_similar_idx] = (target, similarity_score)
107
+ else:
108
+ targets_unprocessed.append(target)
109
+ if debug:
110
+ print(
111
+ f"The most similar column to '{target}' is '{most_similar_column}' with a similarity score of {similarity_score:.4f} and order ID {most_similar_idx}")
112
+
113
+ most_similar_indices = dict(sorted(most_similar_indices.items()))
114
+
115
+ indices = []
116
+ targets = []
117
+
118
+ for idx, (target, _) in most_similar_indices.items():
119
+ indices.append(idx)
120
+ targets.append(target)
121
+
122
+ if debug:
123
+ print()
124
+ for idx, (target, score) in most_similar_indices.items():
125
+ print(f"Target: '{target}', Column: '{columns[idx]}', Column ID: {idx}, Score: {score:.4f}")
126
+ print()
127
+
128
+ return indices, targets, targets_unprocessed
129
+
130
+ def extract_columns_from_table(self, html_table, column_ids, target_columns, group_by_rows=False):
131
+ soup = BeautifulSoup(html_table, 'html.parser')
132
+ table = soup.find('table')
133
+
134
+ if group_by_rows:
135
+ # Initialize a list to store each row's data as a dictionary
136
+ extracted_data = []
137
+ else:
138
+ # Initialize the extracted data with custom column names
139
+ extracted_data = {target_columns[i]: [] for i in range(len(column_ids))}
140
+
141
+ # Extract row information
142
+ rows = table.find_all('tr')
143
+
144
+ for row in rows:
145
+ # Skip the header row
146
+ if row.find_all('th'):
147
+ continue
148
+
149
+ cells = row.find_all('td')
150
+ if cells: # Ensure the row contains data cells
151
+ if group_by_rows:
152
+ row_data = {}
153
+ for idx, col_id in enumerate(column_ids):
154
+ value = cells[col_id].text.strip() if col_id < len(cells) else ''
155
+ value = value.replace('|', '').strip()
156
+ row_data[target_columns[idx]] = value
157
+ extracted_data.append(row_data)
158
+ else:
159
+ for idx, col_id in enumerate(column_ids):
160
+ value = cells[col_id].text.strip() if col_id < len(cells) else ''
161
+ value = value.replace('|', '').strip()
162
+ extracted_data[target_columns[idx]].append(value)
163
+
164
+ return extracted_data
165
+
166
+ def convert_to_json(self, extracted_data):
167
+ return json.dumps(extracted_data, indent=4)
168
+
169
+ def get_table_column_names(self, html_table):
170
+ """
171
+ Extract column names from an HTML table.
172
+
173
+ Args:
174
+ html_table (str): The HTML content of the table.
175
+
176
+ Returns:
177
+ list: A list of column names.
178
+ """
179
+ # Parse the HTML content using BeautifulSoup with html.parser
180
+ soup = BeautifulSoup(html_table, 'html.parser')
181
+
182
+ # Find the <thead> tag
183
+ thead = soup.find('thead')
184
+
185
+ # Extract column names into a list
186
+ column_names = [th.get_text() for th in thead.find_all('th')]
187
+
188
+ return column_names
189
+
190
+ def invoke_pipeline_step(self, task_call, task_description, local):
191
+ if local:
192
+ with Progress(
193
+ SpinnerColumn(),
194
+ TextColumn("[progress.description]{task.description}"),
195
+ transient=False,
196
+ ) as progress:
197
+ progress.add_task(description=task_description, total=None)
198
+ ret = task_call()
199
+ else:
200
+ print(task_description)
201
+ ret = task_call()
202
+
203
+ return ret
204
+
205
+ def add_answer_section(self, answer, section_name, answer_table):
206
+ if not isinstance(answer, dict):
207
+ raise ValueError("The answer should be a dictionary.")
208
+
209
+ # Parse answer_table if it is a JSON string
210
+ if isinstance(answer_table, str):
211
+ answer_table = json.loads(answer_table)
212
+
213
+ answer[section_name] = answer_table
214
+ return answer
215
+
216
+ def format_json_output(self, answer):
217
+ formatted_json = json.dumps(answer, indent=4)
218
+ formatted_json = formatted_json.replace('", "', '",\n"')
219
+ formatted_json = formatted_json.replace('}, {', '},\n{')
220
+ return formatted_json
221
+
222
+
223
+ if __name__ == "__main__":
224
+ # with open('../data/invoice_1_table.txt', 'r') as file:
225
+ # file_content = file.read()
226
+ #
227
+ # file_content = file_content.strip()[1:-1].strip()
228
+ # data_list = re.split(r"',\s*'", file_content)
229
+ # data_list = [item.strip(" '") for item in data_list]
230
+
231
+ extractor = HTMLExtractor()
232
+
233
+ # answer, targets_unprocessed = extractor.read_data(
234
+ # ['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth'],
235
+ # # ['transaction_date', 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance',
236
+ # # 'deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'maturity_date'],
237
+ # data_list,
238
+ # None,
239
+ # # ['deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'transaction_date',
240
+ # # 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance', 'maturity_date'],
241
+ # True,
242
+ # True,
243
+ # True,
244
+ # True)
245
+ #
246
+ # print(answer)
247
+ # print(targets_unprocessed)
@@ -0,0 +1,145 @@
1
+ import pymupdf4llm
2
+ import pandas as pd
3
+ import re
4
+ from rich.progress import Progress, SpinnerColumn, TextColumn
5
+ from rich import print
6
+ from bs4 import BeautifulSoup
7
+
8
+
9
+ class MarkdownProcessor(object):
10
+ def __init__(self):
11
+ pass
12
+
13
+ def extract_data(self, file_path, options, local=True, debug=False):
14
+ markdown_text = self.invoke_pipeline_step(
15
+ lambda: pymupdf4llm.to_markdown(file_path),
16
+ "Extracting markdown text from the document...",
17
+ local
18
+ )
19
+
20
+ content, table_content = self.invoke_pipeline_step(
21
+ lambda: self.load_text_data(markdown_text, options),
22
+ "Loading text data...",
23
+ local
24
+ )
25
+
26
+ if debug:
27
+ print("Data extracted from the document:")
28
+ print(content)
29
+ print("\n")
30
+ print("Table content extracted from the document:")
31
+ if table_content:
32
+ print(len(table_content))
33
+ print(table_content)
34
+
35
+ return content, table_content
36
+
37
+ def load_text_data(self, markdown_text, options):
38
+ content, table_content = None, None
39
+
40
+ if options is None:
41
+ content = markdown_text
42
+
43
+ if options and "tables" in options and "markdown" in options:
44
+ content = self.extract_form_data(markdown_text)
45
+ table_content = self.extract_tables(markdown_text)
46
+
47
+ return content, table_content
48
+
49
+ def extract_form_data(self, markdown_text):
50
+ return markdown_text
51
+
52
+ def extract_tables(self, markdown_text):
53
+ # Regular expression to match markdown tables
54
+ table_pattern = re.compile(r'(\|.+\|\n\|[-| ]+\|\n(?:\|.*\|\n)*?)(?=\|.*TOTAL)', re.MULTILINE)
55
+
56
+ # Find all tables in the markdown text
57
+ tables = table_pattern.findall(markdown_text)
58
+
59
+ html_tables = []
60
+ for table_text in tables:
61
+ # Split the table into lines
62
+ lines = table_text.strip().split('\n')
63
+
64
+ # Extract headers and rows
65
+ headers = [self.clean_column_name(header.strip()) for header in lines[0].split('|') if header]
66
+ rows = []
67
+ for line in lines[2:]: # Skip header and separator lines
68
+ row = [cell.strip() for cell in line.split('|') if cell]
69
+ rows.append(row)
70
+
71
+ # Convert to Pandas DataFrame
72
+ df = pd.DataFrame(rows, columns=headers)
73
+
74
+ # Convert DataFrame to HTML and append to the list
75
+ html_table = df.to_html(index=False)
76
+ if self.table_has_header(html_table):
77
+ html_tables.append(html_table)
78
+
79
+ return html_tables
80
+
81
+ def clean_column_name(self, name):
82
+ """
83
+ Cleans the column name by removing spaces if the name is a single word with spaces between letters.
84
+
85
+ Args:
86
+ name (str): The column name to clean.
87
+
88
+ Returns:
89
+ str: The cleaned column name.
90
+ """
91
+ # Check if the name contains only letters and spaces
92
+ if all(char.isalpha() or char.isspace() for char in name):
93
+ # Check if it is a single word with spaces between letters
94
+ parts = name.split()
95
+ if len(parts) > 1 and all(len(part) == 1 for part in parts):
96
+ return ''.join(parts)
97
+ return name
98
+
99
+ def invoke_pipeline_step(self, task_call, task_description, local):
100
+ if local:
101
+ with Progress(
102
+ SpinnerColumn(),
103
+ TextColumn("[progress.description]{task.description}"),
104
+ transient=False,
105
+ ) as progress:
106
+ progress.add_task(description=task_description, total=None)
107
+ ret = task_call()
108
+ else:
109
+ print(task_description)
110
+ ret = task_call()
111
+
112
+ return ret
113
+
114
+ def table_has_header(self, table_html):
115
+ soup = BeautifulSoup(table_html, 'html.parser')
116
+ table = soup.find('table')
117
+
118
+ # Check if the table contains a <thead> tag
119
+ if table.find('thead'):
120
+ return True
121
+
122
+ # Check if the table contains any <th> tags inside the table (in case there's no <thead>)
123
+ if table.find_all('th'):
124
+ return True
125
+
126
+ return False
127
+
128
+
129
+ if __name__ == "__main__":
130
+ processor = MarkdownProcessor()
131
+ # content, table_content = processor.extract_data(
132
+ # '/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
133
+ # ['tables', 'markdown'],
134
+ # True,
135
+ # True)
136
+ content, table_content = processor.extract_data(
137
+ '/Users/andrejb/Documents/work/schreiber/invoice_data/test/2618407.pdf',
138
+ ['tables', 'markdown'],
139
+ True,
140
+ True)
141
+ # content, table_content = processor.extract_data(
142
+ # '/Users/andrejb/Documents/work/epik/bankstatement/POSB_2_1.pdf',
143
+ # ['tables', 'markdown'],
144
+ # True,
145
+ # True)
@@ -0,0 +1,179 @@
1
+ import tempfile
2
+ import os
3
+ from unstructured.partition.pdf import partition_pdf
4
+ from unstructured.partition.image import partition_image
5
+ import json
6
+ from unstructured.staging.base import elements_to_json
7
+ from rich.progress import Progress, SpinnerColumn, TextColumn
8
+ from rich import print
9
+ from bs4 import BeautifulSoup
10
+
11
+
12
+ class UnstructuredProcessor(object):
13
+ def __init__(self):
14
+ pass
15
+
16
+ def extract_data(self, file_path, strategy, model_name, options, local=True, debug=False):
17
+ # Extracts the elements from the PDF
18
+ elements = self.invoke_pipeline_step(
19
+ lambda: self.process_file(file_path, strategy, model_name),
20
+ "Extracting elements from the document...",
21
+ local
22
+ )
23
+
24
+ if debug:
25
+ new_extension = 'json' # You can change this to any extension you want
26
+ new_file_path = self.change_file_extension(file_path, new_extension)
27
+
28
+ content, table_content = self.invoke_pipeline_step(
29
+ lambda: self.load_text_data(elements, new_file_path, options),
30
+ "Loading text data...",
31
+ local
32
+ )
33
+ else:
34
+ with tempfile.TemporaryDirectory() as temp_dir:
35
+ temp_file_path = os.path.join(temp_dir, "file_data.json")
36
+
37
+ content, table_content = self.invoke_pipeline_step(
38
+ lambda: self.load_text_data(elements, temp_file_path, options),
39
+ "Loading text data...",
40
+ local
41
+ )
42
+
43
+ if debug:
44
+ print("Data extracted from the document:")
45
+ print(content)
46
+ print("\n")
47
+ print("Table content extracted from the document:")
48
+ if table_content:
49
+ print(len(table_content))
50
+ print(table_content)
51
+
52
+ return content, table_content
53
+
54
+ def process_file(self, file_path, strategy, model_name):
55
+ elements = None
56
+
57
+ if file_path.lower().endswith('.pdf'):
58
+ elements = partition_pdf(
59
+ filename=file_path,
60
+ strategy=strategy,
61
+ infer_table_structure=True,
62
+ hi_res_model_name=model_name,
63
+ languages=['en']
64
+ )
65
+ elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
66
+ elements = partition_image(
67
+ filename=file_path,
68
+ strategy=strategy,
69
+ infer_table_structure=True,
70
+ hi_res_model_name=model_name,
71
+ languages=['en']
72
+ )
73
+
74
+ return elements
75
+
76
+ def change_file_extension(self, file_path, new_extension, suffix=None):
77
+ # Check if the new extension starts with a dot and add one if not
78
+ if not new_extension.startswith('.'):
79
+ new_extension = '.' + new_extension
80
+
81
+ # Split the file path into two parts: the base (everything before the last dot) and the extension
82
+ # If there's no dot in the filename, it'll just return the original filename without an extension
83
+ base = file_path.rsplit('.', 1)[0]
84
+
85
+ # Concatenate the base with the new extension
86
+ if suffix is None:
87
+ new_file_path = base + new_extension
88
+ else:
89
+ new_file_path = base + "_" + suffix + new_extension
90
+
91
+ return new_file_path
92
+
93
+ def load_text_data(self, elements, file_path, options):
94
+ elements_to_json(elements, filename=file_path)
95
+
96
+ content, table_content = None, None
97
+
98
+ if options is None:
99
+ content = self.process_json_file(file_path)
100
+
101
+ if options and "tables" in options and "html" in options:
102
+ content = self.process_json_file(file_path, "form")
103
+
104
+ table_content = self.process_json_file(file_path, "table")
105
+
106
+ return content, table_content
107
+
108
+ def process_json_file(self, file_path, option=None):
109
+ # Read the JSON file
110
+ with open(file_path, 'r') as file:
111
+ data = json.load(file)
112
+
113
+ # Iterate over the JSON data and extract required elements
114
+ extracted_elements = []
115
+ for entry in data:
116
+ if entry["type"] == "Table" and (option is None or option == "table" or option == "form"):
117
+ table_data = entry["metadata"]["text_as_html"]
118
+ if option == "table" and self.table_has_header(table_data):
119
+ extracted_elements.append(table_data)
120
+ if option is None or option == "form":
121
+ extracted_elements.append(table_data)
122
+ elif entry["type"] == "Title" and (option is None or option == "form"):
123
+ extracted_elements.append(entry["text"])
124
+ elif entry["type"] == "NarrativeText" and (option is None or option == "form"):
125
+ extracted_elements.append(entry["text"])
126
+ elif entry["type"] == "UncategorizedText" and (option is None or option == "form"):
127
+ extracted_elements.append(entry["text"])
128
+ elif entry["type"] == "ListItem" and (option is None or option == "form"):
129
+ extracted_elements.append(entry["text"])
130
+ elif entry["type"] == "Image" and (option is None or option == "form"):
131
+ extracted_elements.append(entry["text"])
132
+
133
+ if option is None or option == "form":
134
+ # Convert list to single string with two new lines between each element
135
+ extracted_data = "\n\n".join(extracted_elements)
136
+ return extracted_data
137
+
138
+ return extracted_elements
139
+
140
+ def invoke_pipeline_step(self, task_call, task_description, local):
141
+ if local:
142
+ with Progress(
143
+ SpinnerColumn(),
144
+ TextColumn("[progress.description]{task.description}"),
145
+ transient=False,
146
+ ) as progress:
147
+ progress.add_task(description=task_description, total=None)
148
+ ret = task_call()
149
+ else:
150
+ print(task_description)
151
+ ret = task_call()
152
+
153
+ return ret
154
+
155
+ def table_has_header(self, table_html):
156
+ soup = BeautifulSoup(table_html, 'html.parser')
157
+ table = soup.find('table')
158
+
159
+ # Check if the table contains a <thead> tag
160
+ if table.find('thead'):
161
+ return True
162
+
163
+ # Check if the table contains any <th> tags inside the table (in case there's no <thead>)
164
+ if table.find_all('th'):
165
+ return True
166
+
167
+ return False
168
+
169
+
170
+ if __name__ == "__main__":
171
+ processor = UnstructuredProcessor()
172
+ # content, table_content = processor.extract_data(
173
+ # '/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
174
+ # 'hi_res',
175
+ # 'yolox',
176
+ # ['tables', 'html'],
177
+ # True,
178
+ # True)
179
+
sparrow_parse/temp.py ADDED
@@ -0,0 +1,16 @@
1
+ # content, table_content = processor.extract_data(
2
+ # '/Users/andrejb/Documents/work/schreiber/invoice_data/test/2618407.pdf',
3
+ # 'hi_res',
4
+ # 'yolox',
5
+ # # 'detectron2_onnx',
6
+ # ['tables', 'html'],
7
+ # True,
8
+ # True)
9
+
10
+ # content, table_content = processor.extract_data(
11
+ # '/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_1.pdf',
12
+ # 'hi_res',
13
+ # 'yolox',
14
+ # ['tables', 'html'],
15
+ # True,
16
+ # True)
@@ -1,21 +1,27 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.1.8
3
+ Version: 0.1.10
4
4
  Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  License: GPL-3.0
7
7
  Keywords: llm,rag,vision
8
8
  Author: Andrej Baranovskij
9
9
  Author-email: andrejus.baranovskis@gmail.com
10
- Requires-Python: >=3.10,<4.0
10
+ Requires-Python: >=3.9,<3.12
11
11
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
12
12
  Classifier: Operating System :: OS Independent
13
13
  Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
14
15
  Classifier: Programming Language :: Python :: 3.10
15
16
  Classifier: Programming Language :: Python :: 3.11
16
- Classifier: Programming Language :: Python :: 3.12
17
17
  Classifier: Topic :: Software Development
18
- Requires-Dist: requests (>=2.31.0,<3.0.0)
18
+ Requires-Dist: pymupdf4llm (==0.0.5)
19
+ Requires-Dist: rich (>=13.7.1,<14.0.0)
20
+ Requires-Dist: sentence-transformers (==3.0.1)
21
+ Requires-Dist: torch (==2.2.2)
22
+ Requires-Dist: transformers (==4.41.2)
23
+ Requires-Dist: unstructured-inference (==0.7.33)
24
+ Requires-Dist: unstructured[all-docs] (==0.14.5)
19
25
  Project-URL: Repository, https://github.com/katanaml/sparrow
20
26
  Description-Content-Type: text/markdown
21
27
 
@@ -36,20 +42,26 @@ pip install sparrow-parse
36
42
  Import
37
43
 
38
44
  ```
39
- from sparrow_parse.pdf.pdf_processor import PDFProcessor
45
+ from sparrow_parse.extractor.file_processor import FileProcessor
40
46
  ```
41
47
 
42
48
  Usage
43
49
 
44
50
  ```
45
- processor = PDFProcessor()
46
- result = processor.process_file(file_path, strategy, model_name)
51
+ processor = FileProcessor()
52
+ content = processor.extract_data(file_path, strategy, model_name, options, local, debug)
47
53
  ```
48
54
 
49
- Build for development
55
+ ## Library build
50
56
 
51
57
  ```
58
+ poetry build
59
+ ```
60
+
61
+ Publish to PyPi
52
62
 
63
+ ```
64
+ poetry publish
53
65
  ```
54
66
 
55
67
  ## Commercial usage
@@ -0,0 +1,13 @@
1
+ sparrow_parse/__init__.py,sha256=GG5_e12GvnOxWm5sUjputYu0F3PdW-4ThVZhf-KKQHY,22
2
+ sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
+ sparrow_parse/data/invoice_1_table.txt,sha256=dsWEASxlVNidpTCQDowCM7SjaUzSqwx7DuydTfaQ7xI,1115
4
+ sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ sparrow_parse/extractor/extractor_helper.py,sha256=fvA7iuGdpW_WFc4jkALBzQbACqlE5x_K9ScW-E6RCoY,13357
6
+ sparrow_parse/extractor/html_extractor.py,sha256=XGmaceTuny185iGiLVikEZ8ksLex6mRvQhw-McB84CU,9381
7
+ sparrow_parse/extractor/markdown_processor.py,sha256=YnrDYvmBygF-TC3_BlzEMWI6w_PWxvQWEw2zG37AjGw,4894
8
+ sparrow_parse/extractor/unstructured_processor.py,sha256=7eFIZ6VeEkIslcJyibyix1qFPHGtIwnvNxdRItGz7V8,6755
9
+ sparrow_parse/temp.py,sha256=zY77otqPZYvHqoyo12jxKIzr2eTMWI1h1wZnv4N0r3s,497
10
+ sparrow_parse-0.1.10.dist-info/METADATA,sha256=xqQeYGc6OKhBuFG8IgiV9k704ZePAw4tUhuRaifZTNc,3580
11
+ sparrow_parse-0.1.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
12
+ sparrow_parse-0.1.10.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
13
+ sparrow_parse-0.1.10.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- class PDFProcessor(object):
2
- def __init__(self):
3
- pass
4
-
5
- def process_file(self, content):
6
- print("Processing file...")
7
- return "OK"
@@ -1,8 +0,0 @@
1
- sparrow_parse/__init__.py,sha256=zemvJ5zjFE6SQT2xmkxc-ZYwNkUTCEX7mz3Epb2qztE,21
2
- sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
- sparrow_parse/pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- sparrow_parse/pdf/pdf_processor.py,sha256=hyvOQX_IydRA3z7gQs_g-Ut1hvVHRRxj1_2i-G09-ow,159
5
- sparrow_parse-0.1.8.dist-info/METADATA,sha256=QTAeFIi-KwyBvSbBrB8wS5WCld3gQ3XfAll4wS4x7Yc,3250
6
- sparrow_parse-0.1.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
7
- sparrow_parse-0.1.8.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
8
- sparrow_parse-0.1.8.dist-info/RECORD,,
File without changes