sparrow-parse 0.1.9__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sparrow_parse/__init__.py +1 -1
- sparrow_parse/data/invoice_1_table.txt +9 -0
- sparrow_parse/extractor/extractor_helper.py +368 -0
- sparrow_parse/extractor/html_extractor.py +247 -0
- sparrow_parse/extractor/markdown_processor.py +137 -0
- sparrow_parse/extractor/unstructured_processor.py +179 -0
- sparrow_parse/temp.py +28 -0
- {sparrow_parse-0.1.9.dist-info → sparrow_parse-0.2.0.dist-info}/METADATA +89 -11
- sparrow_parse-0.2.0.dist-info/RECORD +13 -0
- sparrow_parse/extractor/file_processor.py +0 -143
- sparrow_parse-0.1.9.dist-info/RECORD +0 -8
- {sparrow_parse-0.1.9.dist-info → sparrow_parse-0.2.0.dist-info}/WHEEL +0 -0
- {sparrow_parse-0.1.9.dist-info → sparrow_parse-0.2.0.dist-info}/entry_points.txt +0 -0
sparrow_parse/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = '0.
|
1
|
+
__version__ = '0.2.0'
|
@@ -0,0 +1,9 @@
|
|
1
|
+
[
|
2
|
+
'<table><thead><th>No.</th><th>Description</th><th>Qty</th><th>UM</th><th>Net price</th><th>Net worth</th><th>VAT [%]</th><th>Gross worth</th></thead><tr><td></td><td>Wine Glasses Goblets Pair Clear
|
3
|
+
Glass</td><td>5,00</td><td>eacn</td><td>12,00</td><td>60,00</td><td>10%</td><td>66,00</td></tr><tr><td></td><td>With Hooks Stemware Storage Multiple Uses Iron Wine Rack Hanging
|
4
|
+
Glass</td><td>4,00</td><td>eacn</td><td>28,08</td><td>112,32</td><td>10%</td><td>123,55</td></tr><tr><td></td><td>Replacement Corkscrew Parts Spiral Worm Wine Opener Bottle
|
5
|
+
Houdini</td><td>1,00</td><td>eacn</td><td>7,50</td><td>7,50</td><td>10%</td><td>8,25</td></tr><tr><td></td><td>HOME ESSENTIALS GRADIENT STEMLESS WINE GLASSES SET OF 4 20 FL OZ (591 ml)
|
6
|
+
NEW</td><td>1,00</td><td>eacn</td><td>12,99</td><td>12,99</td><td>10%</td><td>14,29</td></tr></table>',
|
7
|
+
'<table><thead><th>VAT</th><th>[%]</th><th>Net worth</th><th>VAT</th><th>Gross worth</th></thead><tr><td></td><td>10%</td><td>192,81</td><td>19,28</td><td>212,09</td></tr><tr><td colspan="2">Total</td><td>$ 192,81</td><td>$
|
8
|
+
19,28</td><td>$ 212,09</td></tr></table>'
|
9
|
+
]
|
@@ -0,0 +1,368 @@
|
|
1
|
+
from bs4 import BeautifulSoup
|
2
|
+
from sentence_transformers import SentenceTransformer, util
|
3
|
+
import pandas as pd
|
4
|
+
import re
|
5
|
+
from io import StringIO
|
6
|
+
|
7
|
+
|
8
|
+
def merge_html_table_headers(html_table, column_keywords, debug=False):
|
9
|
+
soup = BeautifulSoup(html_table, 'html.parser')
|
10
|
+
|
11
|
+
# Find all thead elements
|
12
|
+
theads = soup.find_all('thead')
|
13
|
+
|
14
|
+
if len(theads) > 1 and column_keywords is not None:
|
15
|
+
html_table = update_table_header_colspan(html_table)
|
16
|
+
html_table = merge_table_header_thead(html_table)
|
17
|
+
html_table = merge_colspan_columns(html_table)
|
18
|
+
html_table = normalize_html_table(html_table, debug)
|
19
|
+
html_table = fix_rowspan_elements(html_table)
|
20
|
+
html_table = merge_rows_with_rowspan(html_table)
|
21
|
+
html_table = detect_and_remove_junk_columns(html_table, column_keywords, debug)
|
22
|
+
else:
|
23
|
+
# If there is only one thead, return the original table
|
24
|
+
return html_table
|
25
|
+
|
26
|
+
return html_table
|
27
|
+
|
28
|
+
|
29
|
+
def update_table_header_colspan(html_table):
|
30
|
+
soup = BeautifulSoup(html_table, 'html.parser')
|
31
|
+
theads = soup.find_all('thead')
|
32
|
+
|
33
|
+
for thead in theads:
|
34
|
+
for th in thead.find_all('th'):
|
35
|
+
colspan = th.get('colspan')
|
36
|
+
if colspan and int(colspan) > 1:
|
37
|
+
colspan_count = int(colspan)
|
38
|
+
th['colspan'] = 1
|
39
|
+
for _ in range(colspan_count - 1):
|
40
|
+
new_th = soup.new_tag('th')
|
41
|
+
th.insert_after(new_th)
|
42
|
+
|
43
|
+
return str(soup)
|
44
|
+
|
45
|
+
|
46
|
+
def merge_table_header_thead(html_table):
|
47
|
+
soup = BeautifulSoup(html_table, 'html.parser')
|
48
|
+
theads = soup.find_all('thead')
|
49
|
+
|
50
|
+
primary_thead = theads[0]
|
51
|
+
secondary_thead = theads[1]
|
52
|
+
|
53
|
+
primary_ths = primary_thead.find_all('th')
|
54
|
+
secondary_ths = secondary_thead.find_all('th')
|
55
|
+
|
56
|
+
for i, th in enumerate(primary_ths):
|
57
|
+
if i < len(secondary_ths):
|
58
|
+
primary_text = th.text.strip()
|
59
|
+
secondary_text = secondary_ths[i].text.strip()
|
60
|
+
if primary_text and secondary_text:
|
61
|
+
th.string = (primary_text + ' ' + secondary_text).strip()
|
62
|
+
elif not primary_text and secondary_text:
|
63
|
+
th.string = secondary_text
|
64
|
+
# Remove colspan and rowspan attributes
|
65
|
+
th.attrs.pop('colspan', None)
|
66
|
+
th.attrs.pop('rowspan', None)
|
67
|
+
|
68
|
+
secondary_thead.decompose()
|
69
|
+
|
70
|
+
return str(soup)
|
71
|
+
|
72
|
+
|
73
|
+
def merge_colspan_columns(html_table):
|
74
|
+
# Parse the HTML
|
75
|
+
soup = BeautifulSoup(html_table, 'html.parser')
|
76
|
+
|
77
|
+
# Process colspan attributes by adding empty <td> elements
|
78
|
+
for row in soup.find_all('tr'):
|
79
|
+
cols = []
|
80
|
+
for cell in row.find_all(['th', 'td']):
|
81
|
+
colspan = int(cell.get('colspan', 1))
|
82
|
+
# Add the cell and additional empty cells if colspan is greater than 1
|
83
|
+
cols.append(cell)
|
84
|
+
for _ in range(colspan - 1):
|
85
|
+
new_td = soup.new_tag('td')
|
86
|
+
cols.append(new_td)
|
87
|
+
# Remove the colspan attribute
|
88
|
+
if cell.has_attr('colspan'):
|
89
|
+
del cell['colspan']
|
90
|
+
|
91
|
+
# Replace the row's children with the updated cells
|
92
|
+
row.clear()
|
93
|
+
row.extend(cols)
|
94
|
+
|
95
|
+
return str(soup)
|
96
|
+
|
97
|
+
|
98
|
+
def normalize_html_table(html, debug = False):
|
99
|
+
soup = BeautifulSoup(html, 'html.parser')
|
100
|
+
|
101
|
+
# Find the header row and count the number of cells
|
102
|
+
header = soup.find('thead').find_all(['th', 'td'])
|
103
|
+
header_cell_count = len(header)
|
104
|
+
|
105
|
+
if debug:
|
106
|
+
# Print the number of header cells
|
107
|
+
print(f"Number of cells in header: {header_cell_count}")
|
108
|
+
|
109
|
+
# Find all rows in the table body
|
110
|
+
rows = soup.find_all('tr')
|
111
|
+
|
112
|
+
for row in rows:
|
113
|
+
cells = row.find_all(['td', 'th'])
|
114
|
+
if len(cells) > header_cell_count:
|
115
|
+
extra_cells = len(cells) - header_cell_count
|
116
|
+
for cell in cells:
|
117
|
+
if cell.text.strip() == '' and extra_cells > 0:
|
118
|
+
cell.decompose()
|
119
|
+
extra_cells -= 1
|
120
|
+
elif len(cells) < header_cell_count:
|
121
|
+
missing_cells = header_cell_count - len(cells)
|
122
|
+
for _ in range(missing_cells):
|
123
|
+
new_cell = soup.new_tag('td')
|
124
|
+
row.insert(0, new_cell)
|
125
|
+
|
126
|
+
return str(soup)
|
127
|
+
|
128
|
+
|
129
|
+
def fix_rowspan_elements(html_table):
|
130
|
+
# Parse the HTML table
|
131
|
+
soup = BeautifulSoup(html_table, 'html.parser')
|
132
|
+
|
133
|
+
# Find all table rows
|
134
|
+
rows = soup.find_all('tr')
|
135
|
+
|
136
|
+
# Dictionary to store rows with rowspan elements
|
137
|
+
rowspan_dict = {}
|
138
|
+
|
139
|
+
# Iterate over each row
|
140
|
+
for row_index, row in enumerate(rows):
|
141
|
+
# Find all cells in the row
|
142
|
+
cells = row.find_all(['td', 'th'])
|
143
|
+
|
144
|
+
# Iterate over each cell
|
145
|
+
for cell_index, cell in enumerate(cells):
|
146
|
+
# Check if the cell has a rowspan attribute
|
147
|
+
if cell.has_attr('rowspan'):
|
148
|
+
# Store the rowspan value and cell position
|
149
|
+
rowspan_value = int(cell['rowspan'])
|
150
|
+
if row_index not in rowspan_dict:
|
151
|
+
rowspan_dict[row_index] = []
|
152
|
+
rowspan_dict[row_index].append((cell_index, rowspan_value))
|
153
|
+
|
154
|
+
# List to store the number of rows until the next rowspan row
|
155
|
+
rows_below_until_next_rowspan = []
|
156
|
+
|
157
|
+
# Get the sorted row indices that have rowspan elements
|
158
|
+
sorted_row_indices = sorted(rowspan_dict.keys())
|
159
|
+
|
160
|
+
# Calculate rows below each rowspan row until the next rowspan row
|
161
|
+
for i in range(len(sorted_row_indices)):
|
162
|
+
current_row = sorted_row_indices[i]
|
163
|
+
if i < len(sorted_row_indices) - 1:
|
164
|
+
next_row = sorted_row_indices[i + 1]
|
165
|
+
rows_below = next_row - current_row - 1
|
166
|
+
else:
|
167
|
+
rows_below = len(rows) - current_row - 1
|
168
|
+
rows_below_until_next_rowspan.append((current_row, rows_below))
|
169
|
+
|
170
|
+
# Detect rows where rowspan value is incorrect
|
171
|
+
rows_with_bad_rowspan = []
|
172
|
+
for row_index, rows_below in rows_below_until_next_rowspan:
|
173
|
+
if row_index in rowspan_dict:
|
174
|
+
for cell_index, rowspan_value in rowspan_dict[row_index]:
|
175
|
+
if rowspan_value - 1 < rows_below:
|
176
|
+
print(f"Row {row_index} has a large rowspan value: {rowspan_value}")
|
177
|
+
rows_with_bad_rowspan.append(row_index)
|
178
|
+
break
|
179
|
+
|
180
|
+
# Modify the HTML table to adjust the rowspan attributes
|
181
|
+
for row_index in rows_with_bad_rowspan:
|
182
|
+
if row_index in rowspan_dict:
|
183
|
+
for cell_index, rowspan_value in rowspan_dict[row_index]:
|
184
|
+
# Find the cell with the rowspan attribute
|
185
|
+
cell = rows[row_index].find_all(['td', 'th'])[cell_index]
|
186
|
+
# Remove the rowspan attribute
|
187
|
+
del cell['rowspan']
|
188
|
+
# Find the next row and assign the rowspan value
|
189
|
+
next_row_index = row_index + 1
|
190
|
+
if next_row_index < len(rows):
|
191
|
+
next_row_cells = rows[next_row_index].find_all(['td', 'th'])
|
192
|
+
if len(next_row_cells) > cell_index:
|
193
|
+
next_row_cell = next_row_cells[cell_index]
|
194
|
+
next_row_cell['rowspan'] = rowspan_value
|
195
|
+
else:
|
196
|
+
# Create a new cell if it does not exist
|
197
|
+
new_cell = soup.new_tag(cell.name)
|
198
|
+
new_cell['rowspan'] = rowspan_value
|
199
|
+
new_cell.string = cell.string
|
200
|
+
rows[next_row_index].append(new_cell)
|
201
|
+
|
202
|
+
# Return the modified HTML table
|
203
|
+
return str(soup)
|
204
|
+
|
205
|
+
|
206
|
+
def merge_rows_with_rowspan(html):
|
207
|
+
# Parse the HTML table using BeautifulSoup
|
208
|
+
soup = BeautifulSoup(html, 'html.parser')
|
209
|
+
|
210
|
+
# Extract the header
|
211
|
+
thead = soup.find('thead')
|
212
|
+
|
213
|
+
# Find all rows
|
214
|
+
rows = soup.find_all('tr')
|
215
|
+
|
216
|
+
result = []
|
217
|
+
i = 0
|
218
|
+
|
219
|
+
while i < len(rows):
|
220
|
+
row = rows[i]
|
221
|
+
# Check if any td in the row has a rowspan attribute
|
222
|
+
for td in row.find_all('td'):
|
223
|
+
if td.has_attr('rowspan'):
|
224
|
+
rowspan_value = int(td['rowspan'])
|
225
|
+
result.append(row)
|
226
|
+
|
227
|
+
skip_concatenation = False
|
228
|
+
concatenation_pairs = []
|
229
|
+
|
230
|
+
# Add rows below the current row based on the rowspan number
|
231
|
+
for j in range(1, rowspan_value):
|
232
|
+
if i + j < len(rows):
|
233
|
+
below_row = rows[i + j]
|
234
|
+
|
235
|
+
# Compare cells
|
236
|
+
row_cells = row.find_all('td')
|
237
|
+
below_row_cells = below_row.find_all('td')
|
238
|
+
min_length = min(len(row_cells), len(below_row_cells))
|
239
|
+
|
240
|
+
for k in range(min_length):
|
241
|
+
if is_numeric(row_cells[k].get_text(strip=True)) and is_numeric(below_row_cells[k].get_text(strip=True)):
|
242
|
+
skip_concatenation = True
|
243
|
+
break
|
244
|
+
else:
|
245
|
+
concatenation_pairs.append((row_cells[k], below_row_cells[k]))
|
246
|
+
|
247
|
+
if skip_concatenation:
|
248
|
+
result.append(below_row)
|
249
|
+
|
250
|
+
if not skip_concatenation:
|
251
|
+
for row_cell, below_row_cell in concatenation_pairs:
|
252
|
+
concatenated_text = (row_cell.get_text(strip=True) + ' ' + below_row_cell.get_text(strip=True)).strip()
|
253
|
+
row_cell.string = concatenated_text
|
254
|
+
|
255
|
+
i += rowspan_value - 1 # Skip the rows that have been added
|
256
|
+
break
|
257
|
+
else:
|
258
|
+
result.append(row)
|
259
|
+
break
|
260
|
+
i += 1
|
261
|
+
|
262
|
+
# Convert result list of rows back to an HTML table string
|
263
|
+
new_table_soup = BeautifulSoup(f'<table>{str(thead)}</table>', 'html.parser')
|
264
|
+
tbody = new_table_soup.new_tag('tbody')
|
265
|
+
new_table_soup.table.append(tbody)
|
266
|
+
for row in result:
|
267
|
+
for td in row.find_all('td'):
|
268
|
+
if td.has_attr('rowspan'):
|
269
|
+
del td['rowspan']
|
270
|
+
tbody.append(row)
|
271
|
+
|
272
|
+
return str(new_table_soup.table)
|
273
|
+
|
274
|
+
|
275
|
+
def detect_and_remove_junk_columns(html_table, target_columns, debug=False):
|
276
|
+
# Wrap the HTML string in a StringIO object
|
277
|
+
html_buffer = StringIO(html_table)
|
278
|
+
|
279
|
+
# Read the HTML table
|
280
|
+
df = pd.read_html(html_buffer)[0]
|
281
|
+
|
282
|
+
model = SentenceTransformer('all-mpnet-base-v2')
|
283
|
+
|
284
|
+
# Get the column names of the dataframe
|
285
|
+
column_names = df.columns.tolist()
|
286
|
+
|
287
|
+
# Calculate the similarity of each column name to the target column names
|
288
|
+
target_embeddings = model.encode(target_columns)
|
289
|
+
column_embeddings = model.encode(column_names)
|
290
|
+
|
291
|
+
# Initialize a dictionary to store the similarity scores
|
292
|
+
similarity_scores = {}
|
293
|
+
|
294
|
+
# Identify junk columns based on similarity threshold
|
295
|
+
junk_columns = []
|
296
|
+
similarity_threshold = 0.5 # Adjust this threshold as needed
|
297
|
+
|
298
|
+
for idx, col_embedding in enumerate(column_embeddings):
|
299
|
+
similarities = util.pytorch_cos_sim(col_embedding, target_embeddings)[0]
|
300
|
+
max_similarity = max(similarities)
|
301
|
+
max_similarity_idx = similarities.argmax().item() # Get the index of the max similarity
|
302
|
+
similarity_scores[column_names[idx]] = (
|
303
|
+
max_similarity.item(), target_columns[max_similarity_idx]) # Store similarity score and target column name
|
304
|
+
if max_similarity < similarity_threshold:
|
305
|
+
junk_columns.append(column_names[idx])
|
306
|
+
|
307
|
+
if debug:
|
308
|
+
# Print the similarity scores for debugging purposes
|
309
|
+
for column, (score, target_col) in similarity_scores.items():
|
310
|
+
print(f"Column: {column}, Similarity: {score:.4f}, Target Column: {target_col}")
|
311
|
+
|
312
|
+
# Handle junk columns by concatenating their values to the nearest column on the left
|
313
|
+
for junk_col in junk_columns:
|
314
|
+
junk_col_index = column_names.index(junk_col)
|
315
|
+
if junk_col_index > 0:
|
316
|
+
nearest_col = column_names[junk_col_index - 1]
|
317
|
+
df[nearest_col] = df.apply(
|
318
|
+
lambda row: str(row[junk_col]) if pd.isna(row[nearest_col]) and pd.notna(row[junk_col])
|
319
|
+
else (str(row[nearest_col]) + ' ' + str(row[junk_col])) if pd.notna(row[junk_col])
|
320
|
+
else row[nearest_col],
|
321
|
+
axis=1
|
322
|
+
)
|
323
|
+
df.drop(columns=[junk_col], inplace=True)
|
324
|
+
|
325
|
+
# Replace any remaining NaN values with empty strings
|
326
|
+
df = df.fillna('')
|
327
|
+
|
328
|
+
if debug:
|
329
|
+
print(f"Junk columns: {junk_columns}")
|
330
|
+
print(df.to_string())
|
331
|
+
|
332
|
+
# Convert the result into an HTML table
|
333
|
+
html_table = df.to_html(index=False)
|
334
|
+
|
335
|
+
if debug:
|
336
|
+
print(html_table)
|
337
|
+
|
338
|
+
return html_table
|
339
|
+
|
340
|
+
|
341
|
+
def clean_html_table_header_names(html_table: str) -> str:
|
342
|
+
"""
|
343
|
+
Cleans the headers of an HTML table by removing junk characters and returns the updated HTML as a string.
|
344
|
+
|
345
|
+
Parameters:
|
346
|
+
html (str): The HTML content containing the table.
|
347
|
+
|
348
|
+
Returns:
|
349
|
+
str: The updated HTML table with cleaned headers.
|
350
|
+
"""
|
351
|
+
# Parse the HTML table
|
352
|
+
soup = BeautifulSoup(html_table, "html.parser")
|
353
|
+
table = soup.find("table")
|
354
|
+
|
355
|
+
# Extract the headers and clean them
|
356
|
+
headers = table.find_all("th")
|
357
|
+
for th in headers:
|
358
|
+
clean_header = re.sub(r"[^a-zA-Z0-9\s]", "", th.get_text())
|
359
|
+
th.string.replace_with(clean_header)
|
360
|
+
|
361
|
+
html_table = str(soup)
|
362
|
+
|
363
|
+
return html_table
|
364
|
+
|
365
|
+
|
366
|
+
def is_numeric(value):
|
367
|
+
# Check if the value is numeric
|
368
|
+
return bool(re.match(r'^\d+(?:,\d{3})*(?:\.\d+)?$', value))
|
@@ -0,0 +1,247 @@
|
|
1
|
+
from rich import print
|
2
|
+
from sentence_transformers import SentenceTransformer, util
|
3
|
+
from bs4 import BeautifulSoup
|
4
|
+
import json
|
5
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
6
|
+
from extractor_helper import merge_html_table_headers
|
7
|
+
from extractor_helper import clean_html_table_header_names
|
8
|
+
import re
|
9
|
+
|
10
|
+
|
11
|
+
class HTMLExtractor(object):
|
12
|
+
def __init__(self):
|
13
|
+
pass
|
14
|
+
|
15
|
+
def read_data(self, target_columns, data, column_keywords=None, group_by_rows=True, update_targets=False,
|
16
|
+
local=True, debug=False):
|
17
|
+
answer = {}
|
18
|
+
|
19
|
+
json_result, targets_unprocessed = [], []
|
20
|
+
|
21
|
+
i = 0
|
22
|
+
for table in data:
|
23
|
+
if not target_columns:
|
24
|
+
break
|
25
|
+
|
26
|
+
i += 1
|
27
|
+
json_result, targets_unprocessed = self.read_data_from_table(target_columns, table, column_keywords,
|
28
|
+
group_by_rows, local, debug)
|
29
|
+
answer = self.add_answer_section(answer, "items" + str(i), json_result)
|
30
|
+
|
31
|
+
if update_targets:
|
32
|
+
target_columns = targets_unprocessed
|
33
|
+
|
34
|
+
answer = self.format_json_output(answer)
|
35
|
+
|
36
|
+
return answer, targets_unprocessed
|
37
|
+
|
38
|
+
def read_data_from_table(self, target_columns, data, column_keywords=None, group_by_rows=True, local=True, debug=False):
|
39
|
+
data = self.invoke_pipeline_step(
|
40
|
+
lambda: merge_html_table_headers(data, column_keywords, debug),
|
41
|
+
"Merging HTML table headers...",
|
42
|
+
local
|
43
|
+
)
|
44
|
+
|
45
|
+
data = self.invoke_pipeline_step(
|
46
|
+
lambda: clean_html_table_header_names(data),
|
47
|
+
"Cleaning HTML table headers...",
|
48
|
+
local
|
49
|
+
)
|
50
|
+
|
51
|
+
columns = self.get_table_column_names(data)
|
52
|
+
|
53
|
+
if debug:
|
54
|
+
print("\n")
|
55
|
+
print(f"Columns: {columns}")
|
56
|
+
print(f"Target columns: {target_columns}")
|
57
|
+
|
58
|
+
indices, targets, targets_unprocessed = self.invoke_pipeline_step(
|
59
|
+
lambda: self.calculate_similarity(columns, target_columns, debug),
|
60
|
+
"Calculating cosine similarity between columns and target values...",
|
61
|
+
local
|
62
|
+
)
|
63
|
+
|
64
|
+
if debug:
|
65
|
+
print(f"Unprocessed targets: {targets_unprocessed}")
|
66
|
+
|
67
|
+
# Extracting data
|
68
|
+
extracted_data = self.invoke_pipeline_step(
|
69
|
+
lambda: self.extract_columns_from_table(data, indices, targets, group_by_rows),
|
70
|
+
"Extracting data from the table...",
|
71
|
+
local
|
72
|
+
)
|
73
|
+
|
74
|
+
json_result = self.convert_to_json(extracted_data)
|
75
|
+
|
76
|
+
return json_result, targets_unprocessed
|
77
|
+
|
78
|
+
def calculate_similarity(self, columns, target_columns, debug):
|
79
|
+
model = SentenceTransformer('all-mpnet-base-v2')
|
80
|
+
|
81
|
+
# Compute embeddings for columns and target values
|
82
|
+
column_embeddings = model.encode(columns)
|
83
|
+
target_embeddings = model.encode(target_columns)
|
84
|
+
|
85
|
+
# List to store indices of the most similar columns
|
86
|
+
most_similar_indices = {}
|
87
|
+
targets_unprocessed = []
|
88
|
+
|
89
|
+
# Calculate cosine similarity between each column and target value
|
90
|
+
similarity_scores = util.pytorch_cos_sim(column_embeddings, target_embeddings)
|
91
|
+
|
92
|
+
# Find the most similar column for each target value and provide the order ID
|
93
|
+
for idx, target in enumerate(target_columns):
|
94
|
+
similarities = similarity_scores[:, idx]
|
95
|
+
most_similar_idx = similarities.argmax().item()
|
96
|
+
most_similar_column = columns[most_similar_idx]
|
97
|
+
similarity_score = similarities[most_similar_idx].item()
|
98
|
+
if similarity_score > 0.3:
|
99
|
+
if most_similar_idx in most_similar_indices:
|
100
|
+
if similarity_score > most_similar_indices[most_similar_idx][1]:
|
101
|
+
targets_unprocessed.append(most_similar_indices[most_similar_idx][0])
|
102
|
+
most_similar_indices[most_similar_idx] = (target, similarity_score)
|
103
|
+
else:
|
104
|
+
targets_unprocessed.append(target)
|
105
|
+
else:
|
106
|
+
most_similar_indices[most_similar_idx] = (target, similarity_score)
|
107
|
+
else:
|
108
|
+
targets_unprocessed.append(target)
|
109
|
+
if debug:
|
110
|
+
print(
|
111
|
+
f"The most similar column to '{target}' is '{most_similar_column}' with a similarity score of {similarity_score:.4f} and order ID {most_similar_idx}")
|
112
|
+
|
113
|
+
most_similar_indices = dict(sorted(most_similar_indices.items()))
|
114
|
+
|
115
|
+
indices = []
|
116
|
+
targets = []
|
117
|
+
|
118
|
+
for idx, (target, _) in most_similar_indices.items():
|
119
|
+
indices.append(idx)
|
120
|
+
targets.append(target)
|
121
|
+
|
122
|
+
if debug:
|
123
|
+
print()
|
124
|
+
for idx, (target, score) in most_similar_indices.items():
|
125
|
+
print(f"Target: '{target}', Column: '{columns[idx]}', Column ID: {idx}, Score: {score:.4f}")
|
126
|
+
print()
|
127
|
+
|
128
|
+
return indices, targets, targets_unprocessed
|
129
|
+
|
130
|
+
def extract_columns_from_table(self, html_table, column_ids, target_columns, group_by_rows=False):
|
131
|
+
soup = BeautifulSoup(html_table, 'html.parser')
|
132
|
+
table = soup.find('table')
|
133
|
+
|
134
|
+
if group_by_rows:
|
135
|
+
# Initialize a list to store each row's data as a dictionary
|
136
|
+
extracted_data = []
|
137
|
+
else:
|
138
|
+
# Initialize the extracted data with custom column names
|
139
|
+
extracted_data = {target_columns[i]: [] for i in range(len(column_ids))}
|
140
|
+
|
141
|
+
# Extract row information
|
142
|
+
rows = table.find_all('tr')
|
143
|
+
|
144
|
+
for row in rows:
|
145
|
+
# Skip the header row
|
146
|
+
if row.find_all('th'):
|
147
|
+
continue
|
148
|
+
|
149
|
+
cells = row.find_all('td')
|
150
|
+
if cells: # Ensure the row contains data cells
|
151
|
+
if group_by_rows:
|
152
|
+
row_data = {}
|
153
|
+
for idx, col_id in enumerate(column_ids):
|
154
|
+
value = cells[col_id].text.strip() if col_id < len(cells) else ''
|
155
|
+
value = value.replace('|', '').strip()
|
156
|
+
row_data[target_columns[idx]] = value
|
157
|
+
extracted_data.append(row_data)
|
158
|
+
else:
|
159
|
+
for idx, col_id in enumerate(column_ids):
|
160
|
+
value = cells[col_id].text.strip() if col_id < len(cells) else ''
|
161
|
+
value = value.replace('|', '').strip()
|
162
|
+
extracted_data[target_columns[idx]].append(value)
|
163
|
+
|
164
|
+
return extracted_data
|
165
|
+
|
166
|
+
def convert_to_json(self, extracted_data):
|
167
|
+
return json.dumps(extracted_data, indent=4)
|
168
|
+
|
169
|
+
def get_table_column_names(self, html_table):
|
170
|
+
"""
|
171
|
+
Extract column names from an HTML table.
|
172
|
+
|
173
|
+
Args:
|
174
|
+
html_table (str): The HTML content of the table.
|
175
|
+
|
176
|
+
Returns:
|
177
|
+
list: A list of column names.
|
178
|
+
"""
|
179
|
+
# Parse the HTML content using BeautifulSoup with html.parser
|
180
|
+
soup = BeautifulSoup(html_table, 'html.parser')
|
181
|
+
|
182
|
+
# Find the <thead> tag
|
183
|
+
thead = soup.find('thead')
|
184
|
+
|
185
|
+
# Extract column names into a list
|
186
|
+
column_names = [th.get_text() for th in thead.find_all('th')]
|
187
|
+
|
188
|
+
return column_names
|
189
|
+
|
190
|
+
def invoke_pipeline_step(self, task_call, task_description, local):
|
191
|
+
if local:
|
192
|
+
with Progress(
|
193
|
+
SpinnerColumn(),
|
194
|
+
TextColumn("[progress.description]{task.description}"),
|
195
|
+
transient=False,
|
196
|
+
) as progress:
|
197
|
+
progress.add_task(description=task_description, total=None)
|
198
|
+
ret = task_call()
|
199
|
+
else:
|
200
|
+
print(task_description)
|
201
|
+
ret = task_call()
|
202
|
+
|
203
|
+
return ret
|
204
|
+
|
205
|
+
def add_answer_section(self, answer, section_name, answer_table):
|
206
|
+
if not isinstance(answer, dict):
|
207
|
+
raise ValueError("The answer should be a dictionary.")
|
208
|
+
|
209
|
+
# Parse answer_table if it is a JSON string
|
210
|
+
if isinstance(answer_table, str):
|
211
|
+
answer_table = json.loads(answer_table)
|
212
|
+
|
213
|
+
answer[section_name] = answer_table
|
214
|
+
return answer
|
215
|
+
|
216
|
+
def format_json_output(self, answer):
|
217
|
+
formatted_json = json.dumps(answer, indent=4)
|
218
|
+
formatted_json = formatted_json.replace('", "', '",\n"')
|
219
|
+
formatted_json = formatted_json.replace('}, {', '},\n{')
|
220
|
+
return formatted_json
|
221
|
+
|
222
|
+
|
223
|
+
if __name__ == "__main__":
|
224
|
+
# with open('../data/invoice_1_table.txt', 'r') as file:
|
225
|
+
# file_content = file.read()
|
226
|
+
#
|
227
|
+
# file_content = file_content.strip()[1:-1].strip()
|
228
|
+
# data_list = re.split(r"',\s*'", file_content)
|
229
|
+
# data_list = [item.strip(" '") for item in data_list]
|
230
|
+
|
231
|
+
extractor = HTMLExtractor()
|
232
|
+
|
233
|
+
# answer, targets_unprocessed = extractor.read_data(
|
234
|
+
# ['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth'],
|
235
|
+
# # ['transaction_date', 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance',
|
236
|
+
# # 'deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'maturity_date'],
|
237
|
+
# data_list,
|
238
|
+
# None,
|
239
|
+
# # ['deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'transaction_date',
|
240
|
+
# # 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance', 'maturity_date'],
|
241
|
+
# True,
|
242
|
+
# True,
|
243
|
+
# True,
|
244
|
+
# True)
|
245
|
+
#
|
246
|
+
# print(answer)
|
247
|
+
# print(targets_unprocessed)
|
@@ -0,0 +1,137 @@
|
|
1
|
+
import pymupdf4llm
|
2
|
+
import pandas as pd
|
3
|
+
import re
|
4
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
5
|
+
from rich import print
|
6
|
+
from bs4 import BeautifulSoup
|
7
|
+
|
8
|
+
|
9
|
+
class MarkdownProcessor(object):
|
10
|
+
def __init__(self):
|
11
|
+
pass
|
12
|
+
|
13
|
+
def extract_data(self, file_path, options, local=True, debug=False):
|
14
|
+
markdown_text = self.invoke_pipeline_step(
|
15
|
+
lambda: pymupdf4llm.to_markdown(file_path),
|
16
|
+
"Extracting markdown text from the document...",
|
17
|
+
local
|
18
|
+
)
|
19
|
+
|
20
|
+
content, table_content = self.invoke_pipeline_step(
|
21
|
+
lambda: self.load_text_data(markdown_text, options),
|
22
|
+
"Loading text data...",
|
23
|
+
local
|
24
|
+
)
|
25
|
+
|
26
|
+
if debug:
|
27
|
+
print("Data extracted from the document:")
|
28
|
+
print(content)
|
29
|
+
print("\n")
|
30
|
+
print("Table content extracted from the document:")
|
31
|
+
if table_content:
|
32
|
+
print(len(table_content))
|
33
|
+
print(table_content)
|
34
|
+
|
35
|
+
return content, table_content
|
36
|
+
|
37
|
+
def load_text_data(self, markdown_text, options):
|
38
|
+
content, table_content = None, None
|
39
|
+
|
40
|
+
if options is None:
|
41
|
+
content = markdown_text
|
42
|
+
|
43
|
+
if options and "tables" in options and "markdown" in options:
|
44
|
+
content = self.extract_form_data(markdown_text)
|
45
|
+
table_content = self.extract_tables(markdown_text)
|
46
|
+
|
47
|
+
return content, table_content
|
48
|
+
|
49
|
+
def extract_form_data(self, markdown_text):
|
50
|
+
return markdown_text
|
51
|
+
|
52
|
+
def extract_tables(self, markdown_text):
|
53
|
+
# Regular expression to match markdown tables
|
54
|
+
table_pattern = re.compile(r'(\|.+\|\n\|[-| ]+\|\n(?:\|.*\|\n)*?)(?=\|.*TOTAL)', re.MULTILINE)
|
55
|
+
|
56
|
+
# Find all tables in the markdown text
|
57
|
+
tables = table_pattern.findall(markdown_text)
|
58
|
+
|
59
|
+
html_tables = []
|
60
|
+
for table_text in tables:
|
61
|
+
# Split the table into lines
|
62
|
+
lines = table_text.strip().split('\n')
|
63
|
+
|
64
|
+
# Extract headers and rows
|
65
|
+
headers = [self.clean_column_name(header.strip()) for header in lines[0].split('|') if header]
|
66
|
+
rows = []
|
67
|
+
for line in lines[2:]: # Skip header and separator lines
|
68
|
+
row = [cell.strip() for cell in line.split('|') if cell]
|
69
|
+
rows.append(row)
|
70
|
+
|
71
|
+
# Convert to Pandas DataFrame
|
72
|
+
df = pd.DataFrame(rows, columns=headers)
|
73
|
+
|
74
|
+
# Convert DataFrame to HTML and append to the list
|
75
|
+
html_table = df.to_html(index=False)
|
76
|
+
if self.table_has_header(html_table):
|
77
|
+
html_tables.append(html_table)
|
78
|
+
|
79
|
+
return html_tables
|
80
|
+
|
81
|
+
def clean_column_name(self, name):
|
82
|
+
"""
|
83
|
+
Cleans the column name by removing spaces if the name is a single word with spaces between letters.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
name (str): The column name to clean.
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
str: The cleaned column name.
|
90
|
+
"""
|
91
|
+
# Check if the name contains only letters and spaces
|
92
|
+
if all(char.isalpha() or char.isspace() for char in name):
|
93
|
+
# Check if it is a single word with spaces between letters
|
94
|
+
parts = name.split()
|
95
|
+
if len(parts) > 1 and all(len(part) == 1 for part in parts):
|
96
|
+
return ''.join(parts)
|
97
|
+
return name
|
98
|
+
|
99
|
+
def invoke_pipeline_step(self, task_call, task_description, local):
|
100
|
+
if local:
|
101
|
+
with Progress(
|
102
|
+
SpinnerColumn(),
|
103
|
+
TextColumn("[progress.description]{task.description}"),
|
104
|
+
transient=False,
|
105
|
+
) as progress:
|
106
|
+
progress.add_task(description=task_description, total=None)
|
107
|
+
ret = task_call()
|
108
|
+
else:
|
109
|
+
print(task_description)
|
110
|
+
ret = task_call()
|
111
|
+
|
112
|
+
return ret
|
113
|
+
|
114
|
+
def table_has_header(self, table_html):
|
115
|
+
soup = BeautifulSoup(table_html, 'html.parser')
|
116
|
+
table = soup.find('table')
|
117
|
+
|
118
|
+
# Check if the table contains a <thead> tag
|
119
|
+
if table.find('thead'):
|
120
|
+
return True
|
121
|
+
|
122
|
+
# Check if the table contains any <th> tags inside the table (in case there's no <thead>)
|
123
|
+
if table.find_all('th'):
|
124
|
+
return True
|
125
|
+
|
126
|
+
return False
|
127
|
+
|
128
|
+
|
129
|
+
if __name__ == "__main__":
|
130
|
+
processor = MarkdownProcessor()
|
131
|
+
|
132
|
+
# content, table_content = processor.extract_data(
|
133
|
+
# '/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
|
134
|
+
# ['tables', 'markdown'],
|
135
|
+
# True,
|
136
|
+
# True)
|
137
|
+
|
@@ -0,0 +1,179 @@
|
|
1
|
+
import tempfile
|
2
|
+
import os
|
3
|
+
from unstructured.partition.pdf import partition_pdf
|
4
|
+
from unstructured.partition.image import partition_image
|
5
|
+
import json
|
6
|
+
from unstructured.staging.base import elements_to_json
|
7
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
8
|
+
from rich import print
|
9
|
+
from bs4 import BeautifulSoup
|
10
|
+
|
11
|
+
|
12
|
+
class UnstructuredProcessor(object):
|
13
|
+
def __init__(self):
|
14
|
+
pass
|
15
|
+
|
16
|
+
def extract_data(self, file_path, strategy, model_name, options, local=True, debug=False):
|
17
|
+
# Extracts the elements from the PDF
|
18
|
+
elements = self.invoke_pipeline_step(
|
19
|
+
lambda: self.process_file(file_path, strategy, model_name),
|
20
|
+
"Extracting elements from the document...",
|
21
|
+
local
|
22
|
+
)
|
23
|
+
|
24
|
+
if debug:
|
25
|
+
new_extension = 'json' # You can change this to any extension you want
|
26
|
+
new_file_path = self.change_file_extension(file_path, new_extension)
|
27
|
+
|
28
|
+
content, table_content = self.invoke_pipeline_step(
|
29
|
+
lambda: self.load_text_data(elements, new_file_path, options),
|
30
|
+
"Loading text data...",
|
31
|
+
local
|
32
|
+
)
|
33
|
+
else:
|
34
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
35
|
+
temp_file_path = os.path.join(temp_dir, "file_data.json")
|
36
|
+
|
37
|
+
content, table_content = self.invoke_pipeline_step(
|
38
|
+
lambda: self.load_text_data(elements, temp_file_path, options),
|
39
|
+
"Loading text data...",
|
40
|
+
local
|
41
|
+
)
|
42
|
+
|
43
|
+
if debug:
|
44
|
+
print("Data extracted from the document:")
|
45
|
+
print(content)
|
46
|
+
print("\n")
|
47
|
+
print("Table content extracted from the document:")
|
48
|
+
if table_content:
|
49
|
+
print(len(table_content))
|
50
|
+
print(table_content)
|
51
|
+
|
52
|
+
return content, table_content
|
53
|
+
|
54
|
+
def process_file(self, file_path, strategy, model_name):
|
55
|
+
elements = None
|
56
|
+
|
57
|
+
if file_path.lower().endswith('.pdf'):
|
58
|
+
elements = partition_pdf(
|
59
|
+
filename=file_path,
|
60
|
+
strategy=strategy,
|
61
|
+
infer_table_structure=True,
|
62
|
+
hi_res_model_name=model_name,
|
63
|
+
languages=['en']
|
64
|
+
)
|
65
|
+
elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
|
66
|
+
elements = partition_image(
|
67
|
+
filename=file_path,
|
68
|
+
strategy=strategy,
|
69
|
+
infer_table_structure=True,
|
70
|
+
hi_res_model_name=model_name,
|
71
|
+
languages=['en']
|
72
|
+
)
|
73
|
+
|
74
|
+
return elements
|
75
|
+
|
76
|
+
def change_file_extension(self, file_path, new_extension, suffix=None):
|
77
|
+
# Check if the new extension starts with a dot and add one if not
|
78
|
+
if not new_extension.startswith('.'):
|
79
|
+
new_extension = '.' + new_extension
|
80
|
+
|
81
|
+
# Split the file path into two parts: the base (everything before the last dot) and the extension
|
82
|
+
# If there's no dot in the filename, it'll just return the original filename without an extension
|
83
|
+
base = file_path.rsplit('.', 1)[0]
|
84
|
+
|
85
|
+
# Concatenate the base with the new extension
|
86
|
+
if suffix is None:
|
87
|
+
new_file_path = base + new_extension
|
88
|
+
else:
|
89
|
+
new_file_path = base + "_" + suffix + new_extension
|
90
|
+
|
91
|
+
return new_file_path
|
92
|
+
|
93
|
+
def load_text_data(self, elements, file_path, options):
|
94
|
+
elements_to_json(elements, filename=file_path)
|
95
|
+
|
96
|
+
content, table_content = None, None
|
97
|
+
|
98
|
+
if options is None:
|
99
|
+
content = self.process_json_file(file_path)
|
100
|
+
|
101
|
+
if options and "tables" in options and "html" in options:
|
102
|
+
content = self.process_json_file(file_path, "form")
|
103
|
+
|
104
|
+
table_content = self.process_json_file(file_path, "table")
|
105
|
+
|
106
|
+
return content, table_content
|
107
|
+
|
108
|
+
def process_json_file(self, file_path, option=None):
|
109
|
+
# Read the JSON file
|
110
|
+
with open(file_path, 'r') as file:
|
111
|
+
data = json.load(file)
|
112
|
+
|
113
|
+
# Iterate over the JSON data and extract required elements
|
114
|
+
extracted_elements = []
|
115
|
+
for entry in data:
|
116
|
+
if entry["type"] == "Table" and (option is None or option == "table" or option == "form"):
|
117
|
+
table_data = entry["metadata"]["text_as_html"]
|
118
|
+
if option == "table" and self.table_has_header(table_data):
|
119
|
+
extracted_elements.append(table_data)
|
120
|
+
if option is None or option == "form":
|
121
|
+
extracted_elements.append(table_data)
|
122
|
+
elif entry["type"] == "Title" and (option is None or option == "form"):
|
123
|
+
extracted_elements.append(entry["text"])
|
124
|
+
elif entry["type"] == "NarrativeText" and (option is None or option == "form"):
|
125
|
+
extracted_elements.append(entry["text"])
|
126
|
+
elif entry["type"] == "UncategorizedText" and (option is None or option == "form"):
|
127
|
+
extracted_elements.append(entry["text"])
|
128
|
+
elif entry["type"] == "ListItem" and (option is None or option == "form"):
|
129
|
+
extracted_elements.append(entry["text"])
|
130
|
+
elif entry["type"] == "Image" and (option is None or option == "form"):
|
131
|
+
extracted_elements.append(entry["text"])
|
132
|
+
|
133
|
+
if option is None or option == "form":
|
134
|
+
# Convert list to single string with two new lines between each element
|
135
|
+
extracted_data = "\n\n".join(extracted_elements)
|
136
|
+
return extracted_data
|
137
|
+
|
138
|
+
return extracted_elements
|
139
|
+
|
140
|
+
def invoke_pipeline_step(self, task_call, task_description, local):
|
141
|
+
if local:
|
142
|
+
with Progress(
|
143
|
+
SpinnerColumn(),
|
144
|
+
TextColumn("[progress.description]{task.description}"),
|
145
|
+
transient=False,
|
146
|
+
) as progress:
|
147
|
+
progress.add_task(description=task_description, total=None)
|
148
|
+
ret = task_call()
|
149
|
+
else:
|
150
|
+
print(task_description)
|
151
|
+
ret = task_call()
|
152
|
+
|
153
|
+
return ret
|
154
|
+
|
155
|
+
def table_has_header(self, table_html):
|
156
|
+
soup = BeautifulSoup(table_html, 'html.parser')
|
157
|
+
table = soup.find('table')
|
158
|
+
|
159
|
+
# Check if the table contains a <thead> tag
|
160
|
+
if table.find('thead'):
|
161
|
+
return True
|
162
|
+
|
163
|
+
# Check if the table contains any <th> tags inside the table (in case there's no <thead>)
|
164
|
+
if table.find_all('th'):
|
165
|
+
return True
|
166
|
+
|
167
|
+
return False
|
168
|
+
|
169
|
+
|
170
|
+
if __name__ == "__main__":
|
171
|
+
processor = UnstructuredProcessor()
|
172
|
+
# content, table_content = processor.extract_data(
|
173
|
+
# '/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
|
174
|
+
# 'hi_res',
|
175
|
+
# 'yolox',
|
176
|
+
# ['tables', 'html'],
|
177
|
+
# True,
|
178
|
+
# True)
|
179
|
+
|
sparrow_parse/temp.py
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# content, table_content = processor.extract_data(
|
2
|
+
# '/Users/andrejb/Documents/work/schreiber/invoice_data/test/2618407.pdf',
|
3
|
+
# 'hi_res',
|
4
|
+
# 'yolox',
|
5
|
+
# # 'detectron2_onnx',
|
6
|
+
# ['tables', 'html'],
|
7
|
+
# True,
|
8
|
+
# True)
|
9
|
+
|
10
|
+
# content, table_content = processor.extract_data(
|
11
|
+
# '/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_1.pdf',
|
12
|
+
# 'hi_res',
|
13
|
+
# 'yolox',
|
14
|
+
# ['tables', 'html'],
|
15
|
+
# True,
|
16
|
+
# True)
|
17
|
+
|
18
|
+
|
19
|
+
# content, table_content = processor.extract_data(
|
20
|
+
# '/Users/andrejb/Documents/work/schreiber/invoice_data/test/2618407.pdf',
|
21
|
+
# ['tables', 'markdown'],
|
22
|
+
# True,
|
23
|
+
# True)
|
24
|
+
# content, table_content = processor.extract_data(
|
25
|
+
# '/Users/andrejb/Documents/work/epik/bankstatement/POSB_2_1.pdf',
|
26
|
+
# ['tables', 'markdown'],
|
27
|
+
# True,
|
28
|
+
# True)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
License: GPL-3.0
|
@@ -15,10 +15,13 @@ Classifier: Programming Language :: Python :: 3.9
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.10
|
16
16
|
Classifier: Programming Language :: Python :: 3.11
|
17
17
|
Classifier: Topic :: Software Development
|
18
|
+
Requires-Dist: pymupdf4llm (==0.0.5)
|
18
19
|
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
20
|
+
Requires-Dist: sentence-transformers (==3.0.1)
|
19
21
|
Requires-Dist: torch (==2.2.2)
|
20
|
-
Requires-Dist:
|
21
|
-
Requires-Dist: unstructured
|
22
|
+
Requires-Dist: transformers (==4.41.2)
|
23
|
+
Requires-Dist: unstructured-inference (==0.7.33)
|
24
|
+
Requires-Dist: unstructured[all-docs] (==0.14.5)
|
22
25
|
Project-URL: Repository, https://github.com/katanaml/sparrow
|
23
26
|
Description-Content-Type: text/markdown
|
24
27
|
|
@@ -26,7 +29,7 @@ Description-Content-Type: text/markdown
|
|
26
29
|
|
27
30
|
## Description
|
28
31
|
|
29
|
-
This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing.
|
32
|
+
This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing, parsing and extracting information.
|
30
33
|
|
31
34
|
## Install
|
32
35
|
|
@@ -34,22 +37,97 @@ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-
|
|
34
37
|
pip install sparrow-parse
|
35
38
|
```
|
36
39
|
|
37
|
-
##
|
40
|
+
## Pre-processing
|
38
41
|
|
39
|
-
|
42
|
+
### Unstructured
|
40
43
|
|
41
44
|
```
|
42
|
-
from sparrow_parse.
|
45
|
+
from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
|
46
|
+
|
47
|
+
processor = UnstructuredProcessor()
|
48
|
+
|
49
|
+
content, table_content = processor.extract_data(
|
50
|
+
file_path, # file to process
|
51
|
+
strategy, # data processing strategy supported by unstructured
|
52
|
+
model_name, # model supported by unstructured
|
53
|
+
options, # table extraction into HTML format
|
54
|
+
local, # True if running from CLI, or False if running from FastAPI
|
55
|
+
debug) # Debug
|
43
56
|
```
|
44
57
|
|
45
|
-
|
58
|
+
Example:
|
59
|
+
|
60
|
+
*file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
|
61
|
+
|
62
|
+
*strategy* - `hi_res`
|
63
|
+
|
64
|
+
*model_name* - `yolox`
|
65
|
+
|
66
|
+
*options* - `['tables', 'html']`
|
67
|
+
|
68
|
+
*local* - `True`
|
69
|
+
|
70
|
+
*debug* - `True`
|
71
|
+
|
72
|
+
### Markdown
|
46
73
|
|
47
74
|
```
|
48
|
-
|
49
|
-
|
75
|
+
from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
|
76
|
+
|
77
|
+
processor = MarkdownProcessor()
|
78
|
+
|
79
|
+
content, table_content = processor.extract_data(
|
80
|
+
file_path, # file to process
|
81
|
+
options, # table extraction into HTML format
|
82
|
+
local, # True if running from CLI, or False if running from FastAPI
|
83
|
+
debug) # Debug
|
50
84
|
```
|
51
85
|
|
52
|
-
|
86
|
+
Example:
|
87
|
+
|
88
|
+
*file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
|
89
|
+
|
90
|
+
*options* - `['tables', 'markdown']`
|
91
|
+
|
92
|
+
*local* - `True`
|
93
|
+
|
94
|
+
*debug* - `True`
|
95
|
+
|
96
|
+
## Parsing and extraction
|
97
|
+
|
98
|
+
```
|
99
|
+
from sparrow_parse.extractor.html_extractor import HTMLExtractor
|
100
|
+
|
101
|
+
extractor = HTMLExtractor()
|
102
|
+
|
103
|
+
answer, targets_unprocessed = extractor.read_data(
|
104
|
+
target_columns, # list of table columns data to fetch
|
105
|
+
data, # list of HTML tables
|
106
|
+
column_keywords, # list of valid column names, can be empty. Useful to filter junk content
|
107
|
+
group_by_rows, # JSON result grouping
|
108
|
+
update_targets, # Set to true, if page contains multiple tables with the same columns
|
109
|
+
local, # True if running from CLI, or False if running from FastAPI
|
110
|
+
debug) # Debug
|
111
|
+
|
112
|
+
```
|
113
|
+
|
114
|
+
Example:
|
115
|
+
|
116
|
+
*target_columns* - `['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth']`
|
117
|
+
|
118
|
+
*data* - `list of HTML tables`
|
119
|
+
|
120
|
+
*column_keywords* - `None`
|
121
|
+
|
122
|
+
*group_by_rows* - `True`
|
123
|
+
|
124
|
+
*update_targets* - `True`
|
125
|
+
|
126
|
+
*local* - `True`
|
127
|
+
|
128
|
+
*debug* - `True`
|
129
|
+
|
130
|
+
## Library build
|
53
131
|
|
54
132
|
```
|
55
133
|
poetry build
|
@@ -0,0 +1,13 @@
|
|
1
|
+
sparrow_parse/__init__.py,sha256=clN3TnyYyt5T_wUJLVBmutS2kYLLBET4JFB2QXnRm2Q,21
|
2
|
+
sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
|
3
|
+
sparrow_parse/data/invoice_1_table.txt,sha256=dsWEASxlVNidpTCQDowCM7SjaUzSqwx7DuydTfaQ7xI,1115
|
4
|
+
sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
sparrow_parse/extractor/extractor_helper.py,sha256=fvA7iuGdpW_WFc4jkALBzQbACqlE5x_K9ScW-E6RCoY,13357
|
6
|
+
sparrow_parse/extractor/html_extractor.py,sha256=XGmaceTuny185iGiLVikEZ8ksLex6mRvQhw-McB84CU,9381
|
7
|
+
sparrow_parse/extractor/markdown_processor.py,sha256=dC2WUdA-v2psh7oytruftxYkXdQi72FoEYxF30ROuO0,4506
|
8
|
+
sparrow_parse/extractor/unstructured_processor.py,sha256=7eFIZ6VeEkIslcJyibyix1qFPHGtIwnvNxdRItGz7V8,6755
|
9
|
+
sparrow_parse/temp.py,sha256=xiRDPkv_fsM9xCcW29TU0LushgYjMaN9_Cwur6RvY1A,859
|
10
|
+
sparrow_parse-0.2.0.dist-info/METADATA,sha256=jH_xt6142QixaARXk5EMp01133YRg8tPvKzAfsjQVGE,5582
|
11
|
+
sparrow_parse-0.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
12
|
+
sparrow_parse-0.2.0.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
|
13
|
+
sparrow_parse-0.2.0.dist-info/RECORD,,
|
@@ -1,143 +0,0 @@
|
|
1
|
-
import tempfile
|
2
|
-
import os
|
3
|
-
from unstructured.partition.pdf import partition_pdf
|
4
|
-
from unstructured.partition.image import partition_image
|
5
|
-
import json
|
6
|
-
from unstructured.staging.base import elements_to_json
|
7
|
-
from rich.progress import Progress, SpinnerColumn, TextColumn
|
8
|
-
|
9
|
-
|
10
|
-
class FileProcessor(object):
|
11
|
-
def __init__(self):
|
12
|
-
pass
|
13
|
-
|
14
|
-
def extract_data(self, file_path, strategy, model_name, options, local=True, debug=False):
|
15
|
-
# check if string options contains word table
|
16
|
-
extract_tables = False
|
17
|
-
if options is not None and "tables" in options:
|
18
|
-
extract_tables = True
|
19
|
-
|
20
|
-
# Extracts the elements from the PDF
|
21
|
-
elements = self.invoke_pipeline_step(
|
22
|
-
lambda: self.process_file(file_path, strategy, model_name),
|
23
|
-
"Extracting elements from the document...",
|
24
|
-
local
|
25
|
-
)
|
26
|
-
|
27
|
-
if debug:
|
28
|
-
new_extension = 'json' # You can change this to any extension you want
|
29
|
-
new_file_path = self.change_file_extension(file_path, new_extension)
|
30
|
-
|
31
|
-
content = self.invoke_pipeline_step(
|
32
|
-
lambda: self.load_text_data(elements, new_file_path, extract_tables),
|
33
|
-
"Loading text data...",
|
34
|
-
local
|
35
|
-
)
|
36
|
-
else:
|
37
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
38
|
-
temp_file_path = os.path.join(temp_dir, "file_data.json")
|
39
|
-
|
40
|
-
content = self.invoke_pipeline_step(
|
41
|
-
lambda: self.load_text_data(elements, temp_file_path, extract_tables),
|
42
|
-
"Loading text data...",
|
43
|
-
local
|
44
|
-
)
|
45
|
-
|
46
|
-
return content
|
47
|
-
|
48
|
-
def process_file(self, file_path, strategy, model_name):
|
49
|
-
elements = None
|
50
|
-
|
51
|
-
if file_path.lower().endswith('.pdf'):
|
52
|
-
elements = partition_pdf(
|
53
|
-
filename=file_path,
|
54
|
-
strategy=strategy,
|
55
|
-
infer_table_structure=True,
|
56
|
-
model_name=model_name
|
57
|
-
)
|
58
|
-
elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
|
59
|
-
elements = partition_image(
|
60
|
-
filename=file_path,
|
61
|
-
strategy=strategy,
|
62
|
-
infer_table_structure=True,
|
63
|
-
model_name=model_name
|
64
|
-
)
|
65
|
-
|
66
|
-
return elements
|
67
|
-
|
68
|
-
def change_file_extension(self, file_path, new_extension):
|
69
|
-
# Check if the new extension starts with a dot and add one if not
|
70
|
-
if not new_extension.startswith('.'):
|
71
|
-
new_extension = '.' + new_extension
|
72
|
-
|
73
|
-
# Split the file path into two parts: the base (everything before the last dot) and the extension
|
74
|
-
# If there's no dot in the filename, it'll just return the original filename without an extension
|
75
|
-
base = file_path.rsplit('.', 1)[0]
|
76
|
-
|
77
|
-
# Concatenate the base with the new extension
|
78
|
-
new_file_path = base + new_extension
|
79
|
-
|
80
|
-
return new_file_path
|
81
|
-
|
82
|
-
def load_text_data(self, elements, file_path, extract_tables):
|
83
|
-
elements_to_json(elements, filename=file_path)
|
84
|
-
text_file = self.process_json_file(file_path, extract_tables)
|
85
|
-
|
86
|
-
with open(text_file, 'r') as file:
|
87
|
-
content = file.read()
|
88
|
-
|
89
|
-
return content
|
90
|
-
|
91
|
-
def process_json_file(self, input_data, extract_tables):
|
92
|
-
# Read the JSON file
|
93
|
-
with open(input_data, 'r') as file:
|
94
|
-
data = json.load(file)
|
95
|
-
|
96
|
-
# Iterate over the JSON data and extract required table elements
|
97
|
-
extracted_elements = []
|
98
|
-
for entry in data:
|
99
|
-
if entry["type"] == "Table":
|
100
|
-
extracted_elements.append(entry["metadata"]["text_as_html"])
|
101
|
-
elif entry["type"] == "Title" and extract_tables is False:
|
102
|
-
extracted_elements.append(entry["text"])
|
103
|
-
elif entry["type"] == "NarrativeText" and extract_tables is False:
|
104
|
-
extracted_elements.append(entry["text"])
|
105
|
-
elif entry["type"] == "UncategorizedText" and extract_tables is False:
|
106
|
-
extracted_elements.append(entry["text"])
|
107
|
-
|
108
|
-
# Write the extracted elements to the output file
|
109
|
-
new_extension = 'txt' # You can change this to any extension you want
|
110
|
-
new_file_path = self.change_file_extension(input_data, new_extension)
|
111
|
-
with open(new_file_path, 'w') as output_file:
|
112
|
-
for element in extracted_elements:
|
113
|
-
output_file.write(element + "\n\n") # Adding two newlines for separation
|
114
|
-
|
115
|
-
return new_file_path
|
116
|
-
|
117
|
-
def invoke_pipeline_step(self, task_call, task_description, local):
|
118
|
-
if local:
|
119
|
-
with Progress(
|
120
|
-
SpinnerColumn(),
|
121
|
-
TextColumn("[progress.description]{task.description}"),
|
122
|
-
transient=False,
|
123
|
-
) as progress:
|
124
|
-
progress.add_task(description=task_description, total=None)
|
125
|
-
ret = task_call()
|
126
|
-
else:
|
127
|
-
print(task_description)
|
128
|
-
ret = task_call()
|
129
|
-
|
130
|
-
return ret
|
131
|
-
|
132
|
-
|
133
|
-
# if __name__ == "__main__":
|
134
|
-
# processor = FileProcessor()
|
135
|
-
# content = processor.extract_data('/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
|
136
|
-
# 'hi_res',
|
137
|
-
# 'yolox',
|
138
|
-
# 'tables',
|
139
|
-
# False,
|
140
|
-
# True)
|
141
|
-
# processor.extract_data("/Users/andrejb/Documents/work/lifung/lemming_test/C16E150001_SUPINV.pdf")
|
142
|
-
# processor.extract_data("/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_single.pdf")
|
143
|
-
# print(content)
|
@@ -1,8 +0,0 @@
|
|
1
|
-
sparrow_parse/__init__.py,sha256=m1D6fscvvsMhq5HVNKw7kP5M8AqEzQm1ekrn_nLQF1M,21
|
2
|
-
sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
|
3
|
-
sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
sparrow_parse/extractor/file_processor.py,sha256=OrDxFJVEhy_4pCBxknehAM7fxgSlWgUJ0jeTEegHRxo,5621
|
5
|
-
sparrow_parse-0.1.9.dist-info/METADATA,sha256=wK7uOpPqsC1iwZs_d5Hl1KV4DIbw8NnfPU6MBTyF_kA,3428
|
6
|
-
sparrow_parse-0.1.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
7
|
-
sparrow_parse-0.1.9.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
|
8
|
-
sparrow_parse-0.1.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|