sparrow-parse 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sparrow_parse/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '0.3.3'
1
+ __version__ = '0.3.5'
@@ -1,14 +1,14 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.3.3
4
- Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
3
+ Version: 0.3.5
4
+ Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
7
7
  Author-email: andrejus.baranovskis@gmail.com
8
8
  License: UNKNOWN
9
9
  Project-URL: Homepage, https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
10
10
  Project-URL: Repository, https://github.com/katanaml/sparrow
11
- Keywords: llm,rag,vision
11
+ Keywords: llm,vllm,ocr,vision
12
12
  Platform: UNKNOWN
13
13
  Classifier: Operating System :: OS Independent
14
14
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
@@ -16,23 +16,20 @@ Classifier: Topic :: Software Development
16
16
  Classifier: Programming Language :: Python :: 3.10
17
17
  Requires-Python: >=3.10
18
18
  Description-Content-Type: text/markdown
19
- Requires-Dist: torch ==2.2.2
20
- Requires-Dist: unstructured[all-docs] ==0.14.5
21
- Requires-Dist: unstructured-inference ==0.7.33
22
19
  Requires-Dist: rich
23
- Requires-Dist: pymupdf4llm ==0.0.9
24
- Requires-Dist: transformers ==4.41.2
25
- Requires-Dist: sentence-transformers ==3.0.1
26
- Requires-Dist: numpy ==1.26.4
27
- Requires-Dist: pypdf ==4.3.0
28
- Requires-Dist: easyocr ==1.7.1
20
+ Requires-Dist: transformers==4.41.2
21
+ Requires-Dist: sentence-transformers==3.0.1
22
+ Requires-Dist: numpy==1.26.4
23
+ Requires-Dist: pypdf==4.3.0
24
+ Requires-Dist: easyocr==1.7.1
29
25
  Requires-Dist: gradio-client
26
+ Requires-Dist: pdf2image
30
27
 
31
28
  # Sparrow Parse
32
29
 
33
30
  ## Description
34
31
 
35
- This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing, parsing and extracting information.
32
+ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) library with helpful methods for data pre-processing, parsing and extracting information. This library relies on Visual LLM functionality, Table Transformers and is part of Sparrow. Check main [README](https://github.com/katanaml/sparrow)
36
33
 
37
34
  ## Install
38
35
 
@@ -40,97 +37,42 @@ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-
40
37
  pip install sparrow-parse
41
38
  ```
42
39
 
43
- ## Pre-processing
44
-
45
- ### Unstructured
46
-
47
- ```
48
- from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
49
-
50
- processor = UnstructuredProcessor()
51
-
52
- content, table_content = processor.extract_data(
53
- file_path, # file to process
54
- strategy, # data processing strategy supported by unstructured
55
- model_name, # model supported by unstructured
56
- options, # table extraction into HTML format
57
- local, # True if running from CLI, or False if running from FastAPI
58
- debug) # Debug
59
- ```
60
-
61
- Example:
62
-
63
- *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
64
-
65
- *strategy* - `hi_res`
66
-
67
- *model_name* - `yolox`
68
-
69
- *options* - `['tables', 'unstructured']`
70
-
71
- *local* - `True`
72
-
73
- *debug* - `True`
74
-
75
- ### Markdown
76
-
77
- ```
78
- from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
79
-
80
- processor = MarkdownProcessor()
81
-
82
- content, table_content = processor.extract_data(
83
- file_path, # file to process
84
- options, # table extraction into HTML format
85
- local, # True if running from CLI, or False if running from FastAPI
86
- debug) # Debug
87
- ```
88
-
89
- Example:
90
-
91
- *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
92
-
93
- *options* - `['tables', 'markdown']`
94
-
95
- *local* - `True`
96
-
97
- *debug* - `True`
98
-
99
40
  ## Parsing and extraction
100
41
 
101
- ```
102
- from sparrow_parse.extractor.html_extractor import HTMLExtractor
103
-
104
- extractor = HTMLExtractor()
105
-
106
- answer, targets_unprocessed = extractor.read_data(
107
- target_columns, # list of table columns data to fetch
108
- data, # list of HTML tables
109
- column_keywords, # list of valid column names, can be empty. Useful to filter junk content
110
- group_by_rows, # JSON result grouping
111
- update_targets, # Set to true, if page contains multiple tables with the same columns
112
- local, # True if running from CLI, or False if running from FastAPI
113
- debug) # Debug
42
+ ### Sparrow Parse VL (vision-language model) extractor with Hugging Face GPU infra
114
43
 
115
44
  ```
45
+ from sparrow_parse.vllm.inference_factory import InferenceFactory
46
+ from sparrow_parse.extractors.vllm_extractor import VLLMExtractor
116
47
 
117
- Example:
118
-
119
- *target_columns* - `['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth']`
120
-
121
- *data* - `list of HTML tables`
48
+ extractor = VLLMExtractor()
122
49
 
123
- *column_keywords* - `None`
50
+ # export HF_TOKEN="hf_"
51
+ config = {
52
+ "method": "huggingface", # Could be 'huggingface' or 'local_gpu'
53
+ "hf_space": "katanaml/sparrow-qwen2-vl-7b",
54
+ "hf_token": os.getenv('HF_TOKEN'),
55
+ # Additional fields for local GPU inference
56
+ # "device": "cuda", "model_path": "model.pth"
57
+ }
124
58
 
125
- *group_by_rows* - `True`
59
+ # Use the factory to get the correct instance
60
+ factory = InferenceFactory(config)
61
+ model_inference_instance = factory.get_inference_instance()
126
62
 
127
- *update_targets* - `True`
63
+ input_data = [
64
+ {
65
+ "image": "/data/bonds_table.png",
66
+ "text_input": "retrieve all data. return response in JSON format"
67
+ }
68
+ ]
128
69
 
129
- *local* - `True`
130
-
131
- *debug* - `True`
70
+ # Now you can run inference without knowing which implementation is used
71
+ result = extractor.run_inference(model_inference_instance, input_data, generic_query=False, debug=True)
72
+ print("Inference Result:", result)
73
+ ```
132
74
 
133
- ## PDF optimization
75
+ ## PDF pre-processing
134
76
 
135
77
  ```
136
78
  from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
@@ -145,7 +87,7 @@ num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
145
87
 
146
88
  Example:
147
89
 
148
- *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
90
+ *file_path* - `/data/invoice_1.pdf`
149
91
 
150
92
  *output_directory* - set to not `None`, for debug purposes only
151
93
 
@@ -1,23 +1,18 @@
1
- sparrow_parse/__init__.py,sha256=JDRpXqOC0txw4_CqkfpSl89CczeXGgyjX4XSSLChyQg,21
1
+ sparrow_parse/__init__.py,sha256=e9arv8KorBrIZFQXAlN4DOQTh91btae1iR36M_3Wafk,21
2
2
  sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
- sparrow_parse/temp.py,sha256=gy4_mtNW_KfXn9br_suu6jHx7JKYLKs9pIOBynh_JWY,1134
4
3
  sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- sparrow_parse/extractors/html_extractor.py,sha256=qe9Oz7J-GiIE8G1kIDMOeh96xe6P59Gyh5SjgV3v2c8,9977
6
4
  sparrow_parse/extractors/vllm_extractor.py,sha256=Qwmf-SW4z_UstiiynX5TkyovlkokVhLuzcbUVZ16TXM,1540
7
5
  sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- sparrow_parse/helpers/html_extractor_helper.py,sha256=n9M9NyZfesiCCj3ET9WoyqRcWIFJ4k-jyQlUAarKIhE,13658
9
6
  sparrow_parse/helpers/pdf_optimizer.py,sha256=KI_EweGt9Y_rDH1uCpYD5wKCW3rdjSFFhoVtiPBxX8k,3013
10
7
  sparrow_parse/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- sparrow_parse/processors/markdown_processor.py,sha256=dC2WUdA-v2psh7oytruftxYkXdQi72FoEYxF30ROuO0,4506
12
8
  sparrow_parse/processors/table_structure_processor.py,sha256=bG_6jx66n_KNdY_O6hrZD1D4DHX5Qy__RYcKHmrSGnc,23894
13
- sparrow_parse/processors/unstructured_processor.py,sha256=oonkB5ALaV1pVs0a-xr8yAf-kirIabmtugHMnnEILqo,6770
14
9
  sparrow_parse/vllm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
10
  sparrow_parse/vllm/huggingface_inference.py,sha256=Q2Ju65LDzbO-8RWW7cXzrR-pbZ1zKuPVODlKOTWKg_E,1114
16
11
  sparrow_parse/vllm/inference_base.py,sha256=W0N2khehGdF1XHzZACG3I1UZaydHMk6BZgWNvaJD4Ck,197
17
12
  sparrow_parse/vllm/inference_factory.py,sha256=r04e95uPWG5l8Q23yeDqKmvFxLyF991aA2m0hfBTNn8,993
18
13
  sparrow_parse/vllm/local_gpu_inference.py,sha256=I_uWYiFAQhRrykOKbVz69NzftDxuemDKtAye4kWhtnU,617
19
- sparrow_parse-0.3.3.dist-info/METADATA,sha256=qFl4MsoV6lF_OqgtcfBqDRpTHX8MUJh0jeGgNr77o8w,6482
20
- sparrow_parse-0.3.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
21
- sparrow_parse-0.3.3.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
22
- sparrow_parse-0.3.3.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
23
- sparrow_parse-0.3.3.dist-info/RECORD,,
14
+ sparrow_parse-0.3.5.dist-info/METADATA,sha256=4i_-BJalUQFFUZoo919pfr51ZqvU1Jfq-mEFkHf0gWU,5342
15
+ sparrow_parse-0.3.5.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
16
+ sparrow_parse-0.3.5.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
17
+ sparrow_parse-0.3.5.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
18
+ sparrow_parse-0.3.5.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: bdist_wheel (0.44.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,251 +0,0 @@
1
- from rich import print
2
- from sentence_transformers import SentenceTransformer, util
3
- from bs4 import BeautifulSoup
4
- import json
5
- from rich.progress import Progress, SpinnerColumn, TextColumn
6
- from sparrow_parse.helpers.html_extractor_helper import merge_html_table_headers
7
- from sparrow_parse.helpers.html_extractor_helper import clean_html_table_header_names
8
-
9
-
10
- class HTMLExtractor(object):
11
- def __init__(self):
12
- pass
13
-
14
- def read_data(self, target_columns, data, similarity_threshold_junk, similarity_threshold_column_id,
15
- column_keywords=None, group_by_rows=True, update_targets=False, local=True, debug=False):
16
- answer = {}
17
-
18
- json_result, targets_unprocessed = [], []
19
-
20
- for i, table in enumerate(data):
21
- if not target_columns:
22
- break
23
-
24
- json_result, targets_unprocessed = self.read_data_from_table(target_columns, table, similarity_threshold_junk,
25
- similarity_threshold_column_id, column_keywords,
26
- group_by_rows, local, debug)
27
- answer = self.add_answer_section(answer, "items" + str(i + 1), json_result)
28
-
29
- if update_targets:
30
- target_columns = targets_unprocessed
31
-
32
- answer = self.format_json_output(answer)
33
-
34
- return answer, targets_unprocessed
35
-
36
- def read_data_from_table(self, target_columns, data, similarity_threshold_junk, similarity_threshold_column_id,
37
- column_keywords=None, group_by_rows=True, local=True, debug=False):
38
- data = self.invoke_pipeline_step(
39
- lambda: merge_html_table_headers(data, column_keywords, similarity_threshold_junk, debug),
40
- "Merging HTML table headers...",
41
- local
42
- )
43
-
44
- data = self.invoke_pipeline_step(
45
- lambda: clean_html_table_header_names(data),
46
- "Cleaning HTML table headers...",
47
- local
48
- )
49
-
50
- columns = self.get_table_column_names(data)
51
-
52
- if debug:
53
- print("\n")
54
- print(f"Columns: {columns}")
55
- print(f"Target columns: {target_columns}")
56
-
57
- indices, targets, targets_unprocessed = self.invoke_pipeline_step(
58
- lambda: self.calculate_similarity(columns, target_columns, similarity_threshold_column_id, debug),
59
- "Calculating cosine similarity between columns and target values...",
60
- local
61
- )
62
-
63
- if debug:
64
- print(f"Unprocessed targets: {targets_unprocessed}")
65
-
66
- # Extracting data
67
- extracted_data = self.invoke_pipeline_step(
68
- lambda: self.extract_columns_from_table(data, indices, targets, group_by_rows),
69
- "Extracting data from the table...",
70
- local
71
- )
72
-
73
- json_result = self.convert_to_json(extracted_data)
74
-
75
- return json_result, targets_unprocessed
76
-
77
- def calculate_similarity(self, columns, target_columns, similarity_threshold_column_id, debug):
78
- model = SentenceTransformer('all-mpnet-base-v2')
79
-
80
- # Compute embeddings for columns and target values
81
- column_embeddings = model.encode(columns)
82
- target_embeddings = model.encode(target_columns)
83
-
84
- # List to store indices of the most similar columns
85
- most_similar_indices = {}
86
- targets_unprocessed = []
87
-
88
- # Calculate cosine similarity between each column and target value
89
- similarity_scores = util.pytorch_cos_sim(column_embeddings, target_embeddings)
90
-
91
- # Find the most similar column for each target value and provide the order ID
92
- for idx, target in enumerate(target_columns):
93
- similarities = similarity_scores[:, idx]
94
- most_similar_idx = similarities.argmax().item()
95
- most_similar_column = columns[most_similar_idx]
96
- similarity_score = similarities[most_similar_idx].item()
97
- if similarity_score > similarity_threshold_column_id:
98
- if most_similar_idx in most_similar_indices:
99
- if similarity_score > most_similar_indices[most_similar_idx][1]:
100
- targets_unprocessed.append(most_similar_indices[most_similar_idx][0])
101
- most_similar_indices[most_similar_idx] = (target, similarity_score)
102
- else:
103
- targets_unprocessed.append(target)
104
- else:
105
- most_similar_indices[most_similar_idx] = (target, similarity_score)
106
- else:
107
- targets_unprocessed.append(target)
108
- if debug:
109
- print(
110
- f"The most similar column to '{target}' is '{most_similar_column}' with a similarity score of {similarity_score:.4f} and order ID {most_similar_idx}")
111
-
112
- most_similar_indices = dict(sorted(most_similar_indices.items()))
113
-
114
- indices = []
115
- targets = []
116
-
117
- for idx, (target, _) in most_similar_indices.items():
118
- indices.append(idx)
119
- targets.append(target)
120
-
121
- if debug:
122
- print()
123
- for idx, (target, score) in most_similar_indices.items():
124
- print(f"Target: '{target}', Column: '{columns[idx]}', Column ID: {idx}, Score: {score:.4f}")
125
- print()
126
-
127
- return indices, targets, targets_unprocessed
128
-
129
- def extract_columns_from_table(self, html_table, column_ids, target_columns, group_by_rows=False):
130
- soup = BeautifulSoup(html_table, 'html.parser')
131
- table = soup.find('table')
132
-
133
- if group_by_rows:
134
- # Initialize a list to store each row's data as a dictionary
135
- extracted_data = []
136
- else:
137
- # Initialize the extracted data with custom column names
138
- extracted_data = {target_columns[i]: [] for i in range(len(column_ids))}
139
-
140
- # Extract row information
141
- rows = table.find_all('tr')
142
-
143
- for row in rows:
144
- # Skip the header row
145
- if row.find_all('th'):
146
- continue
147
-
148
- cells = row.find_all('td')
149
- if cells: # Ensure the row contains data cells
150
- if group_by_rows:
151
- row_data = {}
152
- for idx, col_id in enumerate(column_ids):
153
- value = cells[col_id].text.strip() if col_id < len(cells) else ''
154
- value = value.replace('|', '').strip()
155
- row_data[target_columns[idx]] = value
156
- extracted_data.append(row_data)
157
- else:
158
- for idx, col_id in enumerate(column_ids):
159
- value = cells[col_id].text.strip() if col_id < len(cells) else ''
160
- value = value.replace('|', '').strip()
161
- extracted_data[target_columns[idx]].append(value)
162
-
163
- return extracted_data
164
-
165
- def convert_to_json(self, extracted_data):
166
- return json.dumps(extracted_data, indent=4)
167
-
168
- def get_table_column_names(self, html_table):
169
- """
170
- Extract column names from an HTML table.
171
-
172
- Args:
173
- html_table (str): The HTML content of the table.
174
-
175
- Returns:
176
- list: A list of column names.
177
- """
178
- # Parse the HTML content using BeautifulSoup with html.parser
179
- soup = BeautifulSoup(html_table, 'html.parser')
180
-
181
- # Find the <thead> tag
182
- thead = soup.find('thead')
183
-
184
- # Extract column names into a list
185
- column_names = [th.get_text() for th in thead.find_all('th')]
186
-
187
- return column_names
188
-
189
- def invoke_pipeline_step(self, task_call, task_description, local):
190
- if local:
191
- with Progress(
192
- SpinnerColumn(),
193
- TextColumn("[progress.description]{task.description}"),
194
- transient=False,
195
- ) as progress:
196
- progress.add_task(description=task_description, total=None)
197
- ret = task_call()
198
- else:
199
- print(task_description)
200
- ret = task_call()
201
-
202
- return ret
203
-
204
- def add_answer_section(self, answer, section_name, answer_table):
205
- if not isinstance(answer, dict):
206
- raise ValueError("The answer should be a dictionary.")
207
-
208
- # Parse answer_table if it is a JSON string
209
- if isinstance(answer_table, str):
210
- answer_table = json.loads(answer_table)
211
-
212
- answer[section_name] = answer_table
213
- return answer
214
-
215
- def format_json_output(self, answer):
216
- formatted_json = json.dumps(answer, indent=4)
217
- formatted_json = formatted_json.replace('", "', '",\n"')
218
- formatted_json = formatted_json.replace('}, {', '},\n{')
219
- return formatted_json
220
-
221
-
222
- if __name__ == "__main__":
223
- # to run for debugging, navigate above sparrow_parse and run the following command:
224
- # python -m sparrow_parse.extractors.html_extractor
225
-
226
- # with open('data/invoice_1_table.txt', 'r') as file:
227
- # file_content = file.read()
228
- #
229
- # file_content = file_content.strip()[1:-1].strip()
230
- # data_list = re.split(r"',\s*'", file_content)
231
- # data_list = [item.strip(" '") for item in data_list]
232
-
233
- extractor = HTMLExtractor()
234
-
235
- # answer, targets_unprocessed = extractors.read_data(
236
- # # ['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth'],
237
- # ['transaction_date', 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance',
238
- # 'deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'maturity_date'],
239
- # data_list,
240
- # 0.5,
241
- # 0.3,
242
- # # None,
243
- # ['deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'transaction_date',
244
- # 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance', 'maturity_date'],
245
- # True,
246
- # False,
247
- # True,
248
- # True)
249
- #
250
- # print(answer)
251
- # print(targets_unprocessed)
@@ -1,374 +0,0 @@
1
- from bs4 import BeautifulSoup
2
- from sentence_transformers import SentenceTransformer, util
3
- import pandas as pd
4
- import re
5
- from io import StringIO
6
- from rich import print
7
-
8
-
9
- def merge_html_table_headers(html_table, column_keywords, similarity_threshold, debug=False):
10
- soup = BeautifulSoup(html_table, 'html.parser')
11
-
12
- # Find all thead elements
13
- theads = soup.find_all('thead')
14
-
15
- if len(theads) > 1 and column_keywords is not None:
16
- html_table = update_table_header_colspan(html_table)
17
- html_table = merge_table_header_thead(html_table)
18
- html_table = merge_colspan_columns(html_table)
19
- html_table = normalize_html_table(html_table, debug)
20
- html_table = fix_rowspan_elements(html_table)
21
- html_table = merge_rows_with_rowspan(html_table)
22
- html_table = detect_and_remove_junk_columns(html_table, column_keywords, similarity_threshold, debug)
23
- else:
24
- # If there is only one thead, return the original table
25
- return html_table
26
-
27
- return html_table
28
-
29
-
30
- def update_table_header_colspan(html_table):
31
- soup = BeautifulSoup(html_table, 'html.parser')
32
- theads = soup.find_all('thead')
33
-
34
- for thead in theads:
35
- for th in thead.find_all('th'):
36
- colspan = th.get('colspan')
37
- if colspan and int(colspan) > 1:
38
- colspan_count = int(colspan)
39
- th['colspan'] = 1
40
- for _ in range(colspan_count - 1):
41
- new_th = soup.new_tag('th')
42
- th.insert_after(new_th)
43
-
44
- return str(soup)
45
-
46
-
47
- def merge_table_header_thead(html_table):
48
- soup = BeautifulSoup(html_table, 'html.parser')
49
- theads = soup.find_all('thead')
50
-
51
- primary_thead = theads[0]
52
- secondary_thead = theads[1]
53
-
54
- primary_ths = primary_thead.find_all('th')
55
- secondary_ths = secondary_thead.find_all('th')
56
-
57
- for i, th in enumerate(primary_ths):
58
- if i < len(secondary_ths):
59
- primary_text = th.text.strip()
60
- secondary_text = secondary_ths[i].text.strip()
61
- if primary_text and secondary_text:
62
- th.string = (primary_text + ' ' + secondary_text).strip()
63
- elif not primary_text and secondary_text:
64
- th.string = secondary_text
65
- # Remove colspan and rowspan attributes
66
- th.attrs.pop('colspan', None)
67
- th.attrs.pop('rowspan', None)
68
-
69
- secondary_thead.decompose()
70
-
71
- return str(soup)
72
-
73
-
74
- def merge_colspan_columns(html_table):
75
- # Parse the HTML
76
- soup = BeautifulSoup(html_table, 'html.parser')
77
-
78
- # Process colspan attributes by adding empty <td> elements
79
- for row in soup.find_all('tr'):
80
- cols = []
81
- for cell in row.find_all(['th', 'td']):
82
- colspan = int(cell.get('colspan', 1))
83
- # Add the cell and additional empty cells if colspan is greater than 1
84
- cols.append(cell)
85
- for _ in range(colspan - 1):
86
- new_td = soup.new_tag('td')
87
- cols.append(new_td)
88
- # Remove the colspan attribute
89
- if cell.has_attr('colspan'):
90
- del cell['colspan']
91
-
92
- # Replace the row's children with the updated cells
93
- row.clear()
94
- row.extend(cols)
95
-
96
- return str(soup)
97
-
98
-
99
- def normalize_html_table(html, debug = False):
100
- soup = BeautifulSoup(html, 'html.parser')
101
-
102
- # Find the header row and count the number of cells
103
- header = soup.find('thead').find_all(['th', 'td'])
104
- header_cell_count = len(header)
105
-
106
- if debug:
107
- # Print the number of header cells
108
- print(f"Number of cells in header: {header_cell_count}")
109
-
110
- # Find all rows in the table body
111
- rows = soup.find_all('tr')
112
-
113
- for row in rows:
114
- cells = row.find_all(['td', 'th'])
115
- if len(cells) > header_cell_count:
116
- extra_cells = len(cells) - header_cell_count
117
- for cell in cells:
118
- if cell.text.strip() == '' and extra_cells > 0:
119
- cell.decompose()
120
- extra_cells -= 1
121
- elif len(cells) < header_cell_count:
122
- missing_cells = header_cell_count - len(cells)
123
- for _ in range(missing_cells):
124
- new_cell = soup.new_tag('td')
125
- row.insert(0, new_cell)
126
-
127
- return str(soup)
128
-
129
-
130
- def fix_rowspan_elements(html_table):
131
- # Parse the HTML table
132
- soup = BeautifulSoup(html_table, 'html.parser')
133
-
134
- # Find all table rows
135
- rows = soup.find_all('tr')
136
-
137
- # Dictionary to store rows with rowspan elements
138
- rowspan_dict = {}
139
-
140
- # Iterate over each row
141
- for row_index, row in enumerate(rows):
142
- # Find all cells in the row
143
- cells = row.find_all(['td', 'th'])
144
-
145
- # Iterate over each cell
146
- for cell_index, cell in enumerate(cells):
147
- # Check if the cell has a rowspan attribute
148
- if cell.has_attr('rowspan'):
149
- # Store the rowspan value and cell position
150
- rowspan_value = int(cell['rowspan'])
151
- if row_index not in rowspan_dict:
152
- rowspan_dict[row_index] = []
153
- rowspan_dict[row_index].append((cell_index, rowspan_value))
154
-
155
- # List to store the number of rows until the next rowspan row
156
- rows_below_until_next_rowspan = []
157
-
158
- # Get the sorted row indices that have rowspan elements
159
- sorted_row_indices = sorted(rowspan_dict.keys())
160
-
161
- # Calculate rows below each rowspan row until the next rowspan row
162
- for i in range(len(sorted_row_indices)):
163
- current_row = sorted_row_indices[i]
164
- if i < len(sorted_row_indices) - 1:
165
- next_row = sorted_row_indices[i + 1]
166
- rows_below = next_row - current_row - 1
167
- else:
168
- rows_below = len(rows) - current_row - 1
169
- rows_below_until_next_rowspan.append((current_row, rows_below))
170
-
171
- # Detect rows where rowspan value is incorrect
172
- rows_with_bad_rowspan = []
173
- for row_index, rows_below in rows_below_until_next_rowspan:
174
- if row_index in rowspan_dict:
175
- for cell_index, rowspan_value in rowspan_dict[row_index]:
176
- if rowspan_value - 1 < rows_below:
177
- print(f"Row {row_index} has a large rowspan value: {rowspan_value}")
178
- rows_with_bad_rowspan.append(row_index)
179
- break
180
-
181
- # Modify the HTML table to adjust the rowspan attributes
182
- for row_index in rows_with_bad_rowspan:
183
- if row_index in rowspan_dict:
184
- for cell_index, rowspan_value in rowspan_dict[row_index]:
185
- # Find the cell with the rowspan attribute
186
- cell = rows[row_index].find_all(['td', 'th'])[cell_index]
187
- # Remove the rowspan attribute
188
- del cell['rowspan']
189
- # Find the next row and assign the rowspan value
190
- next_row_index = row_index + 1
191
- if next_row_index < len(rows):
192
- next_row_cells = rows[next_row_index].find_all(['td', 'th'])
193
- if len(next_row_cells) > cell_index:
194
- next_row_cell = next_row_cells[cell_index]
195
- next_row_cell['rowspan'] = rowspan_value
196
- else:
197
- # Create a new cell if it does not exist
198
- new_cell = soup.new_tag(cell.name)
199
- new_cell['rowspan'] = rowspan_value
200
- new_cell.string = cell.string
201
- rows[next_row_index].append(new_cell)
202
-
203
- # Return the modified HTML table
204
- return str(soup)
205
-
206
-
207
- def merge_rows_with_rowspan(html):
208
- # Parse the HTML table using BeautifulSoup
209
- soup = BeautifulSoup(html, 'html.parser')
210
-
211
- # Extract the header
212
- thead = soup.find('thead')
213
-
214
- # Find all rows
215
- rows = soup.find_all('tr')
216
-
217
- result = []
218
- i = 0
219
-
220
- while i < len(rows):
221
- row = rows[i]
222
- # Check if any td in the row has a rowspan attribute
223
- for td in row.find_all('td'):
224
- if td.has_attr('rowspan'):
225
- rowspan_value = int(td['rowspan'])
226
- result.append(row)
227
-
228
- skip_concatenation = False
229
- concatenation_pairs = []
230
-
231
- # Add rows below the current row based on the rowspan number
232
- for j in range(1, rowspan_value):
233
- if i + j < len(rows):
234
- below_row = rows[i + j]
235
-
236
- # Compare cells
237
- row_cells = row.find_all('td')
238
- below_row_cells = below_row.find_all('td')
239
- min_length = min(len(row_cells), len(below_row_cells))
240
-
241
- for k in range(min_length):
242
- if is_numeric(row_cells[k].get_text(strip=True)) and is_numeric(below_row_cells[k].get_text(strip=True)):
243
- skip_concatenation = True
244
- break
245
- else:
246
- concatenation_pairs.append((row_cells[k], below_row_cells[k]))
247
-
248
- if skip_concatenation:
249
- result.append(below_row)
250
-
251
- if not skip_concatenation:
252
- for row_cell, below_row_cell in concatenation_pairs:
253
- concatenated_text = (row_cell.get_text(strip=True) + ' ' + below_row_cell.get_text(strip=True)).strip()
254
- row_cell.string = concatenated_text
255
-
256
- i += rowspan_value - 1 # Skip the rows that have been added
257
- break
258
- else:
259
- result.append(row)
260
- break
261
- i += 1
262
-
263
- # Convert result list of rows back to an HTML table string
264
- new_table_soup = BeautifulSoup(f'<table>{str(thead)}</table>', 'html.parser')
265
- tbody = new_table_soup.new_tag('tbody')
266
- new_table_soup.table.append(tbody)
267
- for row in result:
268
- for td in row.find_all('td'):
269
- if td.has_attr('rowspan'):
270
- del td['rowspan']
271
- tbody.append(row)
272
-
273
- return str(new_table_soup.table)
274
-
275
-
276
- def detect_and_remove_junk_columns(html_table, target_columns, similarity_threshold_param, debug=False):
277
- html_table = clean_html_table_header_names(html_table)
278
-
279
- # Wrap the HTML string in a StringIO object
280
- html_buffer = StringIO(html_table)
281
-
282
- # Read the HTML table
283
- df = pd.read_html(html_buffer)[0]
284
-
285
- model = SentenceTransformer('all-mpnet-base-v2')
286
-
287
- # Get the column names of the dataframe
288
- column_names = df.columns.tolist()
289
-
290
- # Calculate the similarity of each column name to the target column names
291
- target_embeddings = model.encode(target_columns)
292
- column_embeddings = model.encode(column_names)
293
-
294
- # Initialize a dictionary to store the similarity scores
295
- similarity_scores = {}
296
-
297
- # Identify junk columns based on similarity threshold
298
- junk_columns = []
299
- similarity_threshold = similarity_threshold_param
300
-
301
- for idx, col_embedding in enumerate(column_embeddings):
302
- similarities = util.pytorch_cos_sim(col_embedding, target_embeddings)[0]
303
- max_similarity = max(similarities)
304
- max_similarity_idx = similarities.argmax().item() # Get the index of the max similarity
305
- similarity_scores[column_names[idx]] = (
306
- max_similarity.item(), target_columns[max_similarity_idx]) # Store similarity score and target column name
307
- if max_similarity < similarity_threshold:
308
- junk_columns.append(column_names[idx])
309
-
310
- if debug:
311
- # Print the similarity scores for debugging purposes
312
- for column, (score, target_col) in similarity_scores.items():
313
- print(f"Column: {column}, Similarity: {score:.4f}, Target Column: {target_col}")
314
-
315
- # Handle junk columns by concatenating their values to the nearest column on the left
316
- for junk_col in junk_columns:
317
- junk_col_index = column_names.index(junk_col)
318
- if junk_col_index > 0:
319
- nearest_col = column_names[junk_col_index - 1]
320
- df[nearest_col] = df.apply(
321
- lambda row: str(row[junk_col]) if pd.isna(row[nearest_col]) and pd.notna(row[junk_col])
322
- else (str(row[nearest_col]) + ' ' + str(row[junk_col])) if pd.notna(row[junk_col])
323
- else row[nearest_col],
324
- axis=1
325
- )
326
- df.drop(columns=[junk_col], inplace=True)
327
-
328
- # Replace any remaining NaN values with empty strings
329
- df = df.fillna('')
330
-
331
- if debug:
332
- print(f"Junk columns: {junk_columns}")
333
- print(df.to_string())
334
-
335
- # Convert the result into an HTML table
336
- html_table = df.to_html(index=False)
337
-
338
- if debug:
339
- print(html_table)
340
-
341
- return html_table
342
-
343
-
344
- def clean_html_table_header_names(html_table: str) -> str:
345
- """
346
- Cleans the headers of an HTML table by removing junk characters and returns the updated HTML as a string.
347
-
348
- Parameters:
349
- html (str): The HTML content containing the table.
350
-
351
- Returns:
352
- str: The updated HTML table with cleaned headers.
353
- """
354
- # Parse the HTML table
355
- soup = BeautifulSoup(html_table, "html.parser")
356
- table = soup.find("table")
357
-
358
- # Extract the headers and clean them
359
- headers = table.find_all("th")
360
- for th in headers:
361
- if th.string:
362
- # Clean the header
363
- clean_header = re.sub(r"[^a-zA-Z0-9\s]", "", th.get_text())
364
- # Keep it empty if the cleaned name is empty
365
- th.string.replace_with(clean_header.strip() if clean_header.strip() else "")
366
-
367
- html_table = str(soup)
368
-
369
- return html_table
370
-
371
-
372
- def is_numeric(value):
373
- # Check if the value is numeric
374
- return bool(re.match(r'^\d+(?:,\d{3})*(?:\.\d+)?$', value))
@@ -1,137 +0,0 @@
1
- import pymupdf4llm
2
- import pandas as pd
3
- import re
4
- from rich.progress import Progress, SpinnerColumn, TextColumn
5
- from rich import print
6
- from bs4 import BeautifulSoup
7
-
8
-
9
- class MarkdownProcessor(object):
10
- def __init__(self):
11
- pass
12
-
13
- def extract_data(self, file_path, options, local=True, debug=False):
14
- markdown_text = self.invoke_pipeline_step(
15
- lambda: pymupdf4llm.to_markdown(file_path),
16
- "Extracting markdown text from the document...",
17
- local
18
- )
19
-
20
- content, table_content = self.invoke_pipeline_step(
21
- lambda: self.load_text_data(markdown_text, options),
22
- "Loading text data...",
23
- local
24
- )
25
-
26
- if debug:
27
- print("Data extracted from the document:")
28
- print(content)
29
- print("\n")
30
- print("Table content extracted from the document:")
31
- if table_content:
32
- print(len(table_content))
33
- print(table_content)
34
-
35
- return content, table_content
36
-
37
- def load_text_data(self, markdown_text, options):
38
- content, table_content = None, None
39
-
40
- if options is None:
41
- content = markdown_text
42
-
43
- if options and "tables" in options and "markdown" in options:
44
- content = self.extract_form_data(markdown_text)
45
- table_content = self.extract_tables(markdown_text)
46
-
47
- return content, table_content
48
-
49
- def extract_form_data(self, markdown_text):
50
- return markdown_text
51
-
52
- def extract_tables(self, markdown_text):
53
- # Regular expression to match markdown tables
54
- table_pattern = re.compile(r'(\|.+\|\n\|[-| ]+\|\n(?:\|.*\|\n)*?)(?=\|.*TOTAL)', re.MULTILINE)
55
-
56
- # Find all tables in the markdown text
57
- tables = table_pattern.findall(markdown_text)
58
-
59
- html_tables = []
60
- for table_text in tables:
61
- # Split the table into lines
62
- lines = table_text.strip().split('\n')
63
-
64
- # Extract headers and rows
65
- headers = [self.clean_column_name(header.strip()) for header in lines[0].split('|') if header]
66
- rows = []
67
- for line in lines[2:]: # Skip header and separator lines
68
- row = [cell.strip() for cell in line.split('|') if cell]
69
- rows.append(row)
70
-
71
- # Convert to Pandas DataFrame
72
- df = pd.DataFrame(rows, columns=headers)
73
-
74
- # Convert DataFrame to HTML and append to the list
75
- html_table = df.to_html(index=False)
76
- if self.table_has_header(html_table):
77
- html_tables.append(html_table)
78
-
79
- return html_tables
80
-
81
- def clean_column_name(self, name):
82
- """
83
- Cleans the column name by removing spaces if the name is a single word with spaces between letters.
84
-
85
- Args:
86
- name (str): The column name to clean.
87
-
88
- Returns:
89
- str: The cleaned column name.
90
- """
91
- # Check if the name contains only letters and spaces
92
- if all(char.isalpha() or char.isspace() for char in name):
93
- # Check if it is a single word with spaces between letters
94
- parts = name.split()
95
- if len(parts) > 1 and all(len(part) == 1 for part in parts):
96
- return ''.join(parts)
97
- return name
98
-
99
- def invoke_pipeline_step(self, task_call, task_description, local):
100
- if local:
101
- with Progress(
102
- SpinnerColumn(),
103
- TextColumn("[progress.description]{task.description}"),
104
- transient=False,
105
- ) as progress:
106
- progress.add_task(description=task_description, total=None)
107
- ret = task_call()
108
- else:
109
- print(task_description)
110
- ret = task_call()
111
-
112
- return ret
113
-
114
- def table_has_header(self, table_html):
115
- soup = BeautifulSoup(table_html, 'html.parser')
116
- table = soup.find('table')
117
-
118
- # Check if the table contains a <thead> tag
119
- if table.find('thead'):
120
- return True
121
-
122
- # Check if the table contains any <th> tags inside the table (in case there's no <thead>)
123
- if table.find_all('th'):
124
- return True
125
-
126
- return False
127
-
128
-
129
- if __name__ == "__main__":
130
- processor = MarkdownProcessor()
131
-
132
- # content, table_content = processor.extract_data(
133
- # '/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
134
- # ['tables', 'markdown'],
135
- # True,
136
- # True)
137
-
@@ -1,178 +0,0 @@
1
- import tempfile
2
- import os
3
- from unstructured.partition.pdf import partition_pdf
4
- from unstructured.partition.image import partition_image
5
- import json
6
- from unstructured.staging.base import elements_to_json
7
- from rich.progress import Progress, SpinnerColumn, TextColumn
8
- from rich import print
9
- from bs4 import BeautifulSoup
10
-
11
-
12
- class UnstructuredProcessor(object):
13
- def __init__(self):
14
- pass
15
-
16
- def extract_data(self, file_path, strategy, model_name, options, local=True, debug=False):
17
- # Extracts the elements from the PDF
18
- elements = self.invoke_pipeline_step(
19
- lambda: self.process_file(file_path, strategy, model_name),
20
- "Extracting elements from the document...",
21
- local
22
- )
23
-
24
- if debug:
25
- new_extension = 'json' # You can change this to any extension you want
26
- new_file_path = self.change_file_extension(file_path, new_extension)
27
-
28
- content, table_content = self.invoke_pipeline_step(
29
- lambda: self.load_text_data(elements, new_file_path, options),
30
- "Loading text data...",
31
- local
32
- )
33
- else:
34
- with tempfile.TemporaryDirectory() as temp_dir:
35
- temp_file_path = os.path.join(temp_dir, "file_data.json")
36
-
37
- content, table_content = self.invoke_pipeline_step(
38
- lambda: self.load_text_data(elements, temp_file_path, options),
39
- "Loading text data...",
40
- local
41
- )
42
-
43
- if debug:
44
- print("Data extracted from the document:")
45
- print(content)
46
- print("\n")
47
- print("Table content extracted from the document:")
48
- if table_content:
49
- print(len(table_content))
50
- print(table_content)
51
-
52
- return content, table_content
53
-
54
- def process_file(self, file_path, strategy, model_name):
55
- elements = None
56
-
57
- if file_path.lower().endswith('.pdf'):
58
- elements = partition_pdf(
59
- filename=file_path,
60
- strategy=strategy,
61
- infer_table_structure=True,
62
- hi_res_model_name=model_name,
63
- languages=['en']
64
- )
65
- elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
66
- elements = partition_image(
67
- filename=file_path,
68
- strategy=strategy,
69
- infer_table_structure=True,
70
- hi_res_model_name=model_name,
71
- languages=['en']
72
- )
73
-
74
- return elements
75
-
76
- def change_file_extension(self, file_path, new_extension, suffix=None):
77
- # Check if the new extension starts with a dot and add one if not
78
- if not new_extension.startswith('.'):
79
- new_extension = '.' + new_extension
80
-
81
- # Split the file path into two parts: the base (everything before the last dot) and the extension
82
- # If there's no dot in the filename, it'll just return the original filename without an extension
83
- base = file_path.rsplit('.', 1)[0]
84
-
85
- # Concatenate the base with the new extension
86
- if suffix is None:
87
- new_file_path = base + new_extension
88
- else:
89
- new_file_path = base + "_" + suffix + new_extension
90
-
91
- return new_file_path
92
-
93
- def load_text_data(self, elements, file_path, options):
94
- elements_to_json(elements, filename=file_path)
95
-
96
- content, table_content = None, None
97
-
98
- if options is None:
99
- content = self.process_json_file(file_path)
100
-
101
- if options and "tables" in options and "unstructured" in options:
102
- content = self.process_json_file(file_path, "form")
103
-
104
- table_content = self.process_json_file(file_path, "table")
105
-
106
- return content, table_content
107
-
108
- def process_json_file(self, file_path, option=None):
109
- # Read the JSON file
110
- with open(file_path, 'r') as file:
111
- data = json.load(file)
112
-
113
- # Iterate over the JSON data and extract required elements
114
- extracted_elements = []
115
- for entry in data:
116
- if entry["type"] == "Table" and (option is None or option == "table" or option == "form"):
117
- table_data = entry["metadata"]["text_as_html"]
118
- if option == "table" and self.table_has_header(table_data):
119
- extracted_elements.append(table_data)
120
- if option is None or option == "form":
121
- extracted_elements.append(table_data)
122
- elif entry["type"] == "Title" and (option is None or option == "form"):
123
- extracted_elements.append(entry["text"])
124
- elif entry["type"] == "NarrativeText" and (option is None or option == "form"):
125
- extracted_elements.append(entry["text"])
126
- elif entry["type"] == "UncategorizedText" and (option is None or option == "form"):
127
- extracted_elements.append(entry["text"])
128
- elif entry["type"] == "ListItem" and (option is None or option == "form"):
129
- extracted_elements.append(entry["text"])
130
- elif entry["type"] == "Image" and (option is None or option == "form"):
131
- extracted_elements.append(entry["text"])
132
-
133
- if option is None or option == "form":
134
- # Convert list to single string with two new lines between each element
135
- extracted_data = "\n\n".join(extracted_elements)
136
- return extracted_data
137
-
138
- return extracted_elements
139
-
140
- def invoke_pipeline_step(self, task_call, task_description, local):
141
- if local:
142
- with Progress(
143
- SpinnerColumn(),
144
- TextColumn("[progress.description]{task.description}"),
145
- transient=False,
146
- ) as progress:
147
- progress.add_task(description=task_description, total=None)
148
- ret = task_call()
149
- else:
150
- print(task_description)
151
- ret = task_call()
152
-
153
- return ret
154
-
155
- def table_has_header(self, table_html):
156
- soup = BeautifulSoup(table_html, 'html.parser')
157
- table = soup.find('table')
158
-
159
- # Check if the table contains a <thead> tag
160
- if table.find('thead'):
161
- return True
162
-
163
- # Check if the table contains any <th> tags inside the table (in case there's no <thead>)
164
- if table.find_all('th'):
165
- return True
166
-
167
- return False
168
-
169
-
170
- if __name__ == "__main__":
171
- processor = UnstructuredProcessor()
172
- # content, table_content = processor.extract_data(
173
- # '/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
174
- # 'hi_res',
175
- # 'yolox',
176
- # ['tables', 'unstructured'],
177
- # True,
178
- # True)
sparrow_parse/temp.py DELETED
@@ -1,27 +0,0 @@
1
- # content, table_content = processor.extract_data(
2
- # '/Users/andrejb/Documents/work/schreiber/invoice_data/test/2618407.pdf',
3
- # 'hi_res',
4
- # 'yolox',
5
- # # 'detectron2_onnx',
6
- # ['tables', 'unstructured'],
7
- # True,
8
- # True)
9
-
10
- # content, table_content = processor.extract_data(
11
- # '/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_1.pdf',
12
- # 'hi_res',
13
- # 'yolox',
14
- # ['tables', 'unstructured'],
15
- # True,
16
- # True)
17
-
18
- # output_directory = "/Users/andrejb/Documents/work/epik/bankstatement/output_pages"
19
- # # Ensure the output directory exists
20
- # os.makedirs(output_directory, exist_ok=True)
21
- #
22
- # # Split the optimized PDF into separate pages
23
- # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_statement.pdf",
24
- # output_directory,
25
- # False)
26
- #
27
- # shutil.rmtree(temp_dir, ignore_errors=True)