sparrow-parse 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,137 +0,0 @@
1
- import pymupdf4llm
2
- import pandas as pd
3
- import re
4
- from rich.progress import Progress, SpinnerColumn, TextColumn
5
- from rich import print
6
- from bs4 import BeautifulSoup
7
-
8
-
9
- class MarkdownProcessor(object):
10
- def __init__(self):
11
- pass
12
-
13
- def extract_data(self, file_path, options, local=True, debug=False):
14
- markdown_text = self.invoke_pipeline_step(
15
- lambda: pymupdf4llm.to_markdown(file_path),
16
- "Extracting markdown text from the document...",
17
- local
18
- )
19
-
20
- content, table_content = self.invoke_pipeline_step(
21
- lambda: self.load_text_data(markdown_text, options),
22
- "Loading text data...",
23
- local
24
- )
25
-
26
- if debug:
27
- print("Data extracted from the document:")
28
- print(content)
29
- print("\n")
30
- print("Table content extracted from the document:")
31
- if table_content:
32
- print(len(table_content))
33
- print(table_content)
34
-
35
- return content, table_content
36
-
37
- def load_text_data(self, markdown_text, options):
38
- content, table_content = None, None
39
-
40
- if options is None:
41
- content = markdown_text
42
-
43
- if options and "tables" in options and "markdown" in options:
44
- content = self.extract_form_data(markdown_text)
45
- table_content = self.extract_tables(markdown_text)
46
-
47
- return content, table_content
48
-
49
- def extract_form_data(self, markdown_text):
50
- return markdown_text
51
-
52
- def extract_tables(self, markdown_text):
53
- # Regular expression to match markdown tables
54
- table_pattern = re.compile(r'(\|.+\|\n\|[-| ]+\|\n(?:\|.*\|\n)*?)(?=\|.*TOTAL)', re.MULTILINE)
55
-
56
- # Find all tables in the markdown text
57
- tables = table_pattern.findall(markdown_text)
58
-
59
- html_tables = []
60
- for table_text in tables:
61
- # Split the table into lines
62
- lines = table_text.strip().split('\n')
63
-
64
- # Extract headers and rows
65
- headers = [self.clean_column_name(header.strip()) for header in lines[0].split('|') if header]
66
- rows = []
67
- for line in lines[2:]: # Skip header and separator lines
68
- row = [cell.strip() for cell in line.split('|') if cell]
69
- rows.append(row)
70
-
71
- # Convert to Pandas DataFrame
72
- df = pd.DataFrame(rows, columns=headers)
73
-
74
- # Convert DataFrame to HTML and append to the list
75
- html_table = df.to_html(index=False)
76
- if self.table_has_header(html_table):
77
- html_tables.append(html_table)
78
-
79
- return html_tables
80
-
81
- def clean_column_name(self, name):
82
- """
83
- Cleans the column name by removing spaces if the name is a single word with spaces between letters.
84
-
85
- Args:
86
- name (str): The column name to clean.
87
-
88
- Returns:
89
- str: The cleaned column name.
90
- """
91
- # Check if the name contains only letters and spaces
92
- if all(char.isalpha() or char.isspace() for char in name):
93
- # Check if it is a single word with spaces between letters
94
- parts = name.split()
95
- if len(parts) > 1 and all(len(part) == 1 for part in parts):
96
- return ''.join(parts)
97
- return name
98
-
99
- def invoke_pipeline_step(self, task_call, task_description, local):
100
- if local:
101
- with Progress(
102
- SpinnerColumn(),
103
- TextColumn("[progress.description]{task.description}"),
104
- transient=False,
105
- ) as progress:
106
- progress.add_task(description=task_description, total=None)
107
- ret = task_call()
108
- else:
109
- print(task_description)
110
- ret = task_call()
111
-
112
- return ret
113
-
114
- def table_has_header(self, table_html):
115
- soup = BeautifulSoup(table_html, 'html.parser')
116
- table = soup.find('table')
117
-
118
- # Check if the table contains a <thead> tag
119
- if table.find('thead'):
120
- return True
121
-
122
- # Check if the table contains any <th> tags inside the table (in case there's no <thead>)
123
- if table.find_all('th'):
124
- return True
125
-
126
- return False
127
-
128
-
129
- if __name__ == "__main__":
130
- processor = MarkdownProcessor()
131
-
132
- # content, table_content = processor.extract_data(
133
- # '/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
134
- # ['tables', 'markdown'],
135
- # True,
136
- # True)
137
-
@@ -1,178 +0,0 @@
1
- import tempfile
2
- import os
3
- from unstructured.partition.pdf import partition_pdf
4
- from unstructured.partition.image import partition_image
5
- import json
6
- from unstructured.staging.base import elements_to_json
7
- from rich.progress import Progress, SpinnerColumn, TextColumn
8
- from rich import print
9
- from bs4 import BeautifulSoup
10
-
11
-
12
- class UnstructuredProcessor(object):
13
- def __init__(self):
14
- pass
15
-
16
- def extract_data(self, file_path, strategy, model_name, options, local=True, debug=False):
17
- # Extracts the elements from the PDF
18
- elements = self.invoke_pipeline_step(
19
- lambda: self.process_file(file_path, strategy, model_name),
20
- "Extracting elements from the document...",
21
- local
22
- )
23
-
24
- if debug:
25
- new_extension = 'json' # You can change this to any extension you want
26
- new_file_path = self.change_file_extension(file_path, new_extension)
27
-
28
- content, table_content = self.invoke_pipeline_step(
29
- lambda: self.load_text_data(elements, new_file_path, options),
30
- "Loading text data...",
31
- local
32
- )
33
- else:
34
- with tempfile.TemporaryDirectory() as temp_dir:
35
- temp_file_path = os.path.join(temp_dir, "file_data.json")
36
-
37
- content, table_content = self.invoke_pipeline_step(
38
- lambda: self.load_text_data(elements, temp_file_path, options),
39
- "Loading text data...",
40
- local
41
- )
42
-
43
- if debug:
44
- print("Data extracted from the document:")
45
- print(content)
46
- print("\n")
47
- print("Table content extracted from the document:")
48
- if table_content:
49
- print(len(table_content))
50
- print(table_content)
51
-
52
- return content, table_content
53
-
54
- def process_file(self, file_path, strategy, model_name):
55
- elements = None
56
-
57
- if file_path.lower().endswith('.pdf'):
58
- elements = partition_pdf(
59
- filename=file_path,
60
- strategy=strategy,
61
- infer_table_structure=True,
62
- hi_res_model_name=model_name,
63
- languages=['en']
64
- )
65
- elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
66
- elements = partition_image(
67
- filename=file_path,
68
- strategy=strategy,
69
- infer_table_structure=True,
70
- hi_res_model_name=model_name,
71
- languages=['en']
72
- )
73
-
74
- return elements
75
-
76
- def change_file_extension(self, file_path, new_extension, suffix=None):
77
- # Check if the new extension starts with a dot and add one if not
78
- if not new_extension.startswith('.'):
79
- new_extension = '.' + new_extension
80
-
81
- # Split the file path into two parts: the base (everything before the last dot) and the extension
82
- # If there's no dot in the filename, it'll just return the original filename without an extension
83
- base = file_path.rsplit('.', 1)[0]
84
-
85
- # Concatenate the base with the new extension
86
- if suffix is None:
87
- new_file_path = base + new_extension
88
- else:
89
- new_file_path = base + "_" + suffix + new_extension
90
-
91
- return new_file_path
92
-
93
- def load_text_data(self, elements, file_path, options):
94
- elements_to_json(elements, filename=file_path)
95
-
96
- content, table_content = None, None
97
-
98
- if options is None:
99
- content = self.process_json_file(file_path)
100
-
101
- if options and "tables" in options and "unstructured" in options:
102
- content = self.process_json_file(file_path, "form")
103
-
104
- table_content = self.process_json_file(file_path, "table")
105
-
106
- return content, table_content
107
-
108
- def process_json_file(self, file_path, option=None):
109
- # Read the JSON file
110
- with open(file_path, 'r') as file:
111
- data = json.load(file)
112
-
113
- # Iterate over the JSON data and extract required elements
114
- extracted_elements = []
115
- for entry in data:
116
- if entry["type"] == "Table" and (option is None or option == "table" or option == "form"):
117
- table_data = entry["metadata"]["text_as_html"]
118
- if option == "table" and self.table_has_header(table_data):
119
- extracted_elements.append(table_data)
120
- if option is None or option == "form":
121
- extracted_elements.append(table_data)
122
- elif entry["type"] == "Title" and (option is None or option == "form"):
123
- extracted_elements.append(entry["text"])
124
- elif entry["type"] == "NarrativeText" and (option is None or option == "form"):
125
- extracted_elements.append(entry["text"])
126
- elif entry["type"] == "UncategorizedText" and (option is None or option == "form"):
127
- extracted_elements.append(entry["text"])
128
- elif entry["type"] == "ListItem" and (option is None or option == "form"):
129
- extracted_elements.append(entry["text"])
130
- elif entry["type"] == "Image" and (option is None or option == "form"):
131
- extracted_elements.append(entry["text"])
132
-
133
- if option is None or option == "form":
134
- # Convert list to single string with two new lines between each element
135
- extracted_data = "\n\n".join(extracted_elements)
136
- return extracted_data
137
-
138
- return extracted_elements
139
-
140
- def invoke_pipeline_step(self, task_call, task_description, local):
141
- if local:
142
- with Progress(
143
- SpinnerColumn(),
144
- TextColumn("[progress.description]{task.description}"),
145
- transient=False,
146
- ) as progress:
147
- progress.add_task(description=task_description, total=None)
148
- ret = task_call()
149
- else:
150
- print(task_description)
151
- ret = task_call()
152
-
153
- return ret
154
-
155
- def table_has_header(self, table_html):
156
- soup = BeautifulSoup(table_html, 'html.parser')
157
- table = soup.find('table')
158
-
159
- # Check if the table contains a <thead> tag
160
- if table.find('thead'):
161
- return True
162
-
163
- # Check if the table contains any <th> tags inside the table (in case there's no <thead>)
164
- if table.find_all('th'):
165
- return True
166
-
167
- return False
168
-
169
-
170
- if __name__ == "__main__":
171
- processor = UnstructuredProcessor()
172
- # content, table_content = processor.extract_data(
173
- # '/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
174
- # 'hi_res',
175
- # 'yolox',
176
- # ['tables', 'unstructured'],
177
- # True,
178
- # True)
sparrow_parse/temp.py DELETED
@@ -1,27 +0,0 @@
1
- # content, table_content = processor.extract_data(
2
- # '/Users/andrejb/Documents/work/schreiber/invoice_data/test/2618407.pdf',
3
- # 'hi_res',
4
- # 'yolox',
5
- # # 'detectron2_onnx',
6
- # ['tables', 'unstructured'],
7
- # True,
8
- # True)
9
-
10
- # content, table_content = processor.extract_data(
11
- # '/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_1.pdf',
12
- # 'hi_res',
13
- # 'yolox',
14
- # ['tables', 'unstructured'],
15
- # True,
16
- # True)
17
-
18
- # output_directory = "/Users/andrejb/Documents/work/epik/bankstatement/output_pages"
19
- # # Ensure the output directory exists
20
- # os.makedirs(output_directory, exist_ok=True)
21
- #
22
- # # Split the optimized PDF into separate pages
23
- # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_statement.pdf",
24
- # output_directory,
25
- # False)
26
- #
27
- # shutil.rmtree(temp_dir, ignore_errors=True)
@@ -1,23 +0,0 @@
1
- sparrow_parse/__init__.py,sha256=SH0xuWVUkyLHZJwWBZ8GJoeliTeYFcqA6TWJgrkLv-U,21
2
- sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
- sparrow_parse/temp.py,sha256=gy4_mtNW_KfXn9br_suu6jHx7JKYLKs9pIOBynh_JWY,1134
4
- sparrow_parse/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- sparrow_parse/extractors/html_extractor.py,sha256=qe9Oz7J-GiIE8G1kIDMOeh96xe6P59Gyh5SjgV3v2c8,9977
6
- sparrow_parse/extractors/vllm_extractor.py,sha256=Qwmf-SW4z_UstiiynX5TkyovlkokVhLuzcbUVZ16TXM,1540
7
- sparrow_parse/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- sparrow_parse/helpers/html_extractor_helper.py,sha256=n9M9NyZfesiCCj3ET9WoyqRcWIFJ4k-jyQlUAarKIhE,13658
9
- sparrow_parse/helpers/pdf_optimizer.py,sha256=KI_EweGt9Y_rDH1uCpYD5wKCW3rdjSFFhoVtiPBxX8k,3013
10
- sparrow_parse/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- sparrow_parse/processors/markdown_processor.py,sha256=dC2WUdA-v2psh7oytruftxYkXdQi72FoEYxF30ROuO0,4506
12
- sparrow_parse/processors/table_structure_processor.py,sha256=bG_6jx66n_KNdY_O6hrZD1D4DHX5Qy__RYcKHmrSGnc,23894
13
- sparrow_parse/processors/unstructured_processor.py,sha256=oonkB5ALaV1pVs0a-xr8yAf-kirIabmtugHMnnEILqo,6770
14
- sparrow_parse/vllm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- sparrow_parse/vllm/huggingface_inference.py,sha256=Q2Ju65LDzbO-8RWW7cXzrR-pbZ1zKuPVODlKOTWKg_E,1114
16
- sparrow_parse/vllm/inference_base.py,sha256=W0N2khehGdF1XHzZACG3I1UZaydHMk6BZgWNvaJD4Ck,197
17
- sparrow_parse/vllm/inference_factory.py,sha256=r04e95uPWG5l8Q23yeDqKmvFxLyF991aA2m0hfBTNn8,993
18
- sparrow_parse/vllm/local_gpu_inference.py,sha256=I_uWYiFAQhRrykOKbVz69NzftDxuemDKtAye4kWhtnU,617
19
- sparrow_parse-0.3.4.dist-info/METADATA,sha256=L7qXKxktk42gUQlBlZAdzHQqfORoC6vBwRCd-VSwv3Y,7444
20
- sparrow_parse-0.3.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
21
- sparrow_parse-0.3.4.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
22
- sparrow_parse-0.3.4.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
23
- sparrow_parse-0.3.4.dist-info/RECORD,,