sparrow-parse 0.2.8__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sparrow_parse-0.2.8 → sparrow_parse-0.3.0}/PKG-INFO +24 -2
- {sparrow_parse-0.2.8 → sparrow_parse-0.3.0}/README.md +21 -0
- {sparrow_parse-0.2.8 → sparrow_parse-0.3.0}/pyproject.toml +3 -2
- sparrow_parse-0.3.0/sparrow_parse/__init__.py +1 -0
- {sparrow_parse-0.2.8 → sparrow_parse-0.3.0}/sparrow_parse/extractor/__pycache__/html_extractor.cpython-310.pyc +0 -0
- {sparrow_parse-0.2.8 → sparrow_parse-0.3.0}/sparrow_parse/extractor/html_extractor.py +23 -23
- sparrow_parse-0.3.0/sparrow_parse/extractor/pdf_optimizer.py +72 -0
- {sparrow_parse-0.2.8 → sparrow_parse-0.3.0}/sparrow_parse/extractor/unstructured_processor.py +0 -9
- sparrow_parse-0.3.0/sparrow_parse/temp.py +27 -0
- sparrow_parse-0.2.8/sparrow_parse/__init__.py +0 -1
- sparrow_parse-0.2.8/sparrow_parse/temp.py +0 -16
- {sparrow_parse-0.2.8 → sparrow_parse-0.3.0}/sparrow_parse/__main__.py +0 -0
- {sparrow_parse-0.2.8 → sparrow_parse-0.3.0}/sparrow_parse/data/invoice_1_table.txt +0 -0
- {sparrow_parse-0.2.8 → sparrow_parse-0.3.0}/sparrow_parse/extractor/__init__.py +0 -0
- {sparrow_parse-0.2.8 → sparrow_parse-0.3.0}/sparrow_parse/extractor/__pycache__/__init__.cpython-310.pyc +0 -0
- {sparrow_parse-0.2.8 → sparrow_parse-0.3.0}/sparrow_parse/extractor/__pycache__/extractor_helper.cpython-310.pyc +0 -0
- {sparrow_parse-0.2.8 → sparrow_parse-0.3.0}/sparrow_parse/extractor/extractor_helper.py +0 -0
- {sparrow_parse-0.2.8 → sparrow_parse-0.3.0}/sparrow_parse/extractor/markdown_processor.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
License: GPL-3.0
|
@@ -15,8 +15,9 @@ Classifier: Programming Language :: Python :: 3.9
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.10
|
16
16
|
Classifier: Programming Language :: Python :: 3.11
|
17
17
|
Classifier: Topic :: Software Development
|
18
|
+
Requires-Dist: PyPDF2 (==3.0.1)
|
18
19
|
Requires-Dist: numpy (==1.26.4)
|
19
|
-
Requires-Dist: pymupdf4llm (==0.0.
|
20
|
+
Requires-Dist: pymupdf4llm (==0.0.6)
|
20
21
|
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
21
22
|
Requires-Dist: sentence-transformers (==3.0.1)
|
22
23
|
Requires-Dist: torch (==2.2.2)
|
@@ -128,6 +129,27 @@ Example:
|
|
128
129
|
|
129
130
|
*debug* - `True`
|
130
131
|
|
132
|
+
## PDF optimization
|
133
|
+
|
134
|
+
```
|
135
|
+
from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
|
136
|
+
|
137
|
+
pdf_optimizer = PDFOptimizer()
|
138
|
+
|
139
|
+
num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
|
140
|
+
output_directory,
|
141
|
+
convert_to_images)
|
142
|
+
|
143
|
+
```
|
144
|
+
|
145
|
+
Example:
|
146
|
+
|
147
|
+
*file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
|
148
|
+
|
149
|
+
*output_directory* - set to not `None`, for debug purposes only
|
150
|
+
|
151
|
+
*convert_to_images* - default `False`, to split into PDF files
|
152
|
+
|
131
153
|
## Library build
|
132
154
|
|
133
155
|
```
|
@@ -100,6 +100,27 @@ Example:
|
|
100
100
|
|
101
101
|
*debug* - `True`
|
102
102
|
|
103
|
+
## PDF optimization
|
104
|
+
|
105
|
+
```
|
106
|
+
from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
|
107
|
+
|
108
|
+
pdf_optimizer = PDFOptimizer()
|
109
|
+
|
110
|
+
num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
|
111
|
+
output_directory,
|
112
|
+
convert_to_images)
|
113
|
+
|
114
|
+
```
|
115
|
+
|
116
|
+
Example:
|
117
|
+
|
118
|
+
*file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
|
119
|
+
|
120
|
+
*output_directory* - set to not `None`, for debug purposes only
|
121
|
+
|
122
|
+
*convert_to_images* - default `False`, to split into PDF files
|
123
|
+
|
103
124
|
## Library build
|
104
125
|
|
105
126
|
```
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "sparrow-parse"
|
3
|
-
version = "0.
|
3
|
+
version = "0.3.0"
|
4
4
|
description = "Sparrow Parse is a Python package for parsing and extracting information from documents."
|
5
5
|
authors = ["Andrej Baranovskij <andrejus.baranovskis@gmail.com>"]
|
6
6
|
license = "GPL-3.0"
|
@@ -25,10 +25,11 @@ torch = {version = "2.2.2", source = "pypi"}
|
|
25
25
|
unstructured = {version = "0.14.5", extras = ["all-docs"]}
|
26
26
|
unstructured-inference = "0.7.33"
|
27
27
|
rich = "^13.7.1"
|
28
|
-
pymupdf4llm = "0.0.
|
28
|
+
pymupdf4llm = "0.0.6"
|
29
29
|
transformers = "4.41.2"
|
30
30
|
sentence-transformers = "3.0.1"
|
31
31
|
numpy = "1.26.4"
|
32
|
+
PyPDF2 = "3.0.1"
|
32
33
|
|
33
34
|
|
34
35
|
[tool.poetry.scripts]
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = '0.3.0'
|
Binary file
|
@@ -224,29 +224,29 @@ if __name__ == "__main__":
|
|
224
224
|
# to run for debugging, navigate to sparrow_parse and run the following command:
|
225
225
|
# python -m extractor.html_extractor
|
226
226
|
|
227
|
-
with open('data/invoice_1_table.txt', 'r') as file:
|
228
|
-
|
229
|
-
|
230
|
-
file_content = file_content.strip()[1:-1].strip()
|
231
|
-
data_list = re.split(r"',\s*'", file_content)
|
232
|
-
data_list = [item.strip(" '") for item in data_list]
|
227
|
+
# with open('data/invoice_1_table.txt', 'r') as file:
|
228
|
+
# file_content = file.read()
|
229
|
+
#
|
230
|
+
# file_content = file_content.strip()[1:-1].strip()
|
231
|
+
# data_list = re.split(r"',\s*'", file_content)
|
232
|
+
# data_list = [item.strip(" '") for item in data_list]
|
233
233
|
|
234
234
|
extractor = HTMLExtractor()
|
235
235
|
|
236
|
-
answer, targets_unprocessed = extractor.read_data(
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
print(answer)
|
252
|
-
print(targets_unprocessed)
|
236
|
+
# answer, targets_unprocessed = extractor.read_data(
|
237
|
+
# # ['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth'],
|
238
|
+
# ['transaction_date', 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance',
|
239
|
+
# 'deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'maturity_date'],
|
240
|
+
# data_list,
|
241
|
+
# 0.5,
|
242
|
+
# 0.3,
|
243
|
+
# # None,
|
244
|
+
# ['deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'transaction_date',
|
245
|
+
# 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance', 'maturity_date'],
|
246
|
+
# True,
|
247
|
+
# False,
|
248
|
+
# True,
|
249
|
+
# True)
|
250
|
+
#
|
251
|
+
# print(answer)
|
252
|
+
# print(targets_unprocessed)
|
@@ -0,0 +1,72 @@
|
|
1
|
+
import PyPDF2
|
2
|
+
from pdf2image import convert_from_path
|
3
|
+
import os
|
4
|
+
import tempfile
|
5
|
+
import shutil
|
6
|
+
|
7
|
+
|
8
|
+
class PDFOptimizer(object):
|
9
|
+
def __init__(self):
|
10
|
+
pass
|
11
|
+
|
12
|
+
def split_pdf_to_pages(self, file_path, output_dir=None, convert_to_images=False):
|
13
|
+
# Create a temporary directory
|
14
|
+
temp_dir = tempfile.mkdtemp()
|
15
|
+
output_files = []
|
16
|
+
|
17
|
+
if not convert_to_images:
|
18
|
+
# Open the PDF file
|
19
|
+
with open(file_path, 'rb') as pdf_file:
|
20
|
+
reader = PyPDF2.PdfReader(pdf_file)
|
21
|
+
number_of_pages = len(reader.pages)
|
22
|
+
|
23
|
+
# Split the PDF into separate files per page
|
24
|
+
for page_num in range(number_of_pages):
|
25
|
+
writer = PyPDF2.PdfWriter()
|
26
|
+
writer.add_page(reader.pages[page_num])
|
27
|
+
|
28
|
+
output_filename = os.path.join(temp_dir, f'page_{page_num + 1}.pdf')
|
29
|
+
with open(output_filename, 'wb') as output_file:
|
30
|
+
writer.write(output_file)
|
31
|
+
output_files.append(output_filename)
|
32
|
+
|
33
|
+
if output_dir:
|
34
|
+
# Save each page to the debug folder
|
35
|
+
debug_output_filename = os.path.join(output_dir, f'page_{page_num + 1}.pdf')
|
36
|
+
with open(debug_output_filename, 'wb') as output_file:
|
37
|
+
writer.write(output_file)
|
38
|
+
|
39
|
+
# Return the number of pages, the list of file paths, and the temporary directory
|
40
|
+
return number_of_pages, output_files, temp_dir
|
41
|
+
else:
|
42
|
+
# Convert the PDF to images
|
43
|
+
images = convert_from_path(file_path, dpi=400)
|
44
|
+
|
45
|
+
# Save the images to the temporary directory
|
46
|
+
for i, image in enumerate(images):
|
47
|
+
output_filename = os.path.join(temp_dir, f'page_{i + 1}.jpg')
|
48
|
+
image.save(output_filename, 'JPEG')
|
49
|
+
output_files.append(output_filename)
|
50
|
+
|
51
|
+
if output_dir:
|
52
|
+
# Save each image to the debug folder
|
53
|
+
debug_output_filename = os.path.join(output_dir, f'page_{i + 1}.jpg')
|
54
|
+
image.save(debug_output_filename, 'JPEG')
|
55
|
+
|
56
|
+
# Return the number of pages, the list of file paths, and the temporary directory
|
57
|
+
return len(images), output_files, temp_dir
|
58
|
+
|
59
|
+
|
60
|
+
if __name__ == "__main__":
|
61
|
+
pdf_optimizer = PDFOptimizer()
|
62
|
+
|
63
|
+
# output_directory = "/Users/andrejb/Documents/work/bankstatement/output_pages"
|
64
|
+
# # Ensure the output directory exists
|
65
|
+
# os.makedirs(output_directory, exist_ok=True)
|
66
|
+
#
|
67
|
+
# # Split the optimized PDF into separate pages
|
68
|
+
# num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/Documents/work/bankstatement/statement.pdf",
|
69
|
+
# output_directory,
|
70
|
+
# False)
|
71
|
+
#
|
72
|
+
# shutil.rmtree(temp_dir, ignore_errors=True)
|
{sparrow_parse-0.2.8 → sparrow_parse-0.3.0}/sparrow_parse/extractor/unstructured_processor.py
RENAMED
@@ -176,12 +176,3 @@ if __name__ == "__main__":
|
|
176
176
|
# ['tables', 'unstructured'],
|
177
177
|
# True,
|
178
178
|
# True)
|
179
|
-
|
180
|
-
content, table_content = processor.extract_data(
|
181
|
-
'/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_1.pdf',
|
182
|
-
'hi_res',
|
183
|
-
'yolox',
|
184
|
-
['tables', 'unstructured'],
|
185
|
-
True,
|
186
|
-
True)
|
187
|
-
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# content, table_content = processor.extract_data(
|
2
|
+
# '/Users/andrejb/Documents/work/schreiber/invoice_data/test/2618407.pdf',
|
3
|
+
# 'hi_res',
|
4
|
+
# 'yolox',
|
5
|
+
# # 'detectron2_onnx',
|
6
|
+
# ['tables', 'unstructured'],
|
7
|
+
# True,
|
8
|
+
# True)
|
9
|
+
|
10
|
+
# content, table_content = processor.extract_data(
|
11
|
+
# '/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_1.pdf',
|
12
|
+
# 'hi_res',
|
13
|
+
# 'yolox',
|
14
|
+
# ['tables', 'unstructured'],
|
15
|
+
# True,
|
16
|
+
# True)
|
17
|
+
|
18
|
+
# output_directory = "/Users/andrejb/Documents/work/epik/bankstatement/output_pages"
|
19
|
+
# # Ensure the output directory exists
|
20
|
+
# os.makedirs(output_directory, exist_ok=True)
|
21
|
+
#
|
22
|
+
# # Split the optimized PDF into separate pages
|
23
|
+
# num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_statement.pdf",
|
24
|
+
# output_directory,
|
25
|
+
# False)
|
26
|
+
#
|
27
|
+
# shutil.rmtree(temp_dir, ignore_errors=True)
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = '0.2.8'
|
@@ -1,16 +0,0 @@
|
|
1
|
-
# content, table_content = processor.extract_data(
|
2
|
-
# '/Users/andrejb/Documents/work/schreiber/invoice_data/test/2618407.pdf',
|
3
|
-
# 'hi_res',
|
4
|
-
# 'yolox',
|
5
|
-
# # 'detectron2_onnx',
|
6
|
-
# ['tables', 'unstructured'],
|
7
|
-
# True,
|
8
|
-
# True)
|
9
|
-
|
10
|
-
# content, table_content = processor.extract_data(
|
11
|
-
# '/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_1.pdf',
|
12
|
-
# 'hi_res',
|
13
|
-
# 'yolox',
|
14
|
-
# ['tables', 'unstructured'],
|
15
|
-
# True,
|
16
|
-
# True)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|