sparrow-parse 0.2.9__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sparrow_parse/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '0.2.9'
1
+ __version__ = '0.3.0'
@@ -244,7 +244,7 @@ if __name__ == "__main__":
244
244
  # ['deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'transaction_date',
245
245
  # 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance', 'maturity_date'],
246
246
  # True,
247
- # True,
247
+ # False,
248
248
  # True,
249
249
  # True)
250
250
  #
@@ -0,0 +1,72 @@
1
+ import PyPDF2
2
+ from pdf2image import convert_from_path
3
+ import os
4
+ import tempfile
5
+ import shutil
6
+
7
+
8
+ class PDFOptimizer(object):
9
+ def __init__(self):
10
+ pass
11
+
12
+ def split_pdf_to_pages(self, file_path, output_dir=None, convert_to_images=False):
13
+ # Create a temporary directory
14
+ temp_dir = tempfile.mkdtemp()
15
+ output_files = []
16
+
17
+ if not convert_to_images:
18
+ # Open the PDF file
19
+ with open(file_path, 'rb') as pdf_file:
20
+ reader = PyPDF2.PdfReader(pdf_file)
21
+ number_of_pages = len(reader.pages)
22
+
23
+ # Split the PDF into separate files per page
24
+ for page_num in range(number_of_pages):
25
+ writer = PyPDF2.PdfWriter()
26
+ writer.add_page(reader.pages[page_num])
27
+
28
+ output_filename = os.path.join(temp_dir, f'page_{page_num + 1}.pdf')
29
+ with open(output_filename, 'wb') as output_file:
30
+ writer.write(output_file)
31
+ output_files.append(output_filename)
32
+
33
+ if output_dir:
34
+ # Save each page to the debug folder
35
+ debug_output_filename = os.path.join(output_dir, f'page_{page_num + 1}.pdf')
36
+ with open(debug_output_filename, 'wb') as output_file:
37
+ writer.write(output_file)
38
+
39
+ # Return the number of pages, the list of file paths, and the temporary directory
40
+ return number_of_pages, output_files, temp_dir
41
+ else:
42
+ # Convert the PDF to images
43
+ images = convert_from_path(file_path, dpi=400)
44
+
45
+ # Save the images to the temporary directory
46
+ for i, image in enumerate(images):
47
+ output_filename = os.path.join(temp_dir, f'page_{i + 1}.jpg')
48
+ image.save(output_filename, 'JPEG')
49
+ output_files.append(output_filename)
50
+
51
+ if output_dir:
52
+ # Save each image to the debug folder
53
+ debug_output_filename = os.path.join(output_dir, f'page_{i + 1}.jpg')
54
+ image.save(debug_output_filename, 'JPEG')
55
+
56
+ # Return the number of pages, the list of file paths, and the temporary directory
57
+ return len(images), output_files, temp_dir
58
+
59
+
60
+ if __name__ == "__main__":
61
+ pdf_optimizer = PDFOptimizer()
62
+
63
+ # output_directory = "/Users/andrejb/Documents/work/bankstatement/output_pages"
64
+ # # Ensure the output directory exists
65
+ # os.makedirs(output_directory, exist_ok=True)
66
+ #
67
+ # # Split the optimized PDF into separate pages
68
+ # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/Documents/work/bankstatement/statement.pdf",
69
+ # output_directory,
70
+ # False)
71
+ #
72
+ # shutil.rmtree(temp_dir, ignore_errors=True)
@@ -176,4 +176,3 @@ if __name__ == "__main__":
176
176
  # ['tables', 'unstructured'],
177
177
  # True,
178
178
  # True)
179
-
sparrow_parse/temp.py CHANGED
@@ -13,4 +13,15 @@
13
13
  # 'yolox',
14
14
  # ['tables', 'unstructured'],
15
15
  # True,
16
- # True)
16
+ # True)
17
+
18
+ # output_directory = "/Users/andrejb/Documents/work/epik/bankstatement/output_pages"
19
+ # # Ensure the output directory exists
20
+ # os.makedirs(output_directory, exist_ok=True)
21
+ #
22
+ # # Split the optimized PDF into separate pages
23
+ # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_statement.pdf",
24
+ # output_directory,
25
+ # False)
26
+ #
27
+ # shutil.rmtree(temp_dir, ignore_errors=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.2.9
3
+ Version: 0.3.0
4
4
  Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  License: GPL-3.0
@@ -15,8 +15,9 @@ Classifier: Programming Language :: Python :: 3.9
15
15
  Classifier: Programming Language :: Python :: 3.10
16
16
  Classifier: Programming Language :: Python :: 3.11
17
17
  Classifier: Topic :: Software Development
18
+ Requires-Dist: PyPDF2 (==3.0.1)
18
19
  Requires-Dist: numpy (==1.26.4)
19
- Requires-Dist: pymupdf4llm (==0.0.5)
20
+ Requires-Dist: pymupdf4llm (==0.0.6)
20
21
  Requires-Dist: rich (>=13.7.1,<14.0.0)
21
22
  Requires-Dist: sentence-transformers (==3.0.1)
22
23
  Requires-Dist: torch (==2.2.2)
@@ -128,6 +129,27 @@ Example:
128
129
 
129
130
  *debug* - `True`
130
131
 
132
+ ## PDF optimization
133
+
134
+ ```
135
+ from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
136
+
137
+ pdf_optimizer = PDFOptimizer()
138
+
139
+ num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
140
+ output_directory,
141
+ convert_to_images)
142
+
143
+ ```
144
+
145
+ Example:
146
+
147
+ *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
148
+
149
+ *output_directory* - set to not `None`, for debug purposes only
150
+
151
+ *convert_to_images* - default `False`, to split into PDF files
152
+
131
153
  ## Library build
132
154
 
133
155
  ```
@@ -0,0 +1,14 @@
1
+ sparrow_parse/__init__.py,sha256=gTggO06fb2c9XKEwlQYUSPlUfy82yVlM9pzLMOUqVcY,21
2
+ sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
+ sparrow_parse/data/invoice_1_table.txt,sha256=dsWEASxlVNidpTCQDowCM7SjaUzSqwx7DuydTfaQ7xI,1115
4
+ sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ sparrow_parse/extractor/extractor_helper.py,sha256=n9M9NyZfesiCCj3ET9WoyqRcWIFJ4k-jyQlUAarKIhE,13658
6
+ sparrow_parse/extractor/html_extractor.py,sha256=Y9c17epY6esn1lNGhOVpzgRuolFJUUZAfZ3G9fKcArU,9916
7
+ sparrow_parse/extractor/markdown_processor.py,sha256=dC2WUdA-v2psh7oytruftxYkXdQi72FoEYxF30ROuO0,4506
8
+ sparrow_parse/extractor/pdf_optimizer.py,sha256=cgXLY7JBtxZVU4KenNIhfN2ogcZXyvD0f7SitvkbJ4o,3016
9
+ sparrow_parse/extractor/unstructured_processor.py,sha256=oonkB5ALaV1pVs0a-xr8yAf-kirIabmtugHMnnEILqo,6770
10
+ sparrow_parse/temp.py,sha256=gy4_mtNW_KfXn9br_suu6jHx7JKYLKs9pIOBynh_JWY,1134
11
+ sparrow_parse-0.3.0.dist-info/METADATA,sha256=rmt5FkqS9RLhEf0M3oYYu8A-r9VE7Yt_ZYlSnn7AcPw,6273
12
+ sparrow_parse-0.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
13
+ sparrow_parse-0.3.0.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
14
+ sparrow_parse-0.3.0.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- sparrow_parse/__init__.py,sha256=e5Klz6yLU-4Ub9zxGUBo9wADAYsST38ZylTv9ze4i60,21
2
- sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
- sparrow_parse/data/invoice_1_table.txt,sha256=dsWEASxlVNidpTCQDowCM7SjaUzSqwx7DuydTfaQ7xI,1115
4
- sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- sparrow_parse/extractor/extractor_helper.py,sha256=n9M9NyZfesiCCj3ET9WoyqRcWIFJ4k-jyQlUAarKIhE,13658
6
- sparrow_parse/extractor/html_extractor.py,sha256=rwtumbPrJoJ8UryjlrASAjAGSsu6bbu-TQm_6ouJods,9915
7
- sparrow_parse/extractor/markdown_processor.py,sha256=dC2WUdA-v2psh7oytruftxYkXdQi72FoEYxF30ROuO0,4506
8
- sparrow_parse/extractor/unstructured_processor.py,sha256=z46aXacMvfW_wmsACs0LtamoMc19eogGd5fVVAj4vIo,6771
9
- sparrow_parse/temp.py,sha256=Hl1wPOEytXnfbUobU8BJgEswPsfncibbQdwrpSHtlOo,513
10
- sparrow_parse-0.2.9.dist-info/METADATA,sha256=qgLxJOpKE9X_uutQhasniDYf6kX93p5exRUTZCAYts4,5622
11
- sparrow_parse-0.2.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
12
- sparrow_parse-0.2.9.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
13
- sparrow_parse-0.2.9.dist-info/RECORD,,