sparrow-parse 0.2.9__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sparrow_parse/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '0.2.9'
1
+ __version__ = '0.3.1'
@@ -244,7 +244,7 @@ if __name__ == "__main__":
244
244
  # ['deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'transaction_date',
245
245
  # 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance', 'maturity_date'],
246
246
  # True,
247
- # True,
247
+ # False,
248
248
  # True,
249
249
  # True)
250
250
  #
@@ -0,0 +1,72 @@
1
+ import pypdf
2
+ from pdf2image import convert_from_path
3
+ import os
4
+ import tempfile
5
+ import shutil
6
+
7
+
8
+ class PDFOptimizer(object):
9
+ def __init__(self):
10
+ pass
11
+
12
+ def split_pdf_to_pages(self, file_path, output_dir=None, convert_to_images=False):
13
+ # Create a temporary directory
14
+ temp_dir = tempfile.mkdtemp()
15
+ output_files = []
16
+
17
+ if not convert_to_images:
18
+ # Open the PDF file
19
+ with open(file_path, 'rb') as pdf_file:
20
+ reader = pypdf.PdfReader(pdf_file)
21
+ number_of_pages = len(reader.pages)
22
+
23
+ # Split the PDF into separate files per page
24
+ for page_num in range(number_of_pages):
25
+ writer = pypdf.PdfWriter()
26
+ writer.add_page(reader.pages[page_num])
27
+
28
+ output_filename = os.path.join(temp_dir, f'page_{page_num + 1}.pdf')
29
+ with open(output_filename, 'wb') as output_file:
30
+ writer.write(output_file)
31
+ output_files.append(output_filename)
32
+
33
+ if output_dir:
34
+ # Save each page to the debug folder
35
+ debug_output_filename = os.path.join(output_dir, f'page_{page_num + 1}.pdf')
36
+ with open(debug_output_filename, 'wb') as output_file:
37
+ writer.write(output_file)
38
+
39
+ # Return the number of pages, the list of file paths, and the temporary directory
40
+ return number_of_pages, output_files, temp_dir
41
+ else:
42
+ # Convert the PDF to images
43
+ images = convert_from_path(file_path, dpi=400)
44
+
45
+ # Save the images to the temporary directory
46
+ for i, image in enumerate(images):
47
+ output_filename = os.path.join(temp_dir, f'page_{i + 1}.jpg')
48
+ image.save(output_filename, 'JPEG')
49
+ output_files.append(output_filename)
50
+
51
+ if output_dir:
52
+ # Save each image to the debug folder
53
+ debug_output_filename = os.path.join(output_dir, f'page_{i + 1}.jpg')
54
+ image.save(debug_output_filename, 'JPEG')
55
+
56
+ # Return the number of pages, the list of file paths, and the temporary directory
57
+ return len(images), output_files, temp_dir
58
+
59
+
60
+ if __name__ == "__main__":
61
+ pdf_optimizer = PDFOptimizer()
62
+
63
+ # output_directory = "/Users/andrejb/Documents/work/bankstatement/output_pages"
64
+ # # Ensure the output directory exists
65
+ # os.makedirs(output_directory, exist_ok=True)
66
+ #
67
+ # # Split the optimized PDF into separate pages
68
+ # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/Documents/work/bankstatement/statement.pdf",
69
+ # output_directory,
70
+ # False)
71
+ #
72
+ # shutil.rmtree(temp_dir, ignore_errors=True)
@@ -176,4 +176,3 @@ if __name__ == "__main__":
176
176
  # ['tables', 'unstructured'],
177
177
  # True,
178
178
  # True)
179
-
sparrow_parse/temp.py CHANGED
@@ -13,4 +13,15 @@
13
13
  # 'yolox',
14
14
  # ['tables', 'unstructured'],
15
15
  # True,
16
- # True)
16
+ # True)
17
+
18
+ # output_directory = "/Users/andrejb/Documents/work/epik/bankstatement/output_pages"
19
+ # # Ensure the output directory exists
20
+ # os.makedirs(output_directory, exist_ok=True)
21
+ #
22
+ # # Split the optimized PDF into separate pages
23
+ # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_statement.pdf",
24
+ # output_directory,
25
+ # False)
26
+ #
27
+ # shutil.rmtree(temp_dir, ignore_errors=True)
@@ -1,30 +1,30 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.2.9
3
+ Version: 0.3.1
4
4
  Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
- License: GPL-3.0
7
- Keywords: llm,rag,vision
8
6
  Author: Andrej Baranovskij
9
7
  Author-email: andrejus.baranovskis@gmail.com
10
- Requires-Python: >=3.9,<3.12
11
- Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
8
+ License: UNKNOWN
9
+ Project-URL: Homepage, https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
10
+ Project-URL: Repository, https://github.com/katanaml/sparrow
11
+ Keywords: llm,rag,vision
12
+ Platform: UNKNOWN
12
13
  Classifier: Operating System :: OS Independent
13
- Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3.9
15
- Classifier: Programming Language :: Python :: 3.10
16
- Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
17
15
  Classifier: Topic :: Software Development
18
- Requires-Dist: numpy (==1.26.4)
19
- Requires-Dist: pymupdf4llm (==0.0.5)
20
- Requires-Dist: rich (>=13.7.1,<14.0.0)
21
- Requires-Dist: sentence-transformers (==3.0.1)
22
- Requires-Dist: torch (==2.2.2)
23
- Requires-Dist: transformers (==4.41.2)
24
- Requires-Dist: unstructured-inference (==0.7.33)
25
- Requires-Dist: unstructured[all-docs] (==0.14.5)
26
- Project-URL: Repository, https://github.com/katanaml/sparrow
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Requires-Python: >=3.10
27
18
  Description-Content-Type: text/markdown
19
+ Requires-Dist: torch ==2.2.2
20
+ Requires-Dist: unstructured[all-docs] ==0.14.5
21
+ Requires-Dist: unstructured-inference ==0.7.33
22
+ Requires-Dist: rich
23
+ Requires-Dist: pymupdf4llm ==0.0.9
24
+ Requires-Dist: transformers ==4.41.2
25
+ Requires-Dist: sentence-transformers ==3.0.1
26
+ Requires-Dist: numpy ==1.26.4
27
+ Requires-Dist: pypdf ==4.3.0
28
28
 
29
29
  # Sparrow Parse
30
30
 
@@ -128,6 +128,27 @@ Example:
128
128
 
129
129
  *debug* - `True`
130
130
 
131
+ ## PDF optimization
132
+
133
+ ```
134
+ from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
135
+
136
+ pdf_optimizer = PDFOptimizer()
137
+
138
+ num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
139
+ output_directory,
140
+ convert_to_images)
141
+
142
+ ```
143
+
144
+ Example:
145
+
146
+ *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
147
+
148
+ *output_directory* - set to not `None`, for debug purposes only
149
+
150
+ *convert_to_images* - default `False`, to split into PDF files
151
+
131
152
  ## Library build
132
153
 
133
154
  ```
@@ -158,3 +179,4 @@ If your organization is seeking to utilize Sparrow under a proprietary license,
158
179
 
159
180
  Licensed under the GPL 3.0. Copyright 2020-2024 Katana ML, Andrej Baranovskij. [Copy of the license](https://github.com/katanaml/sparrow/blob/main/LICENSE).
160
181
 
182
+
@@ -0,0 +1,14 @@
1
+ sparrow_parse/__init__.py,sha256=2KwowXhmiT6-Bln7VPq9d9sRpAzJq9qLyclhp2KWmjA,21
2
+ sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
+ sparrow_parse/temp.py,sha256=gy4_mtNW_KfXn9br_suu6jHx7JKYLKs9pIOBynh_JWY,1134
4
+ sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ sparrow_parse/extractor/extractor_helper.py,sha256=n9M9NyZfesiCCj3ET9WoyqRcWIFJ4k-jyQlUAarKIhE,13658
6
+ sparrow_parse/extractor/html_extractor.py,sha256=Y9c17epY6esn1lNGhOVpzgRuolFJUUZAfZ3G9fKcArU,9916
7
+ sparrow_parse/extractor/markdown_processor.py,sha256=dC2WUdA-v2psh7oytruftxYkXdQi72FoEYxF30ROuO0,4506
8
+ sparrow_parse/extractor/pdf_optimizer.py,sha256=KI_EweGt9Y_rDH1uCpYD5wKCW3rdjSFFhoVtiPBxX8k,3013
9
+ sparrow_parse/extractor/unstructured_processor.py,sha256=oonkB5ALaV1pVs0a-xr8yAf-kirIabmtugHMnnEILqo,6770
10
+ sparrow_parse-0.3.1.dist-info/METADATA,sha256=F3oN55g63Yeklp6n0O7qZvVdBM0OUs9--Ch-0kmGWxE,6190
11
+ sparrow_parse-0.3.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
12
+ sparrow_parse-0.3.1.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
13
+ sparrow_parse-0.3.1.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
14
+ sparrow_parse-0.3.1.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.0
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
+
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ sparrow-parse = sparrow_parse:main
3
+
@@ -0,0 +1 @@
1
+ sparrow_parse
@@ -1,9 +0,0 @@
1
- [
2
- '<table><thead><th>No.</th><th>Description</th><th>Qty</th><th>UM</th><th>Net price</th><th>Net worth</th><th>VAT [%]</th><th>Gross worth</th></thead><tr><td></td><td>Wine Glasses Goblets Pair Clear
3
- Glass</td><td>5,00</td><td>eacn</td><td>12,00</td><td>60,00</td><td>10%</td><td>66,00</td></tr><tr><td></td><td>With Hooks Stemware Storage Multiple Uses Iron Wine Rack Hanging
4
- Glass</td><td>4,00</td><td>eacn</td><td>28,08</td><td>112,32</td><td>10%</td><td>123,55</td></tr><tr><td></td><td>Replacement Corkscrew Parts Spiral Worm Wine Opener Bottle
5
- Houdini</td><td>1,00</td><td>eacn</td><td>7,50</td><td>7,50</td><td>10%</td><td>8,25</td></tr><tr><td></td><td>HOME ESSENTIALS GRADIENT STEMLESS WINE GLASSES SET OF 4 20 FL OZ (591 ml)
6
- NEW</td><td>1,00</td><td>eacn</td><td>12,99</td><td>12,99</td><td>10%</td><td>14,29</td></tr></table>',
7
- '<table><thead><th>VAT</th><th>[%]</th><th>Net worth</th><th>VAT</th><th>Gross worth</th></thead><tr><td></td><td>10%</td><td>192,81</td><td>19,28</td><td>212,09</td></tr><tr><td colspan="2">Total</td><td>$ 192,81</td><td>$
8
- 19,28</td><td>$ 212,09</td></tr></table>'
9
- ]
@@ -1,13 +0,0 @@
1
- sparrow_parse/__init__.py,sha256=e5Klz6yLU-4Ub9zxGUBo9wADAYsST38ZylTv9ze4i60,21
2
- sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
- sparrow_parse/data/invoice_1_table.txt,sha256=dsWEASxlVNidpTCQDowCM7SjaUzSqwx7DuydTfaQ7xI,1115
4
- sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- sparrow_parse/extractor/extractor_helper.py,sha256=n9M9NyZfesiCCj3ET9WoyqRcWIFJ4k-jyQlUAarKIhE,13658
6
- sparrow_parse/extractor/html_extractor.py,sha256=rwtumbPrJoJ8UryjlrASAjAGSsu6bbu-TQm_6ouJods,9915
7
- sparrow_parse/extractor/markdown_processor.py,sha256=dC2WUdA-v2psh7oytruftxYkXdQi72FoEYxF30ROuO0,4506
8
- sparrow_parse/extractor/unstructured_processor.py,sha256=z46aXacMvfW_wmsACs0LtamoMc19eogGd5fVVAj4vIo,6771
9
- sparrow_parse/temp.py,sha256=Hl1wPOEytXnfbUobU8BJgEswPsfncibbQdwrpSHtlOo,513
10
- sparrow_parse-0.2.9.dist-info/METADATA,sha256=qgLxJOpKE9X_uutQhasniDYf6kX93p5exRUTZCAYts4,5622
11
- sparrow_parse-0.2.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
12
- sparrow_parse-0.2.9.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
13
- sparrow_parse-0.2.9.dist-info/RECORD,,
@@ -1,3 +0,0 @@
1
- [console_scripts]
2
- sparrow-parse=sparrow_parse:main
3
-