PyPI - sparrow-parse - Versions diffs - 0.2.9__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

sparrow-parse 0.2.9py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

sparrow_parse/__init__.py +1 -1
sparrow_parse/extractor/html_extractor.py +1 -1
sparrow_parse/extractor/pdf_optimizer.py +72 -0
sparrow_parse/extractor/unstructured_processor.py +0 -1
sparrow_parse/temp.py +12 -1
{sparrow_parse-0.2.9.dist-info → sparrow_parse-0.3.1.dist-info}/METADATA +40 -18
sparrow_parse-0.3.1.dist-info/RECORD +14 -0
{sparrow_parse-0.2.9.dist-info → sparrow_parse-0.3.1.dist-info}/WHEEL +2 -1
sparrow_parse-0.3.1.dist-info/entry_points.txt +3 -0
sparrow_parse-0.3.1.dist-info/top_level.txt +1 -0
sparrow_parse/data/invoice_1_table.txt +0 -9
sparrow_parse-0.2.9.dist-info/RECORD +0 -13
sparrow_parse-0.2.9.dist-info/entry_points.txt +0 -3

sparrow_parse/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = '0.2.9'
1	+ __version__ = '0.3.1'

sparrow_parse/extractor/html_extractor.py CHANGED Viewed

@@ -244,7 +244,7 @@ if __name__ == "__main__":
     #     ['deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'transaction_date',
     #      'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance', 'maturity_date'],
     #     True,
-    #     True,
+    #     False,
     #     True,
     #     True)
     #

sparrow_parse/extractor/pdf_optimizer.py ADDED Viewed

@@ -0,0 +1,72 @@
+import pypdf
+from pdf2image import convert_from_path
+import os
+import tempfile
+import shutil
+class PDFOptimizer(object):
+    def __init__(self):
+        pass
+    def split_pdf_to_pages(self, file_path, output_dir=None, convert_to_images=False):
+        # Create a temporary directory
+        temp_dir = tempfile.mkdtemp()
+        output_files = []
+        if not convert_to_images:
+            # Open the PDF file
+            with open(file_path, 'rb') as pdf_file:
+                reader = pypdf.PdfReader(pdf_file)
+                number_of_pages = len(reader.pages)
+                # Split the PDF into separate files per page
+                for page_num in range(number_of_pages):
+                    writer = pypdf.PdfWriter()
+                    writer.add_page(reader.pages[page_num])
+                    output_filename = os.path.join(temp_dir, f'page_{page_num + 1}.pdf')
+                    with open(output_filename, 'wb') as output_file:
+                        writer.write(output_file)
+                        output_files.append(output_filename)
+                    if output_dir:
+                        # Save each page to the debug folder
+                        debug_output_filename = os.path.join(output_dir, f'page_{page_num + 1}.pdf')
+                        with open(debug_output_filename, 'wb') as output_file:
+                            writer.write(output_file)
+            # Return the number of pages, the list of file paths, and the temporary directory
+            return number_of_pages, output_files, temp_dir
+        else:
+            # Convert the PDF to images
+            images = convert_from_path(file_path, dpi=400)
+            # Save the images to the temporary directory
+            for i, image in enumerate(images):
+                output_filename = os.path.join(temp_dir, f'page_{i + 1}.jpg')
+                image.save(output_filename, 'JPEG')
+                output_files.append(output_filename)
+                if output_dir:
+                    # Save each image to the debug folder
+                    debug_output_filename = os.path.join(output_dir, f'page_{i + 1}.jpg')
+                    image.save(debug_output_filename, 'JPEG')
+            # Return the number of pages, the list of file paths, and the temporary directory
+            return len(images), output_files, temp_dir
+if __name__ == "__main__":
+    pdf_optimizer = PDFOptimizer()
+    # output_directory = "/Users/andrejb/Documents/work/bankstatement/output_pages"
+    # # Ensure the output directory exists
+    # os.makedirs(output_directory, exist_ok=True)
+    #
+    # # Split the optimized PDF into separate pages
+    # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/Documents/work/bankstatement/statement.pdf",
+    #                                                                      output_directory,
+    #                                                                      False)
+    #
+    # shutil.rmtree(temp_dir, ignore_errors=True)

sparrow_parse/extractor/unstructured_processor.py CHANGED Viewed

@@ -176,4 +176,3 @@ if __name__ == "__main__":
     #     ['tables', 'unstructured'],
     #     True,
     #     True)

sparrow_parse/temp.py CHANGED Viewed

@@ -13,4 +13,15 @@
     #     'yolox',
     #     ['tables', 'unstructured'],
     #     True,
-    #     True)
+    #     True)
+    # output_directory = "/Users/andrejb/Documents/work/epik/bankstatement/output_pages"
+    # # Ensure the output directory exists
+    # os.makedirs(output_directory, exist_ok=True)
+    #
+    # # Split the optimized PDF into separate pages
+    # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_statement.pdf",
+    #                                                                      output_directory,
+    #                                                                      False)
+    #
+    # shutil.rmtree(temp_dir, ignore_errors=True)

{sparrow_parse-0.2.9.dist-info → sparrow_parse-0.3.1.dist-info}/METADATA RENAMED Viewed

@@ -1,30 +1,30 @@
 Metadata-Version: 2.1
 Name: sparrow-parse
-Version: 0.2.9
+Version: 0.3.1
 Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
 Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
-License: GPL-3.0
-Keywords: llm,rag,vision
 Author: Andrej Baranovskij
 Author-email: andrejus.baranovskis@gmail.com
-Requires-Python: >=3.9,<3.12
-Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
+License: UNKNOWN
+Project-URL: Homepage, https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
+Project-URL: Repository, https://github.com/katanaml/sparrow
+Keywords: llm,rag,vision
+Platform: UNKNOWN
 Classifier: Operating System :: OS Independent
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
+Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Topic :: Software Development
-Requires-Dist: numpy (==1.26.4)
-Requires-Dist: pymupdf4llm (==0.0.5)
-Requires-Dist: rich (>=13.7.1,<14.0.0)
-Requires-Dist: sentence-transformers (==3.0.1)
-Requires-Dist: torch (==2.2.2)
-Requires-Dist: transformers (==4.41.2)
-Requires-Dist: unstructured-inference (==0.7.33)
-Requires-Dist: unstructured[all-docs] (==0.14.5)
-Project-URL: Repository, https://github.com/katanaml/sparrow
+Classifier: Programming Language :: Python :: 3.10
+Requires-Python: >=3.10
 Description-Content-Type: text/markdown
+Requires-Dist: torch ==2.2.2
+Requires-Dist: unstructured[all-docs] ==0.14.5
+Requires-Dist: unstructured-inference ==0.7.33
+Requires-Dist: rich
+Requires-Dist: pymupdf4llm ==0.0.9
+Requires-Dist: transformers ==4.41.2
+Requires-Dist: sentence-transformers ==3.0.1
+Requires-Dist: numpy ==1.26.4
+Requires-Dist: pypdf ==4.3.0
 # Sparrow Parse
@@ -128,6 +128,27 @@ Example:
 *debug* - `True`
+## PDF optimization
+```
+from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
+pdf_optimizer = PDFOptimizer()
+num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
+                                                                     output_directory,
+                                                                     convert_to_images)
+```
+Example:
+*file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
+*output_directory* - set to not `None`, for debug purposes only
+*convert_to_images* - default `False`, to split into PDF files
 ## Library build
 ```
@@ -158,3 +179,4 @@ If your organization is seeking to utilize Sparrow under a proprietary license,
 Licensed under the GPL 3.0. Copyright 2020-2024 Katana ML, Andrej Baranovskij. [Copy of the license](https://github.com/katanaml/sparrow/blob/main/LICENSE).

sparrow_parse-0.3.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,14 @@
+sparrow_parse/__init__.py,sha256=2KwowXhmiT6-Bln7VPq9d9sRpAzJq9qLyclhp2KWmjA,21
+sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
+sparrow_parse/temp.py,sha256=gy4_mtNW_KfXn9br_suu6jHx7JKYLKs9pIOBynh_JWY,1134
+sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sparrow_parse/extractor/extractor_helper.py,sha256=n9M9NyZfesiCCj3ET9WoyqRcWIFJ4k-jyQlUAarKIhE,13658
+sparrow_parse/extractor/html_extractor.py,sha256=Y9c17epY6esn1lNGhOVpzgRuolFJUUZAfZ3G9fKcArU,9916
+sparrow_parse/extractor/markdown_processor.py,sha256=dC2WUdA-v2psh7oytruftxYkXdQi72FoEYxF30ROuO0,4506
+sparrow_parse/extractor/pdf_optimizer.py,sha256=KI_EweGt9Y_rDH1uCpYD5wKCW3rdjSFFhoVtiPBxX8k,3013
+sparrow_parse/extractor/unstructured_processor.py,sha256=oonkB5ALaV1pVs0a-xr8yAf-kirIabmtugHMnnEILqo,6770
+sparrow_parse-0.3.1.dist-info/METADATA,sha256=F3oN55g63Yeklp6n0O7qZvVdBM0OUs9--Ch-0kmGWxE,6190
+sparrow_parse-0.3.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+sparrow_parse-0.3.1.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
+sparrow_parse-0.3.1.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
+sparrow_parse-0.3.1.dist-info/RECORD,,

{sparrow_parse-0.2.9.dist-info → sparrow_parse-0.3.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,5 @@
 Wheel-Version: 1.0
-Generator: poetry-core 1.9.0
+Generator: bdist_wheel (0.43.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

sparrow_parse-0.3.1.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,3 @@
+[console_scripts]
+sparrow-parse = sparrow_parse:main

sparrow_parse-0.3.1.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ sparrow_parse

sparrow_parse/data/invoice_1_table.txt DELETED Viewed

@@ -1,9 +0,0 @@
-[
-    '<table><thead><th>No.</th><th>Description</th><th>Qty</th><th>UM</th><th>Net price</th><th>Net worth</th><th>VAT [%]</th><th>Gross worth</th></thead><tr><td></td><td>Wine Glasses Goblets Pair Clear
-Glass</td><td>5,00</td><td>eacn</td><td>12,00</td><td>60,00</td><td>10%</td><td>66,00</td></tr><tr><td></td><td>With Hooks Stemware Storage Multiple Uses Iron Wine Rack Hanging
-Glass</td><td>4,00</td><td>eacn</td><td>28,08</td><td>112,32</td><td>10%</td><td>123,55</td></tr><tr><td></td><td>Replacement Corkscrew Parts Spiral Worm Wine Opener Bottle
-Houdini</td><td>1,00</td><td>eacn</td><td>7,50</td><td>7,50</td><td>10%</td><td>8,25</td></tr><tr><td></td><td>HOME ESSENTIALS GRADIENT STEMLESS WINE GLASSES SET OF 4 20 FL OZ (591 ml)
-NEW</td><td>1,00</td><td>eacn</td><td>12,99</td><td>12,99</td><td>10%</td><td>14,29</td></tr></table>',
-    '<table><thead><th>VAT</th><th>[%]</th><th>Net worth</th><th>VAT</th><th>Gross worth</th></thead><tr><td></td><td>10%</td><td>192,81</td><td>19,28</td><td>212,09</td></tr><tr><td colspan="2">Total</td><td>$ 192,81</td><td>$
-19,28</td><td>$ 212,09</td></tr></table>'
-]

sparrow_parse-0.2.9.dist-info/RECORD DELETED Viewed

@@ -1,13 +0,0 @@
-sparrow_parse/__init__.py,sha256=e5Klz6yLU-4Ub9zxGUBo9wADAYsST38ZylTv9ze4i60,21
-sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
-sparrow_parse/data/invoice_1_table.txt,sha256=dsWEASxlVNidpTCQDowCM7SjaUzSqwx7DuydTfaQ7xI,1115
-sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sparrow_parse/extractor/extractor_helper.py,sha256=n9M9NyZfesiCCj3ET9WoyqRcWIFJ4k-jyQlUAarKIhE,13658
-sparrow_parse/extractor/html_extractor.py,sha256=rwtumbPrJoJ8UryjlrASAjAGSsu6bbu-TQm_6ouJods,9915
-sparrow_parse/extractor/markdown_processor.py,sha256=dC2WUdA-v2psh7oytruftxYkXdQi72FoEYxF30ROuO0,4506
-sparrow_parse/extractor/unstructured_processor.py,sha256=z46aXacMvfW_wmsACs0LtamoMc19eogGd5fVVAj4vIo,6771
-sparrow_parse/temp.py,sha256=Hl1wPOEytXnfbUobU8BJgEswPsfncibbQdwrpSHtlOo,513
-sparrow_parse-0.2.9.dist-info/METADATA,sha256=qgLxJOpKE9X_uutQhasniDYf6kX93p5exRUTZCAYts4,5622
-sparrow_parse-0.2.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-sparrow_parse-0.2.9.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
-sparrow_parse-0.2.9.dist-info/RECORD,,

sparrow_parse-0.2.9.dist-info/entry_points.txt DELETED Viewed

@@ -1,3 +0,0 @@
-[console_scripts]
-sparrow-parse=sparrow_parse:main

sparrow-parse 0.2.9__py3-none-any.whl → 0.3.1__py3-none-any.whl

sparrow-parse 0.2.9py3-none-any.whl → 0.3.1py3-none-any.whl