sparrow-parse 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sparrow_parse/__init__.py +1 -1
- sparrow_parse/extractor/pdf_optimizer.py +3 -3
- {sparrow_parse-0.3.0.dist-info → sparrow_parse-0.3.2.dist-info}/METADATA +38 -22
- {sparrow_parse-0.3.0.dist-info → sparrow_parse-0.3.2.dist-info}/RECORD +8 -8
- {sparrow_parse-0.3.0.dist-info → sparrow_parse-0.3.2.dist-info}/WHEEL +2 -1
- sparrow_parse-0.3.2.dist-info/entry_points.txt +3 -0
- sparrow_parse-0.3.2.dist-info/top_level.txt +1 -0
- sparrow_parse/data/invoice_1_table.txt +0 -9
- sparrow_parse-0.3.0.dist-info/entry_points.txt +0 -3
sparrow_parse/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = '0.3.
|
1
|
+
__version__ = '0.3.2'
|
@@ -1,4 +1,4 @@
|
|
1
|
-
import
|
1
|
+
import pypdf
|
2
2
|
from pdf2image import convert_from_path
|
3
3
|
import os
|
4
4
|
import tempfile
|
@@ -17,12 +17,12 @@ class PDFOptimizer(object):
|
|
17
17
|
if not convert_to_images:
|
18
18
|
# Open the PDF file
|
19
19
|
with open(file_path, 'rb') as pdf_file:
|
20
|
-
reader =
|
20
|
+
reader = pypdf.PdfReader(pdf_file)
|
21
21
|
number_of_pages = len(reader.pages)
|
22
22
|
|
23
23
|
# Split the PDF into separate files per page
|
24
24
|
for page_num in range(number_of_pages):
|
25
|
-
writer =
|
25
|
+
writer = pypdf.PdfWriter()
|
26
26
|
writer.add_page(reader.pages[page_num])
|
27
27
|
|
28
28
|
output_filename = os.path.join(temp_dir, f'page_{page_num + 1}.pdf')
|
@@ -1,31 +1,30 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.2
|
4
4
|
Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
|
-
License: GPL-3.0
|
7
|
-
Keywords: llm,rag,vision
|
8
6
|
Author: Andrej Baranovskij
|
9
7
|
Author-email: andrejus.baranovskis@gmail.com
|
10
|
-
|
11
|
-
|
8
|
+
License: UNKNOWN
|
9
|
+
Project-URL: Homepage, https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
10
|
+
Project-URL: Repository, https://github.com/katanaml/sparrow
|
11
|
+
Keywords: llm,rag,vision
|
12
|
+
Platform: UNKNOWN
|
12
13
|
Classifier: Operating System :: OS Independent
|
13
|
-
Classifier:
|
14
|
-
Classifier: Programming Language :: Python :: 3.9
|
15
|
-
Classifier: Programming Language :: Python :: 3.10
|
16
|
-
Classifier: Programming Language :: Python :: 3.11
|
14
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
17
15
|
Classifier: Topic :: Software Development
|
18
|
-
|
19
|
-
Requires-
|
20
|
-
Requires-Dist: pymupdf4llm (==0.0.6)
|
21
|
-
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
22
|
-
Requires-Dist: sentence-transformers (==3.0.1)
|
23
|
-
Requires-Dist: torch (==2.2.2)
|
24
|
-
Requires-Dist: transformers (==4.41.2)
|
25
|
-
Requires-Dist: unstructured-inference (==0.7.33)
|
26
|
-
Requires-Dist: unstructured[all-docs] (==0.14.5)
|
27
|
-
Project-URL: Repository, https://github.com/katanaml/sparrow
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
17
|
+
Requires-Python: >=3.10
|
28
18
|
Description-Content-Type: text/markdown
|
19
|
+
Requires-Dist: torch ==2.2.2
|
20
|
+
Requires-Dist: unstructured[all-docs] ==0.14.5
|
21
|
+
Requires-Dist: unstructured-inference ==0.7.33
|
22
|
+
Requires-Dist: rich
|
23
|
+
Requires-Dist: pymupdf4llm ==0.0.9
|
24
|
+
Requires-Dist: transformers ==4.41.2
|
25
|
+
Requires-Dist: sentence-transformers ==3.0.1
|
26
|
+
Requires-Dist: numpy ==1.26.4
|
27
|
+
Requires-Dist: pypdf ==4.3.0
|
29
28
|
|
30
29
|
# Sparrow Parse
|
31
30
|
|
@@ -152,14 +151,30 @@ Example:
|
|
152
151
|
|
153
152
|
## Library build
|
154
153
|
|
154
|
+
Create Python virtual environment
|
155
|
+
|
155
156
|
```
|
156
|
-
|
157
|
+
python -m venv .env_sparrow_parse
|
157
158
|
```
|
158
159
|
|
159
|
-
|
160
|
+
Install Python libraries
|
160
161
|
|
161
162
|
```
|
162
|
-
|
163
|
+
pip install -r requirements.txt
|
164
|
+
```
|
165
|
+
|
166
|
+
Build package
|
167
|
+
|
168
|
+
```
|
169
|
+
pip install setuptools wheel
|
170
|
+
python setup.py sdist bdist_wheel
|
171
|
+
```
|
172
|
+
|
173
|
+
Upload to PyPI
|
174
|
+
|
175
|
+
```
|
176
|
+
pip install twine
|
177
|
+
twine upload dist/*
|
163
178
|
```
|
164
179
|
|
165
180
|
## Commercial usage
|
@@ -180,3 +195,4 @@ If your organization is seeking to utilize Sparrow under a proprietary license,
|
|
180
195
|
|
181
196
|
Licensed under the GPL 3.0. Copyright 2020-2024 Katana ML, Andrej Baranovskij. [Copy of the license](https://github.com/katanaml/sparrow/blob/main/LICENSE).
|
182
197
|
|
198
|
+
|
@@ -1,14 +1,14 @@
|
|
1
|
-
sparrow_parse/__init__.py,sha256=
|
1
|
+
sparrow_parse/__init__.py,sha256=64UBVh2KX7E-WVG4ZyY1dUiW9jGXZloWZk1N9nEUC2k,21
|
2
2
|
sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
|
3
|
-
sparrow_parse/
|
3
|
+
sparrow_parse/temp.py,sha256=gy4_mtNW_KfXn9br_suu6jHx7JKYLKs9pIOBynh_JWY,1134
|
4
4
|
sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
sparrow_parse/extractor/extractor_helper.py,sha256=n9M9NyZfesiCCj3ET9WoyqRcWIFJ4k-jyQlUAarKIhE,13658
|
6
6
|
sparrow_parse/extractor/html_extractor.py,sha256=Y9c17epY6esn1lNGhOVpzgRuolFJUUZAfZ3G9fKcArU,9916
|
7
7
|
sparrow_parse/extractor/markdown_processor.py,sha256=dC2WUdA-v2psh7oytruftxYkXdQi72FoEYxF30ROuO0,4506
|
8
|
-
sparrow_parse/extractor/pdf_optimizer.py,sha256=
|
8
|
+
sparrow_parse/extractor/pdf_optimizer.py,sha256=KI_EweGt9Y_rDH1uCpYD5wKCW3rdjSFFhoVtiPBxX8k,3013
|
9
9
|
sparrow_parse/extractor/unstructured_processor.py,sha256=oonkB5ALaV1pVs0a-xr8yAf-kirIabmtugHMnnEILqo,6770
|
10
|
-
sparrow_parse/
|
11
|
-
sparrow_parse-0.3.
|
12
|
-
sparrow_parse-0.3.
|
13
|
-
sparrow_parse-0.3.
|
14
|
-
sparrow_parse-0.3.
|
10
|
+
sparrow_parse-0.3.2.dist-info/METADATA,sha256=BA_M_vHGpbJuXvivXHJLCIejtdGHFatOrUVJve1USXY,6422
|
11
|
+
sparrow_parse-0.3.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
12
|
+
sparrow_parse-0.3.2.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
|
13
|
+
sparrow_parse-0.3.2.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
|
14
|
+
sparrow_parse-0.3.2.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
sparrow_parse
|
@@ -1,9 +0,0 @@
|
|
1
|
-
[
|
2
|
-
'<table><thead><th>No.</th><th>Description</th><th>Qty</th><th>UM</th><th>Net price</th><th>Net worth</th><th>VAT [%]</th><th>Gross worth</th></thead><tr><td></td><td>Wine Glasses Goblets Pair Clear
|
3
|
-
Glass</td><td>5,00</td><td>eacn</td><td>12,00</td><td>60,00</td><td>10%</td><td>66,00</td></tr><tr><td></td><td>With Hooks Stemware Storage Multiple Uses Iron Wine Rack Hanging
|
4
|
-
Glass</td><td>4,00</td><td>eacn</td><td>28,08</td><td>112,32</td><td>10%</td><td>123,55</td></tr><tr><td></td><td>Replacement Corkscrew Parts Spiral Worm Wine Opener Bottle
|
5
|
-
Houdini</td><td>1,00</td><td>eacn</td><td>7,50</td><td>7,50</td><td>10%</td><td>8,25</td></tr><tr><td></td><td>HOME ESSENTIALS GRADIENT STEMLESS WINE GLASSES SET OF 4 20 FL OZ (591 ml)
|
6
|
-
NEW</td><td>1,00</td><td>eacn</td><td>12,99</td><td>12,99</td><td>10%</td><td>14,29</td></tr></table>',
|
7
|
-
'<table><thead><th>VAT</th><th>[%]</th><th>Net worth</th><th>VAT</th><th>Gross worth</th></thead><tr><td></td><td>10%</td><td>192,81</td><td>19,28</td><td>212,09</td></tr><tr><td colspan="2">Total</td><td>$ 192,81</td><td>$
|
8
|
-
19,28</td><td>$ 212,09</td></tr></table>'
|
9
|
-
]
|