sparrow-parse 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sparrow_parse/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '0.3.0'
1
+ __version__ = '0.3.2'
@@ -1,4 +1,4 @@
1
- import PyPDF2
1
+ import pypdf
2
2
  from pdf2image import convert_from_path
3
3
  import os
4
4
  import tempfile
@@ -17,12 +17,12 @@ class PDFOptimizer(object):
17
17
  if not convert_to_images:
18
18
  # Open the PDF file
19
19
  with open(file_path, 'rb') as pdf_file:
20
- reader = PyPDF2.PdfReader(pdf_file)
20
+ reader = pypdf.PdfReader(pdf_file)
21
21
  number_of_pages = len(reader.pages)
22
22
 
23
23
  # Split the PDF into separate files per page
24
24
  for page_num in range(number_of_pages):
25
- writer = PyPDF2.PdfWriter()
25
+ writer = pypdf.PdfWriter()
26
26
  writer.add_page(reader.pages[page_num])
27
27
 
28
28
  output_filename = os.path.join(temp_dir, f'page_{page_num + 1}.pdf')
@@ -1,31 +1,30 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
- License: GPL-3.0
7
- Keywords: llm,rag,vision
8
6
  Author: Andrej Baranovskij
9
7
  Author-email: andrejus.baranovskis@gmail.com
10
- Requires-Python: >=3.9,<3.12
11
- Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
8
+ License: UNKNOWN
9
+ Project-URL: Homepage, https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
10
+ Project-URL: Repository, https://github.com/katanaml/sparrow
11
+ Keywords: llm,rag,vision
12
+ Platform: UNKNOWN
12
13
  Classifier: Operating System :: OS Independent
13
- Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3.9
15
- Classifier: Programming Language :: Python :: 3.10
16
- Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
17
15
  Classifier: Topic :: Software Development
18
- Requires-Dist: PyPDF2 (==3.0.1)
19
- Requires-Dist: numpy (==1.26.4)
20
- Requires-Dist: pymupdf4llm (==0.0.6)
21
- Requires-Dist: rich (>=13.7.1,<14.0.0)
22
- Requires-Dist: sentence-transformers (==3.0.1)
23
- Requires-Dist: torch (==2.2.2)
24
- Requires-Dist: transformers (==4.41.2)
25
- Requires-Dist: unstructured-inference (==0.7.33)
26
- Requires-Dist: unstructured[all-docs] (==0.14.5)
27
- Project-URL: Repository, https://github.com/katanaml/sparrow
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Requires-Python: >=3.10
28
18
  Description-Content-Type: text/markdown
19
+ Requires-Dist: torch ==2.2.2
20
+ Requires-Dist: unstructured[all-docs] ==0.14.5
21
+ Requires-Dist: unstructured-inference ==0.7.33
22
+ Requires-Dist: rich
23
+ Requires-Dist: pymupdf4llm ==0.0.9
24
+ Requires-Dist: transformers ==4.41.2
25
+ Requires-Dist: sentence-transformers ==3.0.1
26
+ Requires-Dist: numpy ==1.26.4
27
+ Requires-Dist: pypdf ==4.3.0
29
28
 
30
29
  # Sparrow Parse
31
30
 
@@ -152,14 +151,30 @@ Example:
152
151
 
153
152
  ## Library build
154
153
 
154
+ Create Python virtual environment
155
+
155
156
  ```
156
- poetry build
157
+ python -m venv .env_sparrow_parse
157
158
  ```
158
159
 
159
- Publish to PyPi
160
+ Install Python libraries
160
161
 
161
162
  ```
162
- poetry publish
163
+ pip install -r requirements.txt
164
+ ```
165
+
166
+ Build package
167
+
168
+ ```
169
+ pip install setuptools wheel
170
+ python setup.py sdist bdist_wheel
171
+ ```
172
+
173
+ Upload to PyPI
174
+
175
+ ```
176
+ pip install twine
177
+ twine upload dist/*
163
178
  ```
164
179
 
165
180
  ## Commercial usage
@@ -180,3 +195,4 @@ If your organization is seeking to utilize Sparrow under a proprietary license,
180
195
 
181
196
  Licensed under the GPL 3.0. Copyright 2020-2024 Katana ML, Andrej Baranovskij. [Copy of the license](https://github.com/katanaml/sparrow/blob/main/LICENSE).
182
197
 
198
+
@@ -1,14 +1,14 @@
1
- sparrow_parse/__init__.py,sha256=gTggO06fb2c9XKEwlQYUSPlUfy82yVlM9pzLMOUqVcY,21
1
+ sparrow_parse/__init__.py,sha256=64UBVh2KX7E-WVG4ZyY1dUiW9jGXZloWZk1N9nEUC2k,21
2
2
  sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
- sparrow_parse/data/invoice_1_table.txt,sha256=dsWEASxlVNidpTCQDowCM7SjaUzSqwx7DuydTfaQ7xI,1115
3
+ sparrow_parse/temp.py,sha256=gy4_mtNW_KfXn9br_suu6jHx7JKYLKs9pIOBynh_JWY,1134
4
4
  sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  sparrow_parse/extractor/extractor_helper.py,sha256=n9M9NyZfesiCCj3ET9WoyqRcWIFJ4k-jyQlUAarKIhE,13658
6
6
  sparrow_parse/extractor/html_extractor.py,sha256=Y9c17epY6esn1lNGhOVpzgRuolFJUUZAfZ3G9fKcArU,9916
7
7
  sparrow_parse/extractor/markdown_processor.py,sha256=dC2WUdA-v2psh7oytruftxYkXdQi72FoEYxF30ROuO0,4506
8
- sparrow_parse/extractor/pdf_optimizer.py,sha256=cgXLY7JBtxZVU4KenNIhfN2ogcZXyvD0f7SitvkbJ4o,3016
8
+ sparrow_parse/extractor/pdf_optimizer.py,sha256=KI_EweGt9Y_rDH1uCpYD5wKCW3rdjSFFhoVtiPBxX8k,3013
9
9
  sparrow_parse/extractor/unstructured_processor.py,sha256=oonkB5ALaV1pVs0a-xr8yAf-kirIabmtugHMnnEILqo,6770
10
- sparrow_parse/temp.py,sha256=gy4_mtNW_KfXn9br_suu6jHx7JKYLKs9pIOBynh_JWY,1134
11
- sparrow_parse-0.3.0.dist-info/METADATA,sha256=rmt5FkqS9RLhEf0M3oYYu8A-r9VE7Yt_ZYlSnn7AcPw,6273
12
- sparrow_parse-0.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
13
- sparrow_parse-0.3.0.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
14
- sparrow_parse-0.3.0.dist-info/RECORD,,
10
+ sparrow_parse-0.3.2.dist-info/METADATA,sha256=BA_M_vHGpbJuXvivXHJLCIejtdGHFatOrUVJve1USXY,6422
11
+ sparrow_parse-0.3.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
12
+ sparrow_parse-0.3.2.dist-info/entry_points.txt,sha256=8CrvTVTTcz1YuZ8aRCYNOH15ZOAaYLlcbYX3t28HwJY,54
13
+ sparrow_parse-0.3.2.dist-info/top_level.txt,sha256=n6b-WtT91zKLyCPZTP7wvne8v_yvIahcsz-4sX8I0rY,14
14
+ sparrow_parse-0.3.2.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.0
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
+
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ sparrow-parse = sparrow_parse:main
3
+
@@ -0,0 +1 @@
1
+ sparrow_parse
@@ -1,9 +0,0 @@
1
- [
2
- '<table><thead><th>No.</th><th>Description</th><th>Qty</th><th>UM</th><th>Net price</th><th>Net worth</th><th>VAT [%]</th><th>Gross worth</th></thead><tr><td></td><td>Wine Glasses Goblets Pair Clear
3
- Glass</td><td>5,00</td><td>eacn</td><td>12,00</td><td>60,00</td><td>10%</td><td>66,00</td></tr><tr><td></td><td>With Hooks Stemware Storage Multiple Uses Iron Wine Rack Hanging
4
- Glass</td><td>4,00</td><td>eacn</td><td>28,08</td><td>112,32</td><td>10%</td><td>123,55</td></tr><tr><td></td><td>Replacement Corkscrew Parts Spiral Worm Wine Opener Bottle
5
- Houdini</td><td>1,00</td><td>eacn</td><td>7,50</td><td>7,50</td><td>10%</td><td>8,25</td></tr><tr><td></td><td>HOME ESSENTIALS GRADIENT STEMLESS WINE GLASSES SET OF 4 20 FL OZ (591 ml)
6
- NEW</td><td>1,00</td><td>eacn</td><td>12,99</td><td>12,99</td><td>10%</td><td>14,29</td></tr></table>',
7
- '<table><thead><th>VAT</th><th>[%]</th><th>Net worth</th><th>VAT</th><th>Gross worth</th></thead><tr><td></td><td>10%</td><td>192,81</td><td>19,28</td><td>212,09</td></tr><tr><td colspan="2">Total</td><td>$ 192,81</td><td>$
8
- 19,28</td><td>$ 212,09</td></tr></table>'
9
- ]
@@ -1,3 +0,0 @@
1
- [console_scripts]
2
- sparrow-parse=sparrow_parse:main
3
-