sparrow-parse 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {sparrow_parse-0.3.0 → sparrow-parse-0.3.2}/PKG-INFO +29 -22
  2. {sparrow_parse-0.3.0 → sparrow-parse-0.3.2}/README.md +19 -3
  3. sparrow-parse-0.3.2/setup.cfg +4 -0
  4. sparrow-parse-0.3.2/setup.py +37 -0
  5. sparrow-parse-0.3.2/sparrow_parse/__init__.py +1 -0
  6. {sparrow_parse-0.3.0 → sparrow-parse-0.3.2}/sparrow_parse/extractor/pdf_optimizer.py +3 -3
  7. sparrow-parse-0.3.2/sparrow_parse.egg-info/PKG-INFO +189 -0
  8. sparrow-parse-0.3.2/sparrow_parse.egg-info/SOURCES.txt +17 -0
  9. sparrow-parse-0.3.2/sparrow_parse.egg-info/dependency_links.txt +1 -0
  10. sparrow-parse-0.3.2/sparrow_parse.egg-info/entry_points.txt +3 -0
  11. sparrow-parse-0.3.2/sparrow_parse.egg-info/requires.txt +9 -0
  12. sparrow-parse-0.3.2/sparrow_parse.egg-info/top_level.txt +1 -0
  13. sparrow_parse-0.3.0/pyproject.toml +0 -41
  14. sparrow_parse-0.3.0/sparrow_parse/__init__.py +0 -1
  15. sparrow_parse-0.3.0/sparrow_parse/data/invoice_1_table.txt +0 -9
  16. sparrow_parse-0.3.0/sparrow_parse/extractor/__pycache__/__init__.cpython-310.pyc +0 -0
  17. sparrow_parse-0.3.0/sparrow_parse/extractor/__pycache__/extractor_helper.cpython-310.pyc +0 -0
  18. sparrow_parse-0.3.0/sparrow_parse/extractor/__pycache__/html_extractor.cpython-310.pyc +0 -0
  19. {sparrow_parse-0.3.0 → sparrow-parse-0.3.2}/sparrow_parse/__main__.py +0 -0
  20. {sparrow_parse-0.3.0 → sparrow-parse-0.3.2}/sparrow_parse/extractor/__init__.py +0 -0
  21. {sparrow_parse-0.3.0 → sparrow-parse-0.3.2}/sparrow_parse/extractor/extractor_helper.py +0 -0
  22. {sparrow_parse-0.3.0 → sparrow-parse-0.3.2}/sparrow_parse/extractor/html_extractor.py +0 -0
  23. {sparrow_parse-0.3.0 → sparrow-parse-0.3.2}/sparrow_parse/extractor/markdown_processor.py +0 -0
  24. {sparrow_parse-0.3.0 → sparrow-parse-0.3.2}/sparrow_parse/extractor/unstructured_processor.py +0 -0
  25. {sparrow_parse-0.3.0 → sparrow-parse-0.3.2}/sparrow_parse/temp.py +0 -0
@@ -1,30 +1,20 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
- License: GPL-3.0
7
- Keywords: llm,rag,vision
8
6
  Author: Andrej Baranovskij
9
7
  Author-email: andrejus.baranovskis@gmail.com
10
- Requires-Python: >=3.9,<3.12
11
- Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
8
+ License: UNKNOWN
9
+ Project-URL: Homepage, https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
10
+ Project-URL: Repository, https://github.com/katanaml/sparrow
11
+ Keywords: llm,rag,vision
12
+ Platform: UNKNOWN
12
13
  Classifier: Operating System :: OS Independent
13
- Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3.9
15
- Classifier: Programming Language :: Python :: 3.10
16
- Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
17
15
  Classifier: Topic :: Software Development
18
- Requires-Dist: PyPDF2 (==3.0.1)
19
- Requires-Dist: numpy (==1.26.4)
20
- Requires-Dist: pymupdf4llm (==0.0.6)
21
- Requires-Dist: rich (>=13.7.1,<14.0.0)
22
- Requires-Dist: sentence-transformers (==3.0.1)
23
- Requires-Dist: torch (==2.2.2)
24
- Requires-Dist: transformers (==4.41.2)
25
- Requires-Dist: unstructured-inference (==0.7.33)
26
- Requires-Dist: unstructured[all-docs] (==0.14.5)
27
- Project-URL: Repository, https://github.com/katanaml/sparrow
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Requires-Python: >=3.10
28
18
  Description-Content-Type: text/markdown
29
19
 
30
20
  # Sparrow Parse
@@ -152,14 +142,30 @@ Example:
152
142
 
153
143
  ## Library build
154
144
 
145
+ Create Python virtual environment
146
+
155
147
  ```
156
- poetry build
148
+ python -m venv .env_sparrow_parse
157
149
  ```
158
150
 
159
- Publish to PyPi
151
+ Install Python libraries
160
152
 
161
153
  ```
162
- poetry publish
154
+ pip install -r requirements.txt
155
+ ```
156
+
157
+ Build package
158
+
159
+ ```
160
+ pip install setuptools wheel
161
+ python setup.py sdist bdist_wheel
162
+ ```
163
+
164
+ Upload to PyPI
165
+
166
+ ```
167
+ pip install twine
168
+ twine upload dist/*
163
169
  ```
164
170
 
165
171
  ## Commercial usage
@@ -180,3 +186,4 @@ If your organization is seeking to utilize Sparrow under a proprietary license,
180
186
 
181
187
  Licensed under the GPL 3.0. Copyright 2020-2024 Katana ML, Andrej Baranovskij. [Copy of the license](https://github.com/katanaml/sparrow/blob/main/LICENSE).
182
188
 
189
+
@@ -123,14 +123,30 @@ Example:
123
123
 
124
124
  ## Library build
125
125
 
126
+ Create Python virtual environment
127
+
128
+ ```
129
+ python -m venv .env_sparrow_parse
130
+ ```
131
+
132
+ Install Python libraries
133
+
134
+ ```
135
+ pip install -r requirements.txt
136
+ ```
137
+
138
+ Build package
139
+
126
140
  ```
127
- poetry build
141
+ pip install setuptools wheel
142
+ python setup.py sdist bdist_wheel
128
143
  ```
129
144
 
130
- Publish to PyPi
145
+ Upload to PyPI
131
146
 
132
147
  ```
133
- poetry publish
148
+ pip install twine
149
+ twine upload dist/*
134
150
  ```
135
151
 
136
152
  ## Commercial usage
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,37 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ with open("README.md", "r", encoding="utf-8") as fh:
4
+ long_description = fh.read()
5
+
6
+ with open("requirements.txt", "r", encoding="utf-8") as fh:
7
+ requirements = fh.read().splitlines()
8
+
9
+ setup(
10
+ name="sparrow-parse",
11
+ version="0.3.2",
12
+ author="Andrej Baranovskij",
13
+ author_email="andrejus.baranovskis@gmail.com",
14
+ description="Sparrow Parse is a Python package for parsing and extracting information from documents.",
15
+ long_description=long_description,
16
+ long_description_content_type="text/markdown",
17
+ url="https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse",
18
+ project_urls={
19
+ "Homepage": "https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse",
20
+ "Repository": "https://github.com/katanaml/sparrow",
21
+ },
22
+ classifiers=[
23
+ "Operating System :: OS Independent",
24
+ "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
25
+ "Topic :: Software Development",
26
+ "Programming Language :: Python :: 3.10",
27
+ ],
28
+ entry_points={
29
+ 'console_scripts': [
30
+ 'sparrow-parse=sparrow_parse:main',
31
+ ],
32
+ },
33
+ keywords="llm, rag, vision",
34
+ packages=find_packages(),
35
+ python_requires='>=3.10',
36
+ install_requires=requirements,
37
+ )
@@ -0,0 +1 @@
1
+ __version__ = '0.3.2'
@@ -1,4 +1,4 @@
1
- import PyPDF2
1
+ import pypdf
2
2
  from pdf2image import convert_from_path
3
3
  import os
4
4
  import tempfile
@@ -17,12 +17,12 @@ class PDFOptimizer(object):
17
17
  if not convert_to_images:
18
18
  # Open the PDF file
19
19
  with open(file_path, 'rb') as pdf_file:
20
- reader = PyPDF2.PdfReader(pdf_file)
20
+ reader = pypdf.PdfReader(pdf_file)
21
21
  number_of_pages = len(reader.pages)
22
22
 
23
23
  # Split the PDF into separate files per page
24
24
  for page_num in range(number_of_pages):
25
- writer = PyPDF2.PdfWriter()
25
+ writer = pypdf.PdfWriter()
26
26
  writer.add_page(reader.pages[page_num])
27
27
 
28
28
  output_filename = os.path.join(temp_dir, f'page_{page_num + 1}.pdf')
@@ -0,0 +1,189 @@
1
+ Metadata-Version: 2.1
2
+ Name: sparrow-parse
3
+ Version: 0.3.2
4
+ Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
5
+ Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
+ Author: Andrej Baranovskij
7
+ Author-email: andrejus.baranovskis@gmail.com
8
+ License: UNKNOWN
9
+ Project-URL: Homepage, https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
10
+ Project-URL: Repository, https://github.com/katanaml/sparrow
11
+ Keywords: llm,rag,vision
12
+ Platform: UNKNOWN
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
15
+ Classifier: Topic :: Software Development
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+
20
+ # Sparrow Parse
21
+
22
+ ## Description
23
+
24
+ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing, parsing and extracting information.
25
+
26
+ ## Install
27
+
28
+ ```
29
+ pip install sparrow-parse
30
+ ```
31
+
32
+ ## Pre-processing
33
+
34
+ ### Unstructured
35
+
36
+ ```
37
+ from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
38
+
39
+ processor = UnstructuredProcessor()
40
+
41
+ content, table_content = processor.extract_data(
42
+ file_path, # file to process
43
+ strategy, # data processing strategy supported by unstructured
44
+ model_name, # model supported by unstructured
45
+ options, # table extraction into HTML format
46
+ local, # True if running from CLI, or False if running from FastAPI
47
+ debug) # Debug
48
+ ```
49
+
50
+ Example:
51
+
52
+ *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
53
+
54
+ *strategy* - `hi_res`
55
+
56
+ *model_name* - `yolox`
57
+
58
+ *options* - `['tables', 'unstructured']`
59
+
60
+ *local* - `True`
61
+
62
+ *debug* - `True`
63
+
64
+ ### Markdown
65
+
66
+ ```
67
+ from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
68
+
69
+ processor = MarkdownProcessor()
70
+
71
+ content, table_content = processor.extract_data(
72
+ file_path, # file to process
73
+ options, # table extraction into HTML format
74
+ local, # True if running from CLI, or False if running from FastAPI
75
+ debug) # Debug
76
+ ```
77
+
78
+ Example:
79
+
80
+ *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
81
+
82
+ *options* - `['tables', 'markdown']`
83
+
84
+ *local* - `True`
85
+
86
+ *debug* - `True`
87
+
88
+ ## Parsing and extraction
89
+
90
+ ```
91
+ from sparrow_parse.extractor.html_extractor import HTMLExtractor
92
+
93
+ extractor = HTMLExtractor()
94
+
95
+ answer, targets_unprocessed = extractor.read_data(
96
+ target_columns, # list of table columns data to fetch
97
+ data, # list of HTML tables
98
+ column_keywords, # list of valid column names, can be empty. Useful to filter junk content
99
+ group_by_rows, # JSON result grouping
100
+ update_targets, # Set to true, if page contains multiple tables with the same columns
101
+ local, # True if running from CLI, or False if running from FastAPI
102
+ debug) # Debug
103
+
104
+ ```
105
+
106
+ Example:
107
+
108
+ *target_columns* - `['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth']`
109
+
110
+ *data* - `list of HTML tables`
111
+
112
+ *column_keywords* - `None`
113
+
114
+ *group_by_rows* - `True`
115
+
116
+ *update_targets* - `True`
117
+
118
+ *local* - `True`
119
+
120
+ *debug* - `True`
121
+
122
+ ## PDF optimization
123
+
124
+ ```
125
+ from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
126
+
127
+ pdf_optimizer = PDFOptimizer()
128
+
129
+ num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
130
+ output_directory,
131
+ convert_to_images)
132
+
133
+ ```
134
+
135
+ Example:
136
+
137
+ *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
138
+
139
+ *output_directory* - set to not `None`, for debug purposes only
140
+
141
+ *convert_to_images* - default `False`, to split into PDF files
142
+
143
+ ## Library build
144
+
145
+ Create Python virtual environment
146
+
147
+ ```
148
+ python -m venv .env_sparrow_parse
149
+ ```
150
+
151
+ Install Python libraries
152
+
153
+ ```
154
+ pip install -r requirements.txt
155
+ ```
156
+
157
+ Build package
158
+
159
+ ```
160
+ pip install setuptools wheel
161
+ python setup.py sdist bdist_wheel
162
+ ```
163
+
164
+ Upload to PyPI
165
+
166
+ ```
167
+ pip install twine
168
+ twine upload dist/*
169
+ ```
170
+
171
+ ## Commercial usage
172
+
173
+ Sparrow is available under the GPL 3.0 license, promoting freedom to use, modify, and distribute the software while ensuring any modifications remain open source under the same license. This aligns with our commitment to supporting the open-source community and fostering collaboration.
174
+
175
+ Additionally, we recognize the diverse needs of organizations, including small to medium-sized enterprises (SMEs). Therefore, Sparrow is also offered for free commercial use to organizations with gross revenue below $5 million USD in the past 12 months, enabling them to leverage Sparrow without the financial burden often associated with high-quality software solutions.
176
+
177
+ For businesses that exceed this revenue threshold or require usage terms not accommodated by the GPL 3.0 license—such as integrating Sparrow into proprietary software without the obligation to disclose source code modifications—we offer dual licensing options. Dual licensing allows Sparrow to be used under a separate proprietary license, offering greater flexibility for commercial applications and proprietary integrations. This model supports both the project's sustainability and the business's needs for confidentiality and customization.
178
+
179
+ If your organization is seeking to utilize Sparrow under a proprietary license, or if you are interested in custom workflows, consulting services, or dedicated support and maintenance options, please contact us at abaranovskis@redsamuraiconsulting.com. We're here to provide tailored solutions that meet your unique requirements, ensuring you can maximize the benefits of Sparrow for your projects and workflows.
180
+
181
+ ## Author
182
+
183
+ [Katana ML](https://katanaml.io), [Andrej Baranovskij](https://github.com/abaranovskis-redsamurai)
184
+
185
+ ## License
186
+
187
+ Licensed under the GPL 3.0. Copyright 2020-2024 Katana ML, Andrej Baranovskij. [Copy of the license](https://github.com/katanaml/sparrow/blob/main/LICENSE).
188
+
189
+
@@ -0,0 +1,17 @@
1
+ README.md
2
+ setup.py
3
+ sparrow_parse/__init__.py
4
+ sparrow_parse/__main__.py
5
+ sparrow_parse/temp.py
6
+ sparrow_parse.egg-info/PKG-INFO
7
+ sparrow_parse.egg-info/SOURCES.txt
8
+ sparrow_parse.egg-info/dependency_links.txt
9
+ sparrow_parse.egg-info/entry_points.txt
10
+ sparrow_parse.egg-info/requires.txt
11
+ sparrow_parse.egg-info/top_level.txt
12
+ sparrow_parse/extractor/__init__.py
13
+ sparrow_parse/extractor/extractor_helper.py
14
+ sparrow_parse/extractor/html_extractor.py
15
+ sparrow_parse/extractor/markdown_processor.py
16
+ sparrow_parse/extractor/pdf_optimizer.py
17
+ sparrow_parse/extractor/unstructured_processor.py
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ sparrow-parse = sparrow_parse:main
3
+
@@ -0,0 +1,9 @@
1
+ torch==2.2.2
2
+ unstructured[all-docs]==0.14.5
3
+ unstructured-inference==0.7.33
4
+ rich
5
+ pymupdf4llm==0.0.9
6
+ transformers==4.41.2
7
+ sentence-transformers==3.0.1
8
+ numpy==1.26.4
9
+ pypdf==4.3.0
@@ -0,0 +1 @@
1
+ sparrow_parse
@@ -1,41 +0,0 @@
1
- [tool.poetry]
2
- name = "sparrow-parse"
3
- version = "0.3.0"
4
- description = "Sparrow Parse is a Python package for parsing and extracting information from documents."
5
- authors = ["Andrej Baranovskij <andrejus.baranovskis@gmail.com>"]
6
- license = "GPL-3.0"
7
- readme = "README.md"
8
- homepage = "https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse"
9
- repository = "https://github.com/katanaml/sparrow"
10
- keywords = ["llm", "rag", "vision"]
11
- classifiers = [
12
- "Operating System :: OS Independent",
13
- "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
14
- "Topic :: Software Development",
15
- "Programming Language :: Python :: 3.10",
16
- ]
17
- include = [
18
- "LICENSE",
19
- ]
20
-
21
-
22
- [tool.poetry.dependencies]
23
- python = ">=3.9,<3.12"
24
- torch = {version = "2.2.2", source = "pypi"}
25
- unstructured = {version = "0.14.5", extras = ["all-docs"]}
26
- unstructured-inference = "0.7.33"
27
- rich = "^13.7.1"
28
- pymupdf4llm = "0.0.6"
29
- transformers = "4.41.2"
30
- sentence-transformers = "3.0.1"
31
- numpy = "1.26.4"
32
- PyPDF2 = "3.0.1"
33
-
34
-
35
- [tool.poetry.scripts]
36
- sparrow-parse = 'sparrow_parse:main'
37
-
38
-
39
- [build-system]
40
- requires = ["poetry-core"]
41
- build-backend = "poetry.core.masonry.api"
@@ -1 +0,0 @@
1
- __version__ = '0.3.0'
@@ -1,9 +0,0 @@
1
- [
2
- '<table><thead><th>No.</th><th>Description</th><th>Qty</th><th>UM</th><th>Net price</th><th>Net worth</th><th>VAT [%]</th><th>Gross worth</th></thead><tr><td></td><td>Wine Glasses Goblets Pair Clear
3
- Glass</td><td>5,00</td><td>eacn</td><td>12,00</td><td>60,00</td><td>10%</td><td>66,00</td></tr><tr><td></td><td>With Hooks Stemware Storage Multiple Uses Iron Wine Rack Hanging
4
- Glass</td><td>4,00</td><td>eacn</td><td>28,08</td><td>112,32</td><td>10%</td><td>123,55</td></tr><tr><td></td><td>Replacement Corkscrew Parts Spiral Worm Wine Opener Bottle
5
- Houdini</td><td>1,00</td><td>eacn</td><td>7,50</td><td>7,50</td><td>10%</td><td>8,25</td></tr><tr><td></td><td>HOME ESSENTIALS GRADIENT STEMLESS WINE GLASSES SET OF 4 20 FL OZ (591 ml)
6
- NEW</td><td>1,00</td><td>eacn</td><td>12,99</td><td>12,99</td><td>10%</td><td>14,29</td></tr></table>',
7
- '<table><thead><th>VAT</th><th>[%]</th><th>Net worth</th><th>VAT</th><th>Gross worth</th></thead><tr><td></td><td>10%</td><td>192,81</td><td>19,28</td><td>212,09</td></tr><tr><td colspan="2">Total</td><td>$ 192,81</td><td>$
8
- 19,28</td><td>$ 212,09</td></tr></table>'
9
- ]