sparrow-parse 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sparrow_parse/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '0.1.7'
1
+ __version__ = '0.1.9'
@@ -0,0 +1,143 @@
1
+ import tempfile
2
+ import os
3
+ from unstructured.partition.pdf import partition_pdf
4
+ from unstructured.partition.image import partition_image
5
+ import json
6
+ from unstructured.staging.base import elements_to_json
7
+ from rich.progress import Progress, SpinnerColumn, TextColumn
8
+
9
+
10
+ class FileProcessor(object):
11
+ def __init__(self):
12
+ pass
13
+
14
+ def extract_data(self, file_path, strategy, model_name, options, local=True, debug=False):
15
+ # check if string options contains word table
16
+ extract_tables = False
17
+ if options is not None and "tables" in options:
18
+ extract_tables = True
19
+
20
+ # Extracts the elements from the PDF
21
+ elements = self.invoke_pipeline_step(
22
+ lambda: self.process_file(file_path, strategy, model_name),
23
+ "Extracting elements from the document...",
24
+ local
25
+ )
26
+
27
+ if debug:
28
+ new_extension = 'json' # You can change this to any extension you want
29
+ new_file_path = self.change_file_extension(file_path, new_extension)
30
+
31
+ content = self.invoke_pipeline_step(
32
+ lambda: self.load_text_data(elements, new_file_path, extract_tables),
33
+ "Loading text data...",
34
+ local
35
+ )
36
+ else:
37
+ with tempfile.TemporaryDirectory() as temp_dir:
38
+ temp_file_path = os.path.join(temp_dir, "file_data.json")
39
+
40
+ content = self.invoke_pipeline_step(
41
+ lambda: self.load_text_data(elements, temp_file_path, extract_tables),
42
+ "Loading text data...",
43
+ local
44
+ )
45
+
46
+ return content
47
+
48
+ def process_file(self, file_path, strategy, model_name):
49
+ elements = None
50
+
51
+ if file_path.lower().endswith('.pdf'):
52
+ elements = partition_pdf(
53
+ filename=file_path,
54
+ strategy=strategy,
55
+ infer_table_structure=True,
56
+ model_name=model_name
57
+ )
58
+ elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
59
+ elements = partition_image(
60
+ filename=file_path,
61
+ strategy=strategy,
62
+ infer_table_structure=True,
63
+ model_name=model_name
64
+ )
65
+
66
+ return elements
67
+
68
+ def change_file_extension(self, file_path, new_extension):
69
+ # Check if the new extension starts with a dot and add one if not
70
+ if not new_extension.startswith('.'):
71
+ new_extension = '.' + new_extension
72
+
73
+ # Split the file path into two parts: the base (everything before the last dot) and the extension
74
+ # If there's no dot in the filename, it'll just return the original filename without an extension
75
+ base = file_path.rsplit('.', 1)[0]
76
+
77
+ # Concatenate the base with the new extension
78
+ new_file_path = base + new_extension
79
+
80
+ return new_file_path
81
+
82
+ def load_text_data(self, elements, file_path, extract_tables):
83
+ elements_to_json(elements, filename=file_path)
84
+ text_file = self.process_json_file(file_path, extract_tables)
85
+
86
+ with open(text_file, 'r') as file:
87
+ content = file.read()
88
+
89
+ return content
90
+
91
+ def process_json_file(self, input_data, extract_tables):
92
+ # Read the JSON file
93
+ with open(input_data, 'r') as file:
94
+ data = json.load(file)
95
+
96
+ # Iterate over the JSON data and extract required table elements
97
+ extracted_elements = []
98
+ for entry in data:
99
+ if entry["type"] == "Table":
100
+ extracted_elements.append(entry["metadata"]["text_as_html"])
101
+ elif entry["type"] == "Title" and extract_tables is False:
102
+ extracted_elements.append(entry["text"])
103
+ elif entry["type"] == "NarrativeText" and extract_tables is False:
104
+ extracted_elements.append(entry["text"])
105
+ elif entry["type"] == "UncategorizedText" and extract_tables is False:
106
+ extracted_elements.append(entry["text"])
107
+
108
+ # Write the extracted elements to the output file
109
+ new_extension = 'txt' # You can change this to any extension you want
110
+ new_file_path = self.change_file_extension(input_data, new_extension)
111
+ with open(new_file_path, 'w') as output_file:
112
+ for element in extracted_elements:
113
+ output_file.write(element + "\n\n") # Adding two newlines for separation
114
+
115
+ return new_file_path
116
+
117
+ def invoke_pipeline_step(self, task_call, task_description, local):
118
+ if local:
119
+ with Progress(
120
+ SpinnerColumn(),
121
+ TextColumn("[progress.description]{task.description}"),
122
+ transient=False,
123
+ ) as progress:
124
+ progress.add_task(description=task_description, total=None)
125
+ ret = task_call()
126
+ else:
127
+ print(task_description)
128
+ ret = task_call()
129
+
130
+ return ret
131
+
132
+
133
+ # if __name__ == "__main__":
134
+ # processor = FileProcessor()
135
+ # content = processor.extract_data('/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
136
+ # 'hi_res',
137
+ # 'yolox',
138
+ # 'tables',
139
+ # False,
140
+ # True)
141
+ # processor.extract_data("/Users/andrejb/Documents/work/lifung/lemming_test/C16E150001_SUPINV.pdf")
142
+ # processor.extract_data("/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_single.pdf")
143
+ # print(content)
@@ -0,0 +1,81 @@
1
+ Metadata-Version: 2.1
2
+ Name: sparrow-parse
3
+ Version: 0.1.9
4
+ Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
5
+ Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
+ License: GPL-3.0
7
+ Keywords: llm,rag,vision
8
+ Author: Andrej Baranovskij
9
+ Author-email: andrejus.baranovskis@gmail.com
10
+ Requires-Python: >=3.9,<3.12
11
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Topic :: Software Development
18
+ Requires-Dist: rich (>=13.7.1,<14.0.0)
19
+ Requires-Dist: torch (==2.2.2)
20
+ Requires-Dist: unstructured-inference (==0.7.29)
21
+ Requires-Dist: unstructured[all-docs] (==0.13.6)
22
+ Project-URL: Repository, https://github.com/katanaml/sparrow
23
+ Description-Content-Type: text/markdown
24
+
25
+ # Sparrow Parse
26
+
27
+ ## Description
28
+
29
+ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing.
30
+
31
+ ## Install
32
+
33
+ ```
34
+ pip install sparrow-parse
35
+ ```
36
+
37
+ ## Use
38
+
39
+ Import
40
+
41
+ ```
42
+ from sparrow_parse.pdf.pdf_processor import PDFProcessor
43
+ ```
44
+
45
+ Usage
46
+
47
+ ```
48
+ processor = PDFProcessor()
49
+ result = processor.process_file(file_path, strategy, model_name)
50
+ ```
51
+
52
+ Build for development
53
+
54
+ ```
55
+ poetry build
56
+ ```
57
+
58
+ Publish to PyPi
59
+
60
+ ```
61
+ poetry publish
62
+ ```
63
+
64
+ ## Commercial usage
65
+
66
+ Sparrow is available under the GPL 3.0 license, promoting freedom to use, modify, and distribute the software while ensuring any modifications remain open source under the same license. This aligns with our commitment to supporting the open-source community and fostering collaboration.
67
+
68
+ Additionally, we recognize the diverse needs of organizations, including small to medium-sized enterprises (SMEs). Therefore, Sparrow is also offered for free commercial use to organizations with gross revenue below $5 million USD in the past 12 months, enabling them to leverage Sparrow without the financial burden often associated with high-quality software solutions.
69
+
70
+ For businesses that exceed this revenue threshold or require usage terms not accommodated by the GPL 3.0 license—such as integrating Sparrow into proprietary software without the obligation to disclose source code modifications—we offer dual licensing options. Dual licensing allows Sparrow to be used under a separate proprietary license, offering greater flexibility for commercial applications and proprietary integrations. This model supports both the project's sustainability and the business's needs for confidentiality and customization.
71
+
72
+ If your organization is seeking to utilize Sparrow under a proprietary license, or if you are interested in custom workflows, consulting services, or dedicated support and maintenance options, please contact us at abaranovskis@redsamuraiconsulting.com. We're here to provide tailored solutions that meet your unique requirements, ensuring you can maximize the benefits of Sparrow for your projects and workflows.
73
+
74
+ ## Author
75
+
76
+ [Katana ML](https://katanaml.io), [Andrej Baranovskij](https://github.com/abaranovskis-redsamurai)
77
+
78
+ ## License
79
+
80
+ Licensed under the GPL 3.0. Copyright 2020-2024 Katana ML, Andrej Baranovskij. [Copy of the license](https://github.com/katanaml/sparrow/blob/main/LICENSE).
81
+
@@ -0,0 +1,8 @@
1
+ sparrow_parse/__init__.py,sha256=m1D6fscvvsMhq5HVNKw7kP5M8AqEzQm1ekrn_nLQF1M,21
2
+ sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
+ sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ sparrow_parse/extractor/file_processor.py,sha256=OrDxFJVEhy_4pCBxknehAM7fxgSlWgUJ0jeTEegHRxo,5621
5
+ sparrow_parse-0.1.9.dist-info/METADATA,sha256=wK7uOpPqsC1iwZs_d5Hl1KV4DIbw8NnfPU6MBTyF_kA,3428
6
+ sparrow_parse-0.1.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
7
+ sparrow_parse-0.1.9.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
8
+ sparrow_parse-0.1.9.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- class PDFProcessor(object):
2
- def __init__(self):
3
- pass
4
-
5
- def process_file(self, content):
6
- print("Processing file...")
7
- return "OK"
@@ -1,28 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: sparrow-parse
3
- Version: 0.1.7
4
- Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
5
- Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
- License: GPL-3.0
7
- Keywords: llm,rag,vision
8
- Author: Andrej Baranovskij
9
- Author-email: andrejus.baranovskis@gmail.com
10
- Requires-Python: >=3.10,<4.0
11
- Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
12
- Classifier: Operating System :: OS Independent
13
- Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3.10
15
- Classifier: Programming Language :: Python :: 3.11
16
- Classifier: Programming Language :: Python :: 3.12
17
- Classifier: Topic :: Software Development
18
- Requires-Dist: requests (>=2.31.0,<3.0.0)
19
- Project-URL: Repository, https://github.com/katanaml/sparrow
20
- Description-Content-Type: text/markdown
21
-
22
- ## Author
23
-
24
- [Katana ML](https://katanaml.io), [Andrej Baranovskij](https://github.com/abaranovskis-redsamurai)
25
-
26
- ## License
27
-
28
- Licensed under the GPL 3.0. Copyright 2020-2024 Katana ML, Andrej Baranovskij. [Copy of the license](https://github.com/katanaml/sparrow/blob/main/LICENSE).
@@ -1,8 +0,0 @@
1
- sparrow_parse/__init__.py,sha256=V7LnX330m3uiAO0EYQbPUYETPj2br2y1Pv-a7ApMj40,21
2
- sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
- sparrow_parse/pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- sparrow_parse/pdf/pdf_processor.py,sha256=hyvOQX_IydRA3z7gQs_g-Ut1hvVHRRxj1_2i-G09-ow,159
5
- sparrow_parse-0.1.7.dist-info/METADATA,sha256=5w7-jeqUA3VKEHpaS4lpkwgVFXUW68KONbXEThyjJX4,1165
6
- sparrow_parse-0.1.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
7
- sparrow_parse-0.1.7.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
8
- sparrow_parse-0.1.7.dist-info/RECORD,,
File without changes