sparrow-parse 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sparrow_parse/__init__.py +1 -1
- sparrow_parse/extractor/file_processor.py +143 -0
- {sparrow_parse-0.1.8.dist-info → sparrow_parse-0.1.9.dist-info}/METADATA +13 -4
- sparrow_parse-0.1.9.dist-info/RECORD +8 -0
- sparrow_parse/pdf/pdf_processor.py +0 -7
- sparrow_parse-0.1.8.dist-info/RECORD +0 -8
- /sparrow_parse/{pdf → extractor}/__init__.py +0 -0
- {sparrow_parse-0.1.8.dist-info → sparrow_parse-0.1.9.dist-info}/WHEEL +0 -0
- {sparrow_parse-0.1.8.dist-info → sparrow_parse-0.1.9.dist-info}/entry_points.txt +0 -0
sparrow_parse/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = '0.1.
|
1
|
+
__version__ = '0.1.9'
|
@@ -0,0 +1,143 @@
|
|
1
|
+
import tempfile
|
2
|
+
import os
|
3
|
+
from unstructured.partition.pdf import partition_pdf
|
4
|
+
from unstructured.partition.image import partition_image
|
5
|
+
import json
|
6
|
+
from unstructured.staging.base import elements_to_json
|
7
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
8
|
+
|
9
|
+
|
10
|
+
class FileProcessor(object):
|
11
|
+
def __init__(self):
|
12
|
+
pass
|
13
|
+
|
14
|
+
def extract_data(self, file_path, strategy, model_name, options, local=True, debug=False):
|
15
|
+
# check if string options contains word table
|
16
|
+
extract_tables = False
|
17
|
+
if options is not None and "tables" in options:
|
18
|
+
extract_tables = True
|
19
|
+
|
20
|
+
# Extracts the elements from the PDF
|
21
|
+
elements = self.invoke_pipeline_step(
|
22
|
+
lambda: self.process_file(file_path, strategy, model_name),
|
23
|
+
"Extracting elements from the document...",
|
24
|
+
local
|
25
|
+
)
|
26
|
+
|
27
|
+
if debug:
|
28
|
+
new_extension = 'json' # You can change this to any extension you want
|
29
|
+
new_file_path = self.change_file_extension(file_path, new_extension)
|
30
|
+
|
31
|
+
content = self.invoke_pipeline_step(
|
32
|
+
lambda: self.load_text_data(elements, new_file_path, extract_tables),
|
33
|
+
"Loading text data...",
|
34
|
+
local
|
35
|
+
)
|
36
|
+
else:
|
37
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
38
|
+
temp_file_path = os.path.join(temp_dir, "file_data.json")
|
39
|
+
|
40
|
+
content = self.invoke_pipeline_step(
|
41
|
+
lambda: self.load_text_data(elements, temp_file_path, extract_tables),
|
42
|
+
"Loading text data...",
|
43
|
+
local
|
44
|
+
)
|
45
|
+
|
46
|
+
return content
|
47
|
+
|
48
|
+
def process_file(self, file_path, strategy, model_name):
|
49
|
+
elements = None
|
50
|
+
|
51
|
+
if file_path.lower().endswith('.pdf'):
|
52
|
+
elements = partition_pdf(
|
53
|
+
filename=file_path,
|
54
|
+
strategy=strategy,
|
55
|
+
infer_table_structure=True,
|
56
|
+
model_name=model_name
|
57
|
+
)
|
58
|
+
elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
|
59
|
+
elements = partition_image(
|
60
|
+
filename=file_path,
|
61
|
+
strategy=strategy,
|
62
|
+
infer_table_structure=True,
|
63
|
+
model_name=model_name
|
64
|
+
)
|
65
|
+
|
66
|
+
return elements
|
67
|
+
|
68
|
+
def change_file_extension(self, file_path, new_extension):
|
69
|
+
# Check if the new extension starts with a dot and add one if not
|
70
|
+
if not new_extension.startswith('.'):
|
71
|
+
new_extension = '.' + new_extension
|
72
|
+
|
73
|
+
# Split the file path into two parts: the base (everything before the last dot) and the extension
|
74
|
+
# If there's no dot in the filename, it'll just return the original filename without an extension
|
75
|
+
base = file_path.rsplit('.', 1)[0]
|
76
|
+
|
77
|
+
# Concatenate the base with the new extension
|
78
|
+
new_file_path = base + new_extension
|
79
|
+
|
80
|
+
return new_file_path
|
81
|
+
|
82
|
+
def load_text_data(self, elements, file_path, extract_tables):
|
83
|
+
elements_to_json(elements, filename=file_path)
|
84
|
+
text_file = self.process_json_file(file_path, extract_tables)
|
85
|
+
|
86
|
+
with open(text_file, 'r') as file:
|
87
|
+
content = file.read()
|
88
|
+
|
89
|
+
return content
|
90
|
+
|
91
|
+
def process_json_file(self, input_data, extract_tables):
|
92
|
+
# Read the JSON file
|
93
|
+
with open(input_data, 'r') as file:
|
94
|
+
data = json.load(file)
|
95
|
+
|
96
|
+
# Iterate over the JSON data and extract required table elements
|
97
|
+
extracted_elements = []
|
98
|
+
for entry in data:
|
99
|
+
if entry["type"] == "Table":
|
100
|
+
extracted_elements.append(entry["metadata"]["text_as_html"])
|
101
|
+
elif entry["type"] == "Title" and extract_tables is False:
|
102
|
+
extracted_elements.append(entry["text"])
|
103
|
+
elif entry["type"] == "NarrativeText" and extract_tables is False:
|
104
|
+
extracted_elements.append(entry["text"])
|
105
|
+
elif entry["type"] == "UncategorizedText" and extract_tables is False:
|
106
|
+
extracted_elements.append(entry["text"])
|
107
|
+
|
108
|
+
# Write the extracted elements to the output file
|
109
|
+
new_extension = 'txt' # You can change this to any extension you want
|
110
|
+
new_file_path = self.change_file_extension(input_data, new_extension)
|
111
|
+
with open(new_file_path, 'w') as output_file:
|
112
|
+
for element in extracted_elements:
|
113
|
+
output_file.write(element + "\n\n") # Adding two newlines for separation
|
114
|
+
|
115
|
+
return new_file_path
|
116
|
+
|
117
|
+
def invoke_pipeline_step(self, task_call, task_description, local):
|
118
|
+
if local:
|
119
|
+
with Progress(
|
120
|
+
SpinnerColumn(),
|
121
|
+
TextColumn("[progress.description]{task.description}"),
|
122
|
+
transient=False,
|
123
|
+
) as progress:
|
124
|
+
progress.add_task(description=task_description, total=None)
|
125
|
+
ret = task_call()
|
126
|
+
else:
|
127
|
+
print(task_description)
|
128
|
+
ret = task_call()
|
129
|
+
|
130
|
+
return ret
|
131
|
+
|
132
|
+
|
133
|
+
# if __name__ == "__main__":
|
134
|
+
# processor = FileProcessor()
|
135
|
+
# content = processor.extract_data('/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
|
136
|
+
# 'hi_res',
|
137
|
+
# 'yolox',
|
138
|
+
# 'tables',
|
139
|
+
# False,
|
140
|
+
# True)
|
141
|
+
# processor.extract_data("/Users/andrejb/Documents/work/lifung/lemming_test/C16E150001_SUPINV.pdf")
|
142
|
+
# processor.extract_data("/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_single.pdf")
|
143
|
+
# print(content)
|
@@ -1,21 +1,24 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.9
|
4
4
|
Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
License: GPL-3.0
|
7
7
|
Keywords: llm,rag,vision
|
8
8
|
Author: Andrej Baranovskij
|
9
9
|
Author-email: andrejus.baranovskis@gmail.com
|
10
|
-
Requires-Python: >=3.
|
10
|
+
Requires-Python: >=3.9,<3.12
|
11
11
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
12
12
|
Classifier: Operating System :: OS Independent
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
14
15
|
Classifier: Programming Language :: Python :: 3.10
|
15
16
|
Classifier: Programming Language :: Python :: 3.11
|
16
|
-
Classifier: Programming Language :: Python :: 3.12
|
17
17
|
Classifier: Topic :: Software Development
|
18
|
-
Requires-Dist:
|
18
|
+
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
19
|
+
Requires-Dist: torch (==2.2.2)
|
20
|
+
Requires-Dist: unstructured-inference (==0.7.29)
|
21
|
+
Requires-Dist: unstructured[all-docs] (==0.13.6)
|
19
22
|
Project-URL: Repository, https://github.com/katanaml/sparrow
|
20
23
|
Description-Content-Type: text/markdown
|
21
24
|
|
@@ -49,7 +52,13 @@ result = processor.process_file(file_path, strategy, model_name)
|
|
49
52
|
Build for development
|
50
53
|
|
51
54
|
```
|
55
|
+
poetry build
|
56
|
+
```
|
57
|
+
|
58
|
+
Publish to PyPi
|
52
59
|
|
60
|
+
```
|
61
|
+
poetry publish
|
53
62
|
```
|
54
63
|
|
55
64
|
## Commercial usage
|
@@ -0,0 +1,8 @@
|
|
1
|
+
sparrow_parse/__init__.py,sha256=m1D6fscvvsMhq5HVNKw7kP5M8AqEzQm1ekrn_nLQF1M,21
|
2
|
+
sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
|
3
|
+
sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
sparrow_parse/extractor/file_processor.py,sha256=OrDxFJVEhy_4pCBxknehAM7fxgSlWgUJ0jeTEegHRxo,5621
|
5
|
+
sparrow_parse-0.1.9.dist-info/METADATA,sha256=wK7uOpPqsC1iwZs_d5Hl1KV4DIbw8NnfPU6MBTyF_kA,3428
|
6
|
+
sparrow_parse-0.1.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
7
|
+
sparrow_parse-0.1.9.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
|
8
|
+
sparrow_parse-0.1.9.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
sparrow_parse/__init__.py,sha256=zemvJ5zjFE6SQT2xmkxc-ZYwNkUTCEX7mz3Epb2qztE,21
|
2
|
-
sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
|
3
|
-
sparrow_parse/pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
sparrow_parse/pdf/pdf_processor.py,sha256=hyvOQX_IydRA3z7gQs_g-Ut1hvVHRRxj1_2i-G09-ow,159
|
5
|
-
sparrow_parse-0.1.8.dist-info/METADATA,sha256=QTAeFIi-KwyBvSbBrB8wS5WCld3gQ3XfAll4wS4x7Yc,3250
|
6
|
-
sparrow_parse-0.1.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
7
|
-
sparrow_parse-0.1.8.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
|
8
|
-
sparrow_parse-0.1.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|