sparrow-parse 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sparrow_parse/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = '0.1.8'
1
+ __version__ = '0.1.9'
@@ -0,0 +1,143 @@
1
+ import tempfile
2
+ import os
3
+ from unstructured.partition.pdf import partition_pdf
4
+ from unstructured.partition.image import partition_image
5
+ import json
6
+ from unstructured.staging.base import elements_to_json
7
+ from rich.progress import Progress, SpinnerColumn, TextColumn
8
+
9
+
10
+ class FileProcessor(object):
11
+ def __init__(self):
12
+ pass
13
+
14
+ def extract_data(self, file_path, strategy, model_name, options, local=True, debug=False):
15
+ # check if string options contains word table
16
+ extract_tables = False
17
+ if options is not None and "tables" in options:
18
+ extract_tables = True
19
+
20
+ # Extracts the elements from the PDF
21
+ elements = self.invoke_pipeline_step(
22
+ lambda: self.process_file(file_path, strategy, model_name),
23
+ "Extracting elements from the document...",
24
+ local
25
+ )
26
+
27
+ if debug:
28
+ new_extension = 'json' # You can change this to any extension you want
29
+ new_file_path = self.change_file_extension(file_path, new_extension)
30
+
31
+ content = self.invoke_pipeline_step(
32
+ lambda: self.load_text_data(elements, new_file_path, extract_tables),
33
+ "Loading text data...",
34
+ local
35
+ )
36
+ else:
37
+ with tempfile.TemporaryDirectory() as temp_dir:
38
+ temp_file_path = os.path.join(temp_dir, "file_data.json")
39
+
40
+ content = self.invoke_pipeline_step(
41
+ lambda: self.load_text_data(elements, temp_file_path, extract_tables),
42
+ "Loading text data...",
43
+ local
44
+ )
45
+
46
+ return content
47
+
48
+ def process_file(self, file_path, strategy, model_name):
49
+ elements = None
50
+
51
+ if file_path.lower().endswith('.pdf'):
52
+ elements = partition_pdf(
53
+ filename=file_path,
54
+ strategy=strategy,
55
+ infer_table_structure=True,
56
+ model_name=model_name
57
+ )
58
+ elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
59
+ elements = partition_image(
60
+ filename=file_path,
61
+ strategy=strategy,
62
+ infer_table_structure=True,
63
+ model_name=model_name
64
+ )
65
+
66
+ return elements
67
+
68
+ def change_file_extension(self, file_path, new_extension):
69
+ # Check if the new extension starts with a dot and add one if not
70
+ if not new_extension.startswith('.'):
71
+ new_extension = '.' + new_extension
72
+
73
+ # Split the file path into two parts: the base (everything before the last dot) and the extension
74
+ # If there's no dot in the filename, it'll just return the original filename without an extension
75
+ base = file_path.rsplit('.', 1)[0]
76
+
77
+ # Concatenate the base with the new extension
78
+ new_file_path = base + new_extension
79
+
80
+ return new_file_path
81
+
82
+ def load_text_data(self, elements, file_path, extract_tables):
83
+ elements_to_json(elements, filename=file_path)
84
+ text_file = self.process_json_file(file_path, extract_tables)
85
+
86
+ with open(text_file, 'r') as file:
87
+ content = file.read()
88
+
89
+ return content
90
+
91
+ def process_json_file(self, input_data, extract_tables):
92
+ # Read the JSON file
93
+ with open(input_data, 'r') as file:
94
+ data = json.load(file)
95
+
96
+ # Iterate over the JSON data and extract required table elements
97
+ extracted_elements = []
98
+ for entry in data:
99
+ if entry["type"] == "Table":
100
+ extracted_elements.append(entry["metadata"]["text_as_html"])
101
+ elif entry["type"] == "Title" and extract_tables is False:
102
+ extracted_elements.append(entry["text"])
103
+ elif entry["type"] == "NarrativeText" and extract_tables is False:
104
+ extracted_elements.append(entry["text"])
105
+ elif entry["type"] == "UncategorizedText" and extract_tables is False:
106
+ extracted_elements.append(entry["text"])
107
+
108
+ # Write the extracted elements to the output file
109
+ new_extension = 'txt' # You can change this to any extension you want
110
+ new_file_path = self.change_file_extension(input_data, new_extension)
111
+ with open(new_file_path, 'w') as output_file:
112
+ for element in extracted_elements:
113
+ output_file.write(element + "\n\n") # Adding two newlines for separation
114
+
115
+ return new_file_path
116
+
117
+ def invoke_pipeline_step(self, task_call, task_description, local):
118
+ if local:
119
+ with Progress(
120
+ SpinnerColumn(),
121
+ TextColumn("[progress.description]{task.description}"),
122
+ transient=False,
123
+ ) as progress:
124
+ progress.add_task(description=task_description, total=None)
125
+ ret = task_call()
126
+ else:
127
+ print(task_description)
128
+ ret = task_call()
129
+
130
+ return ret
131
+
132
+
133
+ # if __name__ == "__main__":
134
+ # processor = FileProcessor()
135
+ # content = processor.extract_data('/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
136
+ # 'hi_res',
137
+ # 'yolox',
138
+ # 'tables',
139
+ # False,
140
+ # True)
141
+ # processor.extract_data("/Users/andrejb/Documents/work/lifung/lemming_test/C16E150001_SUPINV.pdf")
142
+ # processor.extract_data("/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_single.pdf")
143
+ # print(content)
@@ -1,21 +1,24 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.1.8
3
+ Version: 0.1.9
4
4
  Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  License: GPL-3.0
7
7
  Keywords: llm,rag,vision
8
8
  Author: Andrej Baranovskij
9
9
  Author-email: andrejus.baranovskis@gmail.com
10
- Requires-Python: >=3.10,<4.0
10
+ Requires-Python: >=3.9,<3.12
11
11
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
12
12
  Classifier: Operating System :: OS Independent
13
13
  Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
14
15
  Classifier: Programming Language :: Python :: 3.10
15
16
  Classifier: Programming Language :: Python :: 3.11
16
- Classifier: Programming Language :: Python :: 3.12
17
17
  Classifier: Topic :: Software Development
18
- Requires-Dist: requests (>=2.31.0,<3.0.0)
18
+ Requires-Dist: rich (>=13.7.1,<14.0.0)
19
+ Requires-Dist: torch (==2.2.2)
20
+ Requires-Dist: unstructured-inference (==0.7.29)
21
+ Requires-Dist: unstructured[all-docs] (==0.13.6)
19
22
  Project-URL: Repository, https://github.com/katanaml/sparrow
20
23
  Description-Content-Type: text/markdown
21
24
 
@@ -49,7 +52,13 @@ result = processor.process_file(file_path, strategy, model_name)
49
52
  Build for development
50
53
 
51
54
  ```
55
+ poetry build
56
+ ```
57
+
58
+ Publish to PyPi
52
59
 
60
+ ```
61
+ poetry publish
53
62
  ```
54
63
 
55
64
  ## Commercial usage
@@ -0,0 +1,8 @@
1
+ sparrow_parse/__init__.py,sha256=m1D6fscvvsMhq5HVNKw7kP5M8AqEzQm1ekrn_nLQF1M,21
2
+ sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
+ sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ sparrow_parse/extractor/file_processor.py,sha256=OrDxFJVEhy_4pCBxknehAM7fxgSlWgUJ0jeTEegHRxo,5621
5
+ sparrow_parse-0.1.9.dist-info/METADATA,sha256=wK7uOpPqsC1iwZs_d5Hl1KV4DIbw8NnfPU6MBTyF_kA,3428
6
+ sparrow_parse-0.1.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
7
+ sparrow_parse-0.1.9.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
8
+ sparrow_parse-0.1.9.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- class PDFProcessor(object):
2
- def __init__(self):
3
- pass
4
-
5
- def process_file(self, content):
6
- print("Processing file...")
7
- return "OK"
@@ -1,8 +0,0 @@
1
- sparrow_parse/__init__.py,sha256=zemvJ5zjFE6SQT2xmkxc-ZYwNkUTCEX7mz3Epb2qztE,21
2
- sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
3
- sparrow_parse/pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- sparrow_parse/pdf/pdf_processor.py,sha256=hyvOQX_IydRA3z7gQs_g-Ut1hvVHRRxj1_2i-G09-ow,159
5
- sparrow_parse-0.1.8.dist-info/METADATA,sha256=QTAeFIi-KwyBvSbBrB8wS5WCld3gQ3XfAll4wS4x7Yc,3250
6
- sparrow_parse-0.1.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
7
- sparrow_parse-0.1.8.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
8
- sparrow_parse-0.1.8.dist-info/RECORD,,
File without changes