PySAR 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/conf.py +53 -0
- pySAR/__init__.py +28 -0
- pySAR/descriptors.py +2893 -0
- pySAR/encoding.py +986 -0
- pySAR/evaluate.py +231 -0
- pySAR/globals_.py +21 -0
- pySAR/model.py +559 -0
- pySAR/plots.py +92 -0
- pySAR/py.typed +0 -0
- pySAR/pyDSP.py +582 -0
- pySAR/pySAR.py +962 -0
- pySAR/utils.py +283 -0
- pysar-2.5.0.dist-info/METADATA +740 -0
- pysar-2.5.0.dist-info/RECORD +17 -0
- pysar-2.5.0.dist-info/WHEEL +5 -0
- pysar-2.5.0.dist-info/licenses/LICENSE +21 -0
- pysar-2.5.0.dist-info/top_level.txt +2 -0
pySAR/utils.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
################# Utilities Module ##################
|
|
3
|
+
################################################################################
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import numpy as np
|
|
7
|
+
import os
|
|
8
|
+
import csv
|
|
9
|
+
|
|
10
|
+
from .globals_ import OUTPUT_FOLDER, CURRENT_DATETIME
|
|
11
|
+
|
|
12
|
+
class Map(dict):
|
|
13
|
+
"""
|
|
14
|
+
Instantiating this class will convert a dict such that it can be accessed using
|
|
15
|
+
dot notation which makes it easier for accessing the individual elements and
|
|
16
|
+
parameters of the config files. It also works for nested dicts.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
==========
|
|
20
|
+
:dict: dict
|
|
21
|
+
input dictionary to be mapped into dot notation.
|
|
22
|
+
|
|
23
|
+
Usage
|
|
24
|
+
=====
|
|
25
|
+
m = Map({'first_name': 'Eduardo'}, last_name='Pool', age=24, sports=['Soccer'])
|
|
26
|
+
# Add new key
|
|
27
|
+
m.new_key = 'Hello world!'
|
|
28
|
+
# Or
|
|
29
|
+
m['new_key'] = 'Hello world!'
|
|
30
|
+
print m.new_key
|
|
31
|
+
print m['new_key']
|
|
32
|
+
# Update values
|
|
33
|
+
m.new_key = 'Yay!'
|
|
34
|
+
# Or
|
|
35
|
+
m['new_key'] = 'Yay!'
|
|
36
|
+
# Delete key
|
|
37
|
+
del m.new_key
|
|
38
|
+
# Or
|
|
39
|
+
del m['new_key']
|
|
40
|
+
|
|
41
|
+
References
|
|
42
|
+
----------
|
|
43
|
+
[1] https://stackoverflow.com/questions/2352181/how-to-use-a-dot-to-access-members-of-dictionary
|
|
44
|
+
"""
|
|
45
|
+
def __init__(self, *args, **kwargs):
|
|
46
|
+
super().__init__(*args, **kwargs)
|
|
47
|
+
for arg in args:
|
|
48
|
+
if (isinstance(arg, dict)):
|
|
49
|
+
for k, v in arg.items():
|
|
50
|
+
self[k] = v
|
|
51
|
+
|
|
52
|
+
if (kwargs):
|
|
53
|
+
for k, v in kwargs.items():
|
|
54
|
+
self[k] = v
|
|
55
|
+
|
|
56
|
+
def __getattr__(self, attr):
|
|
57
|
+
return self.get(attr)
|
|
58
|
+
|
|
59
|
+
def __setattr__(self, key, value):
|
|
60
|
+
self.__setitem__(key, value)
|
|
61
|
+
|
|
62
|
+
def __setitem__(self, key, value):
|
|
63
|
+
super().__setitem__(key, value)
|
|
64
|
+
self.__dict__.update({key: value})
|
|
65
|
+
|
|
66
|
+
def __delattr__(self, item):
|
|
67
|
+
self.__delitem__(item)
|
|
68
|
+
|
|
69
|
+
def __delitem__(self, key):
|
|
70
|
+
super().__delitem__(key)
|
|
71
|
+
del self.__dict__[key]
|
|
72
|
+
|
|
73
|
+
def valid_sequence(sequences):
|
|
74
|
+
"""
|
|
75
|
+
Function that iterates through all protein sequences and validates that
|
|
76
|
+
each sequence is made up of valid canonical amino acid letters. If no
|
|
77
|
+
invalid values are found then None will be returned. If invalid letters
|
|
78
|
+
are found in the sequence, the sequence index and the index of the value
|
|
79
|
+
within the sequence will be appened to a dict and returned. In the output
|
|
80
|
+
dict, the sequence reference is not zero indexed so the index to the first
|
|
81
|
+
sequence will be 1 not 0.
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
==========
|
|
85
|
+
:sequences: list/np.ndarray
|
|
86
|
+
list or array of protein sequences.
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
=======
|
|
90
|
+
:None or invalid_indices: None/list
|
|
91
|
+
if no invalid values found in the protein sequences, None returned. If
|
|
92
|
+
invalid values found, list of dicts returned in the form
|
|
93
|
+
{sequence index: invalid value in sequence index}.
|
|
94
|
+
|
|
95
|
+
Usage
|
|
96
|
+
-----
|
|
97
|
+
seq = ["ACDEF", "GHIKLM", "ABCDE"]
|
|
98
|
+
seq_check = valid_sequence(seq)
|
|
99
|
+
#{'Sequence #3': '(B at index #2)'}
|
|
100
|
+
"""
|
|
101
|
+
#if input is string, cast to a list so it is iterable
|
|
102
|
+
if (isinstance(sequences, str)):
|
|
103
|
+
sequences = [sequences]
|
|
104
|
+
|
|
105
|
+
#valid canonical amino acid letters
|
|
106
|
+
valid_amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',\
|
|
107
|
+
'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '-']
|
|
108
|
+
invalid_indices = []
|
|
109
|
+
|
|
110
|
+
#iterate through all sequences, validating that there are no invalid values
|
|
111
|
+
#present in the sequences, if there are then append to list of invalid indices
|
|
112
|
+
for seq in range(0, len(sequences)):
|
|
113
|
+
for aa in range(0, len(sequences[seq])):
|
|
114
|
+
if (sequences[seq][aa] not in valid_amino_acids):
|
|
115
|
+
invalid_indices.append(
|
|
116
|
+
{f'Sequence #{seq+1}': f'({sequences[seq][aa]} at index #{aa})'})
|
|
117
|
+
|
|
118
|
+
#if no invalid values found in sequences return None, else return list of
|
|
119
|
+
#dicts containing invalid index and invalid values
|
|
120
|
+
return invalid_indices or None
|
|
121
|
+
|
|
122
|
+
def remove_gaps(sequences):
|
|
123
|
+
"""
|
|
124
|
+
Function that removes any gaps ('-') from the protein sequences in the input.
|
|
125
|
+
The descriptors cannot be calculated if a '-' value is passsed into their
|
|
126
|
+
respective funtions so gaps need to be removed. Removing the gaps has the same
|
|
127
|
+
effect as setting the value at the index of the sequence to 0 and has no effect
|
|
128
|
+
on the descriptor calculation. Input can be a string or list/array of sequences.
|
|
129
|
+
|
|
130
|
+
Parameters
|
|
131
|
+
==========
|
|
132
|
+
:sequences: str/list/np.ndarray
|
|
133
|
+
string of 1 protein sequence or array/list of protein sequences.
|
|
134
|
+
|
|
135
|
+
Returns
|
|
136
|
+
=======
|
|
137
|
+
:protein_seqs: np.ndarray
|
|
138
|
+
returns the same inputted protein sequence(s) but with any gaps ('-') removed.
|
|
139
|
+
"""
|
|
140
|
+
#string input: remove gaps and return as string
|
|
141
|
+
if isinstance(sequences, str):
|
|
142
|
+
return sequences.replace("-", "")
|
|
143
|
+
|
|
144
|
+
#pd.Series input: process each element independently using vectorised str.replace
|
|
145
|
+
if isinstance(sequences, pd.Series):
|
|
146
|
+
return sequences.str.replace("-", "", regex=False).reset_index(drop=True)
|
|
147
|
+
|
|
148
|
+
#list/array input: treat as single sequence of chars — join after removing gap chars
|
|
149
|
+
cleaned = ''.join(str(c) for c in sequences if str(c) != '-')
|
|
150
|
+
return [cleaned]
|
|
151
|
+
|
|
152
|
+
def flatten(array):
|
|
153
|
+
"""
|
|
154
|
+
Lambda function for flattening list of lists or array of lists into one
|
|
155
|
+
1-dimensional array/list. Input must contain an array of arrays of the same
|
|
156
|
+
length. Input will be flattened into a 1-dimensional array of size (M * N, 1)
|
|
157
|
+
where M = len(array) and N = len(array[0]). The flattened output can then be
|
|
158
|
+
reshaped into the required shape and format.
|
|
159
|
+
|
|
160
|
+
Parameters
|
|
161
|
+
==========
|
|
162
|
+
:array: np.ndarray/list
|
|
163
|
+
array of arrays or list of lists to be flattened.
|
|
164
|
+
|
|
165
|
+
Returns
|
|
166
|
+
=======
|
|
167
|
+
:flatten(array/list): np.ndarray/list
|
|
168
|
+
flattened 1-dimensional list or array.
|
|
169
|
+
"""
|
|
170
|
+
#if input is a string then return input as cannot be flattened
|
|
171
|
+
if (isinstance(array, str)):
|
|
172
|
+
return array
|
|
173
|
+
|
|
174
|
+
#create flatten lambda function
|
|
175
|
+
_flatten = lambda array: [item for sublist in array for item in sublist]
|
|
176
|
+
|
|
177
|
+
#flatten array/list
|
|
178
|
+
try:
|
|
179
|
+
flattened_array = _flatten(array)
|
|
180
|
+
except (TypeError, ValueError):
|
|
181
|
+
raise TypeError(f'Error flattening array of type: {type(array)} and size {len(array)}.')
|
|
182
|
+
|
|
183
|
+
#if input is a numpy array then reshape to 1D numpy array else return list
|
|
184
|
+
if (isinstance(array,np.ndarray)):
|
|
185
|
+
return (np.array(flattened_array).reshape([-1, 1]))
|
|
186
|
+
else:
|
|
187
|
+
return flattened_array
|
|
188
|
+
|
|
189
|
+
def zero_padding(sequences):
|
|
190
|
+
"""
|
|
191
|
+
Pad sequences in input array with 0's such that every sequence is of the same length
|
|
192
|
+
of max(len(sequences)).
|
|
193
|
+
|
|
194
|
+
Parameters
|
|
195
|
+
==========
|
|
196
|
+
:sequences: np.ndarray/list
|
|
197
|
+
array or list of encoded protein sequences.
|
|
198
|
+
|
|
199
|
+
Returns
|
|
200
|
+
=======
|
|
201
|
+
:sequences: np.ndarray/list
|
|
202
|
+
input sequences but with every sequence in the object now zero paddded
|
|
203
|
+
to be the same length.
|
|
204
|
+
"""
|
|
205
|
+
#no need to zero-pad if only one sequence passed in
|
|
206
|
+
if (len(sequences) == 1):
|
|
207
|
+
return sequences
|
|
208
|
+
|
|
209
|
+
is_series = isinstance(sequences, pd.Series)
|
|
210
|
+
is_ndarray = isinstance(sequences, np.ndarray)
|
|
211
|
+
|
|
212
|
+
#get maximum length of all sequences
|
|
213
|
+
max_len = len(max(sequences, key=len))
|
|
214
|
+
|
|
215
|
+
#check if any sequence is shorter than max_len
|
|
216
|
+
seq_at = lambda i: sequences.iloc[i] if is_series else sequences[i]
|
|
217
|
+
if not any(len(seq_at(s)) < max_len for s in range(len(sequences))):
|
|
218
|
+
return sequences
|
|
219
|
+
|
|
220
|
+
#determine element type to choose padding strategy
|
|
221
|
+
first_elem = seq_at(0)
|
|
222
|
+
if isinstance(first_elem, str):
|
|
223
|
+
#string sequences: pad with '0' character
|
|
224
|
+
if is_series:
|
|
225
|
+
return sequences.str.ljust(max_len, '0')
|
|
226
|
+
seqs_list = list(sequences)
|
|
227
|
+
for s in range(len(seqs_list)):
|
|
228
|
+
if len(seqs_list[s]) < max_len:
|
|
229
|
+
seqs_list[s] = seqs_list[s].ljust(max_len, '0')
|
|
230
|
+
return np.array(seqs_list, dtype=sequences.dtype) if is_ndarray else seqs_list
|
|
231
|
+
else:
|
|
232
|
+
#list/array sequences: extend shorter sequences with zeros
|
|
233
|
+
seqs_list = [list(s) for s in sequences]
|
|
234
|
+
for s in range(len(seqs_list)):
|
|
235
|
+
diff = max_len - len(seqs_list[s])
|
|
236
|
+
if diff > 0:
|
|
237
|
+
seqs_list[s] = seqs_list[s] + [0] * diff
|
|
238
|
+
return np.array(seqs_list, dtype=object) if is_ndarray else seqs_list
|
|
239
|
+
|
|
240
|
+
def save_results(results, file_name, output_folder=""):
|
|
241
|
+
"""
|
|
242
|
+
Save object DataFrame/Series containing metric names and their values captured from
|
|
243
|
+
the encoding process. Save the results in this object to a CSV file named according
|
|
244
|
+
to name input parameter. Function can also accept a dict of results.
|
|
245
|
+
|
|
246
|
+
Parameters
|
|
247
|
+
==========
|
|
248
|
+
:results: dict/pd.DataFrame/pd.Series
|
|
249
|
+
object of the metrics and results from the encoding process. Ideally should
|
|
250
|
+
be a dataframe/series but function also accepts a dict of results.
|
|
251
|
+
:file_name: str
|
|
252
|
+
file name to call results file.
|
|
253
|
+
|
|
254
|
+
Returns
|
|
255
|
+
=======
|
|
256
|
+
None
|
|
257
|
+
"""
|
|
258
|
+
#append extension if not in file name
|
|
259
|
+
if (os.path.splitext(file_name)[1] == ""):
|
|
260
|
+
file_name = file_name + '.csv'
|
|
261
|
+
|
|
262
|
+
#set output folder to default (already timestamped) or append timestamp to custom folder
|
|
263
|
+
if not output_folder:
|
|
264
|
+
output_folder = OUTPUT_FOLDER
|
|
265
|
+
else:
|
|
266
|
+
output_folder = output_folder + '_' + CURRENT_DATETIME
|
|
267
|
+
|
|
268
|
+
#create output folder if it doesn't exist
|
|
269
|
+
if not (os.path.isdir(output_folder)):
|
|
270
|
+
os.makedirs(output_folder)
|
|
271
|
+
|
|
272
|
+
#output results to csv if results variable is a dictionary
|
|
273
|
+
if (isinstance(results, dict)):
|
|
274
|
+
with open(os.path.join(output_folder, file_name), 'w') as f:
|
|
275
|
+
w = csv.DictWriter(f, results.keys())
|
|
276
|
+
w.writeheader()
|
|
277
|
+
w.writerow(results)
|
|
278
|
+
#output results to csv if results variable is a dataframe or Series
|
|
279
|
+
elif (isinstance(results, pd.DataFrame) or isinstance(results, pd.Series)):
|
|
280
|
+
results.reset_index(drop=True, inplace=True)
|
|
281
|
+
results.to_csv(os.path.join(output_folder, file_name))
|
|
282
|
+
else:
|
|
283
|
+
raise TypeError(f'Results object must be of type: dict, pd.Series or pd.DataFrame, got object of type {type(results)}.')
|