PySAR 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pySAR/utils.py ADDED
@@ -0,0 +1,283 @@
1
+ ################################################################################
2
+ ################# Utilities Module ##################
3
+ ################################################################################
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ import os
8
+ import csv
9
+
10
+ from .globals_ import OUTPUT_FOLDER, CURRENT_DATETIME
11
+
12
+ class Map(dict):
13
+ """
14
+ Instantiating this class will convert a dict such that it can be accessed using
15
+ dot notation which makes it easier for accessing the individual elements and
16
+ parameters of the config files. It also works for nested dicts.
17
+
18
+ Parameters
19
+ ==========
20
+ :dict: dict
21
+ input dictionary to be mapped into dot notation.
22
+
23
+ Usage
24
+ =====
25
+ m = Map({'first_name': 'Eduardo'}, last_name='Pool', age=24, sports=['Soccer'])
26
+ # Add new key
27
+ m.new_key = 'Hello world!'
28
+ # Or
29
+ m['new_key'] = 'Hello world!'
30
+ print m.new_key
31
+ print m['new_key']
32
+ # Update values
33
+ m.new_key = 'Yay!'
34
+ # Or
35
+ m['new_key'] = 'Yay!'
36
+ # Delete key
37
+ del m.new_key
38
+ # Or
39
+ del m['new_key']
40
+
41
+ References
42
+ ----------
43
+ [1] https://stackoverflow.com/questions/2352181/how-to-use-a-dot-to-access-members-of-dictionary
44
+ """
45
+ def __init__(self, *args, **kwargs):
46
+ super().__init__(*args, **kwargs)
47
+ for arg in args:
48
+ if (isinstance(arg, dict)):
49
+ for k, v in arg.items():
50
+ self[k] = v
51
+
52
+ if (kwargs):
53
+ for k, v in kwargs.items():
54
+ self[k] = v
55
+
56
+ def __getattr__(self, attr):
57
+ return self.get(attr)
58
+
59
+ def __setattr__(self, key, value):
60
+ self.__setitem__(key, value)
61
+
62
+ def __setitem__(self, key, value):
63
+ super().__setitem__(key, value)
64
+ self.__dict__.update({key: value})
65
+
66
+ def __delattr__(self, item):
67
+ self.__delitem__(item)
68
+
69
+ def __delitem__(self, key):
70
+ super().__delitem__(key)
71
+ del self.__dict__[key]
72
+
73
+ def valid_sequence(sequences):
74
+ """
75
+ Function that iterates through all protein sequences and validates that
76
+ each sequence is made up of valid canonical amino acid letters. If no
77
+ invalid values are found then None will be returned. If invalid letters
78
+ are found in the sequence, the sequence index and the index of the value
79
+ within the sequence will be appened to a dict and returned. In the output
80
+ dict, the sequence reference is not zero indexed so the index to the first
81
+ sequence will be 1 not 0.
82
+
83
+ Parameters
84
+ ==========
85
+ :sequences: list/np.ndarray
86
+ list or array of protein sequences.
87
+
88
+ Returns
89
+ =======
90
+ :None or invalid_indices: None/list
91
+ if no invalid values found in the protein sequences, None returned. If
92
+ invalid values found, list of dicts returned in the form
93
+ {sequence index: invalid value in sequence index}.
94
+
95
+ Usage
96
+ -----
97
+ seq = ["ACDEF", "GHIKLM", "ABCDE"]
98
+ seq_check = valid_sequence(seq)
99
+ #{'Sequence #3': '(B at index #2)'}
100
+ """
101
+ #if input is string, cast to a list so it is iterable
102
+ if (isinstance(sequences, str)):
103
+ sequences = [sequences]
104
+
105
+ #valid canonical amino acid letters
106
+ valid_amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',\
107
+ 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '-']
108
+ invalid_indices = []
109
+
110
+ #iterate through all sequences, validating that there are no invalid values
111
+ #present in the sequences, if there are then append to list of invalid indices
112
+ for seq in range(0, len(sequences)):
113
+ for aa in range(0, len(sequences[seq])):
114
+ if (sequences[seq][aa] not in valid_amino_acids):
115
+ invalid_indices.append(
116
+ {f'Sequence #{seq+1}': f'({sequences[seq][aa]} at index #{aa})'})
117
+
118
+ #if no invalid values found in sequences return None, else return list of
119
+ #dicts containing invalid index and invalid values
120
+ return invalid_indices or None
121
+
122
+ def remove_gaps(sequences):
123
+ """
124
+ Function that removes any gaps ('-') from the protein sequences in the input.
125
+ The descriptors cannot be calculated if a '-' value is passsed into their
126
+ respective funtions so gaps need to be removed. Removing the gaps has the same
127
+ effect as setting the value at the index of the sequence to 0 and has no effect
128
+ on the descriptor calculation. Input can be a string or list/array of sequences.
129
+
130
+ Parameters
131
+ ==========
132
+ :sequences: str/list/np.ndarray
133
+ string of 1 protein sequence or array/list of protein sequences.
134
+
135
+ Returns
136
+ =======
137
+ :protein_seqs: np.ndarray
138
+ returns the same inputted protein sequence(s) but with any gaps ('-') removed.
139
+ """
140
+ #string input: remove gaps and return as string
141
+ if isinstance(sequences, str):
142
+ return sequences.replace("-", "")
143
+
144
+ #pd.Series input: process each element independently using vectorised str.replace
145
+ if isinstance(sequences, pd.Series):
146
+ return sequences.str.replace("-", "", regex=False).reset_index(drop=True)
147
+
148
+ #list/array input: treat as single sequence of chars — join after removing gap chars
149
+ cleaned = ''.join(str(c) for c in sequences if str(c) != '-')
150
+ return [cleaned]
151
+
152
+ def flatten(array):
153
+ """
154
+ Lambda function for flattening list of lists or array of lists into one
155
+ 1-dimensional array/list. Input must contain an array of arrays of the same
156
+ length. Input will be flattened into a 1-dimensional array of size (M * N, 1)
157
+ where M = len(array) and N = len(array[0]). The flattened output can then be
158
+ reshaped into the required shape and format.
159
+
160
+ Parameters
161
+ ==========
162
+ :array: np.ndarray/list
163
+ array of arrays or list of lists to be flattened.
164
+
165
+ Returns
166
+ =======
167
+ :flatten(array/list): np.ndarray/list
168
+ flattened 1-dimensional list or array.
169
+ """
170
+ #if input is a string then return input as cannot be flattened
171
+ if (isinstance(array, str)):
172
+ return array
173
+
174
+ #create flatten lambda function
175
+ _flatten = lambda array: [item for sublist in array for item in sublist]
176
+
177
+ #flatten array/list
178
+ try:
179
+ flattened_array = _flatten(array)
180
+ except (TypeError, ValueError):
181
+ raise TypeError(f'Error flattening array of type: {type(array)} and size {len(array)}.')
182
+
183
+ #if input is a numpy array then reshape to 1D numpy array else return list
184
+ if (isinstance(array,np.ndarray)):
185
+ return (np.array(flattened_array).reshape([-1, 1]))
186
+ else:
187
+ return flattened_array
188
+
189
+ def zero_padding(sequences):
190
+ """
191
+ Pad sequences in input array with 0's such that every sequence is of the same length
192
+ of max(len(sequences)).
193
+
194
+ Parameters
195
+ ==========
196
+ :sequences: np.ndarray/list
197
+ array or list of encoded protein sequences.
198
+
199
+ Returns
200
+ =======
201
+ :sequences: np.ndarray/list
202
+ input sequences but with every sequence in the object now zero paddded
203
+ to be the same length.
204
+ """
205
+ #no need to zero-pad if only one sequence passed in
206
+ if (len(sequences) == 1):
207
+ return sequences
208
+
209
+ is_series = isinstance(sequences, pd.Series)
210
+ is_ndarray = isinstance(sequences, np.ndarray)
211
+
212
+ #get maximum length of all sequences
213
+ max_len = len(max(sequences, key=len))
214
+
215
+ #check if any sequence is shorter than max_len
216
+ seq_at = lambda i: sequences.iloc[i] if is_series else sequences[i]
217
+ if not any(len(seq_at(s)) < max_len for s in range(len(sequences))):
218
+ return sequences
219
+
220
+ #determine element type to choose padding strategy
221
+ first_elem = seq_at(0)
222
+ if isinstance(first_elem, str):
223
+ #string sequences: pad with '0' character
224
+ if is_series:
225
+ return sequences.str.ljust(max_len, '0')
226
+ seqs_list = list(sequences)
227
+ for s in range(len(seqs_list)):
228
+ if len(seqs_list[s]) < max_len:
229
+ seqs_list[s] = seqs_list[s].ljust(max_len, '0')
230
+ return np.array(seqs_list, dtype=sequences.dtype) if is_ndarray else seqs_list
231
+ else:
232
+ #list/array sequences: extend shorter sequences with zeros
233
+ seqs_list = [list(s) for s in sequences]
234
+ for s in range(len(seqs_list)):
235
+ diff = max_len - len(seqs_list[s])
236
+ if diff > 0:
237
+ seqs_list[s] = seqs_list[s] + [0] * diff
238
+ return np.array(seqs_list, dtype=object) if is_ndarray else seqs_list
239
+
240
+ def save_results(results, file_name, output_folder=""):
241
+ """
242
+ Save object DataFrame/Series containing metric names and their values captured from
243
+ the encoding process. Save the results in this object to a CSV file named according
244
+ to name input parameter. Function can also accept a dict of results.
245
+
246
+ Parameters
247
+ ==========
248
+ :results: dict/pd.DataFrame/pd.Series
249
+ object of the metrics and results from the encoding process. Ideally should
250
+ be a dataframe/series but function also accepts a dict of results.
251
+ :file_name: str
252
+ file name to call results file.
253
+
254
+ Returns
255
+ =======
256
+ None
257
+ """
258
+ #append extension if not in file name
259
+ if (os.path.splitext(file_name)[1] == ""):
260
+ file_name = file_name + '.csv'
261
+
262
+ #set output folder to default (already timestamped) or append timestamp to custom folder
263
+ if not output_folder:
264
+ output_folder = OUTPUT_FOLDER
265
+ else:
266
+ output_folder = output_folder + '_' + CURRENT_DATETIME
267
+
268
+ #create output folder if it doesn't exist
269
+ if not (os.path.isdir(output_folder)):
270
+ os.makedirs(output_folder)
271
+
272
+ #output results to csv if results variable is a dictionary
273
+ if (isinstance(results, dict)):
274
+ with open(os.path.join(output_folder, file_name), 'w') as f:
275
+ w = csv.DictWriter(f, results.keys())
276
+ w.writeheader()
277
+ w.writerow(results)
278
+ #output results to csv if results variable is a dataframe or Series
279
+ elif (isinstance(results, pd.DataFrame) or isinstance(results, pd.Series)):
280
+ results.reset_index(drop=True, inplace=True)
281
+ results.to_csv(os.path.join(output_folder, file_name))
282
+ else:
283
+ raise TypeError(f'Results object must be of type: dict, pd.Series or pd.DataFrame, got object of type {type(results)}.')