PySAR 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/conf.py +53 -0
- pySAR/__init__.py +28 -0
- pySAR/descriptors.py +2893 -0
- pySAR/encoding.py +986 -0
- pySAR/evaluate.py +231 -0
- pySAR/globals_.py +21 -0
- pySAR/model.py +559 -0
- pySAR/plots.py +92 -0
- pySAR/py.typed +0 -0
- pySAR/pyDSP.py +582 -0
- pySAR/pySAR.py +962 -0
- pySAR/utils.py +283 -0
- pysar-2.5.0.dist-info/METADATA +740 -0
- pysar-2.5.0.dist-info/RECORD +17 -0
- pysar-2.5.0.dist-info/WHEEL +5 -0
- pysar-2.5.0.dist-info/licenses/LICENSE +21 -0
- pysar-2.5.0.dist-info/top_level.txt +2 -0
pySAR/pyDSP.py
ADDED
|
@@ -0,0 +1,582 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
################# Protein DSP #################
|
|
3
|
+
################################################################################
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from difflib import get_close_matches
|
|
7
|
+
import inspect
|
|
8
|
+
import os
|
|
9
|
+
from scipy.signal import savgol_filter, medfilt, lfilter, hilbert
|
|
10
|
+
from scipy.signal.windows import blackman, hann, hamming, bartlett, blackmanharris, \
|
|
11
|
+
kaiser, gaussian, barthann, bohman, chebwin, cosine, exponential, boxcar, \
|
|
12
|
+
flattop, nuttall, parzen, tukey, triang
|
|
13
|
+
try:
|
|
14
|
+
from scipy.fft import fft
|
|
15
|
+
except ImportError:
|
|
16
|
+
from numpy.fft import fft
|
|
17
|
+
import warnings
|
|
18
|
+
import json
|
|
19
|
+
from .utils import Map, zero_padding
|
|
20
|
+
|
|
21
|
+
class PyDSP():
|
|
22
|
+
"""
|
|
23
|
+
Transform protein sequences into their spectral form via a Discrete Fourier Transform (DFT) using
|
|
24
|
+
the Fast Fourier Transform (FFT) algorithm. Fourier analysis is fundamentally a method for
|
|
25
|
+
expressing a function as a sum of periodic components, and for recovering the function from those
|
|
26
|
+
components. When both the function and its Fourier transform are replaced with discretized
|
|
27
|
+
counterparts, it is called the Discrete Fourier transform (DFT). An implementation algorithm
|
|
28
|
+
for the DFT is known as the FFT, which is used here. From the FFT transformations on the
|
|
29
|
+
encoded protein sequences (encoded via amino acid property values from records in the AAI),
|
|
30
|
+
various informational protein spectra can be generated, including the power, real, imaginary and
|
|
31
|
+
absolute spectra. Prior to the FFT, a window function can be applied to the sequences
|
|
32
|
+
which is a mathematical function that applies a weighting to each discrete time series sample
|
|
33
|
+
in a finite set. By default, no window function is applied; although the function
|
|
34
|
+
can also accept the blackman, blackmanharris, bartlett, gaussia, bartlett, barthann, bohman,
|
|
35
|
+
chebwin, cosine, exponential, flattop, hann, boxcar, nuttall, parzen, triang and tukey windows.
|
|
36
|
+
A filter function can also be applied, the class accepts the savgol, medfilt, lfilter and
|
|
37
|
+
hilbert filters, by default no filter function is applied.
|
|
38
|
+
|
|
39
|
+
In the pipeline of pySAR this class and its functions are only used when the 'use_dsp'
|
|
40
|
+
parameter is set to true in the config files or in the class input parameters, meaning that
|
|
41
|
+
the encoded protein sequences are passed through a Digital Signal Processing (DSP) pipeline
|
|
42
|
+
before being used as training data for the regression models. The protein sequences being
|
|
43
|
+
numerically encoded is a pre-requisite to use the functions in this class, meaning sequences
|
|
44
|
+
cannot be directly input.
|
|
45
|
+
|
|
46
|
+
The class accepts two main input parameters. The protein_seqs input param is a numpy array
|
|
47
|
+
of numerically pre-encoded protein sequences. The config_file parameter is the filename
|
|
48
|
+
or filepath to the configuration file that contains all the required parameters for the
|
|
49
|
+
DSP encoding strategy/process. The class also accepts a variable number of keyword arguments
|
|
50
|
+
(**kwargs) that will override the config file parameter values of the same name if
|
|
51
|
+
they are passed in. The only DSP parameter required from the config file for the classes
|
|
52
|
+
functionality is the spectrum, so if the config_file parameter is not specified but the
|
|
53
|
+
spectrum is passed in then an error will not be raised.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
==========
|
|
57
|
+
:config_file (str/json)
|
|
58
|
+
path to configuration file containing DSP parameters OR JSON object of DSP parameters,
|
|
59
|
+
depending on if the parameter is a valid filepath or not.
|
|
60
|
+
:protein_seqs (np.ndarray)
|
|
61
|
+
array of pre-encoded protein sequences. Class accepts only numerically encoded protein
|
|
62
|
+
sequences, not in amino acid form.
|
|
63
|
+
**kwargs: dict
|
|
64
|
+
keyword arguments and values passed into constructor. The keywords should be
|
|
65
|
+
the same name and form of those in the configuration file. The keyword values
|
|
66
|
+
input take precedence over those in the config files.
|
|
67
|
+
|
|
68
|
+
Methods
|
|
69
|
+
=======
|
|
70
|
+
pre_processing():
|
|
71
|
+
complete required pre-processing steps before DSP functionality/pipeline.
|
|
72
|
+
encode_sequences():
|
|
73
|
+
calculate FFT and various informational spectra of protein sequences.
|
|
74
|
+
inverse_fft():
|
|
75
|
+
calculate inverse FFT of protein sequences.
|
|
76
|
+
consensus_freq():
|
|
77
|
+
calculate consensus frequency of FFT.
|
|
78
|
+
max_freq():
|
|
79
|
+
calculate max frequency of FFT
|
|
80
|
+
"""
|
|
81
|
+
def __init__(self, config_file="", protein_seqs=None, **kwargs):
|
|
82
|
+
|
|
83
|
+
self.protein_seqs = protein_seqs
|
|
84
|
+
self.config_file = config_file
|
|
85
|
+
self.config_parameters = {}
|
|
86
|
+
|
|
87
|
+
config_filepath = ""
|
|
88
|
+
|
|
89
|
+
#read protein seqs from dataset if protein_seqs is None,
|
|
90
|
+
if not (isinstance(config_file, str) or (isinstance(config_file, dict)) or (config_file is None)):
|
|
91
|
+
raise TypeError('JSON config must be a filepath of type string or a dict of parameters, got type {}.'.
|
|
92
|
+
format(type(config_file)))
|
|
93
|
+
|
|
94
|
+
# support config passed as dict, filepath, or omitted (kwargs can still define DSP params)
|
|
95
|
+
if isinstance(config_file, dict):
|
|
96
|
+
self.config_parameters = config_file
|
|
97
|
+
elif config_file in (None, ""):
|
|
98
|
+
self.config_parameters = {}
|
|
99
|
+
elif (isinstance(config_file, str) and os.path.isfile(self.config_file)):
|
|
100
|
+
config_filepath = self.config_file
|
|
101
|
+
elif (isinstance(config_file, str) and os.path.isfile(os.path.join('config', self.config_file))):
|
|
102
|
+
config_filepath = os.path.join('config', self.config_file)
|
|
103
|
+
else:
|
|
104
|
+
raise OSError(f'JSON config file not found at path: {config_filepath}.')
|
|
105
|
+
|
|
106
|
+
if config_filepath:
|
|
107
|
+
try:
|
|
108
|
+
#open config file and parse parameters
|
|
109
|
+
with open(config_filepath) as f:
|
|
110
|
+
self.config_parameters = json.load(f)
|
|
111
|
+
except json.JSONDecodeError as exc:
|
|
112
|
+
raise ValueError(f'Error parsing config JSON file: {config_filepath}.') from exc
|
|
113
|
+
|
|
114
|
+
#create instance of Map class so parameters in config can be accessed via dot notation
|
|
115
|
+
self.config_parameters = Map(self.config_parameters)
|
|
116
|
+
|
|
117
|
+
#raise error if protein sequences parameter is not set
|
|
118
|
+
if (self.protein_seqs is None):
|
|
119
|
+
raise ValueError('Protein sequences input parameter cannot be empty or None.')
|
|
120
|
+
|
|
121
|
+
#direct protein sequences cannot be input to class, they must be encoded first, raise error if so
|
|
122
|
+
for seq in protein_seqs:
|
|
123
|
+
if (isinstance(seq, str)):
|
|
124
|
+
raise ValueError("Protein sequences cannot be directly passed into the pyDSP class, you "
|
|
125
|
+
"must first encode the protein sequences using a specific aaindex record, "
|
|
126
|
+
"and then pass the resultant encoded sequence to the protein_seqs parameter.")
|
|
127
|
+
|
|
128
|
+
#reshape protein sequences to 2 dimensions
|
|
129
|
+
# if (self.protein_seqs.ndim != 2):
|
|
130
|
+
# self.protein_seqs = self.protein_seqs.reshape((-1, 1))
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
#set pyDSP parameters from kwargs or json config - use_dsp, spectrum, window function, window filter
|
|
134
|
+
default_dsp_parameters = getattr(self.config_parameters, 'pyDSP', {}) or {}
|
|
135
|
+
self.dsp_parameters = kwargs.get('dsp_parameters') if 'dsp_parameters' in kwargs else default_dsp_parameters
|
|
136
|
+
if self.dsp_parameters is None:
|
|
137
|
+
self.dsp_parameters = {}
|
|
138
|
+
|
|
139
|
+
# allow spectrum-only usage when config is omitted, per class docstring
|
|
140
|
+
self.spectrum = kwargs.get('spectrum') if 'spectrum' in kwargs else self.dsp_parameters.get("spectrum")
|
|
141
|
+
self.window_parameters = kwargs.get('window_parameters') if 'window_parameters' in kwargs else self.dsp_parameters.get("window", {})
|
|
142
|
+
if self.window_parameters is None:
|
|
143
|
+
self.window_parameters = {}
|
|
144
|
+
self.window = None
|
|
145
|
+
self.filter_parameters = kwargs.get('filter_parameters') if 'filter_parameters' in kwargs else self.dsp_parameters.get("filter", {})
|
|
146
|
+
if self.filter_parameters is None:
|
|
147
|
+
self.filter_parameters = {}
|
|
148
|
+
self.filter = None
|
|
149
|
+
self.window_type = kwargs.get('window_type') if 'window_type' in kwargs else self.window_parameters.get("type")
|
|
150
|
+
self.filter_type = kwargs.get('filter_type') if 'filter_type' in kwargs else self.filter_parameters.get("type")
|
|
151
|
+
|
|
152
|
+
#pre-processing of encoded protein sequences
|
|
153
|
+
self.pre_processing()
|
|
154
|
+
|
|
155
|
+
#transform sequences into the various informational protein spectra
|
|
156
|
+
self.encode_sequences()
|
|
157
|
+
|
|
158
|
+
def pre_processing(self):
|
|
159
|
+
"""
|
|
160
|
+
Complete various pre-processing steps for encoded protein sequences before
|
|
161
|
+
doing any of the DSP-related functions or transformations. Zero-pad the
|
|
162
|
+
sequences, remove any +/- infinity or NAN values, get the approximate
|
|
163
|
+
protein spectra and window function parameter names.
|
|
164
|
+
|
|
165
|
+
Parameters
|
|
166
|
+
==========
|
|
167
|
+
None
|
|
168
|
+
|
|
169
|
+
Returns
|
|
170
|
+
=======
|
|
171
|
+
None
|
|
172
|
+
"""
|
|
173
|
+
#zero-pad encoded sequences so they are all the same length
|
|
174
|
+
self.protein_seqs = zero_padding(self.protein_seqs)
|
|
175
|
+
|
|
176
|
+
#get shape parameters of proteins seqs
|
|
177
|
+
self.num_seqs = self.protein_seqs.shape[0]
|
|
178
|
+
self.signal_len = self.protein_seqs.shape[1]
|
|
179
|
+
|
|
180
|
+
#replace any positive/negative infinity or NAN values with 0
|
|
181
|
+
self.protein_seqs = np.nan_to_num(self.protein_seqs, nan=0.0, posinf=0.0, neginf=0.0)
|
|
182
|
+
|
|
183
|
+
#initialise zeros array to store all protein spectra
|
|
184
|
+
self.fft_power = np.zeros((self.num_seqs, self.signal_len))
|
|
185
|
+
self.fft_real = np.zeros((self.num_seqs, self.signal_len))
|
|
186
|
+
self.fft_imag = np.zeros((self.num_seqs, self.signal_len))
|
|
187
|
+
self.fft_abs = np.zeros((self.num_seqs, self.signal_len))
|
|
188
|
+
|
|
189
|
+
#list of accepted spectra, window functions and filters
|
|
190
|
+
all_spectra = ['power', 'absolute', 'real', 'imaginary']
|
|
191
|
+
all_filters = ['savgol', 'medfilt', 'lfilter', 'hilbert']
|
|
192
|
+
all_windows = ['hamming', 'blackman', 'blackmanharris', 'gaussian', 'bartlett',
|
|
193
|
+
'kaiser', 'barthann', 'bohman', 'chebwin', 'cosine', 'exponential',
|
|
194
|
+
'flattop', 'hann', 'boxcar', 'nuttall', 'parzen', 'triang', 'tukey']
|
|
195
|
+
|
|
196
|
+
#get approximate spectrum type from input, raise error if spectrum None or invalid
|
|
197
|
+
if (self.spectrum == None):
|
|
198
|
+
raise ValueError('Spectrum parameter cannot be empty of None.')
|
|
199
|
+
else:
|
|
200
|
+
#get closest correct spectra from user input, if no close match then raise error
|
|
201
|
+
spectra_matches = (get_close_matches(self.spectrum, all_spectra, cutoff=0.4))
|
|
202
|
+
if (spectra_matches == []):
|
|
203
|
+
raise ValueError(f'Invalid input spectrum type {self.spectrum}, not available in list of available spectra:\n{all_spectra}.')
|
|
204
|
+
else:
|
|
205
|
+
self.spectrum = spectra_matches[0] #closest match in array
|
|
206
|
+
|
|
207
|
+
#get approximate window type from input, if None or invalid set window to 1 (no window)
|
|
208
|
+
if (self.window_type == None):
|
|
209
|
+
self.window = 1 #window = 1 is the same as applying no window
|
|
210
|
+
else:
|
|
211
|
+
#get closest correct window function from user input
|
|
212
|
+
window_matches = (get_close_matches(self.window_type, all_windows, cutoff=0.6))
|
|
213
|
+
|
|
214
|
+
#remove any null or None values from window parameters in config
|
|
215
|
+
self.window_parameters = {k: v for k, v in (self.window_parameters or {}).items() if v is not None}
|
|
216
|
+
window_parameters = {}
|
|
217
|
+
|
|
218
|
+
#get window function specified by window input parameter, if no match then window = 1,
|
|
219
|
+
#pass in specific window parameters from those in the config, else use default parameters
|
|
220
|
+
if (window_matches != []):
|
|
221
|
+
if (window_matches[0] == 'hamming'):
|
|
222
|
+
for k, v in self.window_parameters.items():
|
|
223
|
+
if (k in inspect.getfullargspec(hamming).args): window_parameters[k] = self.window_parameters[k]
|
|
224
|
+
self.window = hamming(self.signal_len, **window_parameters)
|
|
225
|
+
self.window_type = "hamming"
|
|
226
|
+
elif (window_matches[0] == "blackman"):
|
|
227
|
+
for k, v in self.window_parameters.items():
|
|
228
|
+
if (k in inspect.getfullargspec(blackman).args): window_parameters[k] = self.window_parameters[k]
|
|
229
|
+
self.window = blackman(self.signal_len, **window_parameters)
|
|
230
|
+
self.window_type = "blackman"
|
|
231
|
+
elif (window_matches[0] == "blackmanharris"):
|
|
232
|
+
for k, v in self.window_parameters.items():
|
|
233
|
+
if (k in inspect.getfullargspec(blackmanharris).args): window_parameters[k] = self.window_parameters[k]
|
|
234
|
+
self.window = blackmanharris(self.signal_len, **window_parameters)
|
|
235
|
+
self.window_type = "blackmanharris"
|
|
236
|
+
elif (window_matches[0] == "bartlett"):
|
|
237
|
+
for k, v in self.window_parameters.items():
|
|
238
|
+
if (k in inspect.getfullargspec(bartlett).args): window_parameters[k] = self.window_parameters[k]
|
|
239
|
+
self.window = bartlett(self.signal_len, **window_parameters)
|
|
240
|
+
self.window_type = "bartlett"
|
|
241
|
+
elif (window_matches[0] == "gaussian"):
|
|
242
|
+
for k, v in self.window_parameters.items():
|
|
243
|
+
if (k in inspect.getfullargspec(gaussian).args): window_parameters[k] = self.window_parameters[k]
|
|
244
|
+
self.window = gaussian(self.signal_len, std=7, **window_parameters)
|
|
245
|
+
self.window_type = "gaussian"
|
|
246
|
+
elif (window_matches[0] == "kaiser"):
|
|
247
|
+
for k, v in self.window_parameters.items():
|
|
248
|
+
if (k in inspect.getfullargspec(kaiser).args): window_parameters[k] = self.window_parameters[k]
|
|
249
|
+
window_parameters = {k: v for k, v in window_parameters.items() if k != "alpha"} #remove alpha parameter
|
|
250
|
+
self.window = kaiser(self.signal_len, **window_parameters)
|
|
251
|
+
self.window_type = "kaiser"
|
|
252
|
+
elif (window_matches[0] == "hann"):
|
|
253
|
+
for k, v in self.window_parameters.items():
|
|
254
|
+
if (k in inspect.getfullargspec(hann).args): window_parameters[k] = self.window_parameters[k]
|
|
255
|
+
self.window = hann(self.signal_len, **window_parameters)
|
|
256
|
+
self.window_type = "hann"
|
|
257
|
+
elif (window_matches[0] == "barthann"):
|
|
258
|
+
for k, v in self.window_parameters.items():
|
|
259
|
+
if (k in inspect.getfullargspec(barthann).args): window_parameters[k] = self.window_parameters[k]
|
|
260
|
+
self.window = barthann(self.signal_len, **window_parameters)
|
|
261
|
+
self.window_type = "barthann"
|
|
262
|
+
elif (window_matches[0] == "bohman"):
|
|
263
|
+
for k, v in self.window_parameters.items():
|
|
264
|
+
if (k in inspect.getfullargspec(bohman).args): window_parameters[k] = self.window_parameters[k]
|
|
265
|
+
self.window = bohman(self.signal_len, **window_parameters)
|
|
266
|
+
self.window_type = "bohman"
|
|
267
|
+
elif (window_matches[0] == "chebwin"):
|
|
268
|
+
for k, v in self.window_parameters.items():
|
|
269
|
+
if (k in inspect.getfullargspec(chebwin).args): window_parameters[k] = self.window_parameters[k]
|
|
270
|
+
self.window = chebwin(self.signal_len, at=100, **window_parameters)
|
|
271
|
+
self.window_type = "chebwin"
|
|
272
|
+
elif (window_matches[0] == "cosine"):
|
|
273
|
+
for k, v in self.window_parameters.items():
|
|
274
|
+
if (k in inspect.getfullargspec(cosine).args): window_parameters[k] = self.window_parameters[k]
|
|
275
|
+
self.window = cosine(self.signal_len, **window_parameters)
|
|
276
|
+
self.window_type = "cosine"
|
|
277
|
+
elif (window_matches[0] == "exponential"):
|
|
278
|
+
for k, v in self.window_parameters.items():
|
|
279
|
+
if (k in inspect.getfullargspec(exponential).args): window_parameters[k] = self.window_parameters[k]
|
|
280
|
+
self.window = exponential(self.signal_len, **window_parameters)
|
|
281
|
+
self.window_type = "exponential"
|
|
282
|
+
elif (window_matches[0] == "flattop"):
|
|
283
|
+
for k, v in self.window_parameters.items():
|
|
284
|
+
if (k in inspect.getfullargspec(flattop).args): window_parameters[k] = self.window_parameters[k]
|
|
285
|
+
self.window = flattop(self.signal_len, **window_parameters)
|
|
286
|
+
self.window_type = "flattop"
|
|
287
|
+
elif (window_matches[0] == "boxcar"):
|
|
288
|
+
for k, v in self.window_parameters.items():
|
|
289
|
+
if (k in inspect.getfullargspec(boxcar).args): window_parameters[k] = self.window_parameters[k]
|
|
290
|
+
self.window = boxcar(self.signal_len, **window_parameters)
|
|
291
|
+
self.window_type = "boxcar"
|
|
292
|
+
elif (window_matches[0] == "nuttall"):
|
|
293
|
+
for k, v in self.window_parameters.items():
|
|
294
|
+
if (k in inspect.getfullargspec(nuttall).args): window_parameters[k] = self.window_parameters[k]
|
|
295
|
+
self.window = nuttall(self.signal_len, **window_parameters)
|
|
296
|
+
self.window_type = "nuttall"
|
|
297
|
+
elif (window_matches[0] == "parzen"):
|
|
298
|
+
for k, v in self.window_parameters.items():
|
|
299
|
+
if (k in inspect.getfullargspec(parzen).args): window_parameters[k] = self.window_parameters[k]
|
|
300
|
+
self.window = parzen(self.signal_len, **window_parameters)
|
|
301
|
+
self.window_type = "parzen"
|
|
302
|
+
elif (window_matches[0] == "triang"):
|
|
303
|
+
for k, v in self.window_parameters.items():
|
|
304
|
+
if (k in inspect.getfullargspec(triang).args): window_parameters[k] = self.window_parameters[k]
|
|
305
|
+
self.window = triang(self.signal_len, **window_parameters)
|
|
306
|
+
self.window_type = "triang"
|
|
307
|
+
elif (window_matches[0] == "tukey"):
|
|
308
|
+
for k, v in self.window_parameters.items():
|
|
309
|
+
if (k in inspect.getfullargspec(tukey).args): window_parameters[k] = self.window_parameters[k]
|
|
310
|
+
self.window = tukey(self.signal_len, **window_parameters)
|
|
311
|
+
self.window_type = "tukey"
|
|
312
|
+
else:
|
|
313
|
+
self.window = 1 #window = 1 is the same as applying no window
|
|
314
|
+
|
|
315
|
+
#get approximate filter type from input
|
|
316
|
+
if ((self.filter_type != None) and (self.filter_type != "")):
|
|
317
|
+
filter_matches = get_close_matches(self.filter_type, all_filters, cutoff=0.4)
|
|
318
|
+
|
|
319
|
+
#set filter attribute according to approximate user input
|
|
320
|
+
if (filter_matches != []):
|
|
321
|
+
if (filter_matches[0] == 'savgol'):
|
|
322
|
+
self.filter_type = "savgol"
|
|
323
|
+
elif (filter_matches[0] == 'medfilt'):
|
|
324
|
+
self.filter_type = "medfilt"
|
|
325
|
+
elif (filter_matches[0] == 'lfilter'):
|
|
326
|
+
self.filter_type = "lfilter"
|
|
327
|
+
elif (filter_matches[0] == 'hilbert'):
|
|
328
|
+
self.filter_type = "hilbert"
|
|
329
|
+
|
|
330
|
+
def encode_sequences(self):
|
|
331
|
+
"""
|
|
332
|
+
Calculate the DFT of the protein sequences already encoded using
|
|
333
|
+
the AAI indices, using the FFT algorithm, then use the output of the
|
|
334
|
+
FFT to calculate the various informational protein spectra including
|
|
335
|
+
the power, absolute, real and imaginary. The spectrum_encoding
|
|
336
|
+
attribute will be set to the spectrum inputted by user from the
|
|
337
|
+
'spectrum' config parameter, if no valid spectrum input as parameter
|
|
338
|
+
then value error raised. After spectrum calculated, apply any
|
|
339
|
+
window or filter function, if applicable.
|
|
340
|
+
|
|
341
|
+
Parameters
|
|
342
|
+
==========
|
|
343
|
+
None
|
|
344
|
+
|
|
345
|
+
Returns
|
|
346
|
+
=======
|
|
347
|
+
None
|
|
348
|
+
"""
|
|
349
|
+
#create copy of protein sequences so the original instance var remains unchanged
|
|
350
|
+
encoded_seq_copy = np.copy(self.protein_seqs)
|
|
351
|
+
|
|
352
|
+
#initialise zero arrays used to store output of both fft, set
|
|
353
|
+
#datatype to complex number as that is the output type of the FFT transformation
|
|
354
|
+
encoded_dataset_fft = np.zeros((self.protein_seqs.shape), dtype=complex)
|
|
355
|
+
|
|
356
|
+
#initialise zero arrays used to store output frequencies from fft transformations
|
|
357
|
+
encoded_freqs_fft = np.zeros(self.protein_seqs.shape)
|
|
358
|
+
|
|
359
|
+
#iterate through each sequence, applying the FFT to each
|
|
360
|
+
for seq in range(0, self.num_seqs):
|
|
361
|
+
|
|
362
|
+
#create temp zeros arrays to store current sequence's fft
|
|
363
|
+
encoded_fft = np.zeros((self.signal_len), dtype=complex)
|
|
364
|
+
|
|
365
|
+
#apply window function to Fourier array, multiple by 1 if using no window function
|
|
366
|
+
with warnings.catch_warnings():
|
|
367
|
+
warnings.simplefilter(action='ignore', category=FutureWarning)
|
|
368
|
+
encoded_fft = fft(encoded_seq_copy[seq] * self.window)
|
|
369
|
+
|
|
370
|
+
#apply filter to encoded sequences if filter_type not empty in config
|
|
371
|
+
if ((self.filter_type != None) and (self.filter_type != "")):
|
|
372
|
+
|
|
373
|
+
#remove any null or None values from filter parameters in config
|
|
374
|
+
self.filter_parameters = {k: v for k, v in (self.filter_parameters or {}).items() if v is not None}
|
|
375
|
+
filter_parameters = {}
|
|
376
|
+
|
|
377
|
+
#set filter attribute according to approximate user input
|
|
378
|
+
if (self.filter_type == 'savgol'):
|
|
379
|
+
for k, v in self.filter_parameters.items():
|
|
380
|
+
if (k in inspect.getfullargspec(savgol_filter).args): filter_parameters[k] = self.filter_parameters[k]
|
|
381
|
+
self.filter = savgol_filter(encoded_fft, **filter_parameters)
|
|
382
|
+
elif (self.filter_type == 'medfilt'):
|
|
383
|
+
for k, v in self.filter_parameters.items():
|
|
384
|
+
if (k in inspect.getfullargspec(medfilt).args): filter_parameters[k] = self.filter_parameters[k]
|
|
385
|
+
self.filter = medfilt(encoded_fft, **filter_parameters)
|
|
386
|
+
elif (self.filter_type == 'lfilter'):
|
|
387
|
+
for k, v in self.filter_parameters.items():
|
|
388
|
+
if (k in inspect.getfullargspec(lfilter).args): filter_parameters[k] = self.filter_parameters[k]
|
|
389
|
+
b = filter_parameters.pop('b', None)
|
|
390
|
+
a = filter_parameters.pop('a', None)
|
|
391
|
+
if b is not None and a is not None:
|
|
392
|
+
self.filter = lfilter(b, a, encoded_fft, **filter_parameters)
|
|
393
|
+
else:
|
|
394
|
+
self.filter = None
|
|
395
|
+
elif (self.filter_type == 'hilbert'):
|
|
396
|
+
for k, v in self.filter_parameters.items():
|
|
397
|
+
if (k in inspect.getfullargspec(hilbert).args): filter_parameters[k] = self.filter_parameters[k]
|
|
398
|
+
self.filter = hilbert(encoded_fft, **filter_parameters)
|
|
399
|
+
else:
|
|
400
|
+
self.filter = None #no filter
|
|
401
|
+
|
|
402
|
+
#apply filtered signal when available
|
|
403
|
+
if self.filter is not None:
|
|
404
|
+
encoded_fft = self.filter
|
|
405
|
+
|
|
406
|
+
#append transformation from current sequence seq to array of all transformed sequences
|
|
407
|
+
encoded_dataset_fft[seq] = encoded_fft
|
|
408
|
+
|
|
409
|
+
#calculate FFT frequencies
|
|
410
|
+
freqs_fft = np.fft.fftfreq(encoded_fft.size)
|
|
411
|
+
|
|
412
|
+
#append frequency from current sequence seq to array of all frequencies
|
|
413
|
+
encoded_freqs_fft[seq] = freqs_fft
|
|
414
|
+
|
|
415
|
+
#set FFT sequences and frequencies class attributes
|
|
416
|
+
self.fft = encoded_dataset_fft
|
|
417
|
+
self.fft_freqs = encoded_freqs_fft
|
|
418
|
+
|
|
419
|
+
#get individual spectral values, calculated from the FFT transformations
|
|
420
|
+
self.fft_abs = abs(self.fft/self.signal_len)
|
|
421
|
+
self.fft_power = np.abs(self.fft[0:len(self.fft)])
|
|
422
|
+
self.fft_real = self.fft.real
|
|
423
|
+
self.fft_imag = self.fft.imag
|
|
424
|
+
|
|
425
|
+
#set the spectrum_encoding attribute to the spectra specified by 'spectrum' class input parameter
|
|
426
|
+
if (self.spectrum == 'power'):
|
|
427
|
+
self.spectrum_encoding = self.fft_power
|
|
428
|
+
elif (self.spectrum == 'real'):
|
|
429
|
+
self.spectrum_encoding = self.fft_real
|
|
430
|
+
elif (self.spectrum == 'imaginary'):
|
|
431
|
+
self.spectrum_encoding = self.fft_imag
|
|
432
|
+
elif (self.spectrum == 'absolute'):
|
|
433
|
+
self.spectrum_encoding = self.fft_abs
|
|
434
|
+
|
|
435
|
+
def inverse_fft(self, a, n):
|
|
436
|
+
"""
|
|
437
|
+
Get the inverse Fourier Transform of FFT.
|
|
438
|
+
|
|
439
|
+
Parameters
|
|
440
|
+
==========
|
|
441
|
+
:a: np.ndarray
|
|
442
|
+
input array of 1D Fourier Transform.
|
|
443
|
+
:n: int
|
|
444
|
+
length of the output.
|
|
445
|
+
|
|
446
|
+
Returns
|
|
447
|
+
=======
|
|
448
|
+
:inv_fft : np.ndarray
|
|
449
|
+
array of inverse Fourier Transform.
|
|
450
|
+
"""
|
|
451
|
+
self.inv_fft = np.fft.ifft(a,n)
|
|
452
|
+
return self.inv_fft
|
|
453
|
+
|
|
454
|
+
def consensus_freq(self, freqs):
|
|
455
|
+
"""
|
|
456
|
+
Get the Consensus Frequency from Fourier Transform of encoded protein sequences.
|
|
457
|
+
|
|
458
|
+
Parameters
|
|
459
|
+
==========
|
|
460
|
+
:freqs: np.ndarray
|
|
461
|
+
frequencies of Fourier Transform.
|
|
462
|
+
|
|
463
|
+
Returns
|
|
464
|
+
=======
|
|
465
|
+
:CF: float
|
|
466
|
+
consensus frequency found in array of frequencies.
|
|
467
|
+
"""
|
|
468
|
+
freqs = np.asarray(freqs)
|
|
469
|
+
|
|
470
|
+
#raise error if more than one sequence passed into function
|
|
471
|
+
if freqs.ndim != 1:
|
|
472
|
+
raise ValueError(f"Only one protein sequence should be passed into the function: {freqs}.")
|
|
473
|
+
|
|
474
|
+
# CF = PP/N ( peak position/length of largest protein in dataset)
|
|
475
|
+
CF = (self.max_freq(freqs)[0])/self.num_seqs
|
|
476
|
+
return CF
|
|
477
|
+
|
|
478
|
+
def max_freq(self, freqs):
|
|
479
|
+
"""
|
|
480
|
+
Get the maximum frequency from Fourier Transform of an encoded protein sequence.
|
|
481
|
+
|
|
482
|
+
Parameters
|
|
483
|
+
==========
|
|
484
|
+
:freqs: np.ndarray
|
|
485
|
+
frequencies from Fourier Transform.
|
|
486
|
+
|
|
487
|
+
Returns
|
|
488
|
+
=======
|
|
489
|
+
:max_F: float
|
|
490
|
+
maximum frequency found in array of frequencies.
|
|
491
|
+
:max_FI: int
|
|
492
|
+
index of maximum frequency.
|
|
493
|
+
"""
|
|
494
|
+
freqs = np.asarray(freqs)
|
|
495
|
+
|
|
496
|
+
#raise error if more than one sequence passed into function
|
|
497
|
+
if freqs.ndim != 1:
|
|
498
|
+
raise ValueError(f"Only one protein sequence should be passed into the function: {freqs}.")
|
|
499
|
+
|
|
500
|
+
max_F = max(freqs)
|
|
501
|
+
max_FI = np.argmax(freqs)
|
|
502
|
+
return max_F, max_FI
|
|
503
|
+
|
|
504
|
+
###################### Getters & Setters ######################
|
|
505
|
+
|
|
506
|
+
@property
|
|
507
|
+
def fft_power(self):
|
|
508
|
+
return self._fft_power
|
|
509
|
+
|
|
510
|
+
@fft_power.setter
|
|
511
|
+
def fft_power(self, val):
|
|
512
|
+
self._fft_power = val
|
|
513
|
+
|
|
514
|
+
@property
|
|
515
|
+
def fft_real(self):
|
|
516
|
+
return self._fft_real
|
|
517
|
+
|
|
518
|
+
@fft_real.setter
|
|
519
|
+
def fft_real(self, val):
|
|
520
|
+
self._fft_real = val
|
|
521
|
+
|
|
522
|
+
@property
|
|
523
|
+
def fft_imag(self):
|
|
524
|
+
return self._fft_imag
|
|
525
|
+
|
|
526
|
+
@fft_imag.setter
|
|
527
|
+
def fft_imag(self, val):
|
|
528
|
+
self._fft_imag = val
|
|
529
|
+
|
|
530
|
+
@property
|
|
531
|
+
def fft_abs(self):
|
|
532
|
+
return self._fft_abs
|
|
533
|
+
|
|
534
|
+
@fft_abs.setter
|
|
535
|
+
def fft_abs(self, val):
|
|
536
|
+
self._fft_abs = val
|
|
537
|
+
|
|
538
|
+
@property
|
|
539
|
+
def fft_freqs(self):
|
|
540
|
+
return self._fft_freqs
|
|
541
|
+
|
|
542
|
+
@fft_freqs.setter
|
|
543
|
+
def fft_freqs(self, val):
|
|
544
|
+
self._fft_freqs = val
|
|
545
|
+
|
|
546
|
+
@property
|
|
547
|
+
def window(self):
|
|
548
|
+
return self._window
|
|
549
|
+
|
|
550
|
+
@window.setter
|
|
551
|
+
def window(self, val):
|
|
552
|
+
self._window = val
|
|
553
|
+
|
|
554
|
+
@property
|
|
555
|
+
def window_type(self):
|
|
556
|
+
return self._window_type
|
|
557
|
+
|
|
558
|
+
@window_type.setter
|
|
559
|
+
def window_type(self, val):
|
|
560
|
+
self._window_type = val
|
|
561
|
+
|
|
562
|
+
@property
|
|
563
|
+
def filter(self):
|
|
564
|
+
return self._filter
|
|
565
|
+
|
|
566
|
+
@filter.setter
|
|
567
|
+
def filter(self, val):
|
|
568
|
+
self._filter = val
|
|
569
|
+
|
|
570
|
+
@property
|
|
571
|
+
def filter_type(self):
|
|
572
|
+
return self._filter_type
|
|
573
|
+
|
|
574
|
+
@filter_type.setter
|
|
575
|
+
def filter_type(self, val):
|
|
576
|
+
self._filter_type = val
|
|
577
|
+
|
|
578
|
+
def __str__(self):
|
|
579
|
+
return f"Instance of PyDSP class, using parameters: {self.__dict__.keys()}."
|
|
580
|
+
|
|
581
|
+
def __repr__(self):
|
|
582
|
+
return (f'<PyDSP: {self}>.')
|