PySAR 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/conf.py +53 -0
- pySAR/__init__.py +28 -0
- pySAR/descriptors.py +2893 -0
- pySAR/encoding.py +986 -0
- pySAR/evaluate.py +231 -0
- pySAR/globals_.py +21 -0
- pySAR/model.py +559 -0
- pySAR/plots.py +92 -0
- pySAR/py.typed +0 -0
- pySAR/pyDSP.py +582 -0
- pySAR/pySAR.py +962 -0
- pySAR/utils.py +283 -0
- pysar-2.5.0.dist-info/METADATA +740 -0
- pysar-2.5.0.dist-info/RECORD +17 -0
- pysar-2.5.0.dist-info/WHEEL +5 -0
- pysar-2.5.0.dist-info/licenses/LICENSE +21 -0
- pysar-2.5.0.dist-info/top_level.txt +2 -0
pySAR/descriptors.py
ADDED
|
@@ -0,0 +1,2893 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
################# Descriptors #################
|
|
3
|
+
################################################################################
|
|
4
|
+
|
|
5
|
+
from typing import Union, List, Optional, Dict, Any, Callable, Tuple
|
|
6
|
+
from enum import Enum
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import numpy as np
|
|
9
|
+
from difflib import get_close_matches
|
|
10
|
+
import json
|
|
11
|
+
from json import JSONDecodeError
|
|
12
|
+
import itertools
|
|
13
|
+
import time
|
|
14
|
+
from tqdm import tqdm
|
|
15
|
+
from functools import lru_cache
|
|
16
|
+
|
|
17
|
+
from .utils import *
|
|
18
|
+
import protpy as protpy
|
|
19
|
+
|
|
20
|
+
# Descriptor feature dimension constants
|
|
21
|
+
AA_COUNT = 20
|
|
22
|
+
DIPEPTIDE_FEATURES = 20 ** 2 # 400
|
|
23
|
+
TRIPEPTIDE_FEATURES = 20 ** 3 # 8000
|
|
24
|
+
CONJOINT_TRIAD_FEATURES = 343
|
|
25
|
+
|
|
26
|
+
class DescriptorType(Enum):
|
|
27
|
+
"""Enumeration of available protein descriptor types."""
|
|
28
|
+
AMINO_ACID_COMPOSITION = 'amino_acid_composition'
|
|
29
|
+
DIPEPTIDE_COMPOSITION = 'dipeptide_composition'
|
|
30
|
+
TRIPEPTIDE_COMPOSITION = 'tripeptide_composition'
|
|
31
|
+
GRAVY = 'gravy'
|
|
32
|
+
AROMATICITY = 'aromaticity'
|
|
33
|
+
INSTABILITY_INDEX = 'instability_index'
|
|
34
|
+
ISOELECTRIC_POINT = 'isoelectric_point'
|
|
35
|
+
MOLECULAR_WEIGHT = 'molecular_weight'
|
|
36
|
+
CHARGE_DISTRIBUTION = 'charge_distribution'
|
|
37
|
+
HYDROPHOBIC_POLAR_CHARGED_COMPOSITION = 'hydrophobic_polar_charged_composition'
|
|
38
|
+
SECONDARY_STRUCTURE_PROPENSITY = 'secondary_structure_propensity'
|
|
39
|
+
KMER_COMPOSITION = 'kmer_composition'
|
|
40
|
+
REDUCED_ALPHABET_COMPOSITION = 'reduced_alphabet_composition'
|
|
41
|
+
MOTIF_COMPOSITION = 'motif_composition'
|
|
42
|
+
AMINO_ACID_PAIR_COMPOSITION = 'amino_acid_pair_composition'
|
|
43
|
+
ALIPHATIC_INDEX = 'aliphatic_index'
|
|
44
|
+
EXTINCTION_COEFFICIENT = 'extinction_coefficient'
|
|
45
|
+
BOMAN_INDEX = 'boman_index'
|
|
46
|
+
AGGREGATION_PROPENSITY = 'aggregation_propensity'
|
|
47
|
+
HYDROPHOBIC_MOMENT = 'hydrophobic_moment'
|
|
48
|
+
SHANNON_ENTROPY = 'shannon_entropy'
|
|
49
|
+
MOREAUBROTO_AUTOCORRELATION = 'moreaubroto_autocorrelation'
|
|
50
|
+
MORAN_AUTOCORRELATION = 'moran_autocorrelation'
|
|
51
|
+
GEARY_AUTOCORRELATION = 'geary_autocorrelation'
|
|
52
|
+
CTD = 'ctd'
|
|
53
|
+
CTD_COMPOSITION = 'ctd_composition'
|
|
54
|
+
CTD_TRANSITION = 'ctd_transition'
|
|
55
|
+
CTD_DISTRIBUTION = 'ctd_distribution'
|
|
56
|
+
CONJOINT_TRIAD = 'conjoint_triad'
|
|
57
|
+
SEQUENCE_ORDER_COUPLING_NUMBER = 'sequence_order_coupling_number'
|
|
58
|
+
QUASI_SEQUENCE_ORDER = 'quasi_sequence_order'
|
|
59
|
+
PSEUDO_AMINO_ACID_COMPOSITION = 'pseudo_amino_acid_composition'
|
|
60
|
+
AMPHIPHILIC_PSEUDO_AMINO_ACID_COMPOSITION = 'amphiphilic_pseudo_amino_acid_composition'
|
|
61
|
+
|
|
62
|
+
class Descriptors():
|
|
63
|
+
"""
|
|
64
|
+
Class for calculating a wide variety of protein physicochemical, biochemical and structural
|
|
65
|
+
descriptors. These descriptors have been used in a wide variety of Bioinformatics
|
|
66
|
+
applications including: protein structural and functional class prediction,
|
|
67
|
+
protein-protein interactions, subcellular location, secondary structure prediction, among
|
|
68
|
+
many more. They represent the different structural, functional & interaction profiles of
|
|
69
|
+
proteins by exploring the features in the groups of composition, correlation and distribution
|
|
70
|
+
of the constituent residues and their biochemical and physicochemical properties.
|
|
71
|
+
|
|
72
|
+
A custom-built software package was created to generate these descriptors - protpy, which
|
|
73
|
+
is also open-source and available here: https://github.com/amckenna41/protpy. The package
|
|
74
|
+
takes 1 or more protein sequences, returning the respective descriptor values in a Pandas
|
|
75
|
+
DataFrame. protpy and this class allows calculation of the following descriptors: Amino
|
|
76
|
+
Acid Composition (AAComp), Dipeptide Composition (DPComp), Tripeptide Composition (TPComp),
|
|
77
|
+
MoreauBroto Autocorrelation (MBAuto), Moran Autocorrelation (MAuto), Geary Autocorrelation
|
|
78
|
+
(GAuto), Composition (CTD_C), Transition (CTD_T), Distribution (CTD_D), CTD, Conjoint Triad
|
|
79
|
+
(CTriad), Sequence Order Coupling Number (SOCN), Quasi Sequence Order (QSO), Pseudo Amino Acid
|
|
80
|
+
Composition - type 1 (PAAcomp), Amphiphilic Pseudo Amino Acid Composition - type 2 (APAAComp),
|
|
81
|
+
GRAVY, Aromaticity, Instability Index, Isoelectric Point, Molecular Weight, Charge Distribution,
|
|
82
|
+
Hydrophobic/Polar/Charged Composition (HPC), Secondary Structure Propensity (SSP), k-mer
|
|
83
|
+
Composition, Reduced Alphabet Composition, Motif Composition, Amino Acid Pair Composition,
|
|
84
|
+
Aliphatic Index, Extinction Coefficient, Boman Index, Aggregation Propensity, Hydrophobic
|
|
85
|
+
Moment, and Shannon Entropy.
|
|
86
|
+
|
|
87
|
+
Similar to other classes in pySAR, this class works via configuration files which contain
|
|
88
|
+
the values for all the potential parameters, if applicable, of each descriptor. By default,
|
|
89
|
+
the class will look for a descriptors csv which is a file of the pre-calculated descriptor
|
|
90
|
+
values for the specified dataset, if this file doesn't exist, or the parameter value is blank,
|
|
91
|
+
then each descriptor will have to be calculated using its respective function.
|
|
92
|
+
|
|
93
|
+
During initialization, input sequences are normalized by removing gaps and then validated
|
|
94
|
+
against canonical amino acids before descriptor generation begins.
|
|
95
|
+
|
|
96
|
+
This class is also designed to feed descriptor feature matrices directly into downstream
|
|
97
|
+
Encoding and PySAR workflows for model training and evaluation.
|
|
98
|
+
|
|
99
|
+
It is recommended that with every new dataset, the Descriptors class should be instantiated
|
|
100
|
+
with the "all_desc" parameter set to 1 in the config file. This will calculate all the descriptor
|
|
101
|
+
values for the dataset of protein sequences, storing the result in a csv file, meaning that
|
|
102
|
+
this file can be used for future use and the descriptors will not have to be recalculated each
|
|
103
|
+
time. This csv file will be saved to the path and filename according to the "descriptors_csv"
|
|
104
|
+
parameter in the config file.
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
==========
|
|
108
|
+
:config_file: str
|
|
109
|
+
path to configuration file which will contain the various parameter values for all
|
|
110
|
+
descriptors. If invalid value input then error will be raised.
|
|
111
|
+
:protein_seqs: pd.Series or str
|
|
112
|
+
protein sequences to calculate descriptors for. A single sequence string is converted
|
|
113
|
+
internally to a pandas Series. If None or empty, sequences are loaded from the dataset
|
|
114
|
+
path in the configuration.
|
|
115
|
+
**kwargs: dict
|
|
116
|
+
keyword argument names and values for the dataset filename/path and the descriptors
|
|
117
|
+
csv path parameters. The keywords should be the same name and form of those in the
|
|
118
|
+
configuration file. The keyword values input take precedence over those in the config files.
|
|
119
|
+
|
|
120
|
+
Attributes
|
|
121
|
+
==========
|
|
122
|
+
:amino_acid_composition: pd.DataFrame
|
|
123
|
+
Amino acid composition descriptor (20 features)
|
|
124
|
+
:dipeptide_composition: pd.DataFrame
|
|
125
|
+
Dipeptide composition descriptor (400 features)
|
|
126
|
+
:tripeptide_composition: pd.DataFrame
|
|
127
|
+
Tripeptide composition descriptor (8000 features)
|
|
128
|
+
:moreaubroto_autocorrelation: pd.DataFrame
|
|
129
|
+
Moreaubroto autocorrelation descriptor (240 features)
|
|
130
|
+
:moran_autocorrelation: pd.DataFrame
|
|
131
|
+
Moran autocorrelation descriptor (240 features)
|
|
132
|
+
:geary_autocorrelation: pd.DataFrame
|
|
133
|
+
Geary autocorrelation descriptor (240 features)
|
|
134
|
+
:ctd: pd.DataFrame
|
|
135
|
+
Composition-Transition-Distribution descriptor
|
|
136
|
+
:conjoint_triad: pd.DataFrame
|
|
137
|
+
Conjoint triad descriptor (343 features)
|
|
138
|
+
:pseudo_amino_acid_composition: pd.DataFrame
|
|
139
|
+
Pseudo amino acid composition descriptor
|
|
140
|
+
:amphiphilic_pseudo_amino_acid_composition: pd.DataFrame
|
|
141
|
+
Amphiphilic pseudo amino acid composition descriptor
|
|
142
|
+
:gravy: pd.DataFrame
|
|
143
|
+
GRAVY (Grand Average of Hydropathy) descriptor (1 feature)
|
|
144
|
+
:aromaticity: pd.DataFrame
|
|
145
|
+
Aromaticity descriptor (1 feature)
|
|
146
|
+
:instability_index: pd.DataFrame
|
|
147
|
+
Instability Index descriptor (1 feature)
|
|
148
|
+
:isoelectric_point: pd.DataFrame
|
|
149
|
+
Isoelectric Point descriptor (1 feature)
|
|
150
|
+
:molecular_weight: pd.DataFrame
|
|
151
|
+
Molecular Weight descriptor (1 feature)
|
|
152
|
+
:charge_distribution: pd.DataFrame
|
|
153
|
+
Charge Distribution descriptor (3 features)
|
|
154
|
+
:hydrophobic_polar_charged_composition: pd.DataFrame
|
|
155
|
+
Hydrophobic/Polar/Charged Composition descriptor (3 features)
|
|
156
|
+
:secondary_structure_propensity: pd.DataFrame
|
|
157
|
+
Secondary Structure Propensity descriptor (3 features)
|
|
158
|
+
:kmer_composition: pd.DataFrame
|
|
159
|
+
k-mer Composition descriptor (20^k features, default 400)
|
|
160
|
+
:reduced_alphabet_composition: pd.DataFrame
|
|
161
|
+
Reduced Alphabet Composition descriptor (alphabet_size features, default 6)
|
|
162
|
+
:motif_composition: pd.DataFrame
|
|
163
|
+
Motif Composition descriptor (8 features by default)
|
|
164
|
+
:amino_acid_pair_composition: pd.DataFrame
|
|
165
|
+
Amino Acid Pair Composition descriptor (400 features)
|
|
166
|
+
:aliphatic_index: pd.DataFrame
|
|
167
|
+
Aliphatic Index descriptor (1 feature)
|
|
168
|
+
:extinction_coefficient: pd.DataFrame
|
|
169
|
+
Extinction Coefficient descriptor (2 features)
|
|
170
|
+
:boman_index: pd.DataFrame
|
|
171
|
+
Boman Index descriptor (1 feature)
|
|
172
|
+
:aggregation_propensity: pd.DataFrame
|
|
173
|
+
Aggregation Propensity descriptor (2 features)
|
|
174
|
+
:hydrophobic_moment: pd.DataFrame
|
|
175
|
+
Hydrophobic Moment descriptor (2 features)
|
|
176
|
+
:shannon_entropy: pd.DataFrame
|
|
177
|
+
Shannon Entropy descriptor (1 feature)
|
|
178
|
+
:all_descriptors: pd.DataFrame
|
|
179
|
+
Concatenated dataframe of all calculated descriptors
|
|
180
|
+
:valid_descriptors: list
|
|
181
|
+
List of all available descriptor names
|
|
182
|
+
:descriptor_groups: dict
|
|
183
|
+
Mapping of descriptor names to their functional groups
|
|
184
|
+
:num_seqs: int
|
|
185
|
+
Total number of input protein sequences
|
|
186
|
+
:protein_seqs: pd.Series
|
|
187
|
+
Loaded protein sequences with gaps removed
|
|
188
|
+
|
|
189
|
+
Methods
|
|
190
|
+
=======
|
|
191
|
+
import_descriptors()
|
|
192
|
+
Import pre-calculated descriptors from CSV file
|
|
193
|
+
get_amino_acid_composition()
|
|
194
|
+
Calculate amino acid composition for all sequences
|
|
195
|
+
get_dipeptide_composition()
|
|
196
|
+
Calculate dipeptide composition for all sequences
|
|
197
|
+
get_tripeptide_composition()
|
|
198
|
+
Calculate tripeptide composition for all sequences
|
|
199
|
+
get_moreaubroto_autocorrelation()
|
|
200
|
+
Calculate Moreau-Broto autocorrelation descriptor
|
|
201
|
+
get_moran_autocorrelation()
|
|
202
|
+
Calculate Moran autocorrelation descriptor
|
|
203
|
+
get_geary_autocorrelation()
|
|
204
|
+
Calculate Geary autocorrelation descriptor
|
|
205
|
+
get_ctd()
|
|
206
|
+
Calculate CTD descriptor
|
|
207
|
+
get_ctd_composition()
|
|
208
|
+
Calculate CTD composition descriptor
|
|
209
|
+
get_ctd_transition()
|
|
210
|
+
Calculate CTD transition descriptor
|
|
211
|
+
get_ctd_distribution()
|
|
212
|
+
Calculate CTD distribution descriptor
|
|
213
|
+
get_conjoint_triad()
|
|
214
|
+
Calculate conjoint triad descriptor
|
|
215
|
+
get_sequence_order_coupling_number()
|
|
216
|
+
Calculate sequence order coupling number descriptor
|
|
217
|
+
get_quasi_sequence_order()
|
|
218
|
+
Calculate quasi sequence order descriptor
|
|
219
|
+
get_pseudo_amino_acid_composition()
|
|
220
|
+
Calculate pseudo amino acid composition descriptor
|
|
221
|
+
get_amphiphilic_pseudo_amino_acid_composition()
|
|
222
|
+
Calculate amphiphilic pseudo amino acid composition descriptor
|
|
223
|
+
get_gravy()
|
|
224
|
+
Calculate GRAVY (Grand Average of Hydropathy) descriptor
|
|
225
|
+
get_aromaticity()
|
|
226
|
+
Calculate Aromaticity descriptor
|
|
227
|
+
get_instability_index()
|
|
228
|
+
Calculate Instability Index descriptor
|
|
229
|
+
get_isoelectric_point()
|
|
230
|
+
Calculate Isoelectric Point descriptor
|
|
231
|
+
get_molecular_weight()
|
|
232
|
+
Calculate Molecular Weight descriptor
|
|
233
|
+
get_charge_distribution()
|
|
234
|
+
Calculate Charge Distribution descriptor
|
|
235
|
+
get_hydrophobic_polar_charged_composition()
|
|
236
|
+
Calculate Hydrophobic/Polar/Charged Composition descriptor
|
|
237
|
+
get_secondary_structure_propensity()
|
|
238
|
+
Calculate Secondary Structure Propensity descriptor
|
|
239
|
+
get_kmer_composition()
|
|
240
|
+
Calculate k-mer Composition descriptor
|
|
241
|
+
get_reduced_alphabet_composition()
|
|
242
|
+
Calculate Reduced Alphabet Composition descriptor
|
|
243
|
+
get_motif_composition()
|
|
244
|
+
Calculate Motif Composition descriptor
|
|
245
|
+
get_amino_acid_pair_composition()
|
|
246
|
+
Calculate Amino Acid Pair Composition descriptor
|
|
247
|
+
get_aliphatic_index()
|
|
248
|
+
Calculate Aliphatic Index descriptor
|
|
249
|
+
get_extinction_coefficient()
|
|
250
|
+
Calculate Extinction Coefficient descriptor
|
|
251
|
+
get_boman_index()
|
|
252
|
+
Calculate Boman Index descriptor
|
|
253
|
+
get_aggregation_propensity()
|
|
254
|
+
Calculate Aggregation Propensity descriptor
|
|
255
|
+
get_hydrophobic_moment()
|
|
256
|
+
Calculate Hydrophobic Moment descriptor
|
|
257
|
+
get_shannon_entropy()
|
|
258
|
+
Calculate Shannon Entropy descriptor
|
|
259
|
+
get_all_descriptors()
|
|
260
|
+
Calculate all descriptors and return a concatenated dataframe
|
|
261
|
+
get_descriptor_encoding()
|
|
262
|
+
Resolve a descriptor name and return its encoding dataframe
|
|
263
|
+
all_descriptors_list()
|
|
264
|
+
Return descriptor names or combinations of descriptor names
|
|
265
|
+
validate_descriptors()
|
|
266
|
+
Validate descriptor names exist in valid descriptors list
|
|
267
|
+
validate_sequences()
|
|
268
|
+
Validate sequences contain only canonical amino acids
|
|
269
|
+
get_descriptor_info()
|
|
270
|
+
Get metadata about a specific descriptor
|
|
271
|
+
reset_descriptors()
|
|
272
|
+
Clear all descriptor DataFrames to empty state
|
|
273
|
+
clear_cache()
|
|
274
|
+
Free memory from cached descriptor metadata
|
|
275
|
+
get_descriptor_columns()
|
|
276
|
+
Get column names for a calculated descriptor
|
|
277
|
+
__str__()
|
|
278
|
+
Return a human-readable string summary of descriptor shapes
|
|
279
|
+
__repr__()
|
|
280
|
+
Return the object representation string
|
|
281
|
+
__len__()
|
|
282
|
+
Return number of rows in all_descriptors
|
|
283
|
+
__shape__()
|
|
284
|
+
Return shape of all_descriptors
|
|
285
|
+
__sizeof__()
|
|
286
|
+
Return memory footprint of all_descriptors
|
|
287
|
+
|
|
288
|
+
Raises
|
|
289
|
+
======
|
|
290
|
+
:TypeError
|
|
291
|
+
If config_file is not a string or protein sequences are invalid type
|
|
292
|
+
:OSError
|
|
293
|
+
If config file or dataset file not found at specified path
|
|
294
|
+
:InvalidSequenceError
|
|
295
|
+
If protein sequences contain non-canonical amino acids
|
|
296
|
+
:InvalidDescriptorError
|
|
297
|
+
If requesting a non-existent descriptor
|
|
298
|
+
:DescriptorConfigError
|
|
299
|
+
If configuration JSON file is invalid or malformed
|
|
300
|
+
|
|
301
|
+
Examples
|
|
302
|
+
========
|
|
303
|
+
>>> from pySAR.descriptors import Descriptors
|
|
304
|
+
>>> desc = Descriptors(config_file='config/thermostability.json')
|
|
305
|
+
>>>
|
|
306
|
+
>>> # Calculate single descriptor
|
|
307
|
+
>>> aa_comp = desc.get_amino_acid_composition()
|
|
308
|
+
>>>
|
|
309
|
+
>>> # Calculate multiple descriptors
|
|
310
|
+
>>> desc.get_dipeptide_composition()
|
|
311
|
+
>>> desc.get_moran_autocorrelation()
|
|
312
|
+
>>>
|
|
313
|
+
>>> # Get all descriptors at once
|
|
314
|
+
>>> all_desc = desc.get_all_descriptors()
|
|
315
|
+
>>> alldescs.shape
|
|
316
|
+
(261, 10572)
|
|
317
|
+
>>>
|
|
318
|
+
>>> # Get descriptor information
|
|
319
|
+
>>> info = desc.get_descriptor_info('amino_acid_composition')
|
|
320
|
+
>>> info['feature_count']
|
|
321
|
+
20
|
|
322
|
+
>>>
|
|
323
|
+
>>> # Get columns for a descriptor
|
|
324
|
+
>>> columns = desc.get_descriptor_columns('dipeptide_composition')
|
|
325
|
+
>>> len(columns)
|
|
326
|
+
400
|
|
327
|
+
|
|
328
|
+
Notes
|
|
329
|
+
=====
|
|
330
|
+
- Tripeptide and pseudo-amino acid composition descriptors are computationally expensive
|
|
331
|
+
and may take significant time to calculate on large datasets
|
|
332
|
+
- Pre-calculating all descriptors and exporting to CSV (via 'all_desc' config parameter)
|
|
333
|
+
is recommended to avoid recalculation
|
|
334
|
+
- The descriptor_feature_count property is cached for performance
|
|
335
|
+
- Memory usage scales with dataset size and number of descriptors calculated
|
|
336
|
+
- Protein sequences must contain only standard 20 amino acids (A-W, excluding B, O, U, Z)
|
|
337
|
+
|
|
338
|
+
References
|
|
339
|
+
==========
|
|
340
|
+
[1] Dong, J., Yao, ZJ., Zhang, L. et al. PyBioMed: a python library for
|
|
341
|
+
various molecular representations of chemicals, proteins and DNAs and
|
|
342
|
+
their interactions. J Cheminform 10, 16 (2018).
|
|
343
|
+
https://doi.org/10.1186/s13321-018-0270-2
|
|
344
|
+
[2] Reczko, M. and Bohr, H. (1994) The DEF data base of sequence based protein
|
|
345
|
+
fold class predictions. Nucleic Acids Res, 22, 3616-3619.
|
|
346
|
+
[3] Hua, S. and Sun, Z. (2001) Support vector machine approach for protein
|
|
347
|
+
subcellular localization prediction. Bioinformatics, 17, 721-728.
|
|
348
|
+
[4] Broto P, Moreau G, Vandicke C: Molecular structures: perception,
|
|
349
|
+
autocorrelation descriptor and SAR studies. Eur J Med Chem 1984, 19: 71–78.
|
|
350
|
+
[5] Ong, S.A., Lin, H.H., Chen, Y.Z. et al. Efficacy of different protein
|
|
351
|
+
descriptors in predicting protein functional families. BMC Bioinformatics
|
|
352
|
+
8, 300 (2007). https://doi.org/10.1186/1471-2105-8-300
|
|
353
|
+
[6] Inna Dubchak, Ilya Muchink, Stephen R.Holbrook and Sung-Hou Kim. Prediction
|
|
354
|
+
of protein folding class using global description of amino acid sequence.
|
|
355
|
+
Proc.Natl. Acad.Sci.USA, 1995, 92, 8700-8704.
|
|
356
|
+
[7] Juwen Shen, Jian Zhang, Xiaomin Luo, Weiliang Zhu, Kunqian Yu, Kaixian Chen,
|
|
357
|
+
Yixue Li, Huanliang Jiang. Predicting proten-protein interactions based only
|
|
358
|
+
on sequences inforamtion. PNAS. 2007 (104) 4337-4341.
|
|
359
|
+
[8] Kuo-Chen Chou. Prediction of Protein Subcellar Locations by Incorporating
|
|
360
|
+
Quasi-Sequence-Order Effect. Biochemical and Biophysical Research
|
|
361
|
+
Communications 2000, 278, 477-483.
|
|
362
|
+
[9] Kuo-Chen Chou. Prediction of Protein Cellular Attributes Using
|
|
363
|
+
Pseudo-Amino Acid Composition. PROTEINS: Structure, Function, and
|
|
364
|
+
Genetics, 2001, 43: 246-255.
|
|
365
|
+
[10] Kuo-Chen Chou. Using amphiphilic pseudo amino acid composition to predict enzyme
|
|
366
|
+
subfamily classes. Bioinformatics, 2005,21,10-19.
|
|
367
|
+
[11] J. Shen et al., “Predicting protein-protein interactions based only on sequences
|
|
368
|
+
information,” Proc. Natl. Acad. Sci. U. S. A., vol. 104, no. 11, pp. 4337–4341, 2007.
|
|
369
|
+
[12] Gisbert Schneider and Paul Wrede. The Rational Design of Amino Acid Sequences
|
|
370
|
+
by Artifical Neural Networks and Simulated Molecular Evolution: Do Novo Design
|
|
371
|
+
of an Idealized Leader Cleavge Site. Biophys Journal, 1994, 66, 335-344.
|
|
372
|
+
[13] Grantham, R. (1974-09-06). "Amino acid difference formula to help explain protein
|
|
373
|
+
evolution". Science. 185 (4154): 862–864. Bibcode:1974Sci...185..862G.
|
|
374
|
+
doi:10.1126/science.185.4154.862. ISSN 0036-8075. PMID 4843792. S2CID 35388307.
|
|
375
|
+
[14] B. Hollas, “An analysis of the autocorrelation descriptor for molecules,” J. Math. Chem.,
|
|
376
|
+
vol. 33, no. 2, pp. 91–101, 2003.
|
|
377
|
+
"""
|
|
378
|
+
def __init__(self,
|
|
379
|
+
config_file: str = "",
|
|
380
|
+
protein_seqs: Optional[Union[pd.Series, str]] = None,
|
|
381
|
+
**kwargs) -> None:
|
|
382
|
+
|
|
383
|
+
self.config_file = config_file
|
|
384
|
+
self.protein_seqs = protein_seqs
|
|
385
|
+
self.kwargs = locals()['kwargs'] #get any keyword argument variables of class
|
|
386
|
+
self.config_parameters = {}
|
|
387
|
+
|
|
388
|
+
desc_config_filepath = ""
|
|
389
|
+
|
|
390
|
+
#import config file, raise error if invalid path
|
|
391
|
+
if not (isinstance(self.config_file, str) or (self.config_file is None)):
|
|
392
|
+
raise TypeError(f'JSON config file must be a filepath of type string, got type {type(config_file)}.')
|
|
393
|
+
if (os.path.splitext(self.config_file)[1] == ''):
|
|
394
|
+
self.config_file = self.config_file + '.json' #append extension if only filename input
|
|
395
|
+
if (os.path.isfile(self.config_file)):
|
|
396
|
+
desc_config_filepath = self.config_file
|
|
397
|
+
elif (os.path.isfile(os.path.join('config', self.config_file))):
|
|
398
|
+
desc_config_filepath = os.path.join('config', self.config_file)
|
|
399
|
+
else:
|
|
400
|
+
raise OSError(f'JSON config file not found at path: {self.config_file}.')
|
|
401
|
+
|
|
402
|
+
#open json file and read config parameters
|
|
403
|
+
try:
|
|
404
|
+
with open(desc_config_filepath) as f:
|
|
405
|
+
self.config_parameters = json.load(f)
|
|
406
|
+
except (json.JSONDecodeError, FileNotFoundError, IOError) as e:
|
|
407
|
+
raise DescriptorConfigError(f'Error parsing config JSON file {desc_config_filepath}: {e}')
|
|
408
|
+
|
|
409
|
+
#create instance of Map class so parameters in config can be accessed via dot notation
|
|
410
|
+
self.dataset_parameters = Map(self.config_parameters["dataset"])
|
|
411
|
+
self.desc_parameters = Map(self.config_parameters["descriptors"])
|
|
412
|
+
|
|
413
|
+
#set dataset and descriptors csv filepath from kwargs, if applicable, or the config file values
|
|
414
|
+
self.dataset_filepath = self.kwargs.get('dataset') if 'dataset' in self.kwargs else self.dataset_parameters["dataset"]
|
|
415
|
+
self.descriptors_csv = self.kwargs.get('descriptors_csv') if 'descriptors_csv' in self.kwargs else self.desc_parameters.descriptors_csv
|
|
416
|
+
|
|
417
|
+
#import protein sequences from dataset if not directly specified in protein_seqs input param
|
|
418
|
+
if not (isinstance(self.protein_seqs, pd.Series)):
|
|
419
|
+
if (self.protein_seqs is None or self.protein_seqs == ""):
|
|
420
|
+
#open dataset and read protein seqs if protein_seqs is empty/None
|
|
421
|
+
if not (os.path.isfile(self.dataset_filepath)):
|
|
422
|
+
raise OSError(f'Dataset file not found at path: {self.dataset_filepath}.')
|
|
423
|
+
|
|
424
|
+
#read in dataset csv from filepath mentioned in config
|
|
425
|
+
try:
|
|
426
|
+
data = pd.read_csv(self.dataset_filepath, sep=",", header=0)
|
|
427
|
+
self.protein_seqs = data[self.dataset_parameters["sequence_col"]]
|
|
428
|
+
except (FileNotFoundError, IOError, KeyError, pd.errors.ParserError) as e:
|
|
429
|
+
raise DescriptorError(f'Error opening dataset file {self.dataset_filepath}: {e}')
|
|
430
|
+
else:
|
|
431
|
+
#if 1 protein sequence (1 string) input then convert to pandas Series object
|
|
432
|
+
if (isinstance(self.protein_seqs, str)):
|
|
433
|
+
self.protein_seqs = pd.Series(self.protein_seqs)
|
|
434
|
+
|
|
435
|
+
#only the sequences should be passed in, not all columns in a dataset etc.
|
|
436
|
+
if (isinstance(self.protein_seqs, pd.DataFrame) and \
|
|
437
|
+
len(self.protein_seqs.columns) > 1):
|
|
438
|
+
raise ValueError("The full dataset must not be passed in, only the"
|
|
439
|
+
" columns containing the protein sequences.")
|
|
440
|
+
|
|
441
|
+
#remove any gaps from protein sequences
|
|
442
|
+
self.protein_seqs = remove_gaps(self.protein_seqs)
|
|
443
|
+
|
|
444
|
+
#validate that all input protein sequences are valid and only contain valid amino acids, if not then raise ValueError
|
|
445
|
+
invalid_seqs = valid_sequence(self.protein_seqs)
|
|
446
|
+
if (invalid_seqs != None):
|
|
447
|
+
raise InvalidSequenceError(f'Invalid Amino Acids found in protein sequence dataset: {invalid_seqs}.')
|
|
448
|
+
|
|
449
|
+
#get the total number of inputted protein sequences
|
|
450
|
+
self.num_seqs = len(self.protein_seqs)
|
|
451
|
+
|
|
452
|
+
#initialise all descriptor attributes to empty dataframes
|
|
453
|
+
self.amino_acid_composition = pd.DataFrame()
|
|
454
|
+
self.dipeptide_composition = pd.DataFrame()
|
|
455
|
+
self.tripeptide_composition = pd.DataFrame()
|
|
456
|
+
# new composition descriptors (protpy >= 1.3.0)
|
|
457
|
+
self.gravy = pd.DataFrame()
|
|
458
|
+
self.aromaticity = pd.DataFrame()
|
|
459
|
+
self.instability_index = pd.DataFrame()
|
|
460
|
+
self.isoelectric_point = pd.DataFrame()
|
|
461
|
+
self.molecular_weight = pd.DataFrame()
|
|
462
|
+
self.charge_distribution = pd.DataFrame()
|
|
463
|
+
self.hydrophobic_polar_charged_composition = pd.DataFrame()
|
|
464
|
+
self.secondary_structure_propensity = pd.DataFrame()
|
|
465
|
+
self.kmer_composition = pd.DataFrame()
|
|
466
|
+
self.reduced_alphabet_composition = pd.DataFrame()
|
|
467
|
+
self.motif_composition = pd.DataFrame()
|
|
468
|
+
self.amino_acid_pair_composition = pd.DataFrame()
|
|
469
|
+
self.aliphatic_index = pd.DataFrame()
|
|
470
|
+
self.extinction_coefficient = pd.DataFrame()
|
|
471
|
+
self.boman_index = pd.DataFrame()
|
|
472
|
+
self.aggregation_propensity = pd.DataFrame()
|
|
473
|
+
self.hydrophobic_moment = pd.DataFrame()
|
|
474
|
+
self.shannon_entropy = pd.DataFrame()
|
|
475
|
+
self.moreaubroto_autocorrelation = pd.DataFrame()
|
|
476
|
+
self.moran_autocorrelation = pd.DataFrame()
|
|
477
|
+
self.geary_autocorrelation = pd.DataFrame()
|
|
478
|
+
self.ctd = pd.DataFrame()
|
|
479
|
+
self.ctd_composition = pd.DataFrame()
|
|
480
|
+
self.ctd_transition = pd.DataFrame()
|
|
481
|
+
self.ctd_distribution = pd.DataFrame()
|
|
482
|
+
self.conjoint_triad = pd.DataFrame()
|
|
483
|
+
self.sequence_order_coupling_number = pd.DataFrame()
|
|
484
|
+
self.quasi_sequence_order = pd.DataFrame()
|
|
485
|
+
self.pseudo_amino_acid_composition = pd.DataFrame()
|
|
486
|
+
self.amphiphilic_pseudo_amino_acid_composition = pd.DataFrame()
|
|
487
|
+
self.all_descriptors = pd.DataFrame()
|
|
488
|
+
|
|
489
|
+
#append extension if just the filename input as descriptors csv
|
|
490
|
+
if ((self.descriptors_csv != '' and self.descriptors_csv != None)
|
|
491
|
+
and (os.path.splitext(self.descriptors_csv)[1] == '')):
|
|
492
|
+
self.descriptors_csv = self.descriptors_csv + ".csv"
|
|
493
|
+
|
|
494
|
+
#try importing descriptors csv with pre-calculated descriptor values
|
|
495
|
+
if (os.path.isfile(self.descriptors_csv)):
|
|
496
|
+
self.import_descriptors(self.descriptors_csv)
|
|
497
|
+
#get the total number of inputted protein sequences
|
|
498
|
+
self.num_seqs = self.all_descriptors.shape[0]
|
|
499
|
+
|
|
500
|
+
#create dictionary of descriptors and their associated groups
|
|
501
|
+
keys = self.all_descriptors_list()
|
|
502
|
+
# 21 Composition (3 original + 18 new) + 3 Autocorrelation + 4 CTD + 1 Conjoint Triad + 2 Sequence Order + 2 Pseudo Composition
|
|
503
|
+
values = (["Composition"] * 21 + ["Autocorrelation"] * 3 + ["CTD"] * 4 +
|
|
504
|
+
["Conjoint Triad"] + ["Sequence Order"] * 2 + ["Pseudo Composition"] * 2)
|
|
505
|
+
self.descriptor_groups = dict(zip(keys,values))
|
|
506
|
+
|
|
507
|
+
#get shape of descriptors
|
|
508
|
+
self.shape = self.all_descriptors.shape
|
|
509
|
+
|
|
510
|
+
#list of available protein descriptors
|
|
511
|
+
self.valid_descriptors = [
|
|
512
|
+
'amino_acid_composition', 'dipeptide_composition', 'tripeptide_composition',
|
|
513
|
+
'gravy', 'aromaticity', 'instability_index', 'isoelectric_point', 'molecular_weight',
|
|
514
|
+
'charge_distribution', 'hydrophobic_polar_charged_composition',
|
|
515
|
+
'secondary_structure_propensity', 'kmer_composition', 'reduced_alphabet_composition',
|
|
516
|
+
'motif_composition', 'amino_acid_pair_composition', 'aliphatic_index',
|
|
517
|
+
'extinction_coefficient', 'boman_index', 'aggregation_propensity',
|
|
518
|
+
'hydrophobic_moment', 'shannon_entropy',
|
|
519
|
+
'moreaubroto_autocorrelation', 'moran_autocorrelation', 'geary_autocorrelation',
|
|
520
|
+
'ctd', 'ctd_composition', 'ctd_transition', 'ctd_distribution', 'conjoint_triad',
|
|
521
|
+
'sequence_order_coupling_number', 'quasi_sequence_order',
|
|
522
|
+
'pseudo_amino_acid_composition', 'amphiphilic_pseudo_amino_acid_composition'
|
|
523
|
+
]
|
|
524
|
+
|
|
525
|
+
def import_descriptors(self, descriptor_filepath: str = "") -> None:
|
|
526
|
+
"""
|
|
527
|
+
Import descriptors from descriptors csv, setting the class attributes to their values.
|
|
528
|
+
It is recommended that after calculating the descriptors for a dataset of sequences
|
|
529
|
+
that the calculated values are exported to a csv; this means they don't need to be
|
|
530
|
+
recalculated each time. The all_descriptors class attribute is a dataframe of all
|
|
531
|
+
concatenated descriptors from the csv.
|
|
532
|
+
|
|
533
|
+
Parameters
|
|
534
|
+
==========
|
|
535
|
+
:descriptor_filepath: str
|
|
536
|
+
filepath to pre-calculated descriptor csv file.
|
|
537
|
+
|
|
538
|
+
Returns
|
|
539
|
+
=======
|
|
540
|
+
None
|
|
541
|
+
"""
|
|
542
|
+
#raise type error if filepath parameter isn't string
|
|
543
|
+
if not (isinstance(descriptor_filepath, str)):
|
|
544
|
+
raise TypeError(f"Filepath input parameter should be type str, got {type(descriptor_filepath)}.")
|
|
545
|
+
|
|
546
|
+
#verify descriptors csv exists at filepath
|
|
547
|
+
if not (os.path.isfile(descriptor_filepath)):
|
|
548
|
+
raise OSError(f'Descriptors csv file does not exist at filepath: {descriptor_filepath}.')
|
|
549
|
+
|
|
550
|
+
#import descriptors csv as dataframe
|
|
551
|
+
try:
|
|
552
|
+
descriptor_df = pd.read_csv(descriptor_filepath)
|
|
553
|
+
except (FileNotFoundError, IOError, pd.errors.ParserError) as e:
|
|
554
|
+
raise DescriptorError(f'Error reading descriptors csv file {descriptor_filepath}: {e}')
|
|
555
|
+
|
|
556
|
+
#replacing any +/- infinity or NAN values with 0
|
|
557
|
+
descriptor_df = descriptor_df.replace([np.inf, -np.inf], np.nan).fillna(0)
|
|
558
|
+
|
|
559
|
+
'''
|
|
560
|
+
calculate dimension of each descriptor in the csv according to the properties of each
|
|
561
|
+
descriptor, pull each descriptor value from the csv according to its dimension,
|
|
562
|
+
setting the values to the class instance variables
|
|
563
|
+
'''
|
|
564
|
+
amino_acid_composition_dim = (0, AA_COUNT)
|
|
565
|
+
self.amino_acid_composition = descriptor_df.iloc[:,amino_acid_composition_dim[0]:amino_acid_composition_dim[1]]
|
|
566
|
+
|
|
567
|
+
dipeptide_composition_dim = (AA_COUNT, AA_COUNT + DIPEPTIDE_FEATURES)
|
|
568
|
+
self.dipeptide_composition = descriptor_df.iloc[:,dipeptide_composition_dim[0]:dipeptide_composition_dim[1]]
|
|
569
|
+
|
|
570
|
+
tripeptide_composition_dim = (AA_COUNT + DIPEPTIDE_FEATURES, AA_COUNT + DIPEPTIDE_FEATURES + TRIPEPTIDE_FEATURES)
|
|
571
|
+
self.tripeptide_composition = descriptor_df.iloc[:,tripeptide_composition_dim[0]:tripeptide_composition_dim[1]]
|
|
572
|
+
|
|
573
|
+
#dimension of autocorrelation (moreaubroto, moran and geary) descriptors depends on the lag value and number of properties
|
|
574
|
+
_comp_offset = AA_COUNT + DIPEPTIDE_FEATURES + TRIPEPTIDE_FEATURES
|
|
575
|
+
moreaubroto_dim = (_comp_offset,
|
|
576
|
+
_comp_offset + (self.desc_parameters.moreaubroto_autocorrelation["lag"] * len(self.desc_parameters.moreaubroto_autocorrelation["properties"])))
|
|
577
|
+
self.moreaubroto_autocorrelation = descriptor_df.iloc[:,moreaubroto_dim[0]:moreaubroto_dim[1]]
|
|
578
|
+
|
|
579
|
+
moran_auto_dim = (moreaubroto_dim[1], moreaubroto_dim[1] +
|
|
580
|
+
(self.desc_parameters.moran_autocorrelation["lag"] * len(self.desc_parameters.moran_autocorrelation["properties"])))
|
|
581
|
+
self.moran_autocorrelation = descriptor_df.iloc[:,moran_auto_dim[0]: moran_auto_dim[1]]
|
|
582
|
+
|
|
583
|
+
geary_auto_dim = (moran_auto_dim[1], moran_auto_dim[1] +
|
|
584
|
+
(self.desc_parameters.geary_autocorrelation["lag"] * len(self.desc_parameters.geary_autocorrelation["properties"])))
|
|
585
|
+
self.geary_autocorrelation = descriptor_df.iloc[:,geary_auto_dim[0]:geary_auto_dim[1]]
|
|
586
|
+
|
|
587
|
+
#get CTD parameters from config to determine the dimensions of the CTD descriptors
|
|
588
|
+
ctd_property = self.desc_parameters.ctd["property"]
|
|
589
|
+
if not (isinstance(ctd_property, list)):
|
|
590
|
+
ctd_property = ctd_property.split(',')
|
|
591
|
+
ctd_all_ctd = self.desc_parameters.ctd["all"]
|
|
592
|
+
|
|
593
|
+
#if using all properties in CTD calculation, 147 features generated, 21 features per 7 properties
|
|
594
|
+
if (ctd_all_ctd):
|
|
595
|
+
ctd_dim = (geary_auto_dim[1], geary_auto_dim[1]+147) #21 CTD features per 7 properties = 147
|
|
596
|
+
ctd_comp_dim = (geary_auto_dim[1], geary_auto_dim[1] + 21) #3 CTD_Comp features per 7 properties = 21
|
|
597
|
+
ctd_trans_dim = (ctd_comp_dim[1], ctd_comp_dim[1] + 21) #3 CTD_Trans features per 7 properties = 21
|
|
598
|
+
ctd_distr_dim = (ctd_trans_dim[1], ctd_trans_dim[1] + 105) #15 CTD_Distr features per 7 properties = 105
|
|
599
|
+
#only using a pre-determined list of physicochemical properties, 21 features per property
|
|
600
|
+
else:
|
|
601
|
+
ctd_comp_dim = (geary_auto_dim[1], geary_auto_dim[1] + (len(ctd_property) * 3)) #3 CTD_Comp features per property
|
|
602
|
+
ctd_trans_dim = (ctd_comp_dim[1], ctd_comp_dim[1] + (len(ctd_property) * 3)) #3 CTD_Trans features per property
|
|
603
|
+
ctd_distr_dim = (ctd_trans_dim[1], ctd_trans_dim[1] + (len(ctd_property) * 15)) #15 CTD_Distr features per property
|
|
604
|
+
ctd_dim = (geary_auto_dim[1], ctd_distr_dim[1]) #21 CTD features per property
|
|
605
|
+
|
|
606
|
+
self.ctd = descriptor_df.iloc[:,ctd_dim[0]:ctd_dim[1]]
|
|
607
|
+
|
|
608
|
+
self.ctd_composition = descriptor_df.iloc[:,ctd_comp_dim[0]:ctd_comp_dim[1]]
|
|
609
|
+
|
|
610
|
+
self.ctd_transition = descriptor_df.iloc[:,ctd_trans_dim[0]:ctd_trans_dim[1]]
|
|
611
|
+
|
|
612
|
+
self.ctd_distribution = descriptor_df.iloc[:,ctd_distr_dim[0]:ctd_distr_dim[1]]
|
|
613
|
+
|
|
614
|
+
conjoint_triad_dim = (ctd_distr_dim[1], ctd_distr_dim[1] + CONJOINT_TRIAD_FEATURES)
|
|
615
|
+
|
|
616
|
+
self.conjoint_triad = descriptor_df.iloc[:,conjoint_triad_dim[0]:conjoint_triad_dim[1]]
|
|
617
|
+
|
|
618
|
+
#socn value dependant on value of lag and distance matrix
|
|
619
|
+
socn_lag = self.desc_parameters.sequence_order_coupling_number["lag"]
|
|
620
|
+
socn_distance_matrix = self.desc_parameters.sequence_order_coupling_number["distance_matrix"]
|
|
621
|
+
|
|
622
|
+
#if no distance matrix speciifed in config then both are used for descriptor calculation
|
|
623
|
+
if (socn_distance_matrix == "" or socn_distance_matrix == None):
|
|
624
|
+
socn_dim = (conjoint_triad_dim[1], conjoint_triad_dim[1] + (socn_lag * 2))
|
|
625
|
+
#distance matrix specified in config
|
|
626
|
+
else:
|
|
627
|
+
socn_dim = (conjoint_triad_dim[1], conjoint_triad_dim[1] + socn_lag)
|
|
628
|
+
|
|
629
|
+
self.sequence_order_coupling_number = descriptor_df.iloc[:,socn_dim[0]:socn_dim[1]]
|
|
630
|
+
|
|
631
|
+
quasi_seq_order_lag = self.desc_parameters.quasi_sequence_order["lag"]
|
|
632
|
+
quasi_seq_order_dist_matrix = self.desc_parameters.quasi_sequence_order["distance_matrix"]
|
|
633
|
+
|
|
634
|
+
#if no distance matrix speciifed in config then both are used for descriptor calculation
|
|
635
|
+
if (quasi_seq_order_dist_matrix == "" or quasi_seq_order_dist_matrix == None):
|
|
636
|
+
quasi_seq_order_dim = (socn_dim[1], socn_dim[1] + ((quasi_seq_order_lag+20) * 2))
|
|
637
|
+
#distance matrix specified in config
|
|
638
|
+
else:
|
|
639
|
+
quasi_seq_order_dim = (socn_dim[1], socn_dim[1] + (quasi_seq_order_lag+20))
|
|
640
|
+
|
|
641
|
+
self.quasi_sequence_order = descriptor_df.iloc[:,quasi_seq_order_dim[0]:quasi_seq_order_dim[1]]
|
|
642
|
+
|
|
643
|
+
#paac value dependant on lambda value
|
|
644
|
+
paac_lambda = self.desc_parameters.pseudo_amino_acid_composition["lambda"]
|
|
645
|
+
|
|
646
|
+
pseudo_amino_acid_composition_dim = (quasi_seq_order_dim[1], quasi_seq_order_dim[1] + (20 + paac_lambda))
|
|
647
|
+
self.pseudo_amino_acid_composition = descriptor_df.iloc[:,pseudo_amino_acid_composition_dim[0]:pseudo_amino_acid_composition_dim[1]]
|
|
648
|
+
|
|
649
|
+
apaac_lambda = self.desc_parameters.amphiphilic_pseudo_amino_acid_composition["lambda"]
|
|
650
|
+
|
|
651
|
+
amphiphilic_pseudo_amino_acid_composition_dim = (pseudo_amino_acid_composition_dim[1],
|
|
652
|
+
pseudo_amino_acid_composition_dim[1] + (20 + (2*apaac_lambda)))
|
|
653
|
+
self.amphiphilic_pseudo_amino_acid_composition = descriptor_df.iloc[:,amphiphilic_pseudo_amino_acid_composition_dim[0]:
|
|
654
|
+
amphiphilic_pseudo_amino_acid_composition_dim[1]]
|
|
655
|
+
|
|
656
|
+
self.all_descriptors = descriptor_df.iloc[:,:]
|
|
657
|
+
|
|
658
|
+
def validate_descriptors(self, descriptors: Union[str, List[str]]) -> List[str]:
|
|
659
|
+
"""
|
|
660
|
+
Validate that requested descriptors exist in the valid descriptors list.
|
|
661
|
+
|
|
662
|
+
Parameters
|
|
663
|
+
==========
|
|
664
|
+
:descriptors: str or list of str
|
|
665
|
+
Descriptor name(s) to validate
|
|
666
|
+
|
|
667
|
+
Returns
|
|
668
|
+
=======
|
|
669
|
+
:List[str]
|
|
670
|
+
List of validated descriptor names
|
|
671
|
+
|
|
672
|
+
Raises
|
|
673
|
+
======
|
|
674
|
+
:TypeError
|
|
675
|
+
If descriptors is not a string or list of strings
|
|
676
|
+
:InvalidDescriptorError
|
|
677
|
+
If any invalid descriptors are requested
|
|
678
|
+
"""
|
|
679
|
+
if isinstance(descriptors, str):
|
|
680
|
+
descriptors = [descriptors]
|
|
681
|
+
elif not isinstance(descriptors, list):
|
|
682
|
+
raise TypeError(
|
|
683
|
+
f"Descriptors must be a string or list of strings, got {type(descriptors)}."
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
if not all(isinstance(descriptor, str) for descriptor in descriptors):
|
|
687
|
+
raise TypeError("All descriptor names must be strings.")
|
|
688
|
+
|
|
689
|
+
invalid = set(descriptors) - set(self.valid_descriptors)
|
|
690
|
+
if invalid:
|
|
691
|
+
raise InvalidDescriptorError(f"Invalid descriptors requested: {invalid}. "
|
|
692
|
+
f"Valid descriptors: {self.valid_descriptors}")
|
|
693
|
+
|
|
694
|
+
return descriptors
|
|
695
|
+
|
|
696
|
+
def validate_sequences(self, seqs: Optional[pd.Series] = None) -> bool:
|
|
697
|
+
"""
|
|
698
|
+
Validate all sequences contain only valid amino acids.
|
|
699
|
+
|
|
700
|
+
Parameters
|
|
701
|
+
==========
|
|
702
|
+
:seqs: pd.Series, optional
|
|
703
|
+
Sequences to validate. If None, uses self.protein_seqs
|
|
704
|
+
|
|
705
|
+
Returns
|
|
706
|
+
=======
|
|
707
|
+
:bool
|
|
708
|
+
True if all sequences are valid
|
|
709
|
+
|
|
710
|
+
Raises
|
|
711
|
+
======
|
|
712
|
+
:InvalidSequenceError
|
|
713
|
+
If invalid amino acids found
|
|
714
|
+
"""
|
|
715
|
+
seqs = seqs if seqs is not None else self.protein_seqs
|
|
716
|
+
invalid = valid_sequence(seqs)
|
|
717
|
+
|
|
718
|
+
if invalid is not None:
|
|
719
|
+
raise InvalidSequenceError(f"Invalid amino acids found: {invalid}")
|
|
720
|
+
|
|
721
|
+
return True
|
|
722
|
+
|
|
723
|
+
@property
|
|
724
|
+
@lru_cache(maxsize=1)
|
|
725
|
+
def descriptor_feature_count(self) -> Dict[str, int]:
|
|
726
|
+
"""
|
|
727
|
+
Get count of features in each descriptor (cached for performance).
|
|
728
|
+
|
|
729
|
+
Returns
|
|
730
|
+
=======
|
|
731
|
+
:Dict[str, int]
|
|
732
|
+
Dictionary mapping descriptor names to feature counts
|
|
733
|
+
"""
|
|
734
|
+
counts = {
|
|
735
|
+
'amino_acid_composition': AA_COUNT,
|
|
736
|
+
'dipeptide_composition': DIPEPTIDE_FEATURES,
|
|
737
|
+
'tripeptide_composition': TRIPEPTIDE_FEATURES,
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
# Autocorrelation counts depend on lag and properties
|
|
741
|
+
if not self.moreaubroto_autocorrelation.empty:
|
|
742
|
+
counts['moreaubroto_autocorrelation'] = self.moreaubroto_autocorrelation.shape[1]
|
|
743
|
+
if not self.moran_autocorrelation.empty:
|
|
744
|
+
counts['moran_autocorrelation'] = self.moran_autocorrelation.shape[1]
|
|
745
|
+
if not self.geary_autocorrelation.empty:
|
|
746
|
+
counts['geary_autocorrelation'] = self.geary_autocorrelation.shape[1]
|
|
747
|
+
|
|
748
|
+
# CTD counts
|
|
749
|
+
if not self.ctd.empty:
|
|
750
|
+
counts['ctd'] = self.ctd.shape[1]
|
|
751
|
+
counts['ctd_composition'] = self.ctd_composition.shape[1]
|
|
752
|
+
counts['ctd_transition'] = self.ctd_transition.shape[1]
|
|
753
|
+
counts['ctd_distribution'] = self.ctd_distribution.shape[1]
|
|
754
|
+
|
|
755
|
+
counts['conjoint_triad'] = CONJOINT_TRIAD_FEATURES
|
|
756
|
+
|
|
757
|
+
# Sequence order counts
|
|
758
|
+
if not self.sequence_order_coupling_number.empty:
|
|
759
|
+
counts['sequence_order_coupling_number'] = self.sequence_order_coupling_number.shape[1]
|
|
760
|
+
if not self.quasi_sequence_order.empty:
|
|
761
|
+
counts['quasi_sequence_order'] = self.quasi_sequence_order.shape[1]
|
|
762
|
+
|
|
763
|
+
# Pseudo composition counts
|
|
764
|
+
if not self.pseudo_amino_acid_composition.empty:
|
|
765
|
+
counts['pseudo_amino_acid_composition'] = self.pseudo_amino_acid_composition.shape[1]
|
|
766
|
+
if not self.amphiphilic_pseudo_amino_acid_composition.empty:
|
|
767
|
+
counts['amphiphilic_pseudo_amino_acid_composition'] = self.amphiphilic_pseudo_amino_acid_composition.shape[1]
|
|
768
|
+
|
|
769
|
+
return counts
|
|
770
|
+
|
|
771
|
+
def get_amino_acid_composition(self) -> pd.DataFrame:
|
|
772
|
+
"""
|
|
773
|
+
Calculate Amino Acid Composition (AAComp) of protein sequence using the
|
|
774
|
+
custom-built protpy package. AAComp describes the fraction of each amino
|
|
775
|
+
acid type within a protein sequence, and is calculated as:
|
|
776
|
+
|
|
777
|
+
AA_Comp(s) = AA(t)/N(s)
|
|
778
|
+
|
|
779
|
+
where AA_Comp(s) is the AAComp of protein sequence s, AA(t) is the number
|
|
780
|
+
of amino acid types t (where t = 1,2,..,20) and N(s) is the length of the
|
|
781
|
+
sequence s.
|
|
782
|
+
|
|
783
|
+
Parameters
|
|
784
|
+
==========
|
|
785
|
+
None
|
|
786
|
+
|
|
787
|
+
Returns
|
|
788
|
+
=======
|
|
789
|
+
:amino_acid_composition: pd.Dataframe
|
|
790
|
+
pandas dataframe of AAComp for protein sequence. Dataframe will
|
|
791
|
+
be of the shape N x 20, where N is the number of protein sequences
|
|
792
|
+
and 20 is the number of features calculated from the descriptor
|
|
793
|
+
(for the 20 canonical amino acids).
|
|
794
|
+
"""
|
|
795
|
+
#if attribute already calculated & not empty then return it
|
|
796
|
+
if not self.amino_acid_composition.empty:
|
|
797
|
+
return self.amino_acid_composition
|
|
798
|
+
|
|
799
|
+
#calculate descriptor value for each sequence using helper method
|
|
800
|
+
self.amino_acid_composition = self._calculate_descriptor_batch(
|
|
801
|
+
protpy.amino_acid_composition,
|
|
802
|
+
desc_name="Amino Acid Composition"
|
|
803
|
+
)
|
|
804
|
+
|
|
805
|
+
return self.amino_acid_composition
|
|
806
|
+
|
|
807
|
+
def get_dipeptide_composition(self) -> pd.DataFrame:
|
|
808
|
+
"""
|
|
809
|
+
Calculate Dipeptide Composition (DPComp) for protein sequence using
|
|
810
|
+
the custom-built protpy package. Dipeptide composition is the fraction
|
|
811
|
+
of each dipeptide type within a protein sequence. With dipeptides
|
|
812
|
+
being of length 2 and there being 20 canonical amino acids, this creates
|
|
813
|
+
20^2 different combinations, thus a 400-Dimensional vector will be produced
|
|
814
|
+
such that:
|
|
815
|
+
|
|
816
|
+
DPComp(s,t) = AA(s,t) / N -1
|
|
817
|
+
|
|
818
|
+
where DPComp(s,t) is the dipeptide composition of the protein sequence
|
|
819
|
+
for amino acid type s and t (where s and t = 1,2,..,20), AA(s,t) is the number
|
|
820
|
+
of dipeptides represented by amino acid type s and t and N is the total number
|
|
821
|
+
of dipeptides.
|
|
822
|
+
|
|
823
|
+
Parameters
|
|
824
|
+
==========
|
|
825
|
+
None
|
|
826
|
+
|
|
827
|
+
Returns
|
|
828
|
+
=======
|
|
829
|
+
:dipeptide_composition: pd.Dataframe
|
|
830
|
+
pandas Dataframe of dipeptide composition for protein sequence. Dataframe will
|
|
831
|
+
be of the shape N x 400, where N is the number of protein sequences and 400 is
|
|
832
|
+
the number of features calculated from the descriptor (20^2 for the 20 canonical
|
|
833
|
+
amino acids).
|
|
834
|
+
"""
|
|
835
|
+
#if attribute already calculated & not empty then return it
|
|
836
|
+
if not self.dipeptide_composition.empty:
|
|
837
|
+
return self.dipeptide_composition
|
|
838
|
+
|
|
839
|
+
#calculate descriptor value using helper method
|
|
840
|
+
self.dipeptide_composition = self._calculate_descriptor_batch(
|
|
841
|
+
protpy.dipeptide_composition,
|
|
842
|
+
desc_name="Dipeptide Composition"
|
|
843
|
+
)
|
|
844
|
+
|
|
845
|
+
return self.dipeptide_composition
|
|
846
|
+
|
|
847
|
+
def get_tripeptide_composition(self) -> pd.DataFrame:
|
|
848
|
+
"""
|
|
849
|
+
Calculate Tripeptide Composition (TPComp) of protein sequence using
|
|
850
|
+
custom-built protpy package. Tripeptide composition is the fraction of
|
|
851
|
+
each tripeptide type within a protein sequence. With tripeptides being
|
|
852
|
+
of length 3 and there being 20 canonical amino acids this creates 20^3
|
|
853
|
+
different combinations, thus a 8000-Dimensional vector will be produced
|
|
854
|
+
such that:
|
|
855
|
+
|
|
856
|
+
TPComp(s,t,u) = AA(s,t,u) / N -1
|
|
857
|
+
|
|
858
|
+
where TPComp(s,t,u) is the tripeptide composition of the protein sequence
|
|
859
|
+
for amino acid type s, t and u (where s, t and u = 1,2,..,20), AA(s,t,u) is
|
|
860
|
+
the number of tripeptides represented by amino acid type s and t, and N is
|
|
861
|
+
the total number of tripeptides.
|
|
862
|
+
|
|
863
|
+
Parameters
|
|
864
|
+
==========
|
|
865
|
+
None
|
|
866
|
+
|
|
867
|
+
Returns
|
|
868
|
+
=======
|
|
869
|
+
:tripeptide_composition: pd.Dataframe
|
|
870
|
+
pandas Dataframe of tripeptide composition for protein sequence. Dataframe will
|
|
871
|
+
be of the shape N x 8000, where N is the number of protein sequences and 8000 is
|
|
872
|
+
the number of features calculated from the descriptor (20^3 for the 20 canonical
|
|
873
|
+
amino acids).
|
|
874
|
+
"""
|
|
875
|
+
#if attribute already calculated & not empty then return it
|
|
876
|
+
if not self.tripeptide_composition.empty:
|
|
877
|
+
return self.tripeptide_composition
|
|
878
|
+
|
|
879
|
+
#calculate descriptor value using helper method
|
|
880
|
+
self.tripeptide_composition = self._calculate_descriptor_batch(
|
|
881
|
+
protpy.tripeptide_composition,
|
|
882
|
+
desc_name="Tripeptide Composition"
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
return self.tripeptide_composition
|
|
886
|
+
|
|
887
|
+
def get_gravy(self) -> pd.DataFrame:
|
|
888
|
+
"""
|
|
889
|
+
Calculate the Grand Average of Hydropathy (GRAVY) for protein sequences using
|
|
890
|
+
the protpy package. GRAVY is the mean of Kyte-Doolittle hydropathy values across
|
|
891
|
+
all residues. A positive value indicates overall hydrophobicity; a negative value
|
|
892
|
+
indicates overall hydrophilicity.
|
|
893
|
+
|
|
894
|
+
Parameters
|
|
895
|
+
==========
|
|
896
|
+
None
|
|
897
|
+
|
|
898
|
+
Returns
|
|
899
|
+
=======
|
|
900
|
+
:gravy: pd.DataFrame
|
|
901
|
+
Dataframe of GRAVY values, shape N x 1 where N is the number of sequences.
|
|
902
|
+
"""
|
|
903
|
+
# return cached result if already computed
|
|
904
|
+
if not self.gravy.empty:
|
|
905
|
+
return self.gravy
|
|
906
|
+
|
|
907
|
+
# calculate GRAVY for all sequences
|
|
908
|
+
self.gravy = self._calculate_descriptor_batch(
|
|
909
|
+
protpy.gravy,
|
|
910
|
+
desc_name="GRAVY"
|
|
911
|
+
)
|
|
912
|
+
return self.gravy
|
|
913
|
+
|
|
914
|
+
def get_aromaticity(self) -> pd.DataFrame:
|
|
915
|
+
"""
|
|
916
|
+
Calculate Aromaticity for protein sequences using the protpy package.
|
|
917
|
+
Aromaticity is the fraction of aromatic residues (F, W, Y, H) in the sequence.
|
|
918
|
+
|
|
919
|
+
Parameters
|
|
920
|
+
==========
|
|
921
|
+
None
|
|
922
|
+
|
|
923
|
+
Returns
|
|
924
|
+
=======
|
|
925
|
+
:aromaticity: pd.DataFrame
|
|
926
|
+
Dataframe of Aromaticity values, shape N x 1 where N is the number of sequences.
|
|
927
|
+
"""
|
|
928
|
+
# return cached result if already computed
|
|
929
|
+
if not self.aromaticity.empty:
|
|
930
|
+
return self.aromaticity
|
|
931
|
+
|
|
932
|
+
# calculate aromaticity for all sequences
|
|
933
|
+
self.aromaticity = self._calculate_descriptor_batch(
|
|
934
|
+
protpy.aromaticity,
|
|
935
|
+
desc_name="Aromaticity"
|
|
936
|
+
)
|
|
937
|
+
return self.aromaticity
|
|
938
|
+
|
|
939
|
+
def get_instability_index(self) -> pd.DataFrame:
|
|
940
|
+
"""
|
|
941
|
+
Calculate the Instability Index for protein sequences using the protpy package.
|
|
942
|
+
Based on dipeptide instability weight values (DIWV). Values below 40 indicate a
|
|
943
|
+
stable protein; 40 or above indicates instability.
|
|
944
|
+
|
|
945
|
+
Parameters
|
|
946
|
+
==========
|
|
947
|
+
None
|
|
948
|
+
|
|
949
|
+
Returns
|
|
950
|
+
=======
|
|
951
|
+
:instability_index: pd.DataFrame
|
|
952
|
+
Dataframe of InstabilityIndex values, shape N x 1.
|
|
953
|
+
"""
|
|
954
|
+
# return cached result if already computed
|
|
955
|
+
if not self.instability_index.empty:
|
|
956
|
+
return self.instability_index
|
|
957
|
+
|
|
958
|
+
# calculate instability index for all sequences
|
|
959
|
+
self.instability_index = self._calculate_descriptor_batch(
|
|
960
|
+
protpy.instability_index,
|
|
961
|
+
desc_name="Instability Index"
|
|
962
|
+
)
|
|
963
|
+
return self.instability_index
|
|
964
|
+
|
|
965
|
+
def get_isoelectric_point(self) -> pd.DataFrame:
|
|
966
|
+
"""
|
|
967
|
+
Calculate the Isoelectric Point for protein sequences using the protpy package.
|
|
968
|
+
The isoelectric point is the estimated pH at which the protein carries no net
|
|
969
|
+
charge, calculated iteratively using standard pKa values for ionisable residues.
|
|
970
|
+
|
|
971
|
+
Parameters
|
|
972
|
+
==========
|
|
973
|
+
None
|
|
974
|
+
|
|
975
|
+
Returns
|
|
976
|
+
=======
|
|
977
|
+
:isoelectric_point: pd.DataFrame
|
|
978
|
+
Dataframe of IsoelectricPoint values, shape N x 1.
|
|
979
|
+
"""
|
|
980
|
+
# return cached result if already computed
|
|
981
|
+
if not self.isoelectric_point.empty:
|
|
982
|
+
return self.isoelectric_point
|
|
983
|
+
|
|
984
|
+
# calculate isoelectric point for all sequences
|
|
985
|
+
self.isoelectric_point = self._calculate_descriptor_batch(
|
|
986
|
+
protpy.isoelectric_point,
|
|
987
|
+
desc_name="Isoelectric Point"
|
|
988
|
+
)
|
|
989
|
+
return self.isoelectric_point
|
|
990
|
+
|
|
991
|
+
def get_molecular_weight(self) -> pd.DataFrame:
|
|
992
|
+
"""
|
|
993
|
+
Calculate the Molecular Weight for protein sequences using the protpy package.
|
|
994
|
+
Average molecular weight calculated from residue masses, corrected for water
|
|
995
|
+
lost at each peptide bond.
|
|
996
|
+
|
|
997
|
+
Parameters
|
|
998
|
+
==========
|
|
999
|
+
None
|
|
1000
|
+
|
|
1001
|
+
Returns
|
|
1002
|
+
=======
|
|
1003
|
+
:molecular_weight: pd.DataFrame
|
|
1004
|
+
Dataframe of MolecularWeight values (Da), shape N x 1.
|
|
1005
|
+
"""
|
|
1006
|
+
# return cached result if already computed
|
|
1007
|
+
if not self.molecular_weight.empty:
|
|
1008
|
+
return self.molecular_weight
|
|
1009
|
+
|
|
1010
|
+
# calculate molecular weight for all sequences
|
|
1011
|
+
self.molecular_weight = self._calculate_descriptor_batch(
|
|
1012
|
+
protpy.molecular_weight,
|
|
1013
|
+
desc_name="Molecular Weight"
|
|
1014
|
+
)
|
|
1015
|
+
return self.molecular_weight
|
|
1016
|
+
|
|
1017
|
+
def get_charge_distribution(self) -> pd.DataFrame:
|
|
1018
|
+
"""
|
|
1019
|
+
Calculate Charge Distribution for protein sequences using the protpy package.
|
|
1020
|
+
Computes positive, negative, and net charge contributions of ionisable residues
|
|
1021
|
+
at a given pH using the Henderson-Hasselbalch equation.
|
|
1022
|
+
|
|
1023
|
+
Parameters
|
|
1024
|
+
==========
|
|
1025
|
+
None
|
|
1026
|
+
|
|
1027
|
+
Returns
|
|
1028
|
+
=======
|
|
1029
|
+
:charge_distribution: pd.DataFrame
|
|
1030
|
+
Dataframe of charge values, shape N x 3 (PositiveCharge, NegativeCharge, NetCharge).
|
|
1031
|
+
"""
|
|
1032
|
+
# return cached result if already computed
|
|
1033
|
+
if not self.charge_distribution.empty:
|
|
1034
|
+
return self.charge_distribution
|
|
1035
|
+
|
|
1036
|
+
# get pH parameter from config, falling back to physiological default
|
|
1037
|
+
ph_params = getattr(self.desc_parameters, 'charge_distribution', {})
|
|
1038
|
+
ph = ph_params.get('ph', 7.4) if ph_params else 7.4
|
|
1039
|
+
|
|
1040
|
+
# calculate charge distribution for all sequences
|
|
1041
|
+
self.charge_distribution = self._calculate_descriptor_batch(
|
|
1042
|
+
protpy.charge_distribution,
|
|
1043
|
+
desc_name="Charge Distribution",
|
|
1044
|
+
ph=ph
|
|
1045
|
+
)
|
|
1046
|
+
return self.charge_distribution
|
|
1047
|
+
|
|
1048
|
+
def get_hydrophobic_polar_charged_composition(self) -> pd.DataFrame:
|
|
1049
|
+
"""
|
|
1050
|
+
Calculate Hydrophobic/Polar/Charged Composition (HPC) for protein sequences
|
|
1051
|
+
using the protpy package. Computes the percentage of residues belonging to each
|
|
1052
|
+
of three physicochemical groups: hydrophobic (A, C, F, I, L, M, V, W, Y),
|
|
1053
|
+
polar (G, N, Q, S, T), and charged (D, E, H, K, R).
|
|
1054
|
+
|
|
1055
|
+
Parameters
|
|
1056
|
+
==========
|
|
1057
|
+
None
|
|
1058
|
+
|
|
1059
|
+
Returns
|
|
1060
|
+
=======
|
|
1061
|
+
:hydrophobic_polar_charged_composition: pd.DataFrame
|
|
1062
|
+
Dataframe of HPC values, shape N x 3 (Hydrophobic, Polar, Charged).
|
|
1063
|
+
"""
|
|
1064
|
+
# return cached result if already computed
|
|
1065
|
+
if not self.hydrophobic_polar_charged_composition.empty:
|
|
1066
|
+
return self.hydrophobic_polar_charged_composition
|
|
1067
|
+
|
|
1068
|
+
# calculate HPC composition for all sequences
|
|
1069
|
+
self.hydrophobic_polar_charged_composition = self._calculate_descriptor_batch(
|
|
1070
|
+
protpy.hydrophobic_polar_charged_composition,
|
|
1071
|
+
desc_name="Hydrophobic/Polar/Charged Composition"
|
|
1072
|
+
)
|
|
1073
|
+
return self.hydrophobic_polar_charged_composition
|
|
1074
|
+
|
|
1075
|
+
def get_secondary_structure_propensity(self) -> pd.DataFrame:
|
|
1076
|
+
"""
|
|
1077
|
+
Calculate Secondary Structure Propensity (SSP) for protein sequences using the
|
|
1078
|
+
protpy package. Computes average Chou-Fasman propensity values for alpha-helix,
|
|
1079
|
+
beta-sheet, and random coil conformations across all residues.
|
|
1080
|
+
|
|
1081
|
+
Parameters
|
|
1082
|
+
==========
|
|
1083
|
+
None
|
|
1084
|
+
|
|
1085
|
+
Returns
|
|
1086
|
+
=======
|
|
1087
|
+
:secondary_structure_propensity: pd.DataFrame
|
|
1088
|
+
Dataframe of SSP values, shape N x 3 (Helix, Sheet, Coil).
|
|
1089
|
+
"""
|
|
1090
|
+
# return cached result if already computed
|
|
1091
|
+
if not self.secondary_structure_propensity.empty:
|
|
1092
|
+
return self.secondary_structure_propensity
|
|
1093
|
+
|
|
1094
|
+
# calculate secondary structure propensity for all sequences
|
|
1095
|
+
self.secondary_structure_propensity = self._calculate_descriptor_batch(
|
|
1096
|
+
protpy.secondary_structure_propensity,
|
|
1097
|
+
desc_name="Secondary Structure Propensity"
|
|
1098
|
+
)
|
|
1099
|
+
return self.secondary_structure_propensity
|
|
1100
|
+
|
|
1101
|
+
def get_kmer_composition(self) -> pd.DataFrame:
|
|
1102
|
+
"""
|
|
1103
|
+
Calculate k-mer Composition for protein sequences using the protpy package.
|
|
1104
|
+
Computes the frequency of all possible k-length residue subsequences, expressed
|
|
1105
|
+
as a percentage of total k-mers.
|
|
1106
|
+
|
|
1107
|
+
Parameters
|
|
1108
|
+
==========
|
|
1109
|
+
None
|
|
1110
|
+
|
|
1111
|
+
Returns
|
|
1112
|
+
=======
|
|
1113
|
+
:kmer_composition: pd.DataFrame
|
|
1114
|
+
Dataframe of k-mer composition values, shape N x 20^k (e.g. N x 400 for k=2).
|
|
1115
|
+
"""
|
|
1116
|
+
# return cached result if already computed
|
|
1117
|
+
if not self.kmer_composition.empty:
|
|
1118
|
+
return self.kmer_composition
|
|
1119
|
+
|
|
1120
|
+
# get k-mer length from config, defaulting to 2 (dipeptide)
|
|
1121
|
+
kmer_params = getattr(self.desc_parameters, 'kmer_composition', {})
|
|
1122
|
+
k = kmer_params.get('k', 2) if kmer_params else 2
|
|
1123
|
+
|
|
1124
|
+
# calculate k-mer composition for all sequences
|
|
1125
|
+
self.kmer_composition = self._calculate_descriptor_batch(
|
|
1126
|
+
protpy.kmer_composition,
|
|
1127
|
+
desc_name="k-mer Composition",
|
|
1128
|
+
k=k
|
|
1129
|
+
)
|
|
1130
|
+
return self.kmer_composition
|
|
1131
|
+
|
|
1132
|
+
def get_reduced_alphabet_composition(self) -> pd.DataFrame:
|
|
1133
|
+
"""
|
|
1134
|
+
Calculate Reduced Alphabet Composition for protein sequences using the protpy
|
|
1135
|
+
package. Computes amino acid composition after mapping residues to a reduced
|
|
1136
|
+
alphabet of physicochemical groups. Supported alphabet sizes: 2, 3, 4, 6.
|
|
1137
|
+
|
|
1138
|
+
Parameters
|
|
1139
|
+
==========
|
|
1140
|
+
None
|
|
1141
|
+
|
|
1142
|
+
Returns
|
|
1143
|
+
=======
|
|
1144
|
+
:reduced_alphabet_composition: pd.DataFrame
|
|
1145
|
+
Dataframe of reduced composition values, shape N x alphabet_size.
|
|
1146
|
+
"""
|
|
1147
|
+
# return cached result if already computed
|
|
1148
|
+
if not self.reduced_alphabet_composition.empty:
|
|
1149
|
+
return self.reduced_alphabet_composition
|
|
1150
|
+
|
|
1151
|
+
# get alphabet size from config, defaulting to 6 groups
|
|
1152
|
+
rac_params = getattr(self.desc_parameters, 'reduced_alphabet_composition', {})
|
|
1153
|
+
alphabet_size = rac_params.get('alphabet_size', 6) if rac_params else 6
|
|
1154
|
+
|
|
1155
|
+
# calculate reduced alphabet composition for all sequences
|
|
1156
|
+
self.reduced_alphabet_composition = self._calculate_descriptor_batch(
|
|
1157
|
+
protpy.reduced_alphabet_composition,
|
|
1158
|
+
desc_name="Reduced Alphabet Composition",
|
|
1159
|
+
alphabet_size=alphabet_size
|
|
1160
|
+
)
|
|
1161
|
+
return self.reduced_alphabet_composition
|
|
1162
|
+
|
|
1163
|
+
def get_motif_composition(self) -> pd.DataFrame:
|
|
1164
|
+
"""
|
|
1165
|
+
Calculate Motif Composition for protein sequences using the protpy package.
|
|
1166
|
+
Counts occurrences (including overlapping) of biological sequence motifs matched
|
|
1167
|
+
via regular expressions. Uses 8 built-in motifs by default; a custom dict of
|
|
1168
|
+
name->pattern mappings can be supplied via config.
|
|
1169
|
+
|
|
1170
|
+
Parameters
|
|
1171
|
+
==========
|
|
1172
|
+
None
|
|
1173
|
+
|
|
1174
|
+
Returns
|
|
1175
|
+
=======
|
|
1176
|
+
:motif_composition: pd.DataFrame
|
|
1177
|
+
Dataframe of motif counts, shape N x len(motifs).
|
|
1178
|
+
"""
|
|
1179
|
+
# return cached result if already computed
|
|
1180
|
+
if not self.motif_composition.empty:
|
|
1181
|
+
return self.motif_composition
|
|
1182
|
+
|
|
1183
|
+
# get custom motifs from config; None causes protpy to use built-in defaults
|
|
1184
|
+
motif_params = getattr(self.desc_parameters, 'motif_composition', {})
|
|
1185
|
+
motifs = motif_params.get('motifs', None) if motif_params else None
|
|
1186
|
+
# treat empty list/dict as None to trigger built-in default motifs
|
|
1187
|
+
if not motifs:
|
|
1188
|
+
motifs = None
|
|
1189
|
+
|
|
1190
|
+
# calculate motif composition for all sequences
|
|
1191
|
+
self.motif_composition = self._calculate_descriptor_batch(
|
|
1192
|
+
protpy.motif_composition,
|
|
1193
|
+
desc_name="Motif Composition",
|
|
1194
|
+
motifs=motifs
|
|
1195
|
+
)
|
|
1196
|
+
return self.motif_composition
|
|
1197
|
+
|
|
1198
|
+
def get_amino_acid_pair_composition(self) -> pd.DataFrame:
|
|
1199
|
+
"""
|
|
1200
|
+
Calculate Amino Acid Pair Composition for protein sequences using the protpy
|
|
1201
|
+
package. Computes the frequency of all 400 residue-pair combinations with
|
|
1202
|
+
column names annotated by the physicochemical class of each residue.
|
|
1203
|
+
|
|
1204
|
+
Parameters
|
|
1205
|
+
==========
|
|
1206
|
+
None
|
|
1207
|
+
|
|
1208
|
+
Returns
|
|
1209
|
+
=======
|
|
1210
|
+
:amino_acid_pair_composition: pd.DataFrame
|
|
1211
|
+
Dataframe of pair composition values, shape N x 400.
|
|
1212
|
+
"""
|
|
1213
|
+
# return cached result if already computed
|
|
1214
|
+
if not self.amino_acid_pair_composition.empty:
|
|
1215
|
+
return self.amino_acid_pair_composition
|
|
1216
|
+
|
|
1217
|
+
# calculate amino acid pair composition for all sequences
|
|
1218
|
+
self.amino_acid_pair_composition = self._calculate_descriptor_batch(
|
|
1219
|
+
protpy.amino_acid_pair_composition,
|
|
1220
|
+
desc_name="Amino Acid Pair Composition"
|
|
1221
|
+
)
|
|
1222
|
+
return self.amino_acid_pair_composition
|
|
1223
|
+
|
|
1224
|
+
def get_aliphatic_index(self) -> pd.DataFrame:
|
|
1225
|
+
"""
|
|
1226
|
+
Calculate the Aliphatic Index for protein sequences using the protpy package.
|
|
1227
|
+
Measures the relative volume occupied by aliphatic side chains (Ala, Val, Ile,
|
|
1228
|
+
Leu). Higher values indicate greater thermostability.
|
|
1229
|
+
|
|
1230
|
+
Parameters
|
|
1231
|
+
==========
|
|
1232
|
+
None
|
|
1233
|
+
|
|
1234
|
+
Returns
|
|
1235
|
+
=======
|
|
1236
|
+
:aliphatic_index: pd.DataFrame
|
|
1237
|
+
Dataframe of AliphaticIndex values, shape N x 1.
|
|
1238
|
+
"""
|
|
1239
|
+
# return cached result if already computed
|
|
1240
|
+
if not self.aliphatic_index.empty:
|
|
1241
|
+
return self.aliphatic_index
|
|
1242
|
+
|
|
1243
|
+
# calculate aliphatic index for all sequences
|
|
1244
|
+
self.aliphatic_index = self._calculate_descriptor_batch(
|
|
1245
|
+
protpy.aliphatic_index,
|
|
1246
|
+
desc_name="Aliphatic Index"
|
|
1247
|
+
)
|
|
1248
|
+
return self.aliphatic_index
|
|
1249
|
+
|
|
1250
|
+
def get_extinction_coefficient(self) -> pd.DataFrame:
|
|
1251
|
+
"""
|
|
1252
|
+
Calculate the Extinction Coefficient for protein sequences using the protpy
|
|
1253
|
+
package. Computes the molar extinction coefficient at 280 nm from the number of
|
|
1254
|
+
Trp (W), Tyr (Y), and Cys (C) residues. Reported for reduced and oxidized states.
|
|
1255
|
+
|
|
1256
|
+
Parameters
|
|
1257
|
+
==========
|
|
1258
|
+
None
|
|
1259
|
+
|
|
1260
|
+
Returns
|
|
1261
|
+
=======
|
|
1262
|
+
:extinction_coefficient: pd.DataFrame
|
|
1263
|
+
Dataframe of extinction coefficient values, shape N x 2
|
|
1264
|
+
(ExtCoeff_Reduced, ExtCoeff_Oxidized).
|
|
1265
|
+
"""
|
|
1266
|
+
# return cached result if already computed
|
|
1267
|
+
if not self.extinction_coefficient.empty:
|
|
1268
|
+
return self.extinction_coefficient
|
|
1269
|
+
|
|
1270
|
+
# calculate extinction coefficient for all sequences
|
|
1271
|
+
self.extinction_coefficient = self._calculate_descriptor_batch(
|
|
1272
|
+
protpy.extinction_coefficient,
|
|
1273
|
+
desc_name="Extinction Coefficient"
|
|
1274
|
+
)
|
|
1275
|
+
return self.extinction_coefficient
|
|
1276
|
+
|
|
1277
|
+
def get_boman_index(self) -> pd.DataFrame:
|
|
1278
|
+
"""
|
|
1279
|
+
Calculate the Boman Index for protein sequences using the protpy package.
|
|
1280
|
+
Sum of solubility values for amino acids divided by sequence length, predicting
|
|
1281
|
+
potential for protein-protein interactions.
|
|
1282
|
+
|
|
1283
|
+
Parameters
|
|
1284
|
+
==========
|
|
1285
|
+
None
|
|
1286
|
+
|
|
1287
|
+
Returns
|
|
1288
|
+
=======
|
|
1289
|
+
:boman_index: pd.DataFrame
|
|
1290
|
+
Dataframe of BomanIndex values, shape N x 1.
|
|
1291
|
+
"""
|
|
1292
|
+
# return cached result if already computed
|
|
1293
|
+
if not self.boman_index.empty:
|
|
1294
|
+
return self.boman_index
|
|
1295
|
+
|
|
1296
|
+
# calculate Boman index for all sequences
|
|
1297
|
+
self.boman_index = self._calculate_descriptor_batch(
|
|
1298
|
+
protpy.boman_index,
|
|
1299
|
+
desc_name="Boman Index"
|
|
1300
|
+
)
|
|
1301
|
+
return self.boman_index
|
|
1302
|
+
|
|
1303
|
+
def get_aggregation_propensity(self) -> pd.DataFrame:
|
|
1304
|
+
"""
|
|
1305
|
+
Calculate Aggregation Propensity for protein sequences using the protpy package.
|
|
1306
|
+
Estimates aggregation-prone regions via a sliding-window approach combining
|
|
1307
|
+
Kyte-Doolittle hydrophobicity and charge neutrality. Returns the count of
|
|
1308
|
+
qualifying windows and the fraction of the sequence covered.
|
|
1309
|
+
|
|
1310
|
+
Parameters
|
|
1311
|
+
==========
|
|
1312
|
+
None
|
|
1313
|
+
|
|
1314
|
+
Returns
|
|
1315
|
+
=======
|
|
1316
|
+
:aggregation_propensity: pd.DataFrame
|
|
1317
|
+
Dataframe of aggregation values, shape N x 2
|
|
1318
|
+
(AggregProneRegions, AggregProneFraction).
|
|
1319
|
+
"""
|
|
1320
|
+
# return cached result if already computed
|
|
1321
|
+
if not self.aggregation_propensity.empty:
|
|
1322
|
+
return self.aggregation_propensity
|
|
1323
|
+
|
|
1324
|
+
# get sliding-window parameters from config, using standard defaults otherwise
|
|
1325
|
+
agg_params = getattr(self.desc_parameters, 'aggregation_propensity', {})
|
|
1326
|
+
window = agg_params.get('window', 5) if agg_params else 5
|
|
1327
|
+
hydrophobicity_threshold = agg_params.get('hydrophobicity_threshold', 2.0) if agg_params else 2.0
|
|
1328
|
+
charge_threshold = agg_params.get('charge_threshold', 1) if agg_params else 1
|
|
1329
|
+
|
|
1330
|
+
# calculate aggregation propensity for all sequences
|
|
1331
|
+
self.aggregation_propensity = self._calculate_descriptor_batch(
|
|
1332
|
+
protpy.aggregation_propensity,
|
|
1333
|
+
desc_name="Aggregation Propensity",
|
|
1334
|
+
window=window,
|
|
1335
|
+
hydrophobicity_threshold=hydrophobicity_threshold,
|
|
1336
|
+
charge_threshold=charge_threshold
|
|
1337
|
+
)
|
|
1338
|
+
return self.aggregation_propensity
|
|
1339
|
+
|
|
1340
|
+
def get_hydrophobic_moment(self) -> pd.DataFrame:
|
|
1341
|
+
"""
|
|
1342
|
+
Calculate Hydrophobic Moment for protein sequences using the protpy package.
|
|
1343
|
+
Computes the mean and maximum hydrophobic moment across sliding windows using
|
|
1344
|
+
the Eisenberg hydrophobicity scale and a helical-wheel projection. Captures
|
|
1345
|
+
amphipathicity of putative helix segments.
|
|
1346
|
+
|
|
1347
|
+
Parameters
|
|
1348
|
+
==========
|
|
1349
|
+
None
|
|
1350
|
+
|
|
1351
|
+
Returns
|
|
1352
|
+
=======
|
|
1353
|
+
:hydrophobic_moment: pd.DataFrame
|
|
1354
|
+
Dataframe of hydrophobic moment values, shape N x 2
|
|
1355
|
+
(HydrophobicMoment_Mean, HydrophobicMoment_Max).
|
|
1356
|
+
"""
|
|
1357
|
+
# return cached result if already computed
|
|
1358
|
+
if not self.hydrophobic_moment.empty:
|
|
1359
|
+
return self.hydrophobic_moment
|
|
1360
|
+
|
|
1361
|
+
# get window and helical angle from config, using Eisenberg scale defaults
|
|
1362
|
+
hm_params = getattr(self.desc_parameters, 'hydrophobic_moment', {})
|
|
1363
|
+
window = hm_params.get('window', 11) if hm_params else 11
|
|
1364
|
+
angle = hm_params.get('angle', 100) if hm_params else 100
|
|
1365
|
+
|
|
1366
|
+
# calculate hydrophobic moment for all sequences
|
|
1367
|
+
self.hydrophobic_moment = self._calculate_descriptor_batch(
|
|
1368
|
+
protpy.hydrophobic_moment,
|
|
1369
|
+
desc_name="Hydrophobic Moment",
|
|
1370
|
+
window=window,
|
|
1371
|
+
angle=angle
|
|
1372
|
+
)
|
|
1373
|
+
return self.hydrophobic_moment
|
|
1374
|
+
|
|
1375
|
+
def get_shannon_entropy(self) -> pd.DataFrame:
|
|
1376
|
+
"""
|
|
1377
|
+
Calculate Shannon Entropy for protein sequences using the protpy package.
|
|
1378
|
+
An information-theoretic measure of amino acid diversity in a sequence computed
|
|
1379
|
+
as H = -sum(p_i * log2(p_i)). A value of 0 means a completely repetitive
|
|
1380
|
+
sequence; the theoretical maximum of ~4.322 bits corresponds to a perfectly
|
|
1381
|
+
uniform distribution across all 20 canonical amino acids.
|
|
1382
|
+
|
|
1383
|
+
Parameters
|
|
1384
|
+
==========
|
|
1385
|
+
None
|
|
1386
|
+
|
|
1387
|
+
Returns
|
|
1388
|
+
=======
|
|
1389
|
+
:shannon_entropy: pd.DataFrame
|
|
1390
|
+
Dataframe of ShannonEntropy values, shape N x 1.
|
|
1391
|
+
"""
|
|
1392
|
+
# return cached result if already computed
|
|
1393
|
+
if not self.shannon_entropy.empty:
|
|
1394
|
+
return self.shannon_entropy
|
|
1395
|
+
|
|
1396
|
+
# calculate Shannon entropy for all sequences
|
|
1397
|
+
self.shannon_entropy = self._calculate_descriptor_batch(
|
|
1398
|
+
protpy.shannon_entropy,
|
|
1399
|
+
desc_name="Shannon Entropy"
|
|
1400
|
+
)
|
|
1401
|
+
return self.shannon_entropy
|
|
1402
|
+
|
|
1403
|
+
def get_moreaubroto_autocorrelation(self) -> pd.DataFrame:
|
|
1404
|
+
"""
|
|
1405
|
+
Calculate MoreauBrotoAuto Autocorrelation (MBAuto) descriptor using
|
|
1406
|
+
custom-built protpy package. Autocorrelation descriptors are a class
|
|
1407
|
+
of topological descriptors, also known as molecular connectivity indices, that
|
|
1408
|
+
describe the level of correlation between two objects (protein or peptide sequences)
|
|
1409
|
+
in terms of their specific structural or physicochemical properties, which are
|
|
1410
|
+
defined based on the distribution of amino acid properties along the sequence.
|
|
1411
|
+
|
|
1412
|
+
By default, 8 amino acid properties are used for deriving the descriptors. The
|
|
1413
|
+
derivations and detailed explanations of this type of descriptor is outlind in
|
|
1414
|
+
[4]. The MBAuto descriptor is a type of Autocorrelation descriptor that uses
|
|
1415
|
+
the property values as the basis for measurement. Each autocorrelation will
|
|
1416
|
+
generate the number of features depending on the lag value and number of
|
|
1417
|
+
properties input with total features = lag * number of properties. The
|
|
1418
|
+
autocorrelation values can also be normalized if the "normalize" parameter
|
|
1419
|
+
is set in the config file. Using the default 8 properties with default lag
|
|
1420
|
+
value of 30, 240 features are generated, the default 8 properties are:
|
|
1421
|
+
|
|
1422
|
+
AccNo. CIDH920105 - Normalized Average Hydrophobicity Scales.
|
|
1423
|
+
AccNo. BHAR880101 - Average Flexibility Indices.
|
|
1424
|
+
AccNo. CHAM820101 - Polarizability Parameter.
|
|
1425
|
+
AccNo. CHAM820102 - Free Energy of Solution in Water, kcal/mole.
|
|
1426
|
+
AccNo. CHOC760101 - Residue Accessible Surface Area in Tripeptide.
|
|
1427
|
+
AccNo. BIGC670101 - Residue Volume.
|
|
1428
|
+
AccNo. CHAM810101 - Steric Parameter.
|
|
1429
|
+
AccNo. DAYM780201 - Relative Mutability.
|
|
1430
|
+
|
|
1431
|
+
Parameters
|
|
1432
|
+
==========
|
|
1433
|
+
None
|
|
1434
|
+
|
|
1435
|
+
Returns15
|
|
1436
|
+
=======
|
|
1437
|
+
:moreaubroto_autocorrelation: pd.Dataframe
|
|
1438
|
+
pandas Dataframe of MBAuto values for protein sequence. Output will
|
|
1439
|
+
be of the shape N x M, where N is the number of protein sequences and
|
|
1440
|
+
M is the number of features calculated from the descriptor, calculated
|
|
1441
|
+
as lag * number of properties. By default, the shape will be N x 240
|
|
1442
|
+
(30 features per property - using 8 properties, with lag=30).
|
|
1443
|
+
"""
|
|
1444
|
+
#if attribute already calculated & not empty then return it
|
|
1445
|
+
if not self.moreaubroto_autocorrelation.empty:
|
|
1446
|
+
return self.moreaubroto_autocorrelation
|
|
1447
|
+
|
|
1448
|
+
#get descriptor-specific parameters from config file
|
|
1449
|
+
lag = self.desc_parameters.moreaubroto_autocorrelation["lag"]
|
|
1450
|
+
properties = self.desc_parameters.moreaubroto_autocorrelation["properties"]
|
|
1451
|
+
normalize = self.desc_parameters.moreaubroto_autocorrelation["normalize"]
|
|
1452
|
+
|
|
1453
|
+
#calculate descriptor value using helper method
|
|
1454
|
+
self.moreaubroto_autocorrelation = self._calculate_descriptor_batch(
|
|
1455
|
+
protpy.moreaubroto_autocorrelation,
|
|
1456
|
+
desc_name="MoreauBroto Autocorrelation",
|
|
1457
|
+
lag=lag,
|
|
1458
|
+
properties=properties,
|
|
1459
|
+
normalize=normalize
|
|
1460
|
+
)
|
|
1461
|
+
|
|
1462
|
+
return self.moreaubroto_autocorrelation
|
|
1463
|
+
|
|
1464
|
+
def get_moran_autocorrelation(self) -> pd.DataFrame:
|
|
1465
|
+
"""
|
|
1466
|
+
Calculate Moran autocorrelation (MAuto) of protein sequences using the custom-built
|
|
1467
|
+
protpy package. MAuto utilizes property deviations from the average values.
|
|
1468
|
+
**refer to MBAuto docstring for autocorrelation description.
|
|
1469
|
+
|
|
1470
|
+
Parameters
|
|
1471
|
+
==========
|
|
1472
|
+
None
|
|
1473
|
+
|
|
1474
|
+
Returns
|
|
1475
|
+
=======
|
|
1476
|
+
:moran_autocorrelation: pd.DataFrame
|
|
1477
|
+
pandas Dataframe of MAuto values for protein sequence. Output will
|
|
1478
|
+
be of the shape N x M, where N is the number of protein sequences
|
|
1479
|
+
and M is the number of features calculated from the descriptor,
|
|
1480
|
+
calculated as lag * number of properties. By default, the shape
|
|
1481
|
+
will be N x 240 (30 features per property - using 8 properties,
|
|
1482
|
+
with lag=30).
|
|
1483
|
+
"""
|
|
1484
|
+
#if attribute already calculated & not empty then return it
|
|
1485
|
+
if not self.moran_autocorrelation.empty:
|
|
1486
|
+
return self.moran_autocorrelation
|
|
1487
|
+
|
|
1488
|
+
#get descriptor-specific parameters from config file
|
|
1489
|
+
lag = self.desc_parameters.moran_autocorrelation["lag"]
|
|
1490
|
+
properties = self.desc_parameters.moran_autocorrelation["properties"]
|
|
1491
|
+
normalize = self.desc_parameters.moran_autocorrelation["normalize"]
|
|
1492
|
+
|
|
1493
|
+
#calculate descriptor value using helper method
|
|
1494
|
+
self.moran_autocorrelation = self._calculate_descriptor_batch(
|
|
1495
|
+
protpy.moran_autocorrelation,
|
|
1496
|
+
desc_name="Moran Autocorrelation",
|
|
1497
|
+
lag=lag,
|
|
1498
|
+
properties=properties,
|
|
1499
|
+
normalize=normalize
|
|
1500
|
+
)
|
|
1501
|
+
|
|
1502
|
+
return self.moran_autocorrelation
|
|
1503
|
+
|
|
1504
|
+
def get_geary_autocorrelation(self) -> pd.DataFrame:
|
|
1505
|
+
"""
|
|
1506
|
+
Calculate Geary Autocorrelation (GAuto) of protein sequences using the
|
|
1507
|
+
custom-built protpy package. GAuto utilizes the square-difference of
|
|
1508
|
+
property values instead of vector-products (of property values or
|
|
1509
|
+
deviations).
|
|
1510
|
+
**refer to MBAuto docstring for autocorrelation description.
|
|
1511
|
+
|
|
1512
|
+
Parameters
|
|
1513
|
+
==========
|
|
1514
|
+
None
|
|
1515
|
+
|
|
1516
|
+
Returns
|
|
1517
|
+
=======
|
|
1518
|
+
:geary_autocorrelation: pd.DataFrame
|
|
1519
|
+
pandas Dataframe of GAuto values for protein sequence. Output will
|
|
1520
|
+
be of the shape N x M, where N is the number of protein sequences and
|
|
1521
|
+
M is the number of features calculated from the descriptor, calculated
|
|
1522
|
+
as lag * number of properties. By default, the shape will be N x 240
|
|
1523
|
+
(30 features per property - using 8 properties, with lag=30).
|
|
1524
|
+
"""
|
|
1525
|
+
#if attribute already calculated & not empty then return it
|
|
1526
|
+
if not self.geary_autocorrelation.empty:
|
|
1527
|
+
return self.geary_autocorrelation
|
|
1528
|
+
|
|
1529
|
+
#get descriptor-specific parameters from config file
|
|
1530
|
+
lag = self.desc_parameters.geary_autocorrelation["lag"]
|
|
1531
|
+
properties = self.desc_parameters.geary_autocorrelation["properties"]
|
|
1532
|
+
normalize = self.desc_parameters.geary_autocorrelation["normalize"]
|
|
1533
|
+
|
|
1534
|
+
#calculate descriptor value using helper method
|
|
1535
|
+
self.geary_autocorrelation = self._calculate_descriptor_batch(
|
|
1536
|
+
protpy.geary_autocorrelation,
|
|
1537
|
+
desc_name="Geary Autocorrelation",
|
|
1538
|
+
lag=lag,
|
|
1539
|
+
properties=properties,
|
|
1540
|
+
normalize=normalize
|
|
1541
|
+
)
|
|
1542
|
+
|
|
1543
|
+
return self.geary_autocorrelation
|
|
1544
|
+
|
|
1545
|
+
def get_ctd_composition(self) -> pd.DataFrame:
|
|
1546
|
+
"""
|
|
1547
|
+
Calculate Composition (C_CTD) physicochemical/structural descriptor
|
|
1548
|
+
of protein sequences from the calculated CTD descriptor. Composition
|
|
1549
|
+
is determined as the number of amino acids of a particular property
|
|
1550
|
+
divided by total number of amino acids,
|
|
1551
|
+
|
|
1552
|
+
Parameters
|
|
1553
|
+
==========
|
|
1554
|
+
None
|
|
1555
|
+
|
|
1556
|
+
Returns
|
|
1557
|
+
=======
|
|
1558
|
+
:ctd_composition: pd.DataFrame
|
|
1559
|
+
pandas dataframe of C_CTD values for protein sequence. Output will
|
|
1560
|
+
be of the shape N x M, where N is the number of protein sequences
|
|
1561
|
+
and M is the (number of physicochemical properties * 3), with 3
|
|
1562
|
+
features being calculated per property. By default the
|
|
1563
|
+
"hydrophobicity" property will be used, generating an output of
|
|
1564
|
+
N x 3.
|
|
1565
|
+
"""
|
|
1566
|
+
#if attribute already calculated & not empty then return it
|
|
1567
|
+
if not (self.ctd_composition.empty):
|
|
1568
|
+
return self.ctd_composition
|
|
1569
|
+
|
|
1570
|
+
#calculate ctd descriptor if not already calculated
|
|
1571
|
+
if (self.ctd.empty):
|
|
1572
|
+
self.ctd = self.get_ctd()
|
|
1573
|
+
|
|
1574
|
+
#initialise dataframe
|
|
1575
|
+
comp_df = pd.DataFrame()
|
|
1576
|
+
|
|
1577
|
+
#get ctd properties used for calculating descriptor
|
|
1578
|
+
ctd_property = self.desc_parameters.ctd["property"]
|
|
1579
|
+
if not (isinstance(ctd_property, list)):
|
|
1580
|
+
ctd_property = ctd_property.split(',')
|
|
1581
|
+
all_ctd = self.desc_parameters.ctd["all"]
|
|
1582
|
+
|
|
1583
|
+
#get composition descriptor from CTD dataframe, dependant on number of props, 3 features per property
|
|
1584
|
+
if (all_ctd):
|
|
1585
|
+
comp_df = self.ctd.iloc[:,0:21]
|
|
1586
|
+
else:
|
|
1587
|
+
comp_df = self.ctd.iloc[:,0:3 * len(ctd_property)]
|
|
1588
|
+
|
|
1589
|
+
self.ctd_composition = comp_df
|
|
1590
|
+
|
|
1591
|
+
return self.ctd_composition
|
|
1592
|
+
|
|
1593
|
+
def get_ctd_transition(self) -> pd.DataFrame:
|
|
1594
|
+
"""
|
|
1595
|
+
Calculate Transition (T_CTD) physicochemical/structural descriptor of
|
|
1596
|
+
protein sequences from the calculated CTD descriptor. Transition is
|
|
1597
|
+
determined as the number of transitions from a particular property to
|
|
1598
|
+
different property divided by (total number of amino acids − 1).
|
|
1599
|
+
|
|
1600
|
+
Parameters
|
|
1601
|
+
==========
|
|
1602
|
+
None
|
|
1603
|
+
|
|
1604
|
+
Returns
|
|
1605
|
+
=======
|
|
1606
|
+
:ctd_transition: pd.Dataframe
|
|
1607
|
+
pandas Dataframe of T_CTD values for protein sequence. Output will
|
|
1608
|
+
be of the shape N x M, where N is the number of protein sequences
|
|
1609
|
+
and M is the (number of physicochemical properties * 3), with 3
|
|
1610
|
+
features being calculated per property. By default the
|
|
1611
|
+
"hydrophobicity" property will be used, generating an output of
|
|
1612
|
+
N x 3.
|
|
1613
|
+
"""
|
|
1614
|
+
#if attribute already calculated & not empty then return it
|
|
1615
|
+
if not (self.ctd_transition.empty):
|
|
1616
|
+
return self.ctd_transition
|
|
1617
|
+
|
|
1618
|
+
#calculate ctd descriptor if not already calculated
|
|
1619
|
+
if (self.ctd.empty):
|
|
1620
|
+
self.ctd = self.get_ctd()
|
|
1621
|
+
|
|
1622
|
+
#initialise dataframe
|
|
1623
|
+
transition_df = pd.DataFrame()
|
|
1624
|
+
|
|
1625
|
+
#get ctd properties used for calculating descriptor
|
|
1626
|
+
ctd_property = self.desc_parameters.ctd["property"]
|
|
1627
|
+
if not (isinstance(ctd_property, list)):
|
|
1628
|
+
ctd_property = ctd_property.split(',')
|
|
1629
|
+
all_ctd = self.desc_parameters.ctd["all"]
|
|
1630
|
+
|
|
1631
|
+
#get transition descriptor from CTD dataframe, dependant on number of props, 3 features per property
|
|
1632
|
+
if (all_ctd):
|
|
1633
|
+
transition_df = self.ctd.iloc[:,21:42]
|
|
1634
|
+
else:
|
|
1635
|
+
transition_df = self.ctd.iloc[:,3 * len(ctd_property):(3 * len(ctd_property) * 2)]
|
|
1636
|
+
|
|
1637
|
+
self.ctd_transition = transition_df
|
|
1638
|
+
|
|
1639
|
+
return self.ctd_transition
|
|
1640
|
+
|
|
1641
|
+
def get_ctd_distribution(self) -> pd.DataFrame:
|
|
1642
|
+
"""
|
|
1643
|
+
Calculate Distribution (D_CTD) physicochemical/structural descriptor of
|
|
1644
|
+
protein sequences from the calculated CTD descriptor. Distribution is
|
|
1645
|
+
the chain length within which the first, 25%, 50%, 75% and 100% of the
|
|
1646
|
+
amino acids of a particular property are located.
|
|
1647
|
+
|
|
1648
|
+
Parameters
|
|
1649
|
+
==========
|
|
1650
|
+
None
|
|
1651
|
+
|
|
1652
|
+
Returns
|
|
1653
|
+
=======
|
|
1654
|
+
:ctd_distribution: pd.Dataframe
|
|
1655
|
+
pandas Dataframe of D_CTD values for protein sequence. Output will
|
|
1656
|
+
be of the shape N x M, where N is the number of protein sequences
|
|
1657
|
+
and M is the (number of physicochemical properties * 15), with 15
|
|
1658
|
+
features being calculated per property. By default the
|
|
1659
|
+
"hydrophobicity" property will be used, generating an output of
|
|
1660
|
+
N x 15.
|
|
1661
|
+
"""
|
|
1662
|
+
#if attribute already calculated & not empty then return it
|
|
1663
|
+
if not (self.ctd_distribution.empty):
|
|
1664
|
+
return self.ctd_distribution
|
|
1665
|
+
|
|
1666
|
+
#calculate ctd descriptor if not already calculated
|
|
1667
|
+
if (self.ctd.empty):
|
|
1668
|
+
self.ctd = self.get_ctd()
|
|
1669
|
+
|
|
1670
|
+
#initialise dataframe
|
|
1671
|
+
distribution_df = pd.DataFrame()
|
|
1672
|
+
|
|
1673
|
+
#get ctd properties used for calculating descriptor
|
|
1674
|
+
ctd_property = self.desc_parameters.ctd["property"]
|
|
1675
|
+
if not (isinstance(ctd_property, list)):
|
|
1676
|
+
ctd_property = ctd_property.split(',')
|
|
1677
|
+
all_ctd = self.desc_parameters.ctd["all"]
|
|
1678
|
+
|
|
1679
|
+
#get distribution descriptor from CTD dataframe, dependant on number of props, 15 features per property
|
|
1680
|
+
if (all_ctd):
|
|
1681
|
+
distribution_df = self.ctd.iloc[:,42:]
|
|
1682
|
+
else:
|
|
1683
|
+
distribution_df = self.ctd.iloc[:,2 * (3 * len(ctd_property)):]
|
|
1684
|
+
|
|
1685
|
+
self.ctd_distribution = distribution_df
|
|
1686
|
+
|
|
1687
|
+
return self.ctd_distribution
|
|
1688
|
+
|
|
1689
|
+
def get_ctd(self) -> pd.DataFrame:
|
|
1690
|
+
"""
|
|
1691
|
+
Calculate all CTD (Composition, Transition, Distribution)
|
|
1692
|
+
physicochemical/structural descriptor of protein sequences using the
|
|
1693
|
+
custom-built protpy package.
|
|
1694
|
+
|
|
1695
|
+
Parameters
|
|
1696
|
+
==========
|
|
1697
|
+
None
|
|
1698
|
+
|
|
1699
|
+
Returns
|
|
1700
|
+
=======
|
|
1701
|
+
:ctd: pd.Series
|
|
1702
|
+
pandas Series of CTD values for protein sequence. Output will
|
|
1703
|
+
be of the shape N x M, where N is the number of protein
|
|
1704
|
+
sequences and M is (number of physicochemical properties * 21),
|
|
1705
|
+
with 21 being the number of features calculated for each of the
|
|
1706
|
+
CTD descriptors per property. Using all properties will generate
|
|
1707
|
+
an output of N x 147, by default the "hydrophobicity"
|
|
1708
|
+
property is used, generating an output of N x 21.
|
|
1709
|
+
"""
|
|
1710
|
+
#if attribute already calculated & not empty then return it
|
|
1711
|
+
if not (self.ctd.empty):
|
|
1712
|
+
return self.ctd
|
|
1713
|
+
|
|
1714
|
+
#get descriptor-specific parameters from config file
|
|
1715
|
+
ctd_property = self.desc_parameters.ctd["property"]
|
|
1716
|
+
all_ctd = self.desc_parameters.ctd["all"]
|
|
1717
|
+
|
|
1718
|
+
#initialise dataframe
|
|
1719
|
+
ctd_df = pd.DataFrame()
|
|
1720
|
+
|
|
1721
|
+
#calculate descriptor value, concatenate descriptor values
|
|
1722
|
+
for seq in self.protein_seqs:
|
|
1723
|
+
ctd_seq = protpy.ctd_(seq, property=ctd_property, all_ctd=all_ctd)
|
|
1724
|
+
ctd_df = pd.concat([ctd_df, ctd_seq])
|
|
1725
|
+
|
|
1726
|
+
self.ctd = ctd_df.reset_index(drop=True)
|
|
1727
|
+
|
|
1728
|
+
return self.ctd
|
|
1729
|
+
|
|
1730
|
+
def get_conjoint_triad(self) -> pd.DataFrame:
|
|
1731
|
+
"""
|
|
1732
|
+
Calculate Conjoint Triad (CTriad) of protein sequences using the custom-built
|
|
1733
|
+
protpy package. The descriptor mainly considers neighbour relationships in
|
|
1734
|
+
protein sequences by encoding each protein sequence using the triad (continuous
|
|
1735
|
+
three amino acids) frequency distribution extracted from a 7-letter reduced
|
|
1736
|
+
alphabet [11]. CTriad calculates 343 different features (7x7x7), with the
|
|
1737
|
+
output being of shape N x 343 where N is the number of sequences.
|
|
1738
|
+
|
|
1739
|
+
Parameters
|
|
1740
|
+
==========
|
|
1741
|
+
None
|
|
1742
|
+
|
|
1743
|
+
Returns
|
|
1744
|
+
=======
|
|
1745
|
+
:conjoint_triad: pd.Dataframe
|
|
1746
|
+
pandas Dataframe of CTriad descriptor values for all protein sequences. Dataframe
|
|
1747
|
+
will be of the shape N x 343, where N is the number of protein sequences and 343
|
|
1748
|
+
is the number of features calculated from the descriptor for a sequence.
|
|
1749
|
+
"""
|
|
1750
|
+
#if attribute already calculated & not empty then return it
|
|
1751
|
+
if not (self.conjoint_triad.empty):
|
|
1752
|
+
return self.conjoint_triad
|
|
1753
|
+
|
|
1754
|
+
#initialise dataframe
|
|
1755
|
+
conjoint_triad_df = pd.DataFrame()
|
|
1756
|
+
|
|
1757
|
+
#calculate descriptor value, for each sequence, concatenate descriptor values
|
|
1758
|
+
for seq in self.protein_seqs:
|
|
1759
|
+
conjoint_triad_seq = protpy.conjoint_triad(seq)
|
|
1760
|
+
conjoint_triad_df = pd.concat([conjoint_triad_df, conjoint_triad_seq])
|
|
1761
|
+
|
|
1762
|
+
self.conjoint_triad = conjoint_triad_df.reset_index(drop=True)
|
|
1763
|
+
|
|
1764
|
+
return self.conjoint_triad
|
|
1765
|
+
|
|
1766
|
+
def get_sequence_order_coupling_number(self) -> pd.DataFrame:
|
|
1767
|
+
"""
|
|
1768
|
+
Calculate Sequence Order Coupling Number (SOCN) features for input protein sequence
|
|
1769
|
+
using custom-built protpy package. SOCN computes the dissimilarity between amino acid
|
|
1770
|
+
pairs. The distance between amino acid pairs is determined by d which varies between
|
|
1771
|
+
1 to lag. For each d, it computes the sum of the dissimilarities of all amino acid
|
|
1772
|
+
pairs. The number of output features can be calculated as N * 2, where N = lag, by
|
|
1773
|
+
default this value is 30 which generates an output of M x 60 where M is the number
|
|
1774
|
+
of protein sequenes.
|
|
1775
|
+
|
|
1776
|
+
Parameters
|
|
1777
|
+
==========
|
|
1778
|
+
None
|
|
1779
|
+
|
|
1780
|
+
Returns
|
|
1781
|
+
=======
|
|
1782
|
+
:sequence_order_coupling_number_df: pd.Dataframe
|
|
1783
|
+
Dataframe of SOCN descriptor values for all protein sequences. Output
|
|
1784
|
+
will be of the shape N x M, where N is the number of protein sequences and
|
|
1785
|
+
M is the number of features calculated from the descriptor (calculated as
|
|
1786
|
+
N * 2 where N = lag).
|
|
1787
|
+
"""
|
|
1788
|
+
#if attribute already calculated & not empty then return it
|
|
1789
|
+
if not (self.sequence_order_coupling_number.empty):
|
|
1790
|
+
return self.sequence_order_coupling_number
|
|
1791
|
+
|
|
1792
|
+
#initialise dataframe
|
|
1793
|
+
sequence_order_coupling_number_df = pd.DataFrame()
|
|
1794
|
+
|
|
1795
|
+
#get descriptor-specific parameters from config file
|
|
1796
|
+
lag = self.desc_parameters.sequence_order_coupling_number["lag"]
|
|
1797
|
+
distance_matrix = self.desc_parameters.sequence_order_coupling_number["distance_matrix"]
|
|
1798
|
+
|
|
1799
|
+
#calculate descriptor value, for each sequence, concatenate descriptor values
|
|
1800
|
+
for seq in self.protein_seqs:
|
|
1801
|
+
#if no distance matrix present in config then calculate SOCN using both matrices
|
|
1802
|
+
if (distance_matrix == "" or distance_matrix == None):
|
|
1803
|
+
sequence_order_coupling_number_seq = protpy.sequence_order_coupling_number_all(seq, lag=lag)
|
|
1804
|
+
else:
|
|
1805
|
+
sequence_order_coupling_number_seq = protpy.sequence_order_coupling_number(seq, lag=lag, distance_matrix=distance_matrix)
|
|
1806
|
+
|
|
1807
|
+
#concat sequence's descriptor output to dataframe
|
|
1808
|
+
sequence_order_coupling_number_df = pd.concat([sequence_order_coupling_number_df, sequence_order_coupling_number_seq])
|
|
1809
|
+
|
|
1810
|
+
self.sequence_order_coupling_number = sequence_order_coupling_number_df.reset_index(drop=True)
|
|
1811
|
+
|
|
1812
|
+
return self.sequence_order_coupling_number
|
|
1813
|
+
|
|
1814
|
+
def get_quasi_sequence_order(self) -> pd.DataFrame:
|
|
1815
|
+
"""
|
|
1816
|
+
Calculate Quasi Sequence Order features for the protein sequences using the
|
|
1817
|
+
custom-built protpy package.The quasi-sequence-order descriptors were proposed
|
|
1818
|
+
by K.C. Chou, et.al. [10]. They are derived from the distance matrix between
|
|
1819
|
+
the 20 amino acids. By default, the Scheider-Wrede physicochemical distance
|
|
1820
|
+
matrix was used. Also utilised in the descriptor calculation is the Grantham
|
|
1821
|
+
chemical distance matrix. Both of these matrices are used by Grantham et. al.
|
|
1822
|
+
in the calculation of the descriptor [13]. 100 values are calculated per
|
|
1823
|
+
sequence, thus generating an output of N x 100 per sequence, where N is the
|
|
1824
|
+
number of protein sequences.
|
|
1825
|
+
|
|
1826
|
+
Parameters
|
|
1827
|
+
==========
|
|
1828
|
+
None
|
|
1829
|
+
|
|
1830
|
+
Returns
|
|
1831
|
+
=======
|
|
1832
|
+
:quasi_sequence_order_df: pd.Dataframe
|
|
1833
|
+
Dataframe of quasi-sequence-order descriptor values for the
|
|
1834
|
+
protein sequences, with output shape N x 100 where N is the number
|
|
1835
|
+
of sequences and 100 the number of calculated features.
|
|
1836
|
+
"""
|
|
1837
|
+
#if attribute already calculated & not empty then return it
|
|
1838
|
+
if not (self.quasi_sequence_order.empty):
|
|
1839
|
+
return self.quasi_sequence_order
|
|
1840
|
+
|
|
1841
|
+
#initialise dataframe
|
|
1842
|
+
quasi_sequence_order_df = pd.DataFrame()
|
|
1843
|
+
|
|
1844
|
+
#get descriptor-specific parameters from config file
|
|
1845
|
+
lag = self.desc_parameters.quasi_sequence_order["lag"]
|
|
1846
|
+
weight = self.desc_parameters.quasi_sequence_order["weight"]
|
|
1847
|
+
distance_matrix = self.desc_parameters.quasi_sequence_order["distance_matrix"]
|
|
1848
|
+
|
|
1849
|
+
#calculate descriptor value, for each sequene, concatenate descriptor values
|
|
1850
|
+
for seq in self.protein_seqs:
|
|
1851
|
+
#if no distance matrix present in config then calculate quasi seq order using both matrices
|
|
1852
|
+
if (distance_matrix == "" or distance_matrix == None):
|
|
1853
|
+
quasi_sequence_order_seq = protpy.quasi_sequence_order_all(seq, lag=lag, weight=weight)
|
|
1854
|
+
else:
|
|
1855
|
+
quasi_sequence_order_seq = protpy.quasi_sequence_order(seq, lag=lag, weight=weight,
|
|
1856
|
+
distance_matrix=distance_matrix)
|
|
1857
|
+
|
|
1858
|
+
#concat sequence's descriptor output to dataframe
|
|
1859
|
+
quasi_sequence_order_df = pd.concat([quasi_sequence_order_df, quasi_sequence_order_seq])
|
|
1860
|
+
|
|
1861
|
+
self.quasi_sequence_order = quasi_sequence_order_df.reset_index(drop=True)
|
|
1862
|
+
|
|
1863
|
+
return self.quasi_sequence_order
|
|
1864
|
+
|
|
1865
|
+
def get_pseudo_amino_acid_composition(self) -> pd.DataFrame:
|
|
1866
|
+
"""
|
|
1867
|
+
Calculate Pseudo Amino Acid Composition (PAAComp) descriptor using custom-built protpy
|
|
1868
|
+
package. PAAComp combines the vanilla amino acid composition descriptor with additional
|
|
1869
|
+
local features, such as correlation between residues of a certain distance, as amino
|
|
1870
|
+
acid composition doesn't take into account sequence order info. The pseudo components
|
|
1871
|
+
of the descriptor are a series rank-different correlation factors [10]. The first 20
|
|
1872
|
+
components are a weighted sum of the amino acid composition and 30 are physicochemical
|
|
1873
|
+
square correlations as dictated by the lambda and properties parameters. This generates
|
|
1874
|
+
an output of [(20 + λ), 1] = 50 x 1 when using the default lambda of 30. By default,
|
|
1875
|
+
the physicochemical properties used are hydrophobicity and hydrophillicity, with a lambda
|
|
1876
|
+
of 30 and weight of 0.05.
|
|
1877
|
+
|
|
1878
|
+
Parameters
|
|
1879
|
+
==========
|
|
1880
|
+
None
|
|
1881
|
+
|
|
1882
|
+
Returns
|
|
1883
|
+
=======
|
|
1884
|
+
:pseudo_amino_acid_composition_df: pd.Dataframe
|
|
1885
|
+
Dataframe of pseudo amino acid composition descriptor values for the protein sequences
|
|
1886
|
+
of output shape N x (20 + λ), where N is the number of protein sequences. With
|
|
1887
|
+
default lambda of 30, the output shape will be N x 50.
|
|
1888
|
+
"""
|
|
1889
|
+
#if attribute already calculated & not empty then return it
|
|
1890
|
+
if not (self.pseudo_amino_acid_composition.empty):
|
|
1891
|
+
return self.pseudo_amino_acid_composition
|
|
1892
|
+
|
|
1893
|
+
#initialise dataframe
|
|
1894
|
+
pseudo_amino_acid_composition_df = pd.DataFrame()
|
|
1895
|
+
|
|
1896
|
+
#get descriptor-specific parameters from config file
|
|
1897
|
+
lamda = self.desc_parameters.pseudo_amino_acid_composition["lambda"]
|
|
1898
|
+
weight = self.desc_parameters.pseudo_amino_acid_composition["weight"]
|
|
1899
|
+
properties = self.desc_parameters.pseudo_amino_acid_composition["properties"]
|
|
1900
|
+
|
|
1901
|
+
#calculate descriptor value, for each sequence, concatenate descriptor values,
|
|
1902
|
+
#tqdm loop to visualise progress as descriptor can take some time to execute
|
|
1903
|
+
for seq in tqdm(self.protein_seqs, unit=" sequence", position=0,
|
|
1904
|
+
desc="PAAComp", mininterval=30, ncols=90):
|
|
1905
|
+
pseudo_amino_acid_composition_seq = protpy.pseudo_amino_acid_composition(seq, lamda=lamda,
|
|
1906
|
+
weight=weight, properties=properties)
|
|
1907
|
+
pseudo_amino_acid_composition_df = pd.concat([pseudo_amino_acid_composition_df, pseudo_amino_acid_composition_seq])
|
|
1908
|
+
|
|
1909
|
+
self.pseudo_amino_acid_composition = pseudo_amino_acid_composition_df.reset_index(drop=True)
|
|
1910
|
+
|
|
1911
|
+
return self.pseudo_amino_acid_composition
|
|
1912
|
+
|
|
1913
|
+
def get_amphiphilic_pseudo_amino_acid_composition(self) -> pd.DataFrame:
|
|
1914
|
+
"""
|
|
1915
|
+
Calculate Amphiphilic Pseudo Amino Acid Composition (APAAComp) of protein sequences
|
|
1916
|
+
using custom-built protpy package. APAAComp has the same form as the amino acid
|
|
1917
|
+
composition, but contains much more information that is related to the sequence
|
|
1918
|
+
order of a protein and the distribution of the hydrophobic and hydrophilic amino
|
|
1919
|
+
acids along its chain. The first 20 numbers in the descriptor are the components
|
|
1920
|
+
of the conventional amino acid composition; the next 2*lambda numbers are a set of
|
|
1921
|
+
correlation factors that reflect different hydrophobicity and hydrophilicity
|
|
1922
|
+
distribution patterns along a protein chain.
|
|
1923
|
+
|
|
1924
|
+
Parameters
|
|
1925
|
+
==========
|
|
1926
|
+
None
|
|
1927
|
+
|
|
1928
|
+
Returns
|
|
1929
|
+
=======
|
|
1930
|
+
:amphiphilic_pseudo_amino_acid_composition_df: pd.Dataframe
|
|
1931
|
+
Dataframe of Amphiphilic pseudo amino acid composition descriptor values for
|
|
1932
|
+
the protein sequences of output shape N x 80, where N is the number of
|
|
1933
|
+
protein sequences and 80 is calculated as (20 + 2*lambda).
|
|
1934
|
+
"""
|
|
1935
|
+
#if attribute already calculated & not empty then return it
|
|
1936
|
+
if not (self.amphiphilic_pseudo_amino_acid_composition.empty):
|
|
1937
|
+
return self.amphiphilic_pseudo_amino_acid_composition
|
|
1938
|
+
|
|
1939
|
+
#get descriptor-specific parameters from config file
|
|
1940
|
+
lamda = self.desc_parameters.amphiphilic_pseudo_amino_acid_composition["lambda"]
|
|
1941
|
+
weight = self.desc_parameters.amphiphilic_pseudo_amino_acid_composition["weight"]
|
|
1942
|
+
|
|
1943
|
+
#initialise dataframe
|
|
1944
|
+
amphiphilic_pseudo_amino_acid_composition_df = pd.DataFrame()
|
|
1945
|
+
|
|
1946
|
+
#calculate descriptor value, for each sequence, concatenate descriptor values,
|
|
1947
|
+
#tqdm loop to visualise progress as descriptor can take some time to execute
|
|
1948
|
+
for seq in tqdm(self.protein_seqs, unit=" sequence", position=0,
|
|
1949
|
+
desc="APAAComp", mininterval=30, ncols=90):
|
|
1950
|
+
amphiphilic_pseudo_amino_acid_composition_seq = protpy.amphiphilic_pseudo_amino_acid_composition(seq,
|
|
1951
|
+
lamda=lamda, weight=weight)
|
|
1952
|
+
amphiphilic_pseudo_amino_acid_composition_df = pd.concat([amphiphilic_pseudo_amino_acid_composition_df,
|
|
1953
|
+
amphiphilic_pseudo_amino_acid_composition_seq])
|
|
1954
|
+
|
|
1955
|
+
self.amphiphilic_pseudo_amino_acid_composition = amphiphilic_pseudo_amino_acid_composition_df.reset_index(drop=True)
|
|
1956
|
+
|
|
1957
|
+
return self.amphiphilic_pseudo_amino_acid_composition
|
|
1958
|
+
|
|
1959
|
+
def get_all_descriptors(self, export: bool = False, descriptors_export_filename: str = "") -> pd.DataFrame:
|
|
1960
|
+
"""
|
|
1961
|
+
Calculate all individual descriptor values, concatenating each descriptor
|
|
1962
|
+
Dataframe into one storing all descriptors. The number of descriptor
|
|
1963
|
+
features calculated is dependant on several additional meta parameters of
|
|
1964
|
+
some descriptors, including the number of properties and max lag for the
|
|
1965
|
+
Autocorrelation, SOCN and QSO and the number of properties and lamda for
|
|
1966
|
+
PAAComp and the lambda for APAAComp.
|
|
1967
|
+
|
|
1968
|
+
To export all descriptors to a csv set export=True when calling the function,
|
|
1969
|
+
this saves having to recalculate all the descriptor values when using them
|
|
1970
|
+
in multiple encoding processes, and the descriptors can be imported using the
|
|
1971
|
+
import_descriptors function. By default, the function will save the output
|
|
1972
|
+
csv to the value at the "descriptors_csv" parameter in the config file,
|
|
1973
|
+
although the name for this exported csv can be set by the
|
|
1974
|
+
descriptors_export_filename input parameter.
|
|
1975
|
+
|
|
1976
|
+
Parameters
|
|
1977
|
+
==========
|
|
1978
|
+
:export: bool (default=False)
|
|
1979
|
+
if true then all calculated descriptors from the protpy package will be
|
|
1980
|
+
exported to a CSV. This allows for pre-calculated descriptors for a
|
|
1981
|
+
dataset to be easily imported and not have to be recalculated again.
|
|
1982
|
+
:descriptors_export_filename: str
|
|
1983
|
+
filepath/filename for the exported csv of all the calculated descriptor
|
|
1984
|
+
values if input parameter export=True
|
|
1985
|
+
|
|
1986
|
+
Returns
|
|
1987
|
+
=======
|
|
1988
|
+
:all_descriptor_df: pd.DataFrame
|
|
1989
|
+
concatenated dataframe of all individual descriptors. Using the default
|
|
1990
|
+
attributes and their associated values, the output will be of the shape
|
|
1991
|
+
N x 10572, where N is the number of protein sequences and 10572 is the
|
|
1992
|
+
number of descriptor features.
|
|
1993
|
+
"""
|
|
1994
|
+
print('############################### Exporting all descriptors ################################\n')
|
|
1995
|
+
|
|
1996
|
+
#start time counter
|
|
1997
|
+
start = time.time()
|
|
1998
|
+
|
|
1999
|
+
#iterate over all descriptors, calculating each using their respective function and the protpy package
|
|
2000
|
+
for descr in tqdm(self.all_descriptors_list(), unit=" descriptor", position=0,
|
|
2001
|
+
desc="Descriptors", mininterval=30, ncols=90):
|
|
2002
|
+
|
|
2003
|
+
#if descriptor attribute DF is empty then call its respective get_descriptor function
|
|
2004
|
+
if (descr == "amino_acid_composition" and getattr(self, "amino_acid_composition").empty):
|
|
2005
|
+
self.amino_acid_composition = self.get_amino_acid_composition()
|
|
2006
|
+
|
|
2007
|
+
if (descr == "dipeptide_composition" and getattr(self, "dipeptide_composition").empty):
|
|
2008
|
+
self.dipeptide_composition = self.get_dipeptide_composition()
|
|
2009
|
+
|
|
2010
|
+
if (descr == "tripeptide_composition" and getattr(self, "tripeptide_composition").empty):
|
|
2011
|
+
self.tripeptide_composition = self.get_tripeptide_composition()
|
|
2012
|
+
|
|
2013
|
+
if (descr == "moreaubroto_autocorrelation" and getattr(self, "moreaubroto_autocorrelation").empty):
|
|
2014
|
+
self.moreaubroto_autocorrelation = self.get_moreaubroto_autocorrelation()
|
|
2015
|
+
|
|
2016
|
+
if (descr == "moran_autocorrelation" and getattr(self, "moran_autocorrelation").empty):
|
|
2017
|
+
self.moran_autocorrelation = self.get_moran_autocorrelation()
|
|
2018
|
+
|
|
2019
|
+
if (descr == "geary_autocorrelation" and getattr(self, "geary_autocorrelation").empty):
|
|
2020
|
+
self.geary_autocorrelation = self.get_geary_autocorrelation()
|
|
2021
|
+
|
|
2022
|
+
if (descr == "ctd" and getattr(self, "ctd").empty):
|
|
2023
|
+
self.ctd = self.get_ctd()
|
|
2024
|
+
|
|
2025
|
+
if (descr == "ctd_composition" and getattr(self, "ctd_composition").empty):
|
|
2026
|
+
self.ctd_composition = self.get_ctd_composition()
|
|
2027
|
+
|
|
2028
|
+
if (descr == "ctd_transition" and getattr(self, "ctd_transition").empty):
|
|
2029
|
+
self.ctd_transition = self.get_ctd_transition()
|
|
2030
|
+
|
|
2031
|
+
if (descr == "ctd_distribution" and getattr(self, "ctd_distribution").empty):
|
|
2032
|
+
self.ctd_distribution = self.get_ctd_distribution()
|
|
2033
|
+
|
|
2034
|
+
if (descr == "conjoint_triad" and getattr(self, "conjoint_triad").empty):
|
|
2035
|
+
self.conjoint_triad = self.get_conjoint_triad()
|
|
2036
|
+
|
|
2037
|
+
if (descr == "sequence_order_coupling_number" and getattr(self, "sequence_order_coupling_number").empty):
|
|
2038
|
+
self.sequence_order_coupling_number = self.get_sequence_order_coupling_number()
|
|
2039
|
+
|
|
2040
|
+
if (descr == "quasi_sequence_order" and getattr(self, "quasi_sequence_order").empty):
|
|
2041
|
+
self.quasi_sequence_order = self.get_quasi_sequence_order()
|
|
2042
|
+
|
|
2043
|
+
if (descr == "pseudo_amino_acid_composition" and getattr(self, "pseudo_amino_acid_composition").empty):
|
|
2044
|
+
self.pseudo_amino_acid_composition = self.get_pseudo_amino_acid_composition()
|
|
2045
|
+
|
|
2046
|
+
if (descr == "amphiphilic_pseudo_amino_acid_composition" and getattr(self, "amphiphilic_pseudo_amino_acid_composition").empty):
|
|
2047
|
+
self.amphiphilic_pseudo_amino_acid_composition = self.get_amphiphilic_pseudo_amino_acid_composition()
|
|
2048
|
+
|
|
2049
|
+
#stop time counter, calculate elapsed time
|
|
2050
|
+
end = time.time()
|
|
2051
|
+
elapsed = end - start
|
|
2052
|
+
|
|
2053
|
+
print(f'\nElapsed time for calculating all descriptors: {elapsed/60:.2f} minutes.')
|
|
2054
|
+
print('\n##########################################################################################')
|
|
2055
|
+
|
|
2056
|
+
#append all calculated descriptors to list
|
|
2057
|
+
all_desc = [
|
|
2058
|
+
self.amino_acid_composition, self.dipeptide_composition, self.tripeptide_composition,
|
|
2059
|
+
self.moreaubroto_autocorrelation, self.moran_autocorrelation,
|
|
2060
|
+
self.geary_autocorrelation, self.ctd_composition, self.ctd_transition,
|
|
2061
|
+
self.ctd_distribution, self.conjoint_triad, self.sequence_order_coupling_number,
|
|
2062
|
+
self.quasi_sequence_order, self.pseudo_amino_acid_composition, self.amphiphilic_pseudo_amino_acid_composition
|
|
2063
|
+
]
|
|
2064
|
+
|
|
2065
|
+
#concatenate individual descriptor dataframe attributes
|
|
2066
|
+
all_descriptor_df = pd.concat(all_desc, axis = 1)
|
|
2067
|
+
self.all_descriptors = all_descriptor_df
|
|
2068
|
+
|
|
2069
|
+
#export pre-calculated descriptor values to a csv, use default name if parameter empty
|
|
2070
|
+
if (export):
|
|
2071
|
+
if (descriptors_export_filename == ""):
|
|
2072
|
+
if (self.desc_config.descriptors_csv == "" or self.desc_config.descriptors_csv == None):
|
|
2073
|
+
self.desc_config.descriptors_csv = "descriptors_output.csv"
|
|
2074
|
+
self.all_descriptors.to_csv(self.desc_config.descriptors_csv, index=0)
|
|
2075
|
+
else:
|
|
2076
|
+
#append extension if not present on filename - export to csv
|
|
2077
|
+
if (os.path.splitext(os.path.basename(descriptors_export_filename))[1] == ""):
|
|
2078
|
+
descriptors_export_filename = descriptors_export_filename + ".csv"
|
|
2079
|
+
self.all_descriptors.to_csv(descriptors_export_filename, index=0)
|
|
2080
|
+
|
|
2081
|
+
return all_descriptor_df
|
|
2082
|
+
|
|
2083
|
+
def get_descriptor_encoding(self, descriptor: str) -> Optional[pd.DataFrame]:
|
|
2084
|
+
"""
|
|
2085
|
+
Get the protein descriptor values of a specified input descriptor. If the
|
|
2086
|
+
sought descriptor has already been calculated then its attribute is returned,
|
|
2087
|
+
else the descriptor is calculated using its get_descriptor function.
|
|
2088
|
+
|
|
2089
|
+
Parameters
|
|
2090
|
+
==========
|
|
2091
|
+
:descriptor: str
|
|
2092
|
+
name of descriptor to return. Method can accept the approximate name
|
|
2093
|
+
of the descriptor, e.g. 'amino_comp'/'aa_composition' etc will return
|
|
2094
|
+
the 'amino_acid_composition' descriptor. This functionality is realised
|
|
2095
|
+
using the difflib library and its built-in get_close_matches function.
|
|
2096
|
+
|
|
2097
|
+
Returns
|
|
2098
|
+
=======
|
|
2099
|
+
:desc_encoding: pd.DataFrame or None
|
|
2100
|
+
dataframe of matching descriptor attribute. None returned if no matching
|
|
2101
|
+
descriptor found.
|
|
2102
|
+
"""
|
|
2103
|
+
#input descriptor parameter should be a string
|
|
2104
|
+
if not(isinstance(descriptor, str)):
|
|
2105
|
+
raise TypeError('Input parameter {} is not of correct datatype string, got {}.'.
|
|
2106
|
+
format(descriptor, type(descriptor)))
|
|
2107
|
+
|
|
2108
|
+
#remove any whitespace from input parameter, replace spaces with underscores and lowercase
|
|
2109
|
+
descriptor = descriptor.strip().replace(' ', '_').lower()
|
|
2110
|
+
|
|
2111
|
+
#validate input descriptor is a valid available descriptor, get its closest match
|
|
2112
|
+
desc_matches = get_close_matches(descriptor, self.valid_descriptors, cutoff=0.6)
|
|
2113
|
+
if (desc_matches != []):
|
|
2114
|
+
desc = desc_matches[0] #set desc to closest descriptor match found
|
|
2115
|
+
else:
|
|
2116
|
+
raise ValueError(f"Could not find a match for the input descriptor {descriptor} in"
|
|
2117
|
+
f" list of available valid models:\n{self.valid_descriptors}.")
|
|
2118
|
+
|
|
2119
|
+
#if sought descriptor attribute dataframe is empty, call the descriptor's
|
|
2120
|
+
# get_descriptor() function, set desc_encoding to descriptor attribute
|
|
2121
|
+
if (desc == 'amino_acid_composition'):
|
|
2122
|
+
if (getattr(self, desc).empty):
|
|
2123
|
+
self.get_amino_acid_composition()
|
|
2124
|
+
desc_encoding = self.amino_acid_composition
|
|
2125
|
+
|
|
2126
|
+
elif (desc == 'dipeptide_composition'):
|
|
2127
|
+
if (getattr(self, desc).empty):
|
|
2128
|
+
self.get_dipeptide_composition()
|
|
2129
|
+
desc_encoding = self.dipeptide_composition
|
|
2130
|
+
|
|
2131
|
+
elif (desc == 'tripeptide_composition'):
|
|
2132
|
+
if (getattr(self, desc).empty):
|
|
2133
|
+
self.get_tripeptide_composition()
|
|
2134
|
+
desc_encoding = self.tripeptide_composition
|
|
2135
|
+
|
|
2136
|
+
elif (desc == 'gravy'):
|
|
2137
|
+
if (getattr(self, desc).empty):
|
|
2138
|
+
self.get_gravy()
|
|
2139
|
+
desc_encoding = self.gravy
|
|
2140
|
+
|
|
2141
|
+
elif (desc == 'aromaticity'):
|
|
2142
|
+
if (getattr(self, desc).empty):
|
|
2143
|
+
self.get_aromaticity()
|
|
2144
|
+
desc_encoding = self.aromaticity
|
|
2145
|
+
|
|
2146
|
+
elif (desc == 'instability_index'):
|
|
2147
|
+
if (getattr(self, desc).empty):
|
|
2148
|
+
self.get_instability_index()
|
|
2149
|
+
desc_encoding = self.instability_index
|
|
2150
|
+
|
|
2151
|
+
elif (desc == 'isoelectric_point'):
|
|
2152
|
+
if (getattr(self, desc).empty):
|
|
2153
|
+
self.get_isoelectric_point()
|
|
2154
|
+
desc_encoding = self.isoelectric_point
|
|
2155
|
+
|
|
2156
|
+
elif (desc == 'molecular_weight'):
|
|
2157
|
+
if (getattr(self, desc).empty):
|
|
2158
|
+
self.get_molecular_weight()
|
|
2159
|
+
desc_encoding = self.molecular_weight
|
|
2160
|
+
|
|
2161
|
+
elif (desc == 'charge_distribution'):
|
|
2162
|
+
if (getattr(self, desc).empty):
|
|
2163
|
+
self.get_charge_distribution()
|
|
2164
|
+
desc_encoding = self.charge_distribution
|
|
2165
|
+
|
|
2166
|
+
elif (desc == 'hydrophobic_polar_charged_composition'):
|
|
2167
|
+
if (getattr(self, desc).empty):
|
|
2168
|
+
self.get_hydrophobic_polar_charged_composition()
|
|
2169
|
+
desc_encoding = self.hydrophobic_polar_charged_composition
|
|
2170
|
+
|
|
2171
|
+
elif (desc == 'secondary_structure_propensity'):
|
|
2172
|
+
if (getattr(self, desc).empty):
|
|
2173
|
+
self.get_secondary_structure_propensity()
|
|
2174
|
+
desc_encoding = self.secondary_structure_propensity
|
|
2175
|
+
|
|
2176
|
+
elif (desc == 'kmer_composition'):
|
|
2177
|
+
if (getattr(self, desc).empty):
|
|
2178
|
+
self.get_kmer_composition()
|
|
2179
|
+
desc_encoding = self.kmer_composition
|
|
2180
|
+
|
|
2181
|
+
elif (desc == 'reduced_alphabet_composition'):
|
|
2182
|
+
if (getattr(self, desc).empty):
|
|
2183
|
+
self.get_reduced_alphabet_composition()
|
|
2184
|
+
desc_encoding = self.reduced_alphabet_composition
|
|
2185
|
+
|
|
2186
|
+
elif (desc == 'motif_composition'):
|
|
2187
|
+
if (getattr(self, desc).empty):
|
|
2188
|
+
self.get_motif_composition()
|
|
2189
|
+
desc_encoding = self.motif_composition
|
|
2190
|
+
|
|
2191
|
+
elif (desc == 'amino_acid_pair_composition'):
|
|
2192
|
+
if (getattr(self, desc).empty):
|
|
2193
|
+
self.get_amino_acid_pair_composition()
|
|
2194
|
+
desc_encoding = self.amino_acid_pair_composition
|
|
2195
|
+
|
|
2196
|
+
elif (desc == 'aliphatic_index'):
|
|
2197
|
+
if (getattr(self, desc).empty):
|
|
2198
|
+
self.get_aliphatic_index()
|
|
2199
|
+
desc_encoding = self.aliphatic_index
|
|
2200
|
+
|
|
2201
|
+
elif (desc == 'extinction_coefficient'):
|
|
2202
|
+
if (getattr(self, desc).empty):
|
|
2203
|
+
self.get_extinction_coefficient()
|
|
2204
|
+
desc_encoding = self.extinction_coefficient
|
|
2205
|
+
|
|
2206
|
+
elif (desc == 'boman_index'):
|
|
2207
|
+
if (getattr(self, desc).empty):
|
|
2208
|
+
self.get_boman_index()
|
|
2209
|
+
desc_encoding = self.boman_index
|
|
2210
|
+
|
|
2211
|
+
elif (desc == 'aggregation_propensity'):
|
|
2212
|
+
if (getattr(self, desc).empty):
|
|
2213
|
+
self.get_aggregation_propensity()
|
|
2214
|
+
desc_encoding = self.aggregation_propensity
|
|
2215
|
+
|
|
2216
|
+
elif (desc == 'hydrophobic_moment'):
|
|
2217
|
+
if (getattr(self, desc).empty):
|
|
2218
|
+
self.get_hydrophobic_moment()
|
|
2219
|
+
desc_encoding = self.hydrophobic_moment
|
|
2220
|
+
|
|
2221
|
+
elif (desc == 'shannon_entropy'):
|
|
2222
|
+
if (getattr(self, desc).empty):
|
|
2223
|
+
self.get_shannon_entropy()
|
|
2224
|
+
desc_encoding = self.shannon_entropy
|
|
2225
|
+
|
|
2226
|
+
elif (desc == 'moreaubroto_autocorrelation'):
|
|
2227
|
+
if (getattr(self, desc).empty):
|
|
2228
|
+
self.get_moreaubroto_autocorrelation()
|
|
2229
|
+
desc_encoding = self.moreaubroto_autocorrelation
|
|
2230
|
+
|
|
2231
|
+
elif (desc == 'moran_autocorrelation'):
|
|
2232
|
+
if (getattr(self, desc).empty):
|
|
2233
|
+
self.get_moran_autocorrelation()
|
|
2234
|
+
desc_encoding = self.moran_autocorrelation
|
|
2235
|
+
|
|
2236
|
+
elif (desc == 'geary_autocorrelation'):
|
|
2237
|
+
if (getattr(self, desc).empty):
|
|
2238
|
+
self.get_geary_autocorrelation()
|
|
2239
|
+
desc_encoding = self.geary_autocorrelation
|
|
2240
|
+
|
|
2241
|
+
elif (desc == 'ctd'):
|
|
2242
|
+
if (getattr(self, desc).empty):
|
|
2243
|
+
self.get_ctd()
|
|
2244
|
+
desc_encoding = self.ctd
|
|
2245
|
+
|
|
2246
|
+
elif (desc == 'ctd_composition'):
|
|
2247
|
+
if (getattr(self, desc).empty):
|
|
2248
|
+
self.get_ctd_composition()
|
|
2249
|
+
desc_encoding = self.ctd_composition
|
|
2250
|
+
|
|
2251
|
+
elif (desc == 'ctd_transition'):
|
|
2252
|
+
if (getattr(self, desc).empty):
|
|
2253
|
+
self.get_ctd_transition()
|
|
2254
|
+
desc_encoding = self.ctd_transition
|
|
2255
|
+
|
|
2256
|
+
elif (desc == 'ctd_distribution'):
|
|
2257
|
+
if (getattr(self, desc).empty):
|
|
2258
|
+
self.get_ctd_distribution()
|
|
2259
|
+
desc_encoding = self.ctd_distribution
|
|
2260
|
+
|
|
2261
|
+
elif (desc == 'conjoint_triad'):
|
|
2262
|
+
if (getattr(self, desc).empty):
|
|
2263
|
+
self.get_conjoint_triad()
|
|
2264
|
+
desc_encoding = self.conjoint_triad
|
|
2265
|
+
|
|
2266
|
+
elif (desc == 'sequence_order_coupling_number'):
|
|
2267
|
+
if (getattr(self, desc).empty):
|
|
2268
|
+
self.get_sequence_order_coupling_number()
|
|
2269
|
+
desc_encoding = self.sequence_order_coupling_number
|
|
2270
|
+
|
|
2271
|
+
elif (desc == 'quasi_sequence_order'):
|
|
2272
|
+
if (getattr(self, desc).empty):
|
|
2273
|
+
self.get_quasi_sequence_order()
|
|
2274
|
+
desc_encoding = self.quasi_sequence_order
|
|
2275
|
+
|
|
2276
|
+
elif (desc == 'pseudo_amino_acid_composition'):
|
|
2277
|
+
if (getattr(self, desc).empty):
|
|
2278
|
+
self.get_pseudo_amino_acid_composition()
|
|
2279
|
+
desc_encoding = self.pseudo_amino_acid_composition
|
|
2280
|
+
|
|
2281
|
+
elif (desc == 'amphiphilic_pseudo_amino_acid_composition'):
|
|
2282
|
+
if (getattr(self, desc).empty):
|
|
2283
|
+
self.get_amphiphilic_pseudo_amino_acid_composition()
|
|
2284
|
+
desc_encoding = self.amphiphilic_pseudo_amino_acid_composition
|
|
2285
|
+
else:
|
|
2286
|
+
desc_encoding = None #no matching descriptor found
|
|
2287
|
+
|
|
2288
|
+
return desc_encoding
|
|
2289
|
+
|
|
2290
|
+
def all_descriptors_list(self, desc_combo: int = 1) -> Union[List[str], List[Tuple[str, ...]]]:
|
|
2291
|
+
"""
|
|
2292
|
+
Get list of all available descriptor attributes. Using the desc_combo
|
|
2293
|
+
input parameter you can get the list of all descriptors, all combinations
|
|
2294
|
+
of 2 descriptors or all combinations of 3 descriptors. Default of 1 will
|
|
2295
|
+
mean a list of all available descriptor attributes will be returned. With
|
|
2296
|
+
there being 33 descriptors, 528 and 5456 combinations of 2 and 3 descriptors
|
|
2297
|
+
will be returned if desc_combo=2 or desc_combo=3, respectively.
|
|
2298
|
+
|
|
2299
|
+
Parameters
|
|
2300
|
+
==========
|
|
2301
|
+
:desc_combo: int (default=1)
|
|
2302
|
+
combination of descriptors to return. A value of 2 or 3 will return
|
|
2303
|
+
all combinations of 2 or 3 descriptor attributes etc.
|
|
2304
|
+
|
|
2305
|
+
Returns
|
|
2306
|
+
=======
|
|
2307
|
+
:all_descriptors: List[str] or List[Tuple[str, ...]]
|
|
2308
|
+
list of available descriptor attributes, or list of tuples of descriptor combinations.
|
|
2309
|
+
"""
|
|
2310
|
+
#filter out class attributes that are not any of the desired descriptors
|
|
2311
|
+
all_descriptors = [k[1:] for k in self.__dict__.keys()
|
|
2312
|
+
if k.startswith('_') and not k.startswith('_all_desc')]
|
|
2313
|
+
|
|
2314
|
+
#get all combinations of 2 or 3 descriptors
|
|
2315
|
+
if (desc_combo == 2):
|
|
2316
|
+
all_descriptors = list(itertools.combinations(all_descriptors, 2))
|
|
2317
|
+
elif (desc_combo == 3):
|
|
2318
|
+
all_descriptors = list(itertools.combinations(all_descriptors, 3))
|
|
2319
|
+
else:
|
|
2320
|
+
pass #if desc_combo not equal to 2 or 3 then use default all_descriptors
|
|
2321
|
+
|
|
2322
|
+
return all_descriptors
|
|
2323
|
+
|
|
2324
|
+
def _calculate_descriptor_batch(self,
|
|
2325
|
+
descriptor_func: Callable,
|
|
2326
|
+
desc_name: str = "",
|
|
2327
|
+
**kwargs) -> pd.DataFrame:
|
|
2328
|
+
"""
|
|
2329
|
+
Generic helper method to calculate descriptors for all sequences, preventing code repetition.
|
|
2330
|
+
|
|
2331
|
+
Parameters
|
|
2332
|
+
==========
|
|
2333
|
+
:descriptor_func: Callable
|
|
2334
|
+
Function to calculate descriptor (e.g., protpy.amino_acid_composition)
|
|
2335
|
+
:desc_name: str
|
|
2336
|
+
Name of descriptor for progress tracking
|
|
2337
|
+
:kwargs: dict
|
|
2338
|
+
Additional keyword arguments to pass to descriptor function
|
|
2339
|
+
|
|
2340
|
+
Returns
|
|
2341
|
+
=======
|
|
2342
|
+
:pd.DataFrame
|
|
2343
|
+
Dataframe with calculated descriptor values for all sequences
|
|
2344
|
+
"""
|
|
2345
|
+
iterator = tqdm(self.protein_seqs, desc=f"Computing {desc_name}") if desc_name else self.protein_seqs
|
|
2346
|
+
|
|
2347
|
+
# accumulate results in a list to avoid O(n²) repeated concat
|
|
2348
|
+
desc_list = [descriptor_func(seq, **kwargs) for seq in iterator]
|
|
2349
|
+
|
|
2350
|
+
return pd.concat(desc_list, ignore_index=False).reset_index(drop=True)
|
|
2351
|
+
|
|
2352
|
+
###################### Getters & Setters ######################
|
|
2353
|
+
|
|
2354
|
+
@property
|
|
2355
|
+
def all_desc(self):
|
|
2356
|
+
return self._all_desc
|
|
2357
|
+
|
|
2358
|
+
@all_desc.setter
|
|
2359
|
+
def all_desc(self, val):
|
|
2360
|
+
self._all_desc = val
|
|
2361
|
+
|
|
2362
|
+
@property
|
|
2363
|
+
def amino_acid_composition(self):
|
|
2364
|
+
return self._amino_acid_composition
|
|
2365
|
+
|
|
2366
|
+
@amino_acid_composition.setter
|
|
2367
|
+
def amino_acid_composition(self, val):
|
|
2368
|
+
self._amino_acid_composition = val
|
|
2369
|
+
|
|
2370
|
+
@property
|
|
2371
|
+
def dipeptide_composition(self):
|
|
2372
|
+
return self._dipeptide_composition
|
|
2373
|
+
|
|
2374
|
+
@dipeptide_composition.setter
|
|
2375
|
+
def dipeptide_composition(self, val):
|
|
2376
|
+
self._dipeptide_composition = val
|
|
2377
|
+
|
|
2378
|
+
@property
|
|
2379
|
+
def tripeptide_composition(self):
|
|
2380
|
+
return self._tripeptide_composition
|
|
2381
|
+
|
|
2382
|
+
@tripeptide_composition.setter
|
|
2383
|
+
def tripeptide_composition(self, val):
|
|
2384
|
+
self._tripeptide_composition = val
|
|
2385
|
+
|
|
2386
|
+
@property
|
|
2387
|
+
def gravy(self):
|
|
2388
|
+
return self._gravy
|
|
2389
|
+
|
|
2390
|
+
@gravy.setter
|
|
2391
|
+
def gravy(self, val):
|
|
2392
|
+
self._gravy = val
|
|
2393
|
+
|
|
2394
|
+
@property
|
|
2395
|
+
def aromaticity(self):
|
|
2396
|
+
return self._aromaticity
|
|
2397
|
+
|
|
2398
|
+
@aromaticity.setter
|
|
2399
|
+
def aromaticity(self, val):
|
|
2400
|
+
self._aromaticity = val
|
|
2401
|
+
|
|
2402
|
+
@property
|
|
2403
|
+
def instability_index(self):
|
|
2404
|
+
return self._instability_index
|
|
2405
|
+
|
|
2406
|
+
@instability_index.setter
|
|
2407
|
+
def instability_index(self, val):
|
|
2408
|
+
self._instability_index = val
|
|
2409
|
+
|
|
2410
|
+
@property
|
|
2411
|
+
def isoelectric_point(self):
|
|
2412
|
+
return self._isoelectric_point
|
|
2413
|
+
|
|
2414
|
+
@isoelectric_point.setter
|
|
2415
|
+
def isoelectric_point(self, val):
|
|
2416
|
+
self._isoelectric_point = val
|
|
2417
|
+
|
|
2418
|
+
@property
|
|
2419
|
+
def molecular_weight(self):
|
|
2420
|
+
return self._molecular_weight
|
|
2421
|
+
|
|
2422
|
+
@molecular_weight.setter
|
|
2423
|
+
def molecular_weight(self, val):
|
|
2424
|
+
self._molecular_weight = val
|
|
2425
|
+
|
|
2426
|
+
@property
|
|
2427
|
+
def charge_distribution(self):
|
|
2428
|
+
return self._charge_distribution
|
|
2429
|
+
|
|
2430
|
+
@charge_distribution.setter
|
|
2431
|
+
def charge_distribution(self, val):
|
|
2432
|
+
self._charge_distribution = val
|
|
2433
|
+
|
|
2434
|
+
@property
|
|
2435
|
+
def hydrophobic_polar_charged_composition(self):
|
|
2436
|
+
return self._hydrophobic_polar_charged_composition
|
|
2437
|
+
|
|
2438
|
+
@hydrophobic_polar_charged_composition.setter
|
|
2439
|
+
def hydrophobic_polar_charged_composition(self, val):
|
|
2440
|
+
self._hydrophobic_polar_charged_composition = val
|
|
2441
|
+
|
|
2442
|
+
@property
|
|
2443
|
+
def secondary_structure_propensity(self):
|
|
2444
|
+
return self._secondary_structure_propensity
|
|
2445
|
+
|
|
2446
|
+
@secondary_structure_propensity.setter
|
|
2447
|
+
def secondary_structure_propensity(self, val):
|
|
2448
|
+
self._secondary_structure_propensity = val
|
|
2449
|
+
|
|
2450
|
+
@property
|
|
2451
|
+
def kmer_composition(self):
|
|
2452
|
+
return self._kmer_composition
|
|
2453
|
+
|
|
2454
|
+
@kmer_composition.setter
|
|
2455
|
+
def kmer_composition(self, val):
|
|
2456
|
+
self._kmer_composition = val
|
|
2457
|
+
|
|
2458
|
+
@property
|
|
2459
|
+
def reduced_alphabet_composition(self):
|
|
2460
|
+
return self._reduced_alphabet_composition
|
|
2461
|
+
|
|
2462
|
+
@reduced_alphabet_composition.setter
|
|
2463
|
+
def reduced_alphabet_composition(self, val):
|
|
2464
|
+
self._reduced_alphabet_composition = val
|
|
2465
|
+
|
|
2466
|
+
@property
|
|
2467
|
+
def motif_composition(self):
|
|
2468
|
+
return self._motif_composition
|
|
2469
|
+
|
|
2470
|
+
@motif_composition.setter
|
|
2471
|
+
def motif_composition(self, val):
|
|
2472
|
+
self._motif_composition = val
|
|
2473
|
+
|
|
2474
|
+
@property
|
|
2475
|
+
def amino_acid_pair_composition(self):
|
|
2476
|
+
return self._amino_acid_pair_composition
|
|
2477
|
+
|
|
2478
|
+
@amino_acid_pair_composition.setter
|
|
2479
|
+
def amino_acid_pair_composition(self, val):
|
|
2480
|
+
self._amino_acid_pair_composition = val
|
|
2481
|
+
|
|
2482
|
+
@property
|
|
2483
|
+
def aliphatic_index(self):
|
|
2484
|
+
return self._aliphatic_index
|
|
2485
|
+
|
|
2486
|
+
@aliphatic_index.setter
|
|
2487
|
+
def aliphatic_index(self, val):
|
|
2488
|
+
self._aliphatic_index = val
|
|
2489
|
+
|
|
2490
|
+
@property
|
|
2491
|
+
def extinction_coefficient(self):
|
|
2492
|
+
return self._extinction_coefficient
|
|
2493
|
+
|
|
2494
|
+
@extinction_coefficient.setter
|
|
2495
|
+
def extinction_coefficient(self, val):
|
|
2496
|
+
self._extinction_coefficient = val
|
|
2497
|
+
|
|
2498
|
+
@property
|
|
2499
|
+
def boman_index(self):
|
|
2500
|
+
return self._boman_index
|
|
2501
|
+
|
|
2502
|
+
@boman_index.setter
|
|
2503
|
+
def boman_index(self, val):
|
|
2504
|
+
self._boman_index = val
|
|
2505
|
+
|
|
2506
|
+
@property
|
|
2507
|
+
def aggregation_propensity(self):
|
|
2508
|
+
return self._aggregation_propensity
|
|
2509
|
+
|
|
2510
|
+
@aggregation_propensity.setter
|
|
2511
|
+
def aggregation_propensity(self, val):
|
|
2512
|
+
self._aggregation_propensity = val
|
|
2513
|
+
|
|
2514
|
+
@property
|
|
2515
|
+
def hydrophobic_moment(self):
|
|
2516
|
+
return self._hydrophobic_moment
|
|
2517
|
+
|
|
2518
|
+
@hydrophobic_moment.setter
|
|
2519
|
+
def hydrophobic_moment(self, val):
|
|
2520
|
+
self._hydrophobic_moment = val
|
|
2521
|
+
|
|
2522
|
+
@property
|
|
2523
|
+
def shannon_entropy(self):
|
|
2524
|
+
return self._shannon_entropy
|
|
2525
|
+
|
|
2526
|
+
@shannon_entropy.setter
|
|
2527
|
+
def shannon_entropy(self, val):
|
|
2528
|
+
self._shannon_entropy = val
|
|
2529
|
+
|
|
2530
|
+
@property
|
|
2531
|
+
def moreaubroto_autocorrelation(self):
|
|
2532
|
+
return self._moreaubroto_autocorrelation
|
|
2533
|
+
|
|
2534
|
+
@moreaubroto_autocorrelation.setter
|
|
2535
|
+
def moreaubroto_autocorrelation(self, val):
|
|
2536
|
+
self._moreaubroto_autocorrelation = val
|
|
2537
|
+
|
|
2538
|
+
@property
|
|
2539
|
+
def moran_autocorrelation(self):
|
|
2540
|
+
return self._moran_autocorrelation
|
|
2541
|
+
|
|
2542
|
+
@moran_autocorrelation.setter
|
|
2543
|
+
def moran_autocorrelation(self, val):
|
|
2544
|
+
self._moran_autocorrelation = val
|
|
2545
|
+
|
|
2546
|
+
@property
|
|
2547
|
+
def geary_autocorrelation(self):
|
|
2548
|
+
return self._geary_autocorrelation
|
|
2549
|
+
|
|
2550
|
+
@geary_autocorrelation.setter
|
|
2551
|
+
def geary_autocorrelation(self, val):
|
|
2552
|
+
self._geary_autocorrelation = val
|
|
2553
|
+
|
|
2554
|
+
@property
|
|
2555
|
+
def ctd(self):
|
|
2556
|
+
return self._ctd
|
|
2557
|
+
|
|
2558
|
+
@ctd.setter
|
|
2559
|
+
def ctd(self, val):
|
|
2560
|
+
self._ctd = val
|
|
2561
|
+
|
|
2562
|
+
@property
|
|
2563
|
+
def ctd_composition(self):
|
|
2564
|
+
return self._ctd_composition
|
|
2565
|
+
|
|
2566
|
+
@ctd_composition.setter
|
|
2567
|
+
def ctd_composition(self, val):
|
|
2568
|
+
self._ctd_composition = val
|
|
2569
|
+
|
|
2570
|
+
@property
|
|
2571
|
+
def ctd_transition(self):
|
|
2572
|
+
return self._ctd_transition
|
|
2573
|
+
|
|
2574
|
+
@ctd_transition.setter
|
|
2575
|
+
def ctd_transition(self, val):
|
|
2576
|
+
self._ctd_transition = val
|
|
2577
|
+
|
|
2578
|
+
@property
|
|
2579
|
+
def ctd_distribution(self):
|
|
2580
|
+
return self._ctd_distribution
|
|
2581
|
+
|
|
2582
|
+
@ctd_distribution.setter
|
|
2583
|
+
def ctd_distribution(self, val):
|
|
2584
|
+
self._ctd_distribution = val
|
|
2585
|
+
|
|
2586
|
+
@property
|
|
2587
|
+
def conjoint_triad(self):
|
|
2588
|
+
return self._conjoint_triad
|
|
2589
|
+
|
|
2590
|
+
@conjoint_triad.setter
|
|
2591
|
+
def conjoint_triad(self, val):
|
|
2592
|
+
self._conjoint_triad = val
|
|
2593
|
+
|
|
2594
|
+
@property
|
|
2595
|
+
def sequence_order_coupling_number(self):
|
|
2596
|
+
return self._sequence_order_coupling_number
|
|
2597
|
+
|
|
2598
|
+
@sequence_order_coupling_number.setter
|
|
2599
|
+
def sequence_order_coupling_number(self, val):
|
|
2600
|
+
self._sequence_order_coupling_number = val
|
|
2601
|
+
|
|
2602
|
+
@property
|
|
2603
|
+
def quasi_sequence_order(self):
|
|
2604
|
+
return self._quasi_sequence_order
|
|
2605
|
+
|
|
2606
|
+
@quasi_sequence_order.setter
|
|
2607
|
+
def quasi_sequence_order(self, val):
|
|
2608
|
+
self._quasi_sequence_order = val
|
|
2609
|
+
|
|
2610
|
+
@property
|
|
2611
|
+
def pseudo_amino_acid_composition(self):
|
|
2612
|
+
return self._pseudo_amino_acid_composition
|
|
2613
|
+
|
|
2614
|
+
@pseudo_amino_acid_composition.setter
|
|
2615
|
+
def pseudo_amino_acid_composition(self, val):
|
|
2616
|
+
self._pseudo_amino_acid_composition = val
|
|
2617
|
+
|
|
2618
|
+
@property
|
|
2619
|
+
def amphiphilic_pseudo_amino_acid_composition(self):
|
|
2620
|
+
return self._amphiphilic_pseudo_amino_acid_composition
|
|
2621
|
+
|
|
2622
|
+
@amphiphilic_pseudo_amino_acid_composition.setter
|
|
2623
|
+
def amphiphilic_pseudo_amino_acid_composition(self, val):
|
|
2624
|
+
self._amphiphilic_pseudo_amino_acid_composition = val
|
|
2625
|
+
|
|
2626
|
+
@property
|
|
2627
|
+
def all_descriptors(self):
|
|
2628
|
+
return self._all_descriptors
|
|
2629
|
+
|
|
2630
|
+
@all_descriptors.setter
|
|
2631
|
+
def all_descriptors(self, val):
|
|
2632
|
+
self._all_descriptors = val
|
|
2633
|
+
|
|
2634
|
+
@all_descriptors.deleter
|
|
2635
|
+
def all_descriptors(self):
|
|
2636
|
+
""" Delete all descriptor attribute dataframes """
|
|
2637
|
+
del self._all_descriptors
|
|
2638
|
+
del self._amino_acid_composition
|
|
2639
|
+
del self._dipeptide_composition
|
|
2640
|
+
del self._tripeptide_composition
|
|
2641
|
+
del self._gravy
|
|
2642
|
+
del self._aromaticity
|
|
2643
|
+
del self._instability_index
|
|
2644
|
+
del self._isoelectric_point
|
|
2645
|
+
del self._molecular_weight
|
|
2646
|
+
del self._charge_distribution
|
|
2647
|
+
del self._hydrophobic_polar_charged_composition
|
|
2648
|
+
del self._secondary_structure_propensity
|
|
2649
|
+
del self._kmer_composition
|
|
2650
|
+
del self._reduced_alphabet_composition
|
|
2651
|
+
del self._motif_composition
|
|
2652
|
+
del self._amino_acid_pair_composition
|
|
2653
|
+
del self._aliphatic_index
|
|
2654
|
+
del self._extinction_coefficient
|
|
2655
|
+
del self._boman_index
|
|
2656
|
+
del self._aggregation_propensity
|
|
2657
|
+
del self._hydrophobic_moment
|
|
2658
|
+
del self._shannon_entropy
|
|
2659
|
+
del self._moreaubroto_autocorrelation
|
|
2660
|
+
del self._moran_autocorrelation
|
|
2661
|
+
del self._geary_autocorrelation
|
|
2662
|
+
del self._ctd
|
|
2663
|
+
del self._ctd_transition
|
|
2664
|
+
del self._ctd_composition
|
|
2665
|
+
del self._ctd_distribution
|
|
2666
|
+
del self._conjoint_triad
|
|
2667
|
+
del self._sequence_order_coupling_number
|
|
2668
|
+
del self._quasi_sequence_order
|
|
2669
|
+
del self._pseudo_amino_acid_composition
|
|
2670
|
+
del self._amphiphilic_pseudo_amino_acid_composition
|
|
2671
|
+
|
|
2672
|
+
def __str__(self) -> str:
|
|
2673
|
+
return f'''{self.shape}
|
|
2674
|
+
Amino Acid Composition: {self.amino_acid_composition.shape}
|
|
2675
|
+
Dipeptide Composition: {self.dipeptide_composition.shape}
|
|
2676
|
+
Tripeptide Composition: {self.tripeptide_composition.shape}
|
|
2677
|
+
GRAVY: {self.gravy.shape}
|
|
2678
|
+
Aromaticity: {self.aromaticity.shape}
|
|
2679
|
+
Instability Index: {self.instability_index.shape}
|
|
2680
|
+
Isoelectric Point: {self.isoelectric_point.shape}
|
|
2681
|
+
Molecular Weight: {self.molecular_weight.shape}
|
|
2682
|
+
Charge Distribution: {self.charge_distribution.shape}
|
|
2683
|
+
Hydrophobic/Polar/Charged Composition: {self.hydrophobic_polar_charged_composition.shape}
|
|
2684
|
+
Secondary Structure Propensity: {self.secondary_structure_propensity.shape}
|
|
2685
|
+
k-mer Composition: {self.kmer_composition.shape}
|
|
2686
|
+
Reduced Alphabet Composition: {self.reduced_alphabet_composition.shape}
|
|
2687
|
+
Motif Composition: {self.motif_composition.shape}
|
|
2688
|
+
Amino Acid Pair Composition: {self.amino_acid_pair_composition.shape}
|
|
2689
|
+
Aliphatic Index: {self.aliphatic_index.shape}
|
|
2690
|
+
Extinction Coefficient: {self.extinction_coefficient.shape}
|
|
2691
|
+
Boman Index: {self.boman_index.shape}
|
|
2692
|
+
Aggregation Propensity: {self.aggregation_propensity.shape}
|
|
2693
|
+
Hydrophobic Moment: {self.hydrophobic_moment.shape}
|
|
2694
|
+
Shannon Entropy: {self.shannon_entropy.shape}
|
|
2695
|
+
MoreauBroto Autocorrelation: {self.moreaubroto_autocorrelation.shape}
|
|
2696
|
+
Moran Autocorrelation: {self.moran_autocorrelation.shape}
|
|
2697
|
+
Geary Autocorrelation: {self.geary_autocorrelation.shape}
|
|
2698
|
+
CTD: {self.ctd.shape}
|
|
2699
|
+
Conjoint Triad: {self.conjoint_triad.shape}
|
|
2700
|
+
Sequence Order Coupling Number: {self.sequence_order_coupling_number.shape}
|
|
2701
|
+
Quasi Sequence Order: {self.quasi_sequence_order.shape}
|
|
2702
|
+
Pseudo Amino Acid Composition: {self.pseudo_amino_acid_composition.shape}
|
|
2703
|
+
Amphiphilic Pseudo Amino Acid Composition: {self.amphiphilic_pseudo_amino_acid_composition.shape}'''
|
|
2704
|
+
|
|
2705
|
+
def get_all_descriptors(self,
|
|
2706
|
+
descriptors: Optional[List[str]] = None) -> pd.DataFrame:
|
|
2707
|
+
"""
|
|
2708
|
+
Calculate multiple descriptors efficiently in batch.
|
|
2709
|
+
|
|
2710
|
+
Parameters
|
|
2711
|
+
==========
|
|
2712
|
+
:descriptors: list of str, optional
|
|
2713
|
+
List of specific descriptors to calculate. If None, calculates all.
|
|
2714
|
+
|
|
2715
|
+
Returns
|
|
2716
|
+
=======
|
|
2717
|
+
:pd.DataFrame
|
|
2718
|
+
DataFrame with all calculated descriptor values concatenated
|
|
2719
|
+
"""
|
|
2720
|
+
if descriptors is None:
|
|
2721
|
+
descriptors = self.valid_descriptors
|
|
2722
|
+
else:
|
|
2723
|
+
descriptors = self.validate_descriptors(descriptors)
|
|
2724
|
+
|
|
2725
|
+
results = {}
|
|
2726
|
+
for desc in tqdm(descriptors, desc="Computing all descriptors"):
|
|
2727
|
+
method = getattr(self, f'get_{desc}')
|
|
2728
|
+
results[desc] = method()
|
|
2729
|
+
|
|
2730
|
+
self.all_descriptors = pd.concat(results.values(), axis=1)
|
|
2731
|
+
return self.all_descriptors
|
|
2732
|
+
|
|
2733
|
+
def get_descriptor_info(self, descriptor_name: str) -> Dict[str, Any]:
|
|
2734
|
+
"""
|
|
2735
|
+
Get metadata and information about a specific descriptor.
|
|
2736
|
+
|
|
2737
|
+
Parameters
|
|
2738
|
+
==========
|
|
2739
|
+
:descriptor_name: str
|
|
2740
|
+
Name of the descriptor
|
|
2741
|
+
|
|
2742
|
+
Returns
|
|
2743
|
+
=======
|
|
2744
|
+
:Dict[str, Any]
|
|
2745
|
+
Dictionary with descriptor metadata including name, feature count, group, and parameters
|
|
2746
|
+
"""
|
|
2747
|
+
self.validate_descriptors(descriptor_name)
|
|
2748
|
+
|
|
2749
|
+
descriptor_info = {
|
|
2750
|
+
'name': descriptor_name,
|
|
2751
|
+
'group': self.descriptor_groups.get(descriptor_name, 'Unknown'),
|
|
2752
|
+
'feature_count': self.descriptor_feature_count.get(descriptor_name, 0),
|
|
2753
|
+
'parameters': {},
|
|
2754
|
+
}
|
|
2755
|
+
|
|
2756
|
+
# Add parameters if available
|
|
2757
|
+
if hasattr(self.desc_parameters, descriptor_name):
|
|
2758
|
+
parameters = getattr(self.desc_parameters, descriptor_name)
|
|
2759
|
+
if isinstance(parameters, dict):
|
|
2760
|
+
descriptor_info['parameters'] = dict(parameters)
|
|
2761
|
+
elif hasattr(parameters, '__dict__'):
|
|
2762
|
+
descriptor_info['parameters'] = vars(parameters)
|
|
2763
|
+
|
|
2764
|
+
return descriptor_info
|
|
2765
|
+
|
|
2766
|
+
def reset_descriptors(self) -> None:
|
|
2767
|
+
"""
|
|
2768
|
+
Reset all descriptor attributes to empty DataFrames.
|
|
2769
|
+
Clears all calculated descriptor values without affecting configuration.
|
|
2770
|
+
|
|
2771
|
+
Parameters
|
|
2772
|
+
==========
|
|
2773
|
+
None
|
|
2774
|
+
|
|
2775
|
+
Returns
|
|
2776
|
+
=======
|
|
2777
|
+
None
|
|
2778
|
+
"""
|
|
2779
|
+
self.amino_acid_composition = pd.DataFrame()
|
|
2780
|
+
self.dipeptide_composition = pd.DataFrame()
|
|
2781
|
+
self.tripeptide_composition = pd.DataFrame()
|
|
2782
|
+
self.gravy = pd.DataFrame()
|
|
2783
|
+
self.aromaticity = pd.DataFrame()
|
|
2784
|
+
self.instability_index = pd.DataFrame()
|
|
2785
|
+
self.isoelectric_point = pd.DataFrame()
|
|
2786
|
+
self.molecular_weight = pd.DataFrame()
|
|
2787
|
+
self.charge_distribution = pd.DataFrame()
|
|
2788
|
+
self.hydrophobic_polar_charged_composition = pd.DataFrame()
|
|
2789
|
+
self.secondary_structure_propensity = pd.DataFrame()
|
|
2790
|
+
self.kmer_composition = pd.DataFrame()
|
|
2791
|
+
self.reduced_alphabet_composition = pd.DataFrame()
|
|
2792
|
+
self.motif_composition = pd.DataFrame()
|
|
2793
|
+
self.amino_acid_pair_composition = pd.DataFrame()
|
|
2794
|
+
self.aliphatic_index = pd.DataFrame()
|
|
2795
|
+
self.extinction_coefficient = pd.DataFrame()
|
|
2796
|
+
self.boman_index = pd.DataFrame()
|
|
2797
|
+
self.aggregation_propensity = pd.DataFrame()
|
|
2798
|
+
self.hydrophobic_moment = pd.DataFrame()
|
|
2799
|
+
self.shannon_entropy = pd.DataFrame()
|
|
2800
|
+
self.moreaubroto_autocorrelation = pd.DataFrame()
|
|
2801
|
+
self.moran_autocorrelation = pd.DataFrame()
|
|
2802
|
+
self.geary_autocorrelation = pd.DataFrame()
|
|
2803
|
+
self.ctd = pd.DataFrame()
|
|
2804
|
+
self.ctd_composition = pd.DataFrame()
|
|
2805
|
+
self.ctd_transition = pd.DataFrame()
|
|
2806
|
+
self.ctd_distribution = pd.DataFrame()
|
|
2807
|
+
self.conjoint_triad = pd.DataFrame()
|
|
2808
|
+
self.sequence_order_coupling_number = pd.DataFrame()
|
|
2809
|
+
self.quasi_sequence_order = pd.DataFrame()
|
|
2810
|
+
self.pseudo_amino_acid_composition = pd.DataFrame()
|
|
2811
|
+
self.amphiphilic_pseudo_amino_acid_composition = pd.DataFrame()
|
|
2812
|
+
self.all_descriptors = pd.DataFrame()
|
|
2813
|
+
|
|
2814
|
+
def clear_cache(self) -> None:
|
|
2815
|
+
"""
|
|
2816
|
+
Clear cached descriptor metadata to free memory.
|
|
2817
|
+
Useful after major descriptor calculations or when memory is constrained.
|
|
2818
|
+
|
|
2819
|
+
Parameters
|
|
2820
|
+
==========
|
|
2821
|
+
None
|
|
2822
|
+
|
|
2823
|
+
Returns
|
|
2824
|
+
=======
|
|
2825
|
+
None
|
|
2826
|
+
"""
|
|
2827
|
+
if hasattr(self.descriptor_feature_count, 'cache_clear'):
|
|
2828
|
+
self.descriptor_feature_count.fget.cache_clear()
|
|
2829
|
+
|
|
2830
|
+
def get_descriptor_columns(self, descriptor: str) -> List[str]:
|
|
2831
|
+
"""
|
|
2832
|
+
Get list of column names for a specific descriptor.
|
|
2833
|
+
|
|
2834
|
+
Parameters
|
|
2835
|
+
==========
|
|
2836
|
+
:descriptor: str
|
|
2837
|
+
Name of the descriptor (e.g., 'amino_acid_composition')
|
|
2838
|
+
|
|
2839
|
+
Returns
|
|
2840
|
+
=======
|
|
2841
|
+
:List[str]
|
|
2842
|
+
List of column names in the descriptor DataFrame
|
|
2843
|
+
|
|
2844
|
+
Raises
|
|
2845
|
+
======
|
|
2846
|
+
:InvalidDescriptorError
|
|
2847
|
+
If descriptor name is invalid
|
|
2848
|
+
:ValueError
|
|
2849
|
+
If descriptor has not been calculated yet
|
|
2850
|
+
"""
|
|
2851
|
+
# Validate descriptor name
|
|
2852
|
+
self.validate_descriptors(descriptor)
|
|
2853
|
+
|
|
2854
|
+
# Get the descriptor dataframe attribute
|
|
2855
|
+
desc_attr = getattr(self, descriptor, None)
|
|
2856
|
+
|
|
2857
|
+
if desc_attr is None or desc_attr.empty:
|
|
2858
|
+
raise ValueError(f"Descriptor '{descriptor}' has not been calculated yet. "
|
|
2859
|
+
f"Call get_{descriptor}() first.")
|
|
2860
|
+
|
|
2861
|
+
return desc_attr.columns.tolist()
|
|
2862
|
+
|
|
2863
|
+
def __repr__(self) -> str:
|
|
2864
|
+
return f'<Descriptor: {self}>'
|
|
2865
|
+
|
|
2866
|
+
def __len__(self) -> int:
|
|
2867
|
+
return len(self.all_descriptors)
|
|
2868
|
+
|
|
2869
|
+
def __shape__(self) -> Tuple[int, int]:
|
|
2870
|
+
return self.all_descriptors.shape
|
|
2871
|
+
|
|
2872
|
+
def __sizeof__(self) -> int:
|
|
2873
|
+
""" Get size of all_descriptors object that stores all descriptor values. """
|
|
2874
|
+
return self.all_descriptors.__sizeof__()
|
|
2875
|
+
|
|
2876
|
+
class DescriptorError(Exception):
|
|
2877
|
+
"""Base exception for descriptor operations."""
|
|
2878
|
+
pass
|
|
2879
|
+
|
|
2880
|
+
|
|
2881
|
+
class InvalidSequenceError(DescriptorError):
|
|
2882
|
+
"""Raised when sequence contains invalid amino acids."""
|
|
2883
|
+
pass
|
|
2884
|
+
|
|
2885
|
+
|
|
2886
|
+
class DescriptorConfigError(DescriptorError):
|
|
2887
|
+
"""Raised when config file is invalid or malformed."""
|
|
2888
|
+
pass
|
|
2889
|
+
|
|
2890
|
+
|
|
2891
|
+
class InvalidDescriptorError(DescriptorError):
|
|
2892
|
+
"""Raised when requesting non-existent descriptor."""
|
|
2893
|
+
pass
|