PySAR 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pySAR/descriptors.py ADDED
@@ -0,0 +1,2893 @@
1
+ ################################################################################
2
+ ################# Descriptors #################
3
+ ################################################################################
4
+
5
+ from typing import Union, List, Optional, Dict, Any, Callable, Tuple
6
+ from enum import Enum
7
+ import pandas as pd
8
+ import numpy as np
9
+ from difflib import get_close_matches
10
+ import json
11
+ from json import JSONDecodeError
12
+ import itertools
13
+ import time
14
+ from tqdm import tqdm
15
+ from functools import lru_cache
16
+
17
+ from .utils import *
18
+ import protpy as protpy
19
+
20
+ # Descriptor feature dimension constants
21
+ AA_COUNT = 20
22
+ DIPEPTIDE_FEATURES = 20 ** 2 # 400
23
+ TRIPEPTIDE_FEATURES = 20 ** 3 # 8000
24
+ CONJOINT_TRIAD_FEATURES = 343
25
+
26
+ class DescriptorType(Enum):
27
+ """Enumeration of available protein descriptor types."""
28
+ AMINO_ACID_COMPOSITION = 'amino_acid_composition'
29
+ DIPEPTIDE_COMPOSITION = 'dipeptide_composition'
30
+ TRIPEPTIDE_COMPOSITION = 'tripeptide_composition'
31
+ GRAVY = 'gravy'
32
+ AROMATICITY = 'aromaticity'
33
+ INSTABILITY_INDEX = 'instability_index'
34
+ ISOELECTRIC_POINT = 'isoelectric_point'
35
+ MOLECULAR_WEIGHT = 'molecular_weight'
36
+ CHARGE_DISTRIBUTION = 'charge_distribution'
37
+ HYDROPHOBIC_POLAR_CHARGED_COMPOSITION = 'hydrophobic_polar_charged_composition'
38
+ SECONDARY_STRUCTURE_PROPENSITY = 'secondary_structure_propensity'
39
+ KMER_COMPOSITION = 'kmer_composition'
40
+ REDUCED_ALPHABET_COMPOSITION = 'reduced_alphabet_composition'
41
+ MOTIF_COMPOSITION = 'motif_composition'
42
+ AMINO_ACID_PAIR_COMPOSITION = 'amino_acid_pair_composition'
43
+ ALIPHATIC_INDEX = 'aliphatic_index'
44
+ EXTINCTION_COEFFICIENT = 'extinction_coefficient'
45
+ BOMAN_INDEX = 'boman_index'
46
+ AGGREGATION_PROPENSITY = 'aggregation_propensity'
47
+ HYDROPHOBIC_MOMENT = 'hydrophobic_moment'
48
+ SHANNON_ENTROPY = 'shannon_entropy'
49
+ MOREAUBROTO_AUTOCORRELATION = 'moreaubroto_autocorrelation'
50
+ MORAN_AUTOCORRELATION = 'moran_autocorrelation'
51
+ GEARY_AUTOCORRELATION = 'geary_autocorrelation'
52
+ CTD = 'ctd'
53
+ CTD_COMPOSITION = 'ctd_composition'
54
+ CTD_TRANSITION = 'ctd_transition'
55
+ CTD_DISTRIBUTION = 'ctd_distribution'
56
+ CONJOINT_TRIAD = 'conjoint_triad'
57
+ SEQUENCE_ORDER_COUPLING_NUMBER = 'sequence_order_coupling_number'
58
+ QUASI_SEQUENCE_ORDER = 'quasi_sequence_order'
59
+ PSEUDO_AMINO_ACID_COMPOSITION = 'pseudo_amino_acid_composition'
60
+ AMPHIPHILIC_PSEUDO_AMINO_ACID_COMPOSITION = 'amphiphilic_pseudo_amino_acid_composition'
61
+
62
+ class Descriptors():
63
+ """
64
+ Class for calculating a wide variety of protein physicochemical, biochemical and structural
65
+ descriptors. These descriptors have been used in a wide variety of Bioinformatics
66
+ applications including: protein structural and functional class prediction,
67
+ protein-protein interactions, subcellular location, secondary structure prediction, among
68
+ many more. They represent the different structural, functional & interaction profiles of
69
+ proteins by exploring the features in the groups of composition, correlation and distribution
70
+ of the constituent residues and their biochemical and physicochemical properties.
71
+
72
+ A custom-built software package was created to generate these descriptors - protpy, which
73
+ is also open-source and available here: https://github.com/amckenna41/protpy. The package
74
+ takes 1 or more protein sequences, returning the respective descriptor values in a Pandas
75
+ DataFrame. protpy and this class allows calculation of the following descriptors: Amino
76
+ Acid Composition (AAComp), Dipeptide Composition (DPComp), Tripeptide Composition (TPComp),
77
+ MoreauBroto Autocorrelation (MBAuto), Moran Autocorrelation (MAuto), Geary Autocorrelation
78
+ (GAuto), Composition (CTD_C), Transition (CTD_T), Distribution (CTD_D), CTD, Conjoint Triad
79
+ (CTriad), Sequence Order Coupling Number (SOCN), Quasi Sequence Order (QSO), Pseudo Amino Acid
80
+ Composition - type 1 (PAAcomp), Amphiphilic Pseudo Amino Acid Composition - type 2 (APAAComp),
81
+ GRAVY, Aromaticity, Instability Index, Isoelectric Point, Molecular Weight, Charge Distribution,
82
+ Hydrophobic/Polar/Charged Composition (HPC), Secondary Structure Propensity (SSP), k-mer
83
+ Composition, Reduced Alphabet Composition, Motif Composition, Amino Acid Pair Composition,
84
+ Aliphatic Index, Extinction Coefficient, Boman Index, Aggregation Propensity, Hydrophobic
85
+ Moment, and Shannon Entropy.
86
+
87
+ Similar to other classes in pySAR, this class works via configuration files which contain
88
+ the values for all the potential parameters, if applicable, of each descriptor. By default,
89
+ the class will look for a descriptors csv which is a file of the pre-calculated descriptor
90
+ values for the specified dataset, if this file doesn't exist, or the parameter value is blank,
91
+ then each descriptor will have to be calculated using its respective function.
92
+
93
+ During initialization, input sequences are normalized by removing gaps and then validated
94
+ against canonical amino acids before descriptor generation begins.
95
+
96
+ This class is also designed to feed descriptor feature matrices directly into downstream
97
+ Encoding and PySAR workflows for model training and evaluation.
98
+
99
+ It is recommended that with every new dataset, the Descriptors class should be instantiated
100
+ with the "all_desc" parameter set to 1 in the config file. This will calculate all the descriptor
101
+ values for the dataset of protein sequences, storing the result in a csv file, meaning that
102
+ this file can be used for future use and the descriptors will not have to be recalculated each
103
+ time. This csv file will be saved to the path and filename according to the "descriptors_csv"
104
+ parameter in the config file.
105
+
106
+ Parameters
107
+ ==========
108
+ :config_file: str
109
+ path to configuration file which will contain the various parameter values for all
110
+ descriptors. If invalid value input then error will be raised.
111
+ :protein_seqs: pd.Series or str
112
+ protein sequences to calculate descriptors for. A single sequence string is converted
113
+ internally to a pandas Series. If None or empty, sequences are loaded from the dataset
114
+ path in the configuration.
115
+ **kwargs: dict
116
+ keyword argument names and values for the dataset filename/path and the descriptors
117
+ csv path parameters. The keywords should be the same name and form of those in the
118
+ configuration file. The keyword values input take precedence over those in the config files.
119
+
120
+ Attributes
121
+ ==========
122
+ :amino_acid_composition: pd.DataFrame
123
+ Amino acid composition descriptor (20 features)
124
+ :dipeptide_composition: pd.DataFrame
125
+ Dipeptide composition descriptor (400 features)
126
+ :tripeptide_composition: pd.DataFrame
127
+ Tripeptide composition descriptor (8000 features)
128
+ :moreaubroto_autocorrelation: pd.DataFrame
129
+ Moreaubroto autocorrelation descriptor (240 features)
130
+ :moran_autocorrelation: pd.DataFrame
131
+ Moran autocorrelation descriptor (240 features)
132
+ :geary_autocorrelation: pd.DataFrame
133
+ Geary autocorrelation descriptor (240 features)
134
+ :ctd: pd.DataFrame
135
+ Composition-Transition-Distribution descriptor
136
+ :conjoint_triad: pd.DataFrame
137
+ Conjoint triad descriptor (343 features)
138
+ :pseudo_amino_acid_composition: pd.DataFrame
139
+ Pseudo amino acid composition descriptor
140
+ :amphiphilic_pseudo_amino_acid_composition: pd.DataFrame
141
+ Amphiphilic pseudo amino acid composition descriptor
142
+ :gravy: pd.DataFrame
143
+ GRAVY (Grand Average of Hydropathy) descriptor (1 feature)
144
+ :aromaticity: pd.DataFrame
145
+ Aromaticity descriptor (1 feature)
146
+ :instability_index: pd.DataFrame
147
+ Instability Index descriptor (1 feature)
148
+ :isoelectric_point: pd.DataFrame
149
+ Isoelectric Point descriptor (1 feature)
150
+ :molecular_weight: pd.DataFrame
151
+ Molecular Weight descriptor (1 feature)
152
+ :charge_distribution: pd.DataFrame
153
+ Charge Distribution descriptor (3 features)
154
+ :hydrophobic_polar_charged_composition: pd.DataFrame
155
+ Hydrophobic/Polar/Charged Composition descriptor (3 features)
156
+ :secondary_structure_propensity: pd.DataFrame
157
+ Secondary Structure Propensity descriptor (3 features)
158
+ :kmer_composition: pd.DataFrame
159
+ k-mer Composition descriptor (20^k features, default 400)
160
+ :reduced_alphabet_composition: pd.DataFrame
161
+ Reduced Alphabet Composition descriptor (alphabet_size features, default 6)
162
+ :motif_composition: pd.DataFrame
163
+ Motif Composition descriptor (8 features by default)
164
+ :amino_acid_pair_composition: pd.DataFrame
165
+ Amino Acid Pair Composition descriptor (400 features)
166
+ :aliphatic_index: pd.DataFrame
167
+ Aliphatic Index descriptor (1 feature)
168
+ :extinction_coefficient: pd.DataFrame
169
+ Extinction Coefficient descriptor (2 features)
170
+ :boman_index: pd.DataFrame
171
+ Boman Index descriptor (1 feature)
172
+ :aggregation_propensity: pd.DataFrame
173
+ Aggregation Propensity descriptor (2 features)
174
+ :hydrophobic_moment: pd.DataFrame
175
+ Hydrophobic Moment descriptor (2 features)
176
+ :shannon_entropy: pd.DataFrame
177
+ Shannon Entropy descriptor (1 feature)
178
+ :all_descriptors: pd.DataFrame
179
+ Concatenated dataframe of all calculated descriptors
180
+ :valid_descriptors: list
181
+ List of all available descriptor names
182
+ :descriptor_groups: dict
183
+ Mapping of descriptor names to their functional groups
184
+ :num_seqs: int
185
+ Total number of input protein sequences
186
+ :protein_seqs: pd.Series
187
+ Loaded protein sequences with gaps removed
188
+
189
+ Methods
190
+ =======
191
+ import_descriptors()
192
+ Import pre-calculated descriptors from CSV file
193
+ get_amino_acid_composition()
194
+ Calculate amino acid composition for all sequences
195
+ get_dipeptide_composition()
196
+ Calculate dipeptide composition for all sequences
197
+ get_tripeptide_composition()
198
+ Calculate tripeptide composition for all sequences
199
+ get_moreaubroto_autocorrelation()
200
+ Calculate Moreau-Broto autocorrelation descriptor
201
+ get_moran_autocorrelation()
202
+ Calculate Moran autocorrelation descriptor
203
+ get_geary_autocorrelation()
204
+ Calculate Geary autocorrelation descriptor
205
+ get_ctd()
206
+ Calculate CTD descriptor
207
+ get_ctd_composition()
208
+ Calculate CTD composition descriptor
209
+ get_ctd_transition()
210
+ Calculate CTD transition descriptor
211
+ get_ctd_distribution()
212
+ Calculate CTD distribution descriptor
213
+ get_conjoint_triad()
214
+ Calculate conjoint triad descriptor
215
+ get_sequence_order_coupling_number()
216
+ Calculate sequence order coupling number descriptor
217
+ get_quasi_sequence_order()
218
+ Calculate quasi sequence order descriptor
219
+ get_pseudo_amino_acid_composition()
220
+ Calculate pseudo amino acid composition descriptor
221
+ get_amphiphilic_pseudo_amino_acid_composition()
222
+ Calculate amphiphilic pseudo amino acid composition descriptor
223
+ get_gravy()
224
+ Calculate GRAVY (Grand Average of Hydropathy) descriptor
225
+ get_aromaticity()
226
+ Calculate Aromaticity descriptor
227
+ get_instability_index()
228
+ Calculate Instability Index descriptor
229
+ get_isoelectric_point()
230
+ Calculate Isoelectric Point descriptor
231
+ get_molecular_weight()
232
+ Calculate Molecular Weight descriptor
233
+ get_charge_distribution()
234
+ Calculate Charge Distribution descriptor
235
+ get_hydrophobic_polar_charged_composition()
236
+ Calculate Hydrophobic/Polar/Charged Composition descriptor
237
+ get_secondary_structure_propensity()
238
+ Calculate Secondary Structure Propensity descriptor
239
+ get_kmer_composition()
240
+ Calculate k-mer Composition descriptor
241
+ get_reduced_alphabet_composition()
242
+ Calculate Reduced Alphabet Composition descriptor
243
+ get_motif_composition()
244
+ Calculate Motif Composition descriptor
245
+ get_amino_acid_pair_composition()
246
+ Calculate Amino Acid Pair Composition descriptor
247
+ get_aliphatic_index()
248
+ Calculate Aliphatic Index descriptor
249
+ get_extinction_coefficient()
250
+ Calculate Extinction Coefficient descriptor
251
+ get_boman_index()
252
+ Calculate Boman Index descriptor
253
+ get_aggregation_propensity()
254
+ Calculate Aggregation Propensity descriptor
255
+ get_hydrophobic_moment()
256
+ Calculate Hydrophobic Moment descriptor
257
+ get_shannon_entropy()
258
+ Calculate Shannon Entropy descriptor
259
+ get_all_descriptors()
260
+ Calculate all descriptors and return a concatenated dataframe
261
+ get_descriptor_encoding()
262
+ Resolve a descriptor name and return its encoding dataframe
263
+ all_descriptors_list()
264
+ Return descriptor names or combinations of descriptor names
265
+ validate_descriptors()
266
+ Validate descriptor names exist in valid descriptors list
267
+ validate_sequences()
268
+ Validate sequences contain only canonical amino acids
269
+ get_descriptor_info()
270
+ Get metadata about a specific descriptor
271
+ reset_descriptors()
272
+ Clear all descriptor DataFrames to empty state
273
+ clear_cache()
274
+ Free memory from cached descriptor metadata
275
+ get_descriptor_columns()
276
+ Get column names for a calculated descriptor
277
+ __str__()
278
+ Return a human-readable string summary of descriptor shapes
279
+ __repr__()
280
+ Return the object representation string
281
+ __len__()
282
+ Return number of rows in all_descriptors
283
+ __shape__()
284
+ Return shape of all_descriptors
285
+ __sizeof__()
286
+ Return memory footprint of all_descriptors
287
+
288
+ Raises
289
+ ======
290
+ :TypeError
291
+ If config_file is not a string or protein sequences are invalid type
292
+ :OSError
293
+ If config file or dataset file not found at specified path
294
+ :InvalidSequenceError
295
+ If protein sequences contain non-canonical amino acids
296
+ :InvalidDescriptorError
297
+ If requesting a non-existent descriptor
298
+ :DescriptorConfigError
299
+ If configuration JSON file is invalid or malformed
300
+
301
+ Examples
302
+ ========
303
+ >>> from pySAR.descriptors import Descriptors
304
+ >>> desc = Descriptors(config_file='config/thermostability.json')
305
+ >>>
306
+ >>> # Calculate single descriptor
307
+ >>> aa_comp = desc.get_amino_acid_composition()
308
+ >>>
309
+ >>> # Calculate multiple descriptors
310
+ >>> desc.get_dipeptide_composition()
311
+ >>> desc.get_moran_autocorrelation()
312
+ >>>
313
+ >>> # Get all descriptors at once
314
+ >>> all_desc = desc.get_all_descriptors()
315
+ >>> alldescs.shape
316
+ (261, 10572)
317
+ >>>
318
+ >>> # Get descriptor information
319
+ >>> info = desc.get_descriptor_info('amino_acid_composition')
320
+ >>> info['feature_count']
321
+ 20
322
+ >>>
323
+ >>> # Get columns for a descriptor
324
+ >>> columns = desc.get_descriptor_columns('dipeptide_composition')
325
+ >>> len(columns)
326
+ 400
327
+
328
+ Notes
329
+ =====
330
+ - Tripeptide and pseudo-amino acid composition descriptors are computationally expensive
331
+ and may take significant time to calculate on large datasets
332
+ - Pre-calculating all descriptors and exporting to CSV (via 'all_desc' config parameter)
333
+ is recommended to avoid recalculation
334
+ - The descriptor_feature_count property is cached for performance
335
+ - Memory usage scales with dataset size and number of descriptors calculated
336
+ - Protein sequences must contain only standard 20 amino acids (A-W, excluding B, O, U, Z)
337
+
338
+ References
339
+ ==========
340
+ [1] Dong, J., Yao, ZJ., Zhang, L. et al. PyBioMed: a python library for
341
+ various molecular representations of chemicals, proteins and DNAs and
342
+ their interactions. J Cheminform 10, 16 (2018).
343
+ https://doi.org/10.1186/s13321-018-0270-2
344
+ [2] Reczko, M. and Bohr, H. (1994) The DEF data base of sequence based protein
345
+ fold class predictions. Nucleic Acids Res, 22, 3616-3619.
346
+ [3] Hua, S. and Sun, Z. (2001) Support vector machine approach for protein
347
+ subcellular localization prediction. Bioinformatics, 17, 721-728.
348
+ [4] Broto P, Moreau G, Vandicke C: Molecular structures: perception,
349
+ autocorrelation descriptor and SAR studies. Eur J Med Chem 1984, 19: 71–78.
350
+ [5] Ong, S.A., Lin, H.H., Chen, Y.Z. et al. Efficacy of different protein
351
+ descriptors in predicting protein functional families. BMC Bioinformatics
352
+ 8, 300 (2007). https://doi.org/10.1186/1471-2105-8-300
353
+ [6] Inna Dubchak, Ilya Muchink, Stephen R.Holbrook and Sung-Hou Kim. Prediction
354
+ of protein folding class using global description of amino acid sequence.
355
+ Proc.Natl. Acad.Sci.USA, 1995, 92, 8700-8704.
356
+ [7] Juwen Shen, Jian Zhang, Xiaomin Luo, Weiliang Zhu, Kunqian Yu, Kaixian Chen,
357
+ Yixue Li, Huanliang Jiang. Predicting proten-protein interactions based only
358
+ on sequences inforamtion. PNAS. 2007 (104) 4337-4341.
359
+ [8] Kuo-Chen Chou. Prediction of Protein Subcellar Locations by Incorporating
360
+ Quasi-Sequence-Order Effect. Biochemical and Biophysical Research
361
+ Communications 2000, 278, 477-483.
362
+ [9] Kuo-Chen Chou. Prediction of Protein Cellular Attributes Using
363
+ Pseudo-Amino Acid Composition. PROTEINS: Structure, Function, and
364
+ Genetics, 2001, 43: 246-255.
365
+ [10] Kuo-Chen Chou. Using amphiphilic pseudo amino acid composition to predict enzyme
366
+ subfamily classes. Bioinformatics, 2005,21,10-19.
367
+ [11] J. Shen et al., “Predicting protein-protein interactions based only on sequences
368
+ information,” Proc. Natl. Acad. Sci. U. S. A., vol. 104, no. 11, pp. 4337–4341, 2007.
369
+ [12] Gisbert Schneider and Paul Wrede. The Rational Design of Amino Acid Sequences
370
+ by Artifical Neural Networks and Simulated Molecular Evolution: Do Novo Design
371
+ of an Idealized Leader Cleavge Site. Biophys Journal, 1994, 66, 335-344.
372
+ [13] Grantham, R. (1974-09-06). "Amino acid difference formula to help explain protein
373
+ evolution". Science. 185 (4154): 862–864. Bibcode:1974Sci...185..862G.
374
+ doi:10.1126/science.185.4154.862. ISSN 0036-8075. PMID 4843792. S2CID 35388307.
375
+ [14] B. Hollas, “An analysis of the autocorrelation descriptor for molecules,” J. Math. Chem.,
376
+ vol. 33, no. 2, pp. 91–101, 2003.
377
+ """
378
+ def __init__(self,
379
+ config_file: str = "",
380
+ protein_seqs: Optional[Union[pd.Series, str]] = None,
381
+ **kwargs) -> None:
382
+
383
+ self.config_file = config_file
384
+ self.protein_seqs = protein_seqs
385
+ self.kwargs = locals()['kwargs'] #get any keyword argument variables of class
386
+ self.config_parameters = {}
387
+
388
+ desc_config_filepath = ""
389
+
390
+ #import config file, raise error if invalid path
391
+ if not (isinstance(self.config_file, str) or (self.config_file is None)):
392
+ raise TypeError(f'JSON config file must be a filepath of type string, got type {type(config_file)}.')
393
+ if (os.path.splitext(self.config_file)[1] == ''):
394
+ self.config_file = self.config_file + '.json' #append extension if only filename input
395
+ if (os.path.isfile(self.config_file)):
396
+ desc_config_filepath = self.config_file
397
+ elif (os.path.isfile(os.path.join('config', self.config_file))):
398
+ desc_config_filepath = os.path.join('config', self.config_file)
399
+ else:
400
+ raise OSError(f'JSON config file not found at path: {self.config_file}.')
401
+
402
+ #open json file and read config parameters
403
+ try:
404
+ with open(desc_config_filepath) as f:
405
+ self.config_parameters = json.load(f)
406
+ except (json.JSONDecodeError, FileNotFoundError, IOError) as e:
407
+ raise DescriptorConfigError(f'Error parsing config JSON file {desc_config_filepath}: {e}')
408
+
409
+ #create instance of Map class so parameters in config can be accessed via dot notation
410
+ self.dataset_parameters = Map(self.config_parameters["dataset"])
411
+ self.desc_parameters = Map(self.config_parameters["descriptors"])
412
+
413
+ #set dataset and descriptors csv filepath from kwargs, if applicable, or the config file values
414
+ self.dataset_filepath = self.kwargs.get('dataset') if 'dataset' in self.kwargs else self.dataset_parameters["dataset"]
415
+ self.descriptors_csv = self.kwargs.get('descriptors_csv') if 'descriptors_csv' in self.kwargs else self.desc_parameters.descriptors_csv
416
+
417
+ #import protein sequences from dataset if not directly specified in protein_seqs input param
418
+ if not (isinstance(self.protein_seqs, pd.Series)):
419
+ if (self.protein_seqs is None or self.protein_seqs == ""):
420
+ #open dataset and read protein seqs if protein_seqs is empty/None
421
+ if not (os.path.isfile(self.dataset_filepath)):
422
+ raise OSError(f'Dataset file not found at path: {self.dataset_filepath}.')
423
+
424
+ #read in dataset csv from filepath mentioned in config
425
+ try:
426
+ data = pd.read_csv(self.dataset_filepath, sep=",", header=0)
427
+ self.protein_seqs = data[self.dataset_parameters["sequence_col"]]
428
+ except (FileNotFoundError, IOError, KeyError, pd.errors.ParserError) as e:
429
+ raise DescriptorError(f'Error opening dataset file {self.dataset_filepath}: {e}')
430
+ else:
431
+ #if 1 protein sequence (1 string) input then convert to pandas Series object
432
+ if (isinstance(self.protein_seqs, str)):
433
+ self.protein_seqs = pd.Series(self.protein_seqs)
434
+
435
+ #only the sequences should be passed in, not all columns in a dataset etc.
436
+ if (isinstance(self.protein_seqs, pd.DataFrame) and \
437
+ len(self.protein_seqs.columns) > 1):
438
+ raise ValueError("The full dataset must not be passed in, only the"
439
+ " columns containing the protein sequences.")
440
+
441
+ #remove any gaps from protein sequences
442
+ self.protein_seqs = remove_gaps(self.protein_seqs)
443
+
444
+ #validate that all input protein sequences are valid and only contain valid amino acids, if not then raise ValueError
445
+ invalid_seqs = valid_sequence(self.protein_seqs)
446
+ if (invalid_seqs != None):
447
+ raise InvalidSequenceError(f'Invalid Amino Acids found in protein sequence dataset: {invalid_seqs}.')
448
+
449
+ #get the total number of inputted protein sequences
450
+ self.num_seqs = len(self.protein_seqs)
451
+
452
+ #initialise all descriptor attributes to empty dataframes
453
+ self.amino_acid_composition = pd.DataFrame()
454
+ self.dipeptide_composition = pd.DataFrame()
455
+ self.tripeptide_composition = pd.DataFrame()
456
+ # new composition descriptors (protpy >= 1.3.0)
457
+ self.gravy = pd.DataFrame()
458
+ self.aromaticity = pd.DataFrame()
459
+ self.instability_index = pd.DataFrame()
460
+ self.isoelectric_point = pd.DataFrame()
461
+ self.molecular_weight = pd.DataFrame()
462
+ self.charge_distribution = pd.DataFrame()
463
+ self.hydrophobic_polar_charged_composition = pd.DataFrame()
464
+ self.secondary_structure_propensity = pd.DataFrame()
465
+ self.kmer_composition = pd.DataFrame()
466
+ self.reduced_alphabet_composition = pd.DataFrame()
467
+ self.motif_composition = pd.DataFrame()
468
+ self.amino_acid_pair_composition = pd.DataFrame()
469
+ self.aliphatic_index = pd.DataFrame()
470
+ self.extinction_coefficient = pd.DataFrame()
471
+ self.boman_index = pd.DataFrame()
472
+ self.aggregation_propensity = pd.DataFrame()
473
+ self.hydrophobic_moment = pd.DataFrame()
474
+ self.shannon_entropy = pd.DataFrame()
475
+ self.moreaubroto_autocorrelation = pd.DataFrame()
476
+ self.moran_autocorrelation = pd.DataFrame()
477
+ self.geary_autocorrelation = pd.DataFrame()
478
+ self.ctd = pd.DataFrame()
479
+ self.ctd_composition = pd.DataFrame()
480
+ self.ctd_transition = pd.DataFrame()
481
+ self.ctd_distribution = pd.DataFrame()
482
+ self.conjoint_triad = pd.DataFrame()
483
+ self.sequence_order_coupling_number = pd.DataFrame()
484
+ self.quasi_sequence_order = pd.DataFrame()
485
+ self.pseudo_amino_acid_composition = pd.DataFrame()
486
+ self.amphiphilic_pseudo_amino_acid_composition = pd.DataFrame()
487
+ self.all_descriptors = pd.DataFrame()
488
+
489
+ #append extension if just the filename input as descriptors csv
490
+ if ((self.descriptors_csv != '' and self.descriptors_csv != None)
491
+ and (os.path.splitext(self.descriptors_csv)[1] == '')):
492
+ self.descriptors_csv = self.descriptors_csv + ".csv"
493
+
494
+ #try importing descriptors csv with pre-calculated descriptor values
495
+ if (os.path.isfile(self.descriptors_csv)):
496
+ self.import_descriptors(self.descriptors_csv)
497
+ #get the total number of inputted protein sequences
498
+ self.num_seqs = self.all_descriptors.shape[0]
499
+
500
+ #create dictionary of descriptors and their associated groups
501
+ keys = self.all_descriptors_list()
502
+ # 21 Composition (3 original + 18 new) + 3 Autocorrelation + 4 CTD + 1 Conjoint Triad + 2 Sequence Order + 2 Pseudo Composition
503
+ values = (["Composition"] * 21 + ["Autocorrelation"] * 3 + ["CTD"] * 4 +
504
+ ["Conjoint Triad"] + ["Sequence Order"] * 2 + ["Pseudo Composition"] * 2)
505
+ self.descriptor_groups = dict(zip(keys,values))
506
+
507
+ #get shape of descriptors
508
+ self.shape = self.all_descriptors.shape
509
+
510
+ #list of available protein descriptors
511
+ self.valid_descriptors = [
512
+ 'amino_acid_composition', 'dipeptide_composition', 'tripeptide_composition',
513
+ 'gravy', 'aromaticity', 'instability_index', 'isoelectric_point', 'molecular_weight',
514
+ 'charge_distribution', 'hydrophobic_polar_charged_composition',
515
+ 'secondary_structure_propensity', 'kmer_composition', 'reduced_alphabet_composition',
516
+ 'motif_composition', 'amino_acid_pair_composition', 'aliphatic_index',
517
+ 'extinction_coefficient', 'boman_index', 'aggregation_propensity',
518
+ 'hydrophobic_moment', 'shannon_entropy',
519
+ 'moreaubroto_autocorrelation', 'moran_autocorrelation', 'geary_autocorrelation',
520
+ 'ctd', 'ctd_composition', 'ctd_transition', 'ctd_distribution', 'conjoint_triad',
521
+ 'sequence_order_coupling_number', 'quasi_sequence_order',
522
+ 'pseudo_amino_acid_composition', 'amphiphilic_pseudo_amino_acid_composition'
523
+ ]
524
+
525
+ def import_descriptors(self, descriptor_filepath: str = "") -> None:
526
+ """
527
+ Import descriptors from descriptors csv, setting the class attributes to their values.
528
+ It is recommended that after calculating the descriptors for a dataset of sequences
529
+ that the calculated values are exported to a csv; this means they don't need to be
530
+ recalculated each time. The all_descriptors class attribute is a dataframe of all
531
+ concatenated descriptors from the csv.
532
+
533
+ Parameters
534
+ ==========
535
+ :descriptor_filepath: str
536
+ filepath to pre-calculated descriptor csv file.
537
+
538
+ Returns
539
+ =======
540
+ None
541
+ """
542
+ #raise type error if filepath parameter isn't string
543
+ if not (isinstance(descriptor_filepath, str)):
544
+ raise TypeError(f"Filepath input parameter should be type str, got {type(descriptor_filepath)}.")
545
+
546
+ #verify descriptors csv exists at filepath
547
+ if not (os.path.isfile(descriptor_filepath)):
548
+ raise OSError(f'Descriptors csv file does not exist at filepath: {descriptor_filepath}.')
549
+
550
+ #import descriptors csv as dataframe
551
+ try:
552
+ descriptor_df = pd.read_csv(descriptor_filepath)
553
+ except (FileNotFoundError, IOError, pd.errors.ParserError) as e:
554
+ raise DescriptorError(f'Error reading descriptors csv file {descriptor_filepath}: {e}')
555
+
556
+ #replacing any +/- infinity or NAN values with 0
557
+ descriptor_df = descriptor_df.replace([np.inf, -np.inf], np.nan).fillna(0)
558
+
559
+ '''
560
+ calculate dimension of each descriptor in the csv according to the properties of each
561
+ descriptor, pull each descriptor value from the csv according to its dimension,
562
+ setting the values to the class instance variables
563
+ '''
564
+ amino_acid_composition_dim = (0, AA_COUNT)
565
+ self.amino_acid_composition = descriptor_df.iloc[:,amino_acid_composition_dim[0]:amino_acid_composition_dim[1]]
566
+
567
+ dipeptide_composition_dim = (AA_COUNT, AA_COUNT + DIPEPTIDE_FEATURES)
568
+ self.dipeptide_composition = descriptor_df.iloc[:,dipeptide_composition_dim[0]:dipeptide_composition_dim[1]]
569
+
570
+ tripeptide_composition_dim = (AA_COUNT + DIPEPTIDE_FEATURES, AA_COUNT + DIPEPTIDE_FEATURES + TRIPEPTIDE_FEATURES)
571
+ self.tripeptide_composition = descriptor_df.iloc[:,tripeptide_composition_dim[0]:tripeptide_composition_dim[1]]
572
+
573
+ #dimension of autocorrelation (moreaubroto, moran and geary) descriptors depends on the lag value and number of properties
574
+ _comp_offset = AA_COUNT + DIPEPTIDE_FEATURES + TRIPEPTIDE_FEATURES
575
+ moreaubroto_dim = (_comp_offset,
576
+ _comp_offset + (self.desc_parameters.moreaubroto_autocorrelation["lag"] * len(self.desc_parameters.moreaubroto_autocorrelation["properties"])))
577
+ self.moreaubroto_autocorrelation = descriptor_df.iloc[:,moreaubroto_dim[0]:moreaubroto_dim[1]]
578
+
579
+ moran_auto_dim = (moreaubroto_dim[1], moreaubroto_dim[1] +
580
+ (self.desc_parameters.moran_autocorrelation["lag"] * len(self.desc_parameters.moran_autocorrelation["properties"])))
581
+ self.moran_autocorrelation = descriptor_df.iloc[:,moran_auto_dim[0]: moran_auto_dim[1]]
582
+
583
+ geary_auto_dim = (moran_auto_dim[1], moran_auto_dim[1] +
584
+ (self.desc_parameters.geary_autocorrelation["lag"] * len(self.desc_parameters.geary_autocorrelation["properties"])))
585
+ self.geary_autocorrelation = descriptor_df.iloc[:,geary_auto_dim[0]:geary_auto_dim[1]]
586
+
587
+ #get CTD parameters from config to determine the dimensions of the CTD descriptors
588
+ ctd_property = self.desc_parameters.ctd["property"]
589
+ if not (isinstance(ctd_property, list)):
590
+ ctd_property = ctd_property.split(',')
591
+ ctd_all_ctd = self.desc_parameters.ctd["all"]
592
+
593
+ #if using all properties in CTD calculation, 147 features generated, 21 features per 7 properties
594
+ if (ctd_all_ctd):
595
+ ctd_dim = (geary_auto_dim[1], geary_auto_dim[1]+147) #21 CTD features per 7 properties = 147
596
+ ctd_comp_dim = (geary_auto_dim[1], geary_auto_dim[1] + 21) #3 CTD_Comp features per 7 properties = 21
597
+ ctd_trans_dim = (ctd_comp_dim[1], ctd_comp_dim[1] + 21) #3 CTD_Trans features per 7 properties = 21
598
+ ctd_distr_dim = (ctd_trans_dim[1], ctd_trans_dim[1] + 105) #15 CTD_Distr features per 7 properties = 105
599
+ #only using a pre-determined list of physicochemical properties, 21 features per property
600
+ else:
601
+ ctd_comp_dim = (geary_auto_dim[1], geary_auto_dim[1] + (len(ctd_property) * 3)) #3 CTD_Comp features per property
602
+ ctd_trans_dim = (ctd_comp_dim[1], ctd_comp_dim[1] + (len(ctd_property) * 3)) #3 CTD_Trans features per property
603
+ ctd_distr_dim = (ctd_trans_dim[1], ctd_trans_dim[1] + (len(ctd_property) * 15)) #15 CTD_Distr features per property
604
+ ctd_dim = (geary_auto_dim[1], ctd_distr_dim[1]) #21 CTD features per property
605
+
606
+ self.ctd = descriptor_df.iloc[:,ctd_dim[0]:ctd_dim[1]]
607
+
608
+ self.ctd_composition = descriptor_df.iloc[:,ctd_comp_dim[0]:ctd_comp_dim[1]]
609
+
610
+ self.ctd_transition = descriptor_df.iloc[:,ctd_trans_dim[0]:ctd_trans_dim[1]]
611
+
612
+ self.ctd_distribution = descriptor_df.iloc[:,ctd_distr_dim[0]:ctd_distr_dim[1]]
613
+
614
+ conjoint_triad_dim = (ctd_distr_dim[1], ctd_distr_dim[1] + CONJOINT_TRIAD_FEATURES)
615
+
616
+ self.conjoint_triad = descriptor_df.iloc[:,conjoint_triad_dim[0]:conjoint_triad_dim[1]]
617
+
618
+ #socn value dependant on value of lag and distance matrix
619
+ socn_lag = self.desc_parameters.sequence_order_coupling_number["lag"]
620
+ socn_distance_matrix = self.desc_parameters.sequence_order_coupling_number["distance_matrix"]
621
+
622
+ #if no distance matrix speciifed in config then both are used for descriptor calculation
623
+ if (socn_distance_matrix == "" or socn_distance_matrix == None):
624
+ socn_dim = (conjoint_triad_dim[1], conjoint_triad_dim[1] + (socn_lag * 2))
625
+ #distance matrix specified in config
626
+ else:
627
+ socn_dim = (conjoint_triad_dim[1], conjoint_triad_dim[1] + socn_lag)
628
+
629
+ self.sequence_order_coupling_number = descriptor_df.iloc[:,socn_dim[0]:socn_dim[1]]
630
+
631
+ quasi_seq_order_lag = self.desc_parameters.quasi_sequence_order["lag"]
632
+ quasi_seq_order_dist_matrix = self.desc_parameters.quasi_sequence_order["distance_matrix"]
633
+
634
+ #if no distance matrix speciifed in config then both are used for descriptor calculation
635
+ if (quasi_seq_order_dist_matrix == "" or quasi_seq_order_dist_matrix == None):
636
+ quasi_seq_order_dim = (socn_dim[1], socn_dim[1] + ((quasi_seq_order_lag+20) * 2))
637
+ #distance matrix specified in config
638
+ else:
639
+ quasi_seq_order_dim = (socn_dim[1], socn_dim[1] + (quasi_seq_order_lag+20))
640
+
641
+ self.quasi_sequence_order = descriptor_df.iloc[:,quasi_seq_order_dim[0]:quasi_seq_order_dim[1]]
642
+
643
+ #paac value dependant on lambda value
644
+ paac_lambda = self.desc_parameters.pseudo_amino_acid_composition["lambda"]
645
+
646
+ pseudo_amino_acid_composition_dim = (quasi_seq_order_dim[1], quasi_seq_order_dim[1] + (20 + paac_lambda))
647
+ self.pseudo_amino_acid_composition = descriptor_df.iloc[:,pseudo_amino_acid_composition_dim[0]:pseudo_amino_acid_composition_dim[1]]
648
+
649
+ apaac_lambda = self.desc_parameters.amphiphilic_pseudo_amino_acid_composition["lambda"]
650
+
651
+ amphiphilic_pseudo_amino_acid_composition_dim = (pseudo_amino_acid_composition_dim[1],
652
+ pseudo_amino_acid_composition_dim[1] + (20 + (2*apaac_lambda)))
653
+ self.amphiphilic_pseudo_amino_acid_composition = descriptor_df.iloc[:,amphiphilic_pseudo_amino_acid_composition_dim[0]:
654
+ amphiphilic_pseudo_amino_acid_composition_dim[1]]
655
+
656
+ self.all_descriptors = descriptor_df.iloc[:,:]
657
+
658
+ def validate_descriptors(self, descriptors: Union[str, List[str]]) -> List[str]:
659
+ """
660
+ Validate that requested descriptors exist in the valid descriptors list.
661
+
662
+ Parameters
663
+ ==========
664
+ :descriptors: str or list of str
665
+ Descriptor name(s) to validate
666
+
667
+ Returns
668
+ =======
669
+ :List[str]
670
+ List of validated descriptor names
671
+
672
+ Raises
673
+ ======
674
+ :TypeError
675
+ If descriptors is not a string or list of strings
676
+ :InvalidDescriptorError
677
+ If any invalid descriptors are requested
678
+ """
679
+ if isinstance(descriptors, str):
680
+ descriptors = [descriptors]
681
+ elif not isinstance(descriptors, list):
682
+ raise TypeError(
683
+ f"Descriptors must be a string or list of strings, got {type(descriptors)}."
684
+ )
685
+
686
+ if not all(isinstance(descriptor, str) for descriptor in descriptors):
687
+ raise TypeError("All descriptor names must be strings.")
688
+
689
+ invalid = set(descriptors) - set(self.valid_descriptors)
690
+ if invalid:
691
+ raise InvalidDescriptorError(f"Invalid descriptors requested: {invalid}. "
692
+ f"Valid descriptors: {self.valid_descriptors}")
693
+
694
+ return descriptors
695
+
696
+ def validate_sequences(self, seqs: Optional[pd.Series] = None) -> bool:
697
+ """
698
+ Validate all sequences contain only valid amino acids.
699
+
700
+ Parameters
701
+ ==========
702
+ :seqs: pd.Series, optional
703
+ Sequences to validate. If None, uses self.protein_seqs
704
+
705
+ Returns
706
+ =======
707
+ :bool
708
+ True if all sequences are valid
709
+
710
+ Raises
711
+ ======
712
+ :InvalidSequenceError
713
+ If invalid amino acids found
714
+ """
715
+ seqs = seqs if seqs is not None else self.protein_seqs
716
+ invalid = valid_sequence(seqs)
717
+
718
+ if invalid is not None:
719
+ raise InvalidSequenceError(f"Invalid amino acids found: {invalid}")
720
+
721
+ return True
722
+
723
+ @property
724
+ @lru_cache(maxsize=1)
725
+ def descriptor_feature_count(self) -> Dict[str, int]:
726
+ """
727
+ Get count of features in each descriptor (cached for performance).
728
+
729
+ Returns
730
+ =======
731
+ :Dict[str, int]
732
+ Dictionary mapping descriptor names to feature counts
733
+ """
734
+ counts = {
735
+ 'amino_acid_composition': AA_COUNT,
736
+ 'dipeptide_composition': DIPEPTIDE_FEATURES,
737
+ 'tripeptide_composition': TRIPEPTIDE_FEATURES,
738
+ }
739
+
740
+ # Autocorrelation counts depend on lag and properties
741
+ if not self.moreaubroto_autocorrelation.empty:
742
+ counts['moreaubroto_autocorrelation'] = self.moreaubroto_autocorrelation.shape[1]
743
+ if not self.moran_autocorrelation.empty:
744
+ counts['moran_autocorrelation'] = self.moran_autocorrelation.shape[1]
745
+ if not self.geary_autocorrelation.empty:
746
+ counts['geary_autocorrelation'] = self.geary_autocorrelation.shape[1]
747
+
748
+ # CTD counts
749
+ if not self.ctd.empty:
750
+ counts['ctd'] = self.ctd.shape[1]
751
+ counts['ctd_composition'] = self.ctd_composition.shape[1]
752
+ counts['ctd_transition'] = self.ctd_transition.shape[1]
753
+ counts['ctd_distribution'] = self.ctd_distribution.shape[1]
754
+
755
+ counts['conjoint_triad'] = CONJOINT_TRIAD_FEATURES
756
+
757
+ # Sequence order counts
758
+ if not self.sequence_order_coupling_number.empty:
759
+ counts['sequence_order_coupling_number'] = self.sequence_order_coupling_number.shape[1]
760
+ if not self.quasi_sequence_order.empty:
761
+ counts['quasi_sequence_order'] = self.quasi_sequence_order.shape[1]
762
+
763
+ # Pseudo composition counts
764
+ if not self.pseudo_amino_acid_composition.empty:
765
+ counts['pseudo_amino_acid_composition'] = self.pseudo_amino_acid_composition.shape[1]
766
+ if not self.amphiphilic_pseudo_amino_acid_composition.empty:
767
+ counts['amphiphilic_pseudo_amino_acid_composition'] = self.amphiphilic_pseudo_amino_acid_composition.shape[1]
768
+
769
+ return counts
770
+
771
+ def get_amino_acid_composition(self) -> pd.DataFrame:
772
+ """
773
+ Calculate Amino Acid Composition (AAComp) of protein sequence using the
774
+ custom-built protpy package. AAComp describes the fraction of each amino
775
+ acid type within a protein sequence, and is calculated as:
776
+
777
+ AA_Comp(s) = AA(t)/N(s)
778
+
779
+ where AA_Comp(s) is the AAComp of protein sequence s, AA(t) is the number
780
+ of amino acid types t (where t = 1,2,..,20) and N(s) is the length of the
781
+ sequence s.
782
+
783
+ Parameters
784
+ ==========
785
+ None
786
+
787
+ Returns
788
+ =======
789
+ :amino_acid_composition: pd.Dataframe
790
+ pandas dataframe of AAComp for protein sequence. Dataframe will
791
+ be of the shape N x 20, where N is the number of protein sequences
792
+ and 20 is the number of features calculated from the descriptor
793
+ (for the 20 canonical amino acids).
794
+ """
795
+ #if attribute already calculated & not empty then return it
796
+ if not self.amino_acid_composition.empty:
797
+ return self.amino_acid_composition
798
+
799
+ #calculate descriptor value for each sequence using helper method
800
+ self.amino_acid_composition = self._calculate_descriptor_batch(
801
+ protpy.amino_acid_composition,
802
+ desc_name="Amino Acid Composition"
803
+ )
804
+
805
+ return self.amino_acid_composition
806
+
807
+ def get_dipeptide_composition(self) -> pd.DataFrame:
808
+ """
809
+ Calculate Dipeptide Composition (DPComp) for protein sequence using
810
+ the custom-built protpy package. Dipeptide composition is the fraction
811
+ of each dipeptide type within a protein sequence. With dipeptides
812
+ being of length 2 and there being 20 canonical amino acids, this creates
813
+ 20^2 different combinations, thus a 400-Dimensional vector will be produced
814
+ such that:
815
+
816
+ DPComp(s,t) = AA(s,t) / N -1
817
+
818
+ where DPComp(s,t) is the dipeptide composition of the protein sequence
819
+ for amino acid type s and t (where s and t = 1,2,..,20), AA(s,t) is the number
820
+ of dipeptides represented by amino acid type s and t and N is the total number
821
+ of dipeptides.
822
+
823
+ Parameters
824
+ ==========
825
+ None
826
+
827
+ Returns
828
+ =======
829
+ :dipeptide_composition: pd.Dataframe
830
+ pandas Dataframe of dipeptide composition for protein sequence. Dataframe will
831
+ be of the shape N x 400, where N is the number of protein sequences and 400 is
832
+ the number of features calculated from the descriptor (20^2 for the 20 canonical
833
+ amino acids).
834
+ """
835
+ #if attribute already calculated & not empty then return it
836
+ if not self.dipeptide_composition.empty:
837
+ return self.dipeptide_composition
838
+
839
+ #calculate descriptor value using helper method
840
+ self.dipeptide_composition = self._calculate_descriptor_batch(
841
+ protpy.dipeptide_composition,
842
+ desc_name="Dipeptide Composition"
843
+ )
844
+
845
+ return self.dipeptide_composition
846
+
847
+ def get_tripeptide_composition(self) -> pd.DataFrame:
848
+ """
849
+ Calculate Tripeptide Composition (TPComp) of protein sequence using
850
+ custom-built protpy package. Tripeptide composition is the fraction of
851
+ each tripeptide type within a protein sequence. With tripeptides being
852
+ of length 3 and there being 20 canonical amino acids this creates 20^3
853
+ different combinations, thus a 8000-Dimensional vector will be produced
854
+ such that:
855
+
856
+ TPComp(s,t,u) = AA(s,t,u) / N -1
857
+
858
+ where TPComp(s,t,u) is the tripeptide composition of the protein sequence
859
+ for amino acid type s, t and u (where s, t and u = 1,2,..,20), AA(s,t,u) is
860
+ the number of tripeptides represented by amino acid type s and t, and N is
861
+ the total number of tripeptides.
862
+
863
+ Parameters
864
+ ==========
865
+ None
866
+
867
+ Returns
868
+ =======
869
+ :tripeptide_composition: pd.Dataframe
870
+ pandas Dataframe of tripeptide composition for protein sequence. Dataframe will
871
+ be of the shape N x 8000, where N is the number of protein sequences and 8000 is
872
+ the number of features calculated from the descriptor (20^3 for the 20 canonical
873
+ amino acids).
874
+ """
875
+ #if attribute already calculated & not empty then return it
876
+ if not self.tripeptide_composition.empty:
877
+ return self.tripeptide_composition
878
+
879
+ #calculate descriptor value using helper method
880
+ self.tripeptide_composition = self._calculate_descriptor_batch(
881
+ protpy.tripeptide_composition,
882
+ desc_name="Tripeptide Composition"
883
+ )
884
+
885
+ return self.tripeptide_composition
886
+
887
+ def get_gravy(self) -> pd.DataFrame:
888
+ """
889
+ Calculate the Grand Average of Hydropathy (GRAVY) for protein sequences using
890
+ the protpy package. GRAVY is the mean of Kyte-Doolittle hydropathy values across
891
+ all residues. A positive value indicates overall hydrophobicity; a negative value
892
+ indicates overall hydrophilicity.
893
+
894
+ Parameters
895
+ ==========
896
+ None
897
+
898
+ Returns
899
+ =======
900
+ :gravy: pd.DataFrame
901
+ Dataframe of GRAVY values, shape N x 1 where N is the number of sequences.
902
+ """
903
+ # return cached result if already computed
904
+ if not self.gravy.empty:
905
+ return self.gravy
906
+
907
+ # calculate GRAVY for all sequences
908
+ self.gravy = self._calculate_descriptor_batch(
909
+ protpy.gravy,
910
+ desc_name="GRAVY"
911
+ )
912
+ return self.gravy
913
+
914
+ def get_aromaticity(self) -> pd.DataFrame:
915
+ """
916
+ Calculate Aromaticity for protein sequences using the protpy package.
917
+ Aromaticity is the fraction of aromatic residues (F, W, Y, H) in the sequence.
918
+
919
+ Parameters
920
+ ==========
921
+ None
922
+
923
+ Returns
924
+ =======
925
+ :aromaticity: pd.DataFrame
926
+ Dataframe of Aromaticity values, shape N x 1 where N is the number of sequences.
927
+ """
928
+ # return cached result if already computed
929
+ if not self.aromaticity.empty:
930
+ return self.aromaticity
931
+
932
+ # calculate aromaticity for all sequences
933
+ self.aromaticity = self._calculate_descriptor_batch(
934
+ protpy.aromaticity,
935
+ desc_name="Aromaticity"
936
+ )
937
+ return self.aromaticity
938
+
939
+ def get_instability_index(self) -> pd.DataFrame:
940
+ """
941
+ Calculate the Instability Index for protein sequences using the protpy package.
942
+ Based on dipeptide instability weight values (DIWV). Values below 40 indicate a
943
+ stable protein; 40 or above indicates instability.
944
+
945
+ Parameters
946
+ ==========
947
+ None
948
+
949
+ Returns
950
+ =======
951
+ :instability_index: pd.DataFrame
952
+ Dataframe of InstabilityIndex values, shape N x 1.
953
+ """
954
+ # return cached result if already computed
955
+ if not self.instability_index.empty:
956
+ return self.instability_index
957
+
958
+ # calculate instability index for all sequences
959
+ self.instability_index = self._calculate_descriptor_batch(
960
+ protpy.instability_index,
961
+ desc_name="Instability Index"
962
+ )
963
+ return self.instability_index
964
+
965
+ def get_isoelectric_point(self) -> pd.DataFrame:
966
+ """
967
+ Calculate the Isoelectric Point for protein sequences using the protpy package.
968
+ The isoelectric point is the estimated pH at which the protein carries no net
969
+ charge, calculated iteratively using standard pKa values for ionisable residues.
970
+
971
+ Parameters
972
+ ==========
973
+ None
974
+
975
+ Returns
976
+ =======
977
+ :isoelectric_point: pd.DataFrame
978
+ Dataframe of IsoelectricPoint values, shape N x 1.
979
+ """
980
+ # return cached result if already computed
981
+ if not self.isoelectric_point.empty:
982
+ return self.isoelectric_point
983
+
984
+ # calculate isoelectric point for all sequences
985
+ self.isoelectric_point = self._calculate_descriptor_batch(
986
+ protpy.isoelectric_point,
987
+ desc_name="Isoelectric Point"
988
+ )
989
+ return self.isoelectric_point
990
+
991
+ def get_molecular_weight(self) -> pd.DataFrame:
992
+ """
993
+ Calculate the Molecular Weight for protein sequences using the protpy package.
994
+ Average molecular weight calculated from residue masses, corrected for water
995
+ lost at each peptide bond.
996
+
997
+ Parameters
998
+ ==========
999
+ None
1000
+
1001
+ Returns
1002
+ =======
1003
+ :molecular_weight: pd.DataFrame
1004
+ Dataframe of MolecularWeight values (Da), shape N x 1.
1005
+ """
1006
+ # return cached result if already computed
1007
+ if not self.molecular_weight.empty:
1008
+ return self.molecular_weight
1009
+
1010
+ # calculate molecular weight for all sequences
1011
+ self.molecular_weight = self._calculate_descriptor_batch(
1012
+ protpy.molecular_weight,
1013
+ desc_name="Molecular Weight"
1014
+ )
1015
+ return self.molecular_weight
1016
+
1017
+ def get_charge_distribution(self) -> pd.DataFrame:
1018
+ """
1019
+ Calculate Charge Distribution for protein sequences using the protpy package.
1020
+ Computes positive, negative, and net charge contributions of ionisable residues
1021
+ at a given pH using the Henderson-Hasselbalch equation.
1022
+
1023
+ Parameters
1024
+ ==========
1025
+ None
1026
+
1027
+ Returns
1028
+ =======
1029
+ :charge_distribution: pd.DataFrame
1030
+ Dataframe of charge values, shape N x 3 (PositiveCharge, NegativeCharge, NetCharge).
1031
+ """
1032
+ # return cached result if already computed
1033
+ if not self.charge_distribution.empty:
1034
+ return self.charge_distribution
1035
+
1036
+ # get pH parameter from config, falling back to physiological default
1037
+ ph_params = getattr(self.desc_parameters, 'charge_distribution', {})
1038
+ ph = ph_params.get('ph', 7.4) if ph_params else 7.4
1039
+
1040
+ # calculate charge distribution for all sequences
1041
+ self.charge_distribution = self._calculate_descriptor_batch(
1042
+ protpy.charge_distribution,
1043
+ desc_name="Charge Distribution",
1044
+ ph=ph
1045
+ )
1046
+ return self.charge_distribution
1047
+
1048
+ def get_hydrophobic_polar_charged_composition(self) -> pd.DataFrame:
1049
+ """
1050
+ Calculate Hydrophobic/Polar/Charged Composition (HPC) for protein sequences
1051
+ using the protpy package. Computes the percentage of residues belonging to each
1052
+ of three physicochemical groups: hydrophobic (A, C, F, I, L, M, V, W, Y),
1053
+ polar (G, N, Q, S, T), and charged (D, E, H, K, R).
1054
+
1055
+ Parameters
1056
+ ==========
1057
+ None
1058
+
1059
+ Returns
1060
+ =======
1061
+ :hydrophobic_polar_charged_composition: pd.DataFrame
1062
+ Dataframe of HPC values, shape N x 3 (Hydrophobic, Polar, Charged).
1063
+ """
1064
+ # return cached result if already computed
1065
+ if not self.hydrophobic_polar_charged_composition.empty:
1066
+ return self.hydrophobic_polar_charged_composition
1067
+
1068
+ # calculate HPC composition for all sequences
1069
+ self.hydrophobic_polar_charged_composition = self._calculate_descriptor_batch(
1070
+ protpy.hydrophobic_polar_charged_composition,
1071
+ desc_name="Hydrophobic/Polar/Charged Composition"
1072
+ )
1073
+ return self.hydrophobic_polar_charged_composition
1074
+
1075
+ def get_secondary_structure_propensity(self) -> pd.DataFrame:
1076
+ """
1077
+ Calculate Secondary Structure Propensity (SSP) for protein sequences using the
1078
+ protpy package. Computes average Chou-Fasman propensity values for alpha-helix,
1079
+ beta-sheet, and random coil conformations across all residues.
1080
+
1081
+ Parameters
1082
+ ==========
1083
+ None
1084
+
1085
+ Returns
1086
+ =======
1087
+ :secondary_structure_propensity: pd.DataFrame
1088
+ Dataframe of SSP values, shape N x 3 (Helix, Sheet, Coil).
1089
+ """
1090
+ # return cached result if already computed
1091
+ if not self.secondary_structure_propensity.empty:
1092
+ return self.secondary_structure_propensity
1093
+
1094
+ # calculate secondary structure propensity for all sequences
1095
+ self.secondary_structure_propensity = self._calculate_descriptor_batch(
1096
+ protpy.secondary_structure_propensity,
1097
+ desc_name="Secondary Structure Propensity"
1098
+ )
1099
+ return self.secondary_structure_propensity
1100
+
1101
+ def get_kmer_composition(self) -> pd.DataFrame:
1102
+ """
1103
+ Calculate k-mer Composition for protein sequences using the protpy package.
1104
+ Computes the frequency of all possible k-length residue subsequences, expressed
1105
+ as a percentage of total k-mers.
1106
+
1107
+ Parameters
1108
+ ==========
1109
+ None
1110
+
1111
+ Returns
1112
+ =======
1113
+ :kmer_composition: pd.DataFrame
1114
+ Dataframe of k-mer composition values, shape N x 20^k (e.g. N x 400 for k=2).
1115
+ """
1116
+ # return cached result if already computed
1117
+ if not self.kmer_composition.empty:
1118
+ return self.kmer_composition
1119
+
1120
+ # get k-mer length from config, defaulting to 2 (dipeptide)
1121
+ kmer_params = getattr(self.desc_parameters, 'kmer_composition', {})
1122
+ k = kmer_params.get('k', 2) if kmer_params else 2
1123
+
1124
+ # calculate k-mer composition for all sequences
1125
+ self.kmer_composition = self._calculate_descriptor_batch(
1126
+ protpy.kmer_composition,
1127
+ desc_name="k-mer Composition",
1128
+ k=k
1129
+ )
1130
+ return self.kmer_composition
1131
+
1132
+ def get_reduced_alphabet_composition(self) -> pd.DataFrame:
1133
+ """
1134
+ Calculate Reduced Alphabet Composition for protein sequences using the protpy
1135
+ package. Computes amino acid composition after mapping residues to a reduced
1136
+ alphabet of physicochemical groups. Supported alphabet sizes: 2, 3, 4, 6.
1137
+
1138
+ Parameters
1139
+ ==========
1140
+ None
1141
+
1142
+ Returns
1143
+ =======
1144
+ :reduced_alphabet_composition: pd.DataFrame
1145
+ Dataframe of reduced composition values, shape N x alphabet_size.
1146
+ """
1147
+ # return cached result if already computed
1148
+ if not self.reduced_alphabet_composition.empty:
1149
+ return self.reduced_alphabet_composition
1150
+
1151
+ # get alphabet size from config, defaulting to 6 groups
1152
+ rac_params = getattr(self.desc_parameters, 'reduced_alphabet_composition', {})
1153
+ alphabet_size = rac_params.get('alphabet_size', 6) if rac_params else 6
1154
+
1155
+ # calculate reduced alphabet composition for all sequences
1156
+ self.reduced_alphabet_composition = self._calculate_descriptor_batch(
1157
+ protpy.reduced_alphabet_composition,
1158
+ desc_name="Reduced Alphabet Composition",
1159
+ alphabet_size=alphabet_size
1160
+ )
1161
+ return self.reduced_alphabet_composition
1162
+
1163
+ def get_motif_composition(self) -> pd.DataFrame:
1164
+ """
1165
+ Calculate Motif Composition for protein sequences using the protpy package.
1166
+ Counts occurrences (including overlapping) of biological sequence motifs matched
1167
+ via regular expressions. Uses 8 built-in motifs by default; a custom dict of
1168
+ name->pattern mappings can be supplied via config.
1169
+
1170
+ Parameters
1171
+ ==========
1172
+ None
1173
+
1174
+ Returns
1175
+ =======
1176
+ :motif_composition: pd.DataFrame
1177
+ Dataframe of motif counts, shape N x len(motifs).
1178
+ """
1179
+ # return cached result if already computed
1180
+ if not self.motif_composition.empty:
1181
+ return self.motif_composition
1182
+
1183
+ # get custom motifs from config; None causes protpy to use built-in defaults
1184
+ motif_params = getattr(self.desc_parameters, 'motif_composition', {})
1185
+ motifs = motif_params.get('motifs', None) if motif_params else None
1186
+ # treat empty list/dict as None to trigger built-in default motifs
1187
+ if not motifs:
1188
+ motifs = None
1189
+
1190
+ # calculate motif composition for all sequences
1191
+ self.motif_composition = self._calculate_descriptor_batch(
1192
+ protpy.motif_composition,
1193
+ desc_name="Motif Composition",
1194
+ motifs=motifs
1195
+ )
1196
+ return self.motif_composition
1197
+
1198
+ def get_amino_acid_pair_composition(self) -> pd.DataFrame:
1199
+ """
1200
+ Calculate Amino Acid Pair Composition for protein sequences using the protpy
1201
+ package. Computes the frequency of all 400 residue-pair combinations with
1202
+ column names annotated by the physicochemical class of each residue.
1203
+
1204
+ Parameters
1205
+ ==========
1206
+ None
1207
+
1208
+ Returns
1209
+ =======
1210
+ :amino_acid_pair_composition: pd.DataFrame
1211
+ Dataframe of pair composition values, shape N x 400.
1212
+ """
1213
+ # return cached result if already computed
1214
+ if not self.amino_acid_pair_composition.empty:
1215
+ return self.amino_acid_pair_composition
1216
+
1217
+ # calculate amino acid pair composition for all sequences
1218
+ self.amino_acid_pair_composition = self._calculate_descriptor_batch(
1219
+ protpy.amino_acid_pair_composition,
1220
+ desc_name="Amino Acid Pair Composition"
1221
+ )
1222
+ return self.amino_acid_pair_composition
1223
+
1224
+ def get_aliphatic_index(self) -> pd.DataFrame:
1225
+ """
1226
+ Calculate the Aliphatic Index for protein sequences using the protpy package.
1227
+ Measures the relative volume occupied by aliphatic side chains (Ala, Val, Ile,
1228
+ Leu). Higher values indicate greater thermostability.
1229
+
1230
+ Parameters
1231
+ ==========
1232
+ None
1233
+
1234
+ Returns
1235
+ =======
1236
+ :aliphatic_index: pd.DataFrame
1237
+ Dataframe of AliphaticIndex values, shape N x 1.
1238
+ """
1239
+ # return cached result if already computed
1240
+ if not self.aliphatic_index.empty:
1241
+ return self.aliphatic_index
1242
+
1243
+ # calculate aliphatic index for all sequences
1244
+ self.aliphatic_index = self._calculate_descriptor_batch(
1245
+ protpy.aliphatic_index,
1246
+ desc_name="Aliphatic Index"
1247
+ )
1248
+ return self.aliphatic_index
1249
+
1250
+ def get_extinction_coefficient(self) -> pd.DataFrame:
1251
+ """
1252
+ Calculate the Extinction Coefficient for protein sequences using the protpy
1253
+ package. Computes the molar extinction coefficient at 280 nm from the number of
1254
+ Trp (W), Tyr (Y), and Cys (C) residues. Reported for reduced and oxidized states.
1255
+
1256
+ Parameters
1257
+ ==========
1258
+ None
1259
+
1260
+ Returns
1261
+ =======
1262
+ :extinction_coefficient: pd.DataFrame
1263
+ Dataframe of extinction coefficient values, shape N x 2
1264
+ (ExtCoeff_Reduced, ExtCoeff_Oxidized).
1265
+ """
1266
+ # return cached result if already computed
1267
+ if not self.extinction_coefficient.empty:
1268
+ return self.extinction_coefficient
1269
+
1270
+ # calculate extinction coefficient for all sequences
1271
+ self.extinction_coefficient = self._calculate_descriptor_batch(
1272
+ protpy.extinction_coefficient,
1273
+ desc_name="Extinction Coefficient"
1274
+ )
1275
+ return self.extinction_coefficient
1276
+
1277
+ def get_boman_index(self) -> pd.DataFrame:
1278
+ """
1279
+ Calculate the Boman Index for protein sequences using the protpy package.
1280
+ Sum of solubility values for amino acids divided by sequence length, predicting
1281
+ potential for protein-protein interactions.
1282
+
1283
+ Parameters
1284
+ ==========
1285
+ None
1286
+
1287
+ Returns
1288
+ =======
1289
+ :boman_index: pd.DataFrame
1290
+ Dataframe of BomanIndex values, shape N x 1.
1291
+ """
1292
+ # return cached result if already computed
1293
+ if not self.boman_index.empty:
1294
+ return self.boman_index
1295
+
1296
+ # calculate Boman index for all sequences
1297
+ self.boman_index = self._calculate_descriptor_batch(
1298
+ protpy.boman_index,
1299
+ desc_name="Boman Index"
1300
+ )
1301
+ return self.boman_index
1302
+
1303
+ def get_aggregation_propensity(self) -> pd.DataFrame:
1304
+ """
1305
+ Calculate Aggregation Propensity for protein sequences using the protpy package.
1306
+ Estimates aggregation-prone regions via a sliding-window approach combining
1307
+ Kyte-Doolittle hydrophobicity and charge neutrality. Returns the count of
1308
+ qualifying windows and the fraction of the sequence covered.
1309
+
1310
+ Parameters
1311
+ ==========
1312
+ None
1313
+
1314
+ Returns
1315
+ =======
1316
+ :aggregation_propensity: pd.DataFrame
1317
+ Dataframe of aggregation values, shape N x 2
1318
+ (AggregProneRegions, AggregProneFraction).
1319
+ """
1320
+ # return cached result if already computed
1321
+ if not self.aggregation_propensity.empty:
1322
+ return self.aggregation_propensity
1323
+
1324
+ # get sliding-window parameters from config, using standard defaults otherwise
1325
+ agg_params = getattr(self.desc_parameters, 'aggregation_propensity', {})
1326
+ window = agg_params.get('window', 5) if agg_params else 5
1327
+ hydrophobicity_threshold = agg_params.get('hydrophobicity_threshold', 2.0) if agg_params else 2.0
1328
+ charge_threshold = agg_params.get('charge_threshold', 1) if agg_params else 1
1329
+
1330
+ # calculate aggregation propensity for all sequences
1331
+ self.aggregation_propensity = self._calculate_descriptor_batch(
1332
+ protpy.aggregation_propensity,
1333
+ desc_name="Aggregation Propensity",
1334
+ window=window,
1335
+ hydrophobicity_threshold=hydrophobicity_threshold,
1336
+ charge_threshold=charge_threshold
1337
+ )
1338
+ return self.aggregation_propensity
1339
+
1340
+ def get_hydrophobic_moment(self) -> pd.DataFrame:
1341
+ """
1342
+ Calculate Hydrophobic Moment for protein sequences using the protpy package.
1343
+ Computes the mean and maximum hydrophobic moment across sliding windows using
1344
+ the Eisenberg hydrophobicity scale and a helical-wheel projection. Captures
1345
+ amphipathicity of putative helix segments.
1346
+
1347
+ Parameters
1348
+ ==========
1349
+ None
1350
+
1351
+ Returns
1352
+ =======
1353
+ :hydrophobic_moment: pd.DataFrame
1354
+ Dataframe of hydrophobic moment values, shape N x 2
1355
+ (HydrophobicMoment_Mean, HydrophobicMoment_Max).
1356
+ """
1357
+ # return cached result if already computed
1358
+ if not self.hydrophobic_moment.empty:
1359
+ return self.hydrophobic_moment
1360
+
1361
+ # get window and helical angle from config, using Eisenberg scale defaults
1362
+ hm_params = getattr(self.desc_parameters, 'hydrophobic_moment', {})
1363
+ window = hm_params.get('window', 11) if hm_params else 11
1364
+ angle = hm_params.get('angle', 100) if hm_params else 100
1365
+
1366
+ # calculate hydrophobic moment for all sequences
1367
+ self.hydrophobic_moment = self._calculate_descriptor_batch(
1368
+ protpy.hydrophobic_moment,
1369
+ desc_name="Hydrophobic Moment",
1370
+ window=window,
1371
+ angle=angle
1372
+ )
1373
+ return self.hydrophobic_moment
1374
+
1375
+ def get_shannon_entropy(self) -> pd.DataFrame:
1376
+ """
1377
+ Calculate Shannon Entropy for protein sequences using the protpy package.
1378
+ An information-theoretic measure of amino acid diversity in a sequence computed
1379
+ as H = -sum(p_i * log2(p_i)). A value of 0 means a completely repetitive
1380
+ sequence; the theoretical maximum of ~4.322 bits corresponds to a perfectly
1381
+ uniform distribution across all 20 canonical amino acids.
1382
+
1383
+ Parameters
1384
+ ==========
1385
+ None
1386
+
1387
+ Returns
1388
+ =======
1389
+ :shannon_entropy: pd.DataFrame
1390
+ Dataframe of ShannonEntropy values, shape N x 1.
1391
+ """
1392
+ # return cached result if already computed
1393
+ if not self.shannon_entropy.empty:
1394
+ return self.shannon_entropy
1395
+
1396
+ # calculate Shannon entropy for all sequences
1397
+ self.shannon_entropy = self._calculate_descriptor_batch(
1398
+ protpy.shannon_entropy,
1399
+ desc_name="Shannon Entropy"
1400
+ )
1401
+ return self.shannon_entropy
1402
+
1403
+ def get_moreaubroto_autocorrelation(self) -> pd.DataFrame:
1404
+ """
1405
+ Calculate MoreauBrotoAuto Autocorrelation (MBAuto) descriptor using
1406
+ custom-built protpy package. Autocorrelation descriptors are a class
1407
+ of topological descriptors, also known as molecular connectivity indices, that
1408
+ describe the level of correlation between two objects (protein or peptide sequences)
1409
+ in terms of their specific structural or physicochemical properties, which are
1410
+ defined based on the distribution of amino acid properties along the sequence.
1411
+
1412
+ By default, 8 amino acid properties are used for deriving the descriptors. The
1413
+ derivations and detailed explanations of this type of descriptor is outlind in
1414
+ [4]. The MBAuto descriptor is a type of Autocorrelation descriptor that uses
1415
+ the property values as the basis for measurement. Each autocorrelation will
1416
+ generate the number of features depending on the lag value and number of
1417
+ properties input with total features = lag * number of properties. The
1418
+ autocorrelation values can also be normalized if the "normalize" parameter
1419
+ is set in the config file. Using the default 8 properties with default lag
1420
+ value of 30, 240 features are generated, the default 8 properties are:
1421
+
1422
+ AccNo. CIDH920105 - Normalized Average Hydrophobicity Scales.
1423
+ AccNo. BHAR880101 - Average Flexibility Indices.
1424
+ AccNo. CHAM820101 - Polarizability Parameter.
1425
+ AccNo. CHAM820102 - Free Energy of Solution in Water, kcal/mole.
1426
+ AccNo. CHOC760101 - Residue Accessible Surface Area in Tripeptide.
1427
+ AccNo. BIGC670101 - Residue Volume.
1428
+ AccNo. CHAM810101 - Steric Parameter.
1429
+ AccNo. DAYM780201 - Relative Mutability.
1430
+
1431
+ Parameters
1432
+ ==========
1433
+ None
1434
+
1435
+ Returns15
1436
+ =======
1437
+ :moreaubroto_autocorrelation: pd.Dataframe
1438
+ pandas Dataframe of MBAuto values for protein sequence. Output will
1439
+ be of the shape N x M, where N is the number of protein sequences and
1440
+ M is the number of features calculated from the descriptor, calculated
1441
+ as lag * number of properties. By default, the shape will be N x 240
1442
+ (30 features per property - using 8 properties, with lag=30).
1443
+ """
1444
+ #if attribute already calculated & not empty then return it
1445
+ if not self.moreaubroto_autocorrelation.empty:
1446
+ return self.moreaubroto_autocorrelation
1447
+
1448
+ #get descriptor-specific parameters from config file
1449
+ lag = self.desc_parameters.moreaubroto_autocorrelation["lag"]
1450
+ properties = self.desc_parameters.moreaubroto_autocorrelation["properties"]
1451
+ normalize = self.desc_parameters.moreaubroto_autocorrelation["normalize"]
1452
+
1453
+ #calculate descriptor value using helper method
1454
+ self.moreaubroto_autocorrelation = self._calculate_descriptor_batch(
1455
+ protpy.moreaubroto_autocorrelation,
1456
+ desc_name="MoreauBroto Autocorrelation",
1457
+ lag=lag,
1458
+ properties=properties,
1459
+ normalize=normalize
1460
+ )
1461
+
1462
+ return self.moreaubroto_autocorrelation
1463
+
1464
+ def get_moran_autocorrelation(self) -> pd.DataFrame:
1465
+ """
1466
+ Calculate Moran autocorrelation (MAuto) of protein sequences using the custom-built
1467
+ protpy package. MAuto utilizes property deviations from the average values.
1468
+ **refer to MBAuto docstring for autocorrelation description.
1469
+
1470
+ Parameters
1471
+ ==========
1472
+ None
1473
+
1474
+ Returns
1475
+ =======
1476
+ :moran_autocorrelation: pd.DataFrame
1477
+ pandas Dataframe of MAuto values for protein sequence. Output will
1478
+ be of the shape N x M, where N is the number of protein sequences
1479
+ and M is the number of features calculated from the descriptor,
1480
+ calculated as lag * number of properties. By default, the shape
1481
+ will be N x 240 (30 features per property - using 8 properties,
1482
+ with lag=30).
1483
+ """
1484
+ #if attribute already calculated & not empty then return it
1485
+ if not self.moran_autocorrelation.empty:
1486
+ return self.moran_autocorrelation
1487
+
1488
+ #get descriptor-specific parameters from config file
1489
+ lag = self.desc_parameters.moran_autocorrelation["lag"]
1490
+ properties = self.desc_parameters.moran_autocorrelation["properties"]
1491
+ normalize = self.desc_parameters.moran_autocorrelation["normalize"]
1492
+
1493
+ #calculate descriptor value using helper method
1494
+ self.moran_autocorrelation = self._calculate_descriptor_batch(
1495
+ protpy.moran_autocorrelation,
1496
+ desc_name="Moran Autocorrelation",
1497
+ lag=lag,
1498
+ properties=properties,
1499
+ normalize=normalize
1500
+ )
1501
+
1502
+ return self.moran_autocorrelation
1503
+
1504
+ def get_geary_autocorrelation(self) -> pd.DataFrame:
1505
+ """
1506
+ Calculate Geary Autocorrelation (GAuto) of protein sequences using the
1507
+ custom-built protpy package. GAuto utilizes the square-difference of
1508
+ property values instead of vector-products (of property values or
1509
+ deviations).
1510
+ **refer to MBAuto docstring for autocorrelation description.
1511
+
1512
+ Parameters
1513
+ ==========
1514
+ None
1515
+
1516
+ Returns
1517
+ =======
1518
+ :geary_autocorrelation: pd.DataFrame
1519
+ pandas Dataframe of GAuto values for protein sequence. Output will
1520
+ be of the shape N x M, where N is the number of protein sequences and
1521
+ M is the number of features calculated from the descriptor, calculated
1522
+ as lag * number of properties. By default, the shape will be N x 240
1523
+ (30 features per property - using 8 properties, with lag=30).
1524
+ """
1525
+ #if attribute already calculated & not empty then return it
1526
+ if not self.geary_autocorrelation.empty:
1527
+ return self.geary_autocorrelation
1528
+
1529
+ #get descriptor-specific parameters from config file
1530
+ lag = self.desc_parameters.geary_autocorrelation["lag"]
1531
+ properties = self.desc_parameters.geary_autocorrelation["properties"]
1532
+ normalize = self.desc_parameters.geary_autocorrelation["normalize"]
1533
+
1534
+ #calculate descriptor value using helper method
1535
+ self.geary_autocorrelation = self._calculate_descriptor_batch(
1536
+ protpy.geary_autocorrelation,
1537
+ desc_name="Geary Autocorrelation",
1538
+ lag=lag,
1539
+ properties=properties,
1540
+ normalize=normalize
1541
+ )
1542
+
1543
+ return self.geary_autocorrelation
1544
+
1545
+ def get_ctd_composition(self) -> pd.DataFrame:
1546
+ """
1547
+ Calculate Composition (C_CTD) physicochemical/structural descriptor
1548
+ of protein sequences from the calculated CTD descriptor. Composition
1549
+ is determined as the number of amino acids of a particular property
1550
+ divided by total number of amino acids,
1551
+
1552
+ Parameters
1553
+ ==========
1554
+ None
1555
+
1556
+ Returns
1557
+ =======
1558
+ :ctd_composition: pd.DataFrame
1559
+ pandas dataframe of C_CTD values for protein sequence. Output will
1560
+ be of the shape N x M, where N is the number of protein sequences
1561
+ and M is the (number of physicochemical properties * 3), with 3
1562
+ features being calculated per property. By default the
1563
+ "hydrophobicity" property will be used, generating an output of
1564
+ N x 3.
1565
+ """
1566
+ #if attribute already calculated & not empty then return it
1567
+ if not (self.ctd_composition.empty):
1568
+ return self.ctd_composition
1569
+
1570
+ #calculate ctd descriptor if not already calculated
1571
+ if (self.ctd.empty):
1572
+ self.ctd = self.get_ctd()
1573
+
1574
+ #initialise dataframe
1575
+ comp_df = pd.DataFrame()
1576
+
1577
+ #get ctd properties used for calculating descriptor
1578
+ ctd_property = self.desc_parameters.ctd["property"]
1579
+ if not (isinstance(ctd_property, list)):
1580
+ ctd_property = ctd_property.split(',')
1581
+ all_ctd = self.desc_parameters.ctd["all"]
1582
+
1583
+ #get composition descriptor from CTD dataframe, dependant on number of props, 3 features per property
1584
+ if (all_ctd):
1585
+ comp_df = self.ctd.iloc[:,0:21]
1586
+ else:
1587
+ comp_df = self.ctd.iloc[:,0:3 * len(ctd_property)]
1588
+
1589
+ self.ctd_composition = comp_df
1590
+
1591
+ return self.ctd_composition
1592
+
1593
+ def get_ctd_transition(self) -> pd.DataFrame:
1594
+ """
1595
+ Calculate Transition (T_CTD) physicochemical/structural descriptor of
1596
+ protein sequences from the calculated CTD descriptor. Transition is
1597
+ determined as the number of transitions from a particular property to
1598
+ different property divided by (total number of amino acids − 1).
1599
+
1600
+ Parameters
1601
+ ==========
1602
+ None
1603
+
1604
+ Returns
1605
+ =======
1606
+ :ctd_transition: pd.Dataframe
1607
+ pandas Dataframe of T_CTD values for protein sequence. Output will
1608
+ be of the shape N x M, where N is the number of protein sequences
1609
+ and M is the (number of physicochemical properties * 3), with 3
1610
+ features being calculated per property. By default the
1611
+ "hydrophobicity" property will be used, generating an output of
1612
+ N x 3.
1613
+ """
1614
+ #if attribute already calculated & not empty then return it
1615
+ if not (self.ctd_transition.empty):
1616
+ return self.ctd_transition
1617
+
1618
+ #calculate ctd descriptor if not already calculated
1619
+ if (self.ctd.empty):
1620
+ self.ctd = self.get_ctd()
1621
+
1622
+ #initialise dataframe
1623
+ transition_df = pd.DataFrame()
1624
+
1625
+ #get ctd properties used for calculating descriptor
1626
+ ctd_property = self.desc_parameters.ctd["property"]
1627
+ if not (isinstance(ctd_property, list)):
1628
+ ctd_property = ctd_property.split(',')
1629
+ all_ctd = self.desc_parameters.ctd["all"]
1630
+
1631
+ #get transition descriptor from CTD dataframe, dependant on number of props, 3 features per property
1632
+ if (all_ctd):
1633
+ transition_df = self.ctd.iloc[:,21:42]
1634
+ else:
1635
+ transition_df = self.ctd.iloc[:,3 * len(ctd_property):(3 * len(ctd_property) * 2)]
1636
+
1637
+ self.ctd_transition = transition_df
1638
+
1639
+ return self.ctd_transition
1640
+
1641
+ def get_ctd_distribution(self) -> pd.DataFrame:
1642
+ """
1643
+ Calculate Distribution (D_CTD) physicochemical/structural descriptor of
1644
+ protein sequences from the calculated CTD descriptor. Distribution is
1645
+ the chain length within which the first, 25%, 50%, 75% and 100% of the
1646
+ amino acids of a particular property are located.
1647
+
1648
+ Parameters
1649
+ ==========
1650
+ None
1651
+
1652
+ Returns
1653
+ =======
1654
+ :ctd_distribution: pd.Dataframe
1655
+ pandas Dataframe of D_CTD values for protein sequence. Output will
1656
+ be of the shape N x M, where N is the number of protein sequences
1657
+ and M is the (number of physicochemical properties * 15), with 15
1658
+ features being calculated per property. By default the
1659
+ "hydrophobicity" property will be used, generating an output of
1660
+ N x 15.
1661
+ """
1662
+ #if attribute already calculated & not empty then return it
1663
+ if not (self.ctd_distribution.empty):
1664
+ return self.ctd_distribution
1665
+
1666
+ #calculate ctd descriptor if not already calculated
1667
+ if (self.ctd.empty):
1668
+ self.ctd = self.get_ctd()
1669
+
1670
+ #initialise dataframe
1671
+ distribution_df = pd.DataFrame()
1672
+
1673
+ #get ctd properties used for calculating descriptor
1674
+ ctd_property = self.desc_parameters.ctd["property"]
1675
+ if not (isinstance(ctd_property, list)):
1676
+ ctd_property = ctd_property.split(',')
1677
+ all_ctd = self.desc_parameters.ctd["all"]
1678
+
1679
+ #get distribution descriptor from CTD dataframe, dependant on number of props, 15 features per property
1680
+ if (all_ctd):
1681
+ distribution_df = self.ctd.iloc[:,42:]
1682
+ else:
1683
+ distribution_df = self.ctd.iloc[:,2 * (3 * len(ctd_property)):]
1684
+
1685
+ self.ctd_distribution = distribution_df
1686
+
1687
+ return self.ctd_distribution
1688
+
1689
+ def get_ctd(self) -> pd.DataFrame:
1690
+ """
1691
+ Calculate all CTD (Composition, Transition, Distribution)
1692
+ physicochemical/structural descriptor of protein sequences using the
1693
+ custom-built protpy package.
1694
+
1695
+ Parameters
1696
+ ==========
1697
+ None
1698
+
1699
+ Returns
1700
+ =======
1701
+ :ctd: pd.Series
1702
+ pandas Series of CTD values for protein sequence. Output will
1703
+ be of the shape N x M, where N is the number of protein
1704
+ sequences and M is (number of physicochemical properties * 21),
1705
+ with 21 being the number of features calculated for each of the
1706
+ CTD descriptors per property. Using all properties will generate
1707
+ an output of N x 147, by default the "hydrophobicity"
1708
+ property is used, generating an output of N x 21.
1709
+ """
1710
+ #if attribute already calculated & not empty then return it
1711
+ if not (self.ctd.empty):
1712
+ return self.ctd
1713
+
1714
+ #get descriptor-specific parameters from config file
1715
+ ctd_property = self.desc_parameters.ctd["property"]
1716
+ all_ctd = self.desc_parameters.ctd["all"]
1717
+
1718
+ #initialise dataframe
1719
+ ctd_df = pd.DataFrame()
1720
+
1721
+ #calculate descriptor value, concatenate descriptor values
1722
+ for seq in self.protein_seqs:
1723
+ ctd_seq = protpy.ctd_(seq, property=ctd_property, all_ctd=all_ctd)
1724
+ ctd_df = pd.concat([ctd_df, ctd_seq])
1725
+
1726
+ self.ctd = ctd_df.reset_index(drop=True)
1727
+
1728
+ return self.ctd
1729
+
1730
+ def get_conjoint_triad(self) -> pd.DataFrame:
1731
+ """
1732
+ Calculate Conjoint Triad (CTriad) of protein sequences using the custom-built
1733
+ protpy package. The descriptor mainly considers neighbour relationships in
1734
+ protein sequences by encoding each protein sequence using the triad (continuous
1735
+ three amino acids) frequency distribution extracted from a 7-letter reduced
1736
+ alphabet [11]. CTriad calculates 343 different features (7x7x7), with the
1737
+ output being of shape N x 343 where N is the number of sequences.
1738
+
1739
+ Parameters
1740
+ ==========
1741
+ None
1742
+
1743
+ Returns
1744
+ =======
1745
+ :conjoint_triad: pd.Dataframe
1746
+ pandas Dataframe of CTriad descriptor values for all protein sequences. Dataframe
1747
+ will be of the shape N x 343, where N is the number of protein sequences and 343
1748
+ is the number of features calculated from the descriptor for a sequence.
1749
+ """
1750
+ #if attribute already calculated & not empty then return it
1751
+ if not (self.conjoint_triad.empty):
1752
+ return self.conjoint_triad
1753
+
1754
+ #initialise dataframe
1755
+ conjoint_triad_df = pd.DataFrame()
1756
+
1757
+ #calculate descriptor value, for each sequence, concatenate descriptor values
1758
+ for seq in self.protein_seqs:
1759
+ conjoint_triad_seq = protpy.conjoint_triad(seq)
1760
+ conjoint_triad_df = pd.concat([conjoint_triad_df, conjoint_triad_seq])
1761
+
1762
+ self.conjoint_triad = conjoint_triad_df.reset_index(drop=True)
1763
+
1764
+ return self.conjoint_triad
1765
+
1766
+ def get_sequence_order_coupling_number(self) -> pd.DataFrame:
1767
+ """
1768
+ Calculate Sequence Order Coupling Number (SOCN) features for input protein sequence
1769
+ using custom-built protpy package. SOCN computes the dissimilarity between amino acid
1770
+ pairs. The distance between amino acid pairs is determined by d which varies between
1771
+ 1 to lag. For each d, it computes the sum of the dissimilarities of all amino acid
1772
+ pairs. The number of output features can be calculated as N * 2, where N = lag, by
1773
+ default this value is 30 which generates an output of M x 60 where M is the number
1774
+ of protein sequenes.
1775
+
1776
+ Parameters
1777
+ ==========
1778
+ None
1779
+
1780
+ Returns
1781
+ =======
1782
+ :sequence_order_coupling_number_df: pd.Dataframe
1783
+ Dataframe of SOCN descriptor values for all protein sequences. Output
1784
+ will be of the shape N x M, where N is the number of protein sequences and
1785
+ M is the number of features calculated from the descriptor (calculated as
1786
+ N * 2 where N = lag).
1787
+ """
1788
+ #if attribute already calculated & not empty then return it
1789
+ if not (self.sequence_order_coupling_number.empty):
1790
+ return self.sequence_order_coupling_number
1791
+
1792
+ #initialise dataframe
1793
+ sequence_order_coupling_number_df = pd.DataFrame()
1794
+
1795
+ #get descriptor-specific parameters from config file
1796
+ lag = self.desc_parameters.sequence_order_coupling_number["lag"]
1797
+ distance_matrix = self.desc_parameters.sequence_order_coupling_number["distance_matrix"]
1798
+
1799
+ #calculate descriptor value, for each sequence, concatenate descriptor values
1800
+ for seq in self.protein_seqs:
1801
+ #if no distance matrix present in config then calculate SOCN using both matrices
1802
+ if (distance_matrix == "" or distance_matrix == None):
1803
+ sequence_order_coupling_number_seq = protpy.sequence_order_coupling_number_all(seq, lag=lag)
1804
+ else:
1805
+ sequence_order_coupling_number_seq = protpy.sequence_order_coupling_number(seq, lag=lag, distance_matrix=distance_matrix)
1806
+
1807
+ #concat sequence's descriptor output to dataframe
1808
+ sequence_order_coupling_number_df = pd.concat([sequence_order_coupling_number_df, sequence_order_coupling_number_seq])
1809
+
1810
+ self.sequence_order_coupling_number = sequence_order_coupling_number_df.reset_index(drop=True)
1811
+
1812
+ return self.sequence_order_coupling_number
1813
+
1814
+ def get_quasi_sequence_order(self) -> pd.DataFrame:
1815
+ """
1816
+ Calculate Quasi Sequence Order features for the protein sequences using the
1817
+ custom-built protpy package.The quasi-sequence-order descriptors were proposed
1818
+ by K.C. Chou, et.al. [10]. They are derived from the distance matrix between
1819
+ the 20 amino acids. By default, the Scheider-Wrede physicochemical distance
1820
+ matrix was used. Also utilised in the descriptor calculation is the Grantham
1821
+ chemical distance matrix. Both of these matrices are used by Grantham et. al.
1822
+ in the calculation of the descriptor [13]. 100 values are calculated per
1823
+ sequence, thus generating an output of N x 100 per sequence, where N is the
1824
+ number of protein sequences.
1825
+
1826
+ Parameters
1827
+ ==========
1828
+ None
1829
+
1830
+ Returns
1831
+ =======
1832
+ :quasi_sequence_order_df: pd.Dataframe
1833
+ Dataframe of quasi-sequence-order descriptor values for the
1834
+ protein sequences, with output shape N x 100 where N is the number
1835
+ of sequences and 100 the number of calculated features.
1836
+ """
1837
+ #if attribute already calculated & not empty then return it
1838
+ if not (self.quasi_sequence_order.empty):
1839
+ return self.quasi_sequence_order
1840
+
1841
+ #initialise dataframe
1842
+ quasi_sequence_order_df = pd.DataFrame()
1843
+
1844
+ #get descriptor-specific parameters from config file
1845
+ lag = self.desc_parameters.quasi_sequence_order["lag"]
1846
+ weight = self.desc_parameters.quasi_sequence_order["weight"]
1847
+ distance_matrix = self.desc_parameters.quasi_sequence_order["distance_matrix"]
1848
+
1849
+ #calculate descriptor value, for each sequene, concatenate descriptor values
1850
+ for seq in self.protein_seqs:
1851
+ #if no distance matrix present in config then calculate quasi seq order using both matrices
1852
+ if (distance_matrix == "" or distance_matrix == None):
1853
+ quasi_sequence_order_seq = protpy.quasi_sequence_order_all(seq, lag=lag, weight=weight)
1854
+ else:
1855
+ quasi_sequence_order_seq = protpy.quasi_sequence_order(seq, lag=lag, weight=weight,
1856
+ distance_matrix=distance_matrix)
1857
+
1858
+ #concat sequence's descriptor output to dataframe
1859
+ quasi_sequence_order_df = pd.concat([quasi_sequence_order_df, quasi_sequence_order_seq])
1860
+
1861
+ self.quasi_sequence_order = quasi_sequence_order_df.reset_index(drop=True)
1862
+
1863
+ return self.quasi_sequence_order
1864
+
1865
+ def get_pseudo_amino_acid_composition(self) -> pd.DataFrame:
1866
+ """
1867
+ Calculate Pseudo Amino Acid Composition (PAAComp) descriptor using custom-built protpy
1868
+ package. PAAComp combines the vanilla amino acid composition descriptor with additional
1869
+ local features, such as correlation between residues of a certain distance, as amino
1870
+ acid composition doesn't take into account sequence order info. The pseudo components
1871
+ of the descriptor are a series rank-different correlation factors [10]. The first 20
1872
+ components are a weighted sum of the amino acid composition and 30 are physicochemical
1873
+ square correlations as dictated by the lambda and properties parameters. This generates
1874
+ an output of [(20 + λ), 1] = 50 x 1 when using the default lambda of 30. By default,
1875
+ the physicochemical properties used are hydrophobicity and hydrophillicity, with a lambda
1876
+ of 30 and weight of 0.05.
1877
+
1878
+ Parameters
1879
+ ==========
1880
+ None
1881
+
1882
+ Returns
1883
+ =======
1884
+ :pseudo_amino_acid_composition_df: pd.Dataframe
1885
+ Dataframe of pseudo amino acid composition descriptor values for the protein sequences
1886
+ of output shape N x (20 + λ), where N is the number of protein sequences. With
1887
+ default lambda of 30, the output shape will be N x 50.
1888
+ """
1889
+ #if attribute already calculated & not empty then return it
1890
+ if not (self.pseudo_amino_acid_composition.empty):
1891
+ return self.pseudo_amino_acid_composition
1892
+
1893
+ #initialise dataframe
1894
+ pseudo_amino_acid_composition_df = pd.DataFrame()
1895
+
1896
+ #get descriptor-specific parameters from config file
1897
+ lamda = self.desc_parameters.pseudo_amino_acid_composition["lambda"]
1898
+ weight = self.desc_parameters.pseudo_amino_acid_composition["weight"]
1899
+ properties = self.desc_parameters.pseudo_amino_acid_composition["properties"]
1900
+
1901
+ #calculate descriptor value, for each sequence, concatenate descriptor values,
1902
+ #tqdm loop to visualise progress as descriptor can take some time to execute
1903
+ for seq in tqdm(self.protein_seqs, unit=" sequence", position=0,
1904
+ desc="PAAComp", mininterval=30, ncols=90):
1905
+ pseudo_amino_acid_composition_seq = protpy.pseudo_amino_acid_composition(seq, lamda=lamda,
1906
+ weight=weight, properties=properties)
1907
+ pseudo_amino_acid_composition_df = pd.concat([pseudo_amino_acid_composition_df, pseudo_amino_acid_composition_seq])
1908
+
1909
+ self.pseudo_amino_acid_composition = pseudo_amino_acid_composition_df.reset_index(drop=True)
1910
+
1911
+ return self.pseudo_amino_acid_composition
1912
+
1913
+ def get_amphiphilic_pseudo_amino_acid_composition(self) -> pd.DataFrame:
1914
+ """
1915
+ Calculate Amphiphilic Pseudo Amino Acid Composition (APAAComp) of protein sequences
1916
+ using custom-built protpy package. APAAComp has the same form as the amino acid
1917
+ composition, but contains much more information that is related to the sequence
1918
+ order of a protein and the distribution of the hydrophobic and hydrophilic amino
1919
+ acids along its chain. The first 20 numbers in the descriptor are the components
1920
+ of the conventional amino acid composition; the next 2*lambda numbers are a set of
1921
+ correlation factors that reflect different hydrophobicity and hydrophilicity
1922
+ distribution patterns along a protein chain.
1923
+
1924
+ Parameters
1925
+ ==========
1926
+ None
1927
+
1928
+ Returns
1929
+ =======
1930
+ :amphiphilic_pseudo_amino_acid_composition_df: pd.Dataframe
1931
+ Dataframe of Amphiphilic pseudo amino acid composition descriptor values for
1932
+ the protein sequences of output shape N x 80, where N is the number of
1933
+ protein sequences and 80 is calculated as (20 + 2*lambda).
1934
+ """
1935
+ #if attribute already calculated & not empty then return it
1936
+ if not (self.amphiphilic_pseudo_amino_acid_composition.empty):
1937
+ return self.amphiphilic_pseudo_amino_acid_composition
1938
+
1939
+ #get descriptor-specific parameters from config file
1940
+ lamda = self.desc_parameters.amphiphilic_pseudo_amino_acid_composition["lambda"]
1941
+ weight = self.desc_parameters.amphiphilic_pseudo_amino_acid_composition["weight"]
1942
+
1943
+ #initialise dataframe
1944
+ amphiphilic_pseudo_amino_acid_composition_df = pd.DataFrame()
1945
+
1946
+ #calculate descriptor value, for each sequence, concatenate descriptor values,
1947
+ #tqdm loop to visualise progress as descriptor can take some time to execute
1948
+ for seq in tqdm(self.protein_seqs, unit=" sequence", position=0,
1949
+ desc="APAAComp", mininterval=30, ncols=90):
1950
+ amphiphilic_pseudo_amino_acid_composition_seq = protpy.amphiphilic_pseudo_amino_acid_composition(seq,
1951
+ lamda=lamda, weight=weight)
1952
+ amphiphilic_pseudo_amino_acid_composition_df = pd.concat([amphiphilic_pseudo_amino_acid_composition_df,
1953
+ amphiphilic_pseudo_amino_acid_composition_seq])
1954
+
1955
+ self.amphiphilic_pseudo_amino_acid_composition = amphiphilic_pseudo_amino_acid_composition_df.reset_index(drop=True)
1956
+
1957
+ return self.amphiphilic_pseudo_amino_acid_composition
1958
+
1959
+ def get_all_descriptors(self, export: bool = False, descriptors_export_filename: str = "") -> pd.DataFrame:
1960
+ """
1961
+ Calculate all individual descriptor values, concatenating each descriptor
1962
+ Dataframe into one storing all descriptors. The number of descriptor
1963
+ features calculated is dependant on several additional meta parameters of
1964
+ some descriptors, including the number of properties and max lag for the
1965
+ Autocorrelation, SOCN and QSO and the number of properties and lamda for
1966
+ PAAComp and the lambda for APAAComp.
1967
+
1968
+ To export all descriptors to a csv set export=True when calling the function,
1969
+ this saves having to recalculate all the descriptor values when using them
1970
+ in multiple encoding processes, and the descriptors can be imported using the
1971
+ import_descriptors function. By default, the function will save the output
1972
+ csv to the value at the "descriptors_csv" parameter in the config file,
1973
+ although the name for this exported csv can be set by the
1974
+ descriptors_export_filename input parameter.
1975
+
1976
+ Parameters
1977
+ ==========
1978
+ :export: bool (default=False)
1979
+ if true then all calculated descriptors from the protpy package will be
1980
+ exported to a CSV. This allows for pre-calculated descriptors for a
1981
+ dataset to be easily imported and not have to be recalculated again.
1982
+ :descriptors_export_filename: str
1983
+ filepath/filename for the exported csv of all the calculated descriptor
1984
+ values if input parameter export=True
1985
+
1986
+ Returns
1987
+ =======
1988
+ :all_descriptor_df: pd.DataFrame
1989
+ concatenated dataframe of all individual descriptors. Using the default
1990
+ attributes and their associated values, the output will be of the shape
1991
+ N x 10572, where N is the number of protein sequences and 10572 is the
1992
+ number of descriptor features.
1993
+ """
1994
+ print('############################### Exporting all descriptors ################################\n')
1995
+
1996
+ #start time counter
1997
+ start = time.time()
1998
+
1999
+ #iterate over all descriptors, calculating each using their respective function and the protpy package
2000
+ for descr in tqdm(self.all_descriptors_list(), unit=" descriptor", position=0,
2001
+ desc="Descriptors", mininterval=30, ncols=90):
2002
+
2003
+ #if descriptor attribute DF is empty then call its respective get_descriptor function
2004
+ if (descr == "amino_acid_composition" and getattr(self, "amino_acid_composition").empty):
2005
+ self.amino_acid_composition = self.get_amino_acid_composition()
2006
+
2007
+ if (descr == "dipeptide_composition" and getattr(self, "dipeptide_composition").empty):
2008
+ self.dipeptide_composition = self.get_dipeptide_composition()
2009
+
2010
+ if (descr == "tripeptide_composition" and getattr(self, "tripeptide_composition").empty):
2011
+ self.tripeptide_composition = self.get_tripeptide_composition()
2012
+
2013
+ if (descr == "moreaubroto_autocorrelation" and getattr(self, "moreaubroto_autocorrelation").empty):
2014
+ self.moreaubroto_autocorrelation = self.get_moreaubroto_autocorrelation()
2015
+
2016
+ if (descr == "moran_autocorrelation" and getattr(self, "moran_autocorrelation").empty):
2017
+ self.moran_autocorrelation = self.get_moran_autocorrelation()
2018
+
2019
+ if (descr == "geary_autocorrelation" and getattr(self, "geary_autocorrelation").empty):
2020
+ self.geary_autocorrelation = self.get_geary_autocorrelation()
2021
+
2022
+ if (descr == "ctd" and getattr(self, "ctd").empty):
2023
+ self.ctd = self.get_ctd()
2024
+
2025
+ if (descr == "ctd_composition" and getattr(self, "ctd_composition").empty):
2026
+ self.ctd_composition = self.get_ctd_composition()
2027
+
2028
+ if (descr == "ctd_transition" and getattr(self, "ctd_transition").empty):
2029
+ self.ctd_transition = self.get_ctd_transition()
2030
+
2031
+ if (descr == "ctd_distribution" and getattr(self, "ctd_distribution").empty):
2032
+ self.ctd_distribution = self.get_ctd_distribution()
2033
+
2034
+ if (descr == "conjoint_triad" and getattr(self, "conjoint_triad").empty):
2035
+ self.conjoint_triad = self.get_conjoint_triad()
2036
+
2037
+ if (descr == "sequence_order_coupling_number" and getattr(self, "sequence_order_coupling_number").empty):
2038
+ self.sequence_order_coupling_number = self.get_sequence_order_coupling_number()
2039
+
2040
+ if (descr == "quasi_sequence_order" and getattr(self, "quasi_sequence_order").empty):
2041
+ self.quasi_sequence_order = self.get_quasi_sequence_order()
2042
+
2043
+ if (descr == "pseudo_amino_acid_composition" and getattr(self, "pseudo_amino_acid_composition").empty):
2044
+ self.pseudo_amino_acid_composition = self.get_pseudo_amino_acid_composition()
2045
+
2046
+ if (descr == "amphiphilic_pseudo_amino_acid_composition" and getattr(self, "amphiphilic_pseudo_amino_acid_composition").empty):
2047
+ self.amphiphilic_pseudo_amino_acid_composition = self.get_amphiphilic_pseudo_amino_acid_composition()
2048
+
2049
+ #stop time counter, calculate elapsed time
2050
+ end = time.time()
2051
+ elapsed = end - start
2052
+
2053
+ print(f'\nElapsed time for calculating all descriptors: {elapsed/60:.2f} minutes.')
2054
+ print('\n##########################################################################################')
2055
+
2056
+ #append all calculated descriptors to list
2057
+ all_desc = [
2058
+ self.amino_acid_composition, self.dipeptide_composition, self.tripeptide_composition,
2059
+ self.moreaubroto_autocorrelation, self.moran_autocorrelation,
2060
+ self.geary_autocorrelation, self.ctd_composition, self.ctd_transition,
2061
+ self.ctd_distribution, self.conjoint_triad, self.sequence_order_coupling_number,
2062
+ self.quasi_sequence_order, self.pseudo_amino_acid_composition, self.amphiphilic_pseudo_amino_acid_composition
2063
+ ]
2064
+
2065
+ #concatenate individual descriptor dataframe attributes
2066
+ all_descriptor_df = pd.concat(all_desc, axis = 1)
2067
+ self.all_descriptors = all_descriptor_df
2068
+
2069
+ #export pre-calculated descriptor values to a csv, use default name if parameter empty
2070
+ if (export):
2071
+ if (descriptors_export_filename == ""):
2072
+ if (self.desc_config.descriptors_csv == "" or self.desc_config.descriptors_csv == None):
2073
+ self.desc_config.descriptors_csv = "descriptors_output.csv"
2074
+ self.all_descriptors.to_csv(self.desc_config.descriptors_csv, index=0)
2075
+ else:
2076
+ #append extension if not present on filename - export to csv
2077
+ if (os.path.splitext(os.path.basename(descriptors_export_filename))[1] == ""):
2078
+ descriptors_export_filename = descriptors_export_filename + ".csv"
2079
+ self.all_descriptors.to_csv(descriptors_export_filename, index=0)
2080
+
2081
+ return all_descriptor_df
2082
+
2083
+ def get_descriptor_encoding(self, descriptor: str) -> Optional[pd.DataFrame]:
2084
+ """
2085
+ Get the protein descriptor values of a specified input descriptor. If the
2086
+ sought descriptor has already been calculated then its attribute is returned,
2087
+ else the descriptor is calculated using its get_descriptor function.
2088
+
2089
+ Parameters
2090
+ ==========
2091
+ :descriptor: str
2092
+ name of descriptor to return. Method can accept the approximate name
2093
+ of the descriptor, e.g. 'amino_comp'/'aa_composition' etc will return
2094
+ the 'amino_acid_composition' descriptor. This functionality is realised
2095
+ using the difflib library and its built-in get_close_matches function.
2096
+
2097
+ Returns
2098
+ =======
2099
+ :desc_encoding: pd.DataFrame or None
2100
+ dataframe of matching descriptor attribute. None returned if no matching
2101
+ descriptor found.
2102
+ """
2103
+ #input descriptor parameter should be a string
2104
+ if not(isinstance(descriptor, str)):
2105
+ raise TypeError('Input parameter {} is not of correct datatype string, got {}.'.
2106
+ format(descriptor, type(descriptor)))
2107
+
2108
+ #remove any whitespace from input parameter, replace spaces with underscores and lowercase
2109
+ descriptor = descriptor.strip().replace(' ', '_').lower()
2110
+
2111
+ #validate input descriptor is a valid available descriptor, get its closest match
2112
+ desc_matches = get_close_matches(descriptor, self.valid_descriptors, cutoff=0.6)
2113
+ if (desc_matches != []):
2114
+ desc = desc_matches[0] #set desc to closest descriptor match found
2115
+ else:
2116
+ raise ValueError(f"Could not find a match for the input descriptor {descriptor} in"
2117
+ f" list of available valid models:\n{self.valid_descriptors}.")
2118
+
2119
+ #if sought descriptor attribute dataframe is empty, call the descriptor's
2120
+ # get_descriptor() function, set desc_encoding to descriptor attribute
2121
+ if (desc == 'amino_acid_composition'):
2122
+ if (getattr(self, desc).empty):
2123
+ self.get_amino_acid_composition()
2124
+ desc_encoding = self.amino_acid_composition
2125
+
2126
+ elif (desc == 'dipeptide_composition'):
2127
+ if (getattr(self, desc).empty):
2128
+ self.get_dipeptide_composition()
2129
+ desc_encoding = self.dipeptide_composition
2130
+
2131
+ elif (desc == 'tripeptide_composition'):
2132
+ if (getattr(self, desc).empty):
2133
+ self.get_tripeptide_composition()
2134
+ desc_encoding = self.tripeptide_composition
2135
+
2136
+ elif (desc == 'gravy'):
2137
+ if (getattr(self, desc).empty):
2138
+ self.get_gravy()
2139
+ desc_encoding = self.gravy
2140
+
2141
+ elif (desc == 'aromaticity'):
2142
+ if (getattr(self, desc).empty):
2143
+ self.get_aromaticity()
2144
+ desc_encoding = self.aromaticity
2145
+
2146
+ elif (desc == 'instability_index'):
2147
+ if (getattr(self, desc).empty):
2148
+ self.get_instability_index()
2149
+ desc_encoding = self.instability_index
2150
+
2151
+ elif (desc == 'isoelectric_point'):
2152
+ if (getattr(self, desc).empty):
2153
+ self.get_isoelectric_point()
2154
+ desc_encoding = self.isoelectric_point
2155
+
2156
+ elif (desc == 'molecular_weight'):
2157
+ if (getattr(self, desc).empty):
2158
+ self.get_molecular_weight()
2159
+ desc_encoding = self.molecular_weight
2160
+
2161
+ elif (desc == 'charge_distribution'):
2162
+ if (getattr(self, desc).empty):
2163
+ self.get_charge_distribution()
2164
+ desc_encoding = self.charge_distribution
2165
+
2166
+ elif (desc == 'hydrophobic_polar_charged_composition'):
2167
+ if (getattr(self, desc).empty):
2168
+ self.get_hydrophobic_polar_charged_composition()
2169
+ desc_encoding = self.hydrophobic_polar_charged_composition
2170
+
2171
+ elif (desc == 'secondary_structure_propensity'):
2172
+ if (getattr(self, desc).empty):
2173
+ self.get_secondary_structure_propensity()
2174
+ desc_encoding = self.secondary_structure_propensity
2175
+
2176
+ elif (desc == 'kmer_composition'):
2177
+ if (getattr(self, desc).empty):
2178
+ self.get_kmer_composition()
2179
+ desc_encoding = self.kmer_composition
2180
+
2181
+ elif (desc == 'reduced_alphabet_composition'):
2182
+ if (getattr(self, desc).empty):
2183
+ self.get_reduced_alphabet_composition()
2184
+ desc_encoding = self.reduced_alphabet_composition
2185
+
2186
+ elif (desc == 'motif_composition'):
2187
+ if (getattr(self, desc).empty):
2188
+ self.get_motif_composition()
2189
+ desc_encoding = self.motif_composition
2190
+
2191
+ elif (desc == 'amino_acid_pair_composition'):
2192
+ if (getattr(self, desc).empty):
2193
+ self.get_amino_acid_pair_composition()
2194
+ desc_encoding = self.amino_acid_pair_composition
2195
+
2196
+ elif (desc == 'aliphatic_index'):
2197
+ if (getattr(self, desc).empty):
2198
+ self.get_aliphatic_index()
2199
+ desc_encoding = self.aliphatic_index
2200
+
2201
+ elif (desc == 'extinction_coefficient'):
2202
+ if (getattr(self, desc).empty):
2203
+ self.get_extinction_coefficient()
2204
+ desc_encoding = self.extinction_coefficient
2205
+
2206
+ elif (desc == 'boman_index'):
2207
+ if (getattr(self, desc).empty):
2208
+ self.get_boman_index()
2209
+ desc_encoding = self.boman_index
2210
+
2211
+ elif (desc == 'aggregation_propensity'):
2212
+ if (getattr(self, desc).empty):
2213
+ self.get_aggregation_propensity()
2214
+ desc_encoding = self.aggregation_propensity
2215
+
2216
+ elif (desc == 'hydrophobic_moment'):
2217
+ if (getattr(self, desc).empty):
2218
+ self.get_hydrophobic_moment()
2219
+ desc_encoding = self.hydrophobic_moment
2220
+
2221
+ elif (desc == 'shannon_entropy'):
2222
+ if (getattr(self, desc).empty):
2223
+ self.get_shannon_entropy()
2224
+ desc_encoding = self.shannon_entropy
2225
+
2226
+ elif (desc == 'moreaubroto_autocorrelation'):
2227
+ if (getattr(self, desc).empty):
2228
+ self.get_moreaubroto_autocorrelation()
2229
+ desc_encoding = self.moreaubroto_autocorrelation
2230
+
2231
+ elif (desc == 'moran_autocorrelation'):
2232
+ if (getattr(self, desc).empty):
2233
+ self.get_moran_autocorrelation()
2234
+ desc_encoding = self.moran_autocorrelation
2235
+
2236
+ elif (desc == 'geary_autocorrelation'):
2237
+ if (getattr(self, desc).empty):
2238
+ self.get_geary_autocorrelation()
2239
+ desc_encoding = self.geary_autocorrelation
2240
+
2241
+ elif (desc == 'ctd'):
2242
+ if (getattr(self, desc).empty):
2243
+ self.get_ctd()
2244
+ desc_encoding = self.ctd
2245
+
2246
+ elif (desc == 'ctd_composition'):
2247
+ if (getattr(self, desc).empty):
2248
+ self.get_ctd_composition()
2249
+ desc_encoding = self.ctd_composition
2250
+
2251
+ elif (desc == 'ctd_transition'):
2252
+ if (getattr(self, desc).empty):
2253
+ self.get_ctd_transition()
2254
+ desc_encoding = self.ctd_transition
2255
+
2256
+ elif (desc == 'ctd_distribution'):
2257
+ if (getattr(self, desc).empty):
2258
+ self.get_ctd_distribution()
2259
+ desc_encoding = self.ctd_distribution
2260
+
2261
+ elif (desc == 'conjoint_triad'):
2262
+ if (getattr(self, desc).empty):
2263
+ self.get_conjoint_triad()
2264
+ desc_encoding = self.conjoint_triad
2265
+
2266
+ elif (desc == 'sequence_order_coupling_number'):
2267
+ if (getattr(self, desc).empty):
2268
+ self.get_sequence_order_coupling_number()
2269
+ desc_encoding = self.sequence_order_coupling_number
2270
+
2271
+ elif (desc == 'quasi_sequence_order'):
2272
+ if (getattr(self, desc).empty):
2273
+ self.get_quasi_sequence_order()
2274
+ desc_encoding = self.quasi_sequence_order
2275
+
2276
+ elif (desc == 'pseudo_amino_acid_composition'):
2277
+ if (getattr(self, desc).empty):
2278
+ self.get_pseudo_amino_acid_composition()
2279
+ desc_encoding = self.pseudo_amino_acid_composition
2280
+
2281
+ elif (desc == 'amphiphilic_pseudo_amino_acid_composition'):
2282
+ if (getattr(self, desc).empty):
2283
+ self.get_amphiphilic_pseudo_amino_acid_composition()
2284
+ desc_encoding = self.amphiphilic_pseudo_amino_acid_composition
2285
+ else:
2286
+ desc_encoding = None #no matching descriptor found
2287
+
2288
+ return desc_encoding
2289
+
2290
+ def all_descriptors_list(self, desc_combo: int = 1) -> Union[List[str], List[Tuple[str, ...]]]:
2291
+ """
2292
+ Get list of all available descriptor attributes. Using the desc_combo
2293
+ input parameter you can get the list of all descriptors, all combinations
2294
+ of 2 descriptors or all combinations of 3 descriptors. Default of 1 will
2295
+ mean a list of all available descriptor attributes will be returned. With
2296
+ there being 33 descriptors, 528 and 5456 combinations of 2 and 3 descriptors
2297
+ will be returned if desc_combo=2 or desc_combo=3, respectively.
2298
+
2299
+ Parameters
2300
+ ==========
2301
+ :desc_combo: int (default=1)
2302
+ combination of descriptors to return. A value of 2 or 3 will return
2303
+ all combinations of 2 or 3 descriptor attributes etc.
2304
+
2305
+ Returns
2306
+ =======
2307
+ :all_descriptors: List[str] or List[Tuple[str, ...]]
2308
+ list of available descriptor attributes, or list of tuples of descriptor combinations.
2309
+ """
2310
+ #filter out class attributes that are not any of the desired descriptors
2311
+ all_descriptors = [k[1:] for k in self.__dict__.keys()
2312
+ if k.startswith('_') and not k.startswith('_all_desc')]
2313
+
2314
+ #get all combinations of 2 or 3 descriptors
2315
+ if (desc_combo == 2):
2316
+ all_descriptors = list(itertools.combinations(all_descriptors, 2))
2317
+ elif (desc_combo == 3):
2318
+ all_descriptors = list(itertools.combinations(all_descriptors, 3))
2319
+ else:
2320
+ pass #if desc_combo not equal to 2 or 3 then use default all_descriptors
2321
+
2322
+ return all_descriptors
2323
+
2324
+ def _calculate_descriptor_batch(self,
2325
+ descriptor_func: Callable,
2326
+ desc_name: str = "",
2327
+ **kwargs) -> pd.DataFrame:
2328
+ """
2329
+ Generic helper method to calculate descriptors for all sequences, preventing code repetition.
2330
+
2331
+ Parameters
2332
+ ==========
2333
+ :descriptor_func: Callable
2334
+ Function to calculate descriptor (e.g., protpy.amino_acid_composition)
2335
+ :desc_name: str
2336
+ Name of descriptor for progress tracking
2337
+ :kwargs: dict
2338
+ Additional keyword arguments to pass to descriptor function
2339
+
2340
+ Returns
2341
+ =======
2342
+ :pd.DataFrame
2343
+ Dataframe with calculated descriptor values for all sequences
2344
+ """
2345
+ iterator = tqdm(self.protein_seqs, desc=f"Computing {desc_name}") if desc_name else self.protein_seqs
2346
+
2347
+ # accumulate results in a list to avoid O(n²) repeated concat
2348
+ desc_list = [descriptor_func(seq, **kwargs) for seq in iterator]
2349
+
2350
+ return pd.concat(desc_list, ignore_index=False).reset_index(drop=True)
2351
+
2352
+ ###################### Getters & Setters ######################
2353
+
2354
+ @property
2355
+ def all_desc(self):
2356
+ return self._all_desc
2357
+
2358
+ @all_desc.setter
2359
+ def all_desc(self, val):
2360
+ self._all_desc = val
2361
+
2362
+ @property
2363
+ def amino_acid_composition(self):
2364
+ return self._amino_acid_composition
2365
+
2366
+ @amino_acid_composition.setter
2367
+ def amino_acid_composition(self, val):
2368
+ self._amino_acid_composition = val
2369
+
2370
+ @property
2371
+ def dipeptide_composition(self):
2372
+ return self._dipeptide_composition
2373
+
2374
+ @dipeptide_composition.setter
2375
+ def dipeptide_composition(self, val):
2376
+ self._dipeptide_composition = val
2377
+
2378
+ @property
2379
+ def tripeptide_composition(self):
2380
+ return self._tripeptide_composition
2381
+
2382
+ @tripeptide_composition.setter
2383
+ def tripeptide_composition(self, val):
2384
+ self._tripeptide_composition = val
2385
+
2386
+ @property
2387
+ def gravy(self):
2388
+ return self._gravy
2389
+
2390
+ @gravy.setter
2391
+ def gravy(self, val):
2392
+ self._gravy = val
2393
+
2394
+ @property
2395
+ def aromaticity(self):
2396
+ return self._aromaticity
2397
+
2398
+ @aromaticity.setter
2399
+ def aromaticity(self, val):
2400
+ self._aromaticity = val
2401
+
2402
+ @property
2403
+ def instability_index(self):
2404
+ return self._instability_index
2405
+
2406
+ @instability_index.setter
2407
+ def instability_index(self, val):
2408
+ self._instability_index = val
2409
+
2410
+ @property
2411
+ def isoelectric_point(self):
2412
+ return self._isoelectric_point
2413
+
2414
+ @isoelectric_point.setter
2415
+ def isoelectric_point(self, val):
2416
+ self._isoelectric_point = val
2417
+
2418
+ @property
2419
+ def molecular_weight(self):
2420
+ return self._molecular_weight
2421
+
2422
+ @molecular_weight.setter
2423
+ def molecular_weight(self, val):
2424
+ self._molecular_weight = val
2425
+
2426
+ @property
2427
+ def charge_distribution(self):
2428
+ return self._charge_distribution
2429
+
2430
+ @charge_distribution.setter
2431
+ def charge_distribution(self, val):
2432
+ self._charge_distribution = val
2433
+
2434
+ @property
2435
+ def hydrophobic_polar_charged_composition(self):
2436
+ return self._hydrophobic_polar_charged_composition
2437
+
2438
+ @hydrophobic_polar_charged_composition.setter
2439
+ def hydrophobic_polar_charged_composition(self, val):
2440
+ self._hydrophobic_polar_charged_composition = val
2441
+
2442
+ @property
2443
+ def secondary_structure_propensity(self):
2444
+ return self._secondary_structure_propensity
2445
+
2446
+ @secondary_structure_propensity.setter
2447
+ def secondary_structure_propensity(self, val):
2448
+ self._secondary_structure_propensity = val
2449
+
2450
+ @property
2451
+ def kmer_composition(self):
2452
+ return self._kmer_composition
2453
+
2454
+ @kmer_composition.setter
2455
+ def kmer_composition(self, val):
2456
+ self._kmer_composition = val
2457
+
2458
+ @property
2459
+ def reduced_alphabet_composition(self):
2460
+ return self._reduced_alphabet_composition
2461
+
2462
+ @reduced_alphabet_composition.setter
2463
+ def reduced_alphabet_composition(self, val):
2464
+ self._reduced_alphabet_composition = val
2465
+
2466
+ @property
2467
+ def motif_composition(self):
2468
+ return self._motif_composition
2469
+
2470
+ @motif_composition.setter
2471
+ def motif_composition(self, val):
2472
+ self._motif_composition = val
2473
+
2474
+ @property
2475
+ def amino_acid_pair_composition(self):
2476
+ return self._amino_acid_pair_composition
2477
+
2478
+ @amino_acid_pair_composition.setter
2479
+ def amino_acid_pair_composition(self, val):
2480
+ self._amino_acid_pair_composition = val
2481
+
2482
+ @property
2483
+ def aliphatic_index(self):
2484
+ return self._aliphatic_index
2485
+
2486
+ @aliphatic_index.setter
2487
+ def aliphatic_index(self, val):
2488
+ self._aliphatic_index = val
2489
+
2490
+ @property
2491
+ def extinction_coefficient(self):
2492
+ return self._extinction_coefficient
2493
+
2494
+ @extinction_coefficient.setter
2495
+ def extinction_coefficient(self, val):
2496
+ self._extinction_coefficient = val
2497
+
2498
+ @property
2499
+ def boman_index(self):
2500
+ return self._boman_index
2501
+
2502
+ @boman_index.setter
2503
+ def boman_index(self, val):
2504
+ self._boman_index = val
2505
+
2506
+ @property
2507
+ def aggregation_propensity(self):
2508
+ return self._aggregation_propensity
2509
+
2510
+ @aggregation_propensity.setter
2511
+ def aggregation_propensity(self, val):
2512
+ self._aggregation_propensity = val
2513
+
2514
+ @property
2515
+ def hydrophobic_moment(self):
2516
+ return self._hydrophobic_moment
2517
+
2518
+ @hydrophobic_moment.setter
2519
+ def hydrophobic_moment(self, val):
2520
+ self._hydrophobic_moment = val
2521
+
2522
+ @property
2523
+ def shannon_entropy(self):
2524
+ return self._shannon_entropy
2525
+
2526
+ @shannon_entropy.setter
2527
+ def shannon_entropy(self, val):
2528
+ self._shannon_entropy = val
2529
+
2530
+ @property
2531
+ def moreaubroto_autocorrelation(self):
2532
+ return self._moreaubroto_autocorrelation
2533
+
2534
+ @moreaubroto_autocorrelation.setter
2535
+ def moreaubroto_autocorrelation(self, val):
2536
+ self._moreaubroto_autocorrelation = val
2537
+
2538
+ @property
2539
+ def moran_autocorrelation(self):
2540
+ return self._moran_autocorrelation
2541
+
2542
+ @moran_autocorrelation.setter
2543
+ def moran_autocorrelation(self, val):
2544
+ self._moran_autocorrelation = val
2545
+
2546
+ @property
2547
+ def geary_autocorrelation(self):
2548
+ return self._geary_autocorrelation
2549
+
2550
+ @geary_autocorrelation.setter
2551
+ def geary_autocorrelation(self, val):
2552
+ self._geary_autocorrelation = val
2553
+
2554
+ @property
2555
+ def ctd(self):
2556
+ return self._ctd
2557
+
2558
+ @ctd.setter
2559
+ def ctd(self, val):
2560
+ self._ctd = val
2561
+
2562
+ @property
2563
+ def ctd_composition(self):
2564
+ return self._ctd_composition
2565
+
2566
+ @ctd_composition.setter
2567
+ def ctd_composition(self, val):
2568
+ self._ctd_composition = val
2569
+
2570
+ @property
2571
+ def ctd_transition(self):
2572
+ return self._ctd_transition
2573
+
2574
+ @ctd_transition.setter
2575
+ def ctd_transition(self, val):
2576
+ self._ctd_transition = val
2577
+
2578
+ @property
2579
+ def ctd_distribution(self):
2580
+ return self._ctd_distribution
2581
+
2582
+ @ctd_distribution.setter
2583
+ def ctd_distribution(self, val):
2584
+ self._ctd_distribution = val
2585
+
2586
+ @property
2587
+ def conjoint_triad(self):
2588
+ return self._conjoint_triad
2589
+
2590
+ @conjoint_triad.setter
2591
+ def conjoint_triad(self, val):
2592
+ self._conjoint_triad = val
2593
+
2594
+ @property
2595
+ def sequence_order_coupling_number(self):
2596
+ return self._sequence_order_coupling_number
2597
+
2598
+ @sequence_order_coupling_number.setter
2599
+ def sequence_order_coupling_number(self, val):
2600
+ self._sequence_order_coupling_number = val
2601
+
2602
+ @property
2603
+ def quasi_sequence_order(self):
2604
+ return self._quasi_sequence_order
2605
+
2606
+ @quasi_sequence_order.setter
2607
+ def quasi_sequence_order(self, val):
2608
+ self._quasi_sequence_order = val
2609
+
2610
+ @property
2611
+ def pseudo_amino_acid_composition(self):
2612
+ return self._pseudo_amino_acid_composition
2613
+
2614
+ @pseudo_amino_acid_composition.setter
2615
+ def pseudo_amino_acid_composition(self, val):
2616
+ self._pseudo_amino_acid_composition = val
2617
+
2618
+ @property
2619
+ def amphiphilic_pseudo_amino_acid_composition(self):
2620
+ return self._amphiphilic_pseudo_amino_acid_composition
2621
+
2622
+ @amphiphilic_pseudo_amino_acid_composition.setter
2623
+ def amphiphilic_pseudo_amino_acid_composition(self, val):
2624
+ self._amphiphilic_pseudo_amino_acid_composition = val
2625
+
2626
+ @property
2627
+ def all_descriptors(self):
2628
+ return self._all_descriptors
2629
+
2630
+ @all_descriptors.setter
2631
+ def all_descriptors(self, val):
2632
+ self._all_descriptors = val
2633
+
2634
+ @all_descriptors.deleter
2635
+ def all_descriptors(self):
2636
+ """ Delete all descriptor attribute dataframes """
2637
+ del self._all_descriptors
2638
+ del self._amino_acid_composition
2639
+ del self._dipeptide_composition
2640
+ del self._tripeptide_composition
2641
+ del self._gravy
2642
+ del self._aromaticity
2643
+ del self._instability_index
2644
+ del self._isoelectric_point
2645
+ del self._molecular_weight
2646
+ del self._charge_distribution
2647
+ del self._hydrophobic_polar_charged_composition
2648
+ del self._secondary_structure_propensity
2649
+ del self._kmer_composition
2650
+ del self._reduced_alphabet_composition
2651
+ del self._motif_composition
2652
+ del self._amino_acid_pair_composition
2653
+ del self._aliphatic_index
2654
+ del self._extinction_coefficient
2655
+ del self._boman_index
2656
+ del self._aggregation_propensity
2657
+ del self._hydrophobic_moment
2658
+ del self._shannon_entropy
2659
+ del self._moreaubroto_autocorrelation
2660
+ del self._moran_autocorrelation
2661
+ del self._geary_autocorrelation
2662
+ del self._ctd
2663
+ del self._ctd_transition
2664
+ del self._ctd_composition
2665
+ del self._ctd_distribution
2666
+ del self._conjoint_triad
2667
+ del self._sequence_order_coupling_number
2668
+ del self._quasi_sequence_order
2669
+ del self._pseudo_amino_acid_composition
2670
+ del self._amphiphilic_pseudo_amino_acid_composition
2671
+
2672
+ def __str__(self) -> str:
2673
+ return f'''{self.shape}
2674
+ Amino Acid Composition: {self.amino_acid_composition.shape}
2675
+ Dipeptide Composition: {self.dipeptide_composition.shape}
2676
+ Tripeptide Composition: {self.tripeptide_composition.shape}
2677
+ GRAVY: {self.gravy.shape}
2678
+ Aromaticity: {self.aromaticity.shape}
2679
+ Instability Index: {self.instability_index.shape}
2680
+ Isoelectric Point: {self.isoelectric_point.shape}
2681
+ Molecular Weight: {self.molecular_weight.shape}
2682
+ Charge Distribution: {self.charge_distribution.shape}
2683
+ Hydrophobic/Polar/Charged Composition: {self.hydrophobic_polar_charged_composition.shape}
2684
+ Secondary Structure Propensity: {self.secondary_structure_propensity.shape}
2685
+ k-mer Composition: {self.kmer_composition.shape}
2686
+ Reduced Alphabet Composition: {self.reduced_alphabet_composition.shape}
2687
+ Motif Composition: {self.motif_composition.shape}
2688
+ Amino Acid Pair Composition: {self.amino_acid_pair_composition.shape}
2689
+ Aliphatic Index: {self.aliphatic_index.shape}
2690
+ Extinction Coefficient: {self.extinction_coefficient.shape}
2691
+ Boman Index: {self.boman_index.shape}
2692
+ Aggregation Propensity: {self.aggregation_propensity.shape}
2693
+ Hydrophobic Moment: {self.hydrophobic_moment.shape}
2694
+ Shannon Entropy: {self.shannon_entropy.shape}
2695
+ MoreauBroto Autocorrelation: {self.moreaubroto_autocorrelation.shape}
2696
+ Moran Autocorrelation: {self.moran_autocorrelation.shape}
2697
+ Geary Autocorrelation: {self.geary_autocorrelation.shape}
2698
+ CTD: {self.ctd.shape}
2699
+ Conjoint Triad: {self.conjoint_triad.shape}
2700
+ Sequence Order Coupling Number: {self.sequence_order_coupling_number.shape}
2701
+ Quasi Sequence Order: {self.quasi_sequence_order.shape}
2702
+ Pseudo Amino Acid Composition: {self.pseudo_amino_acid_composition.shape}
2703
+ Amphiphilic Pseudo Amino Acid Composition: {self.amphiphilic_pseudo_amino_acid_composition.shape}'''
2704
+
2705
+ def get_all_descriptors(self,
2706
+ descriptors: Optional[List[str]] = None) -> pd.DataFrame:
2707
+ """
2708
+ Calculate multiple descriptors efficiently in batch.
2709
+
2710
+ Parameters
2711
+ ==========
2712
+ :descriptors: list of str, optional
2713
+ List of specific descriptors to calculate. If None, calculates all.
2714
+
2715
+ Returns
2716
+ =======
2717
+ :pd.DataFrame
2718
+ DataFrame with all calculated descriptor values concatenated
2719
+ """
2720
+ if descriptors is None:
2721
+ descriptors = self.valid_descriptors
2722
+ else:
2723
+ descriptors = self.validate_descriptors(descriptors)
2724
+
2725
+ results = {}
2726
+ for desc in tqdm(descriptors, desc="Computing all descriptors"):
2727
+ method = getattr(self, f'get_{desc}')
2728
+ results[desc] = method()
2729
+
2730
+ self.all_descriptors = pd.concat(results.values(), axis=1)
2731
+ return self.all_descriptors
2732
+
2733
+ def get_descriptor_info(self, descriptor_name: str) -> Dict[str, Any]:
2734
+ """
2735
+ Get metadata and information about a specific descriptor.
2736
+
2737
+ Parameters
2738
+ ==========
2739
+ :descriptor_name: str
2740
+ Name of the descriptor
2741
+
2742
+ Returns
2743
+ =======
2744
+ :Dict[str, Any]
2745
+ Dictionary with descriptor metadata including name, feature count, group, and parameters
2746
+ """
2747
+ self.validate_descriptors(descriptor_name)
2748
+
2749
+ descriptor_info = {
2750
+ 'name': descriptor_name,
2751
+ 'group': self.descriptor_groups.get(descriptor_name, 'Unknown'),
2752
+ 'feature_count': self.descriptor_feature_count.get(descriptor_name, 0),
2753
+ 'parameters': {},
2754
+ }
2755
+
2756
+ # Add parameters if available
2757
+ if hasattr(self.desc_parameters, descriptor_name):
2758
+ parameters = getattr(self.desc_parameters, descriptor_name)
2759
+ if isinstance(parameters, dict):
2760
+ descriptor_info['parameters'] = dict(parameters)
2761
+ elif hasattr(parameters, '__dict__'):
2762
+ descriptor_info['parameters'] = vars(parameters)
2763
+
2764
+ return descriptor_info
2765
+
2766
+ def reset_descriptors(self) -> None:
2767
+ """
2768
+ Reset all descriptor attributes to empty DataFrames.
2769
+ Clears all calculated descriptor values without affecting configuration.
2770
+
2771
+ Parameters
2772
+ ==========
2773
+ None
2774
+
2775
+ Returns
2776
+ =======
2777
+ None
2778
+ """
2779
+ self.amino_acid_composition = pd.DataFrame()
2780
+ self.dipeptide_composition = pd.DataFrame()
2781
+ self.tripeptide_composition = pd.DataFrame()
2782
+ self.gravy = pd.DataFrame()
2783
+ self.aromaticity = pd.DataFrame()
2784
+ self.instability_index = pd.DataFrame()
2785
+ self.isoelectric_point = pd.DataFrame()
2786
+ self.molecular_weight = pd.DataFrame()
2787
+ self.charge_distribution = pd.DataFrame()
2788
+ self.hydrophobic_polar_charged_composition = pd.DataFrame()
2789
+ self.secondary_structure_propensity = pd.DataFrame()
2790
+ self.kmer_composition = pd.DataFrame()
2791
+ self.reduced_alphabet_composition = pd.DataFrame()
2792
+ self.motif_composition = pd.DataFrame()
2793
+ self.amino_acid_pair_composition = pd.DataFrame()
2794
+ self.aliphatic_index = pd.DataFrame()
2795
+ self.extinction_coefficient = pd.DataFrame()
2796
+ self.boman_index = pd.DataFrame()
2797
+ self.aggregation_propensity = pd.DataFrame()
2798
+ self.hydrophobic_moment = pd.DataFrame()
2799
+ self.shannon_entropy = pd.DataFrame()
2800
+ self.moreaubroto_autocorrelation = pd.DataFrame()
2801
+ self.moran_autocorrelation = pd.DataFrame()
2802
+ self.geary_autocorrelation = pd.DataFrame()
2803
+ self.ctd = pd.DataFrame()
2804
+ self.ctd_composition = pd.DataFrame()
2805
+ self.ctd_transition = pd.DataFrame()
2806
+ self.ctd_distribution = pd.DataFrame()
2807
+ self.conjoint_triad = pd.DataFrame()
2808
+ self.sequence_order_coupling_number = pd.DataFrame()
2809
+ self.quasi_sequence_order = pd.DataFrame()
2810
+ self.pseudo_amino_acid_composition = pd.DataFrame()
2811
+ self.amphiphilic_pseudo_amino_acid_composition = pd.DataFrame()
2812
+ self.all_descriptors = pd.DataFrame()
2813
+
2814
+ def clear_cache(self) -> None:
2815
+ """
2816
+ Clear cached descriptor metadata to free memory.
2817
+ Useful after major descriptor calculations or when memory is constrained.
2818
+
2819
+ Parameters
2820
+ ==========
2821
+ None
2822
+
2823
+ Returns
2824
+ =======
2825
+ None
2826
+ """
2827
+ if hasattr(self.descriptor_feature_count, 'cache_clear'):
2828
+ self.descriptor_feature_count.fget.cache_clear()
2829
+
2830
+ def get_descriptor_columns(self, descriptor: str) -> List[str]:
2831
+ """
2832
+ Get list of column names for a specific descriptor.
2833
+
2834
+ Parameters
2835
+ ==========
2836
+ :descriptor: str
2837
+ Name of the descriptor (e.g., 'amino_acid_composition')
2838
+
2839
+ Returns
2840
+ =======
2841
+ :List[str]
2842
+ List of column names in the descriptor DataFrame
2843
+
2844
+ Raises
2845
+ ======
2846
+ :InvalidDescriptorError
2847
+ If descriptor name is invalid
2848
+ :ValueError
2849
+ If descriptor has not been calculated yet
2850
+ """
2851
+ # Validate descriptor name
2852
+ self.validate_descriptors(descriptor)
2853
+
2854
+ # Get the descriptor dataframe attribute
2855
+ desc_attr = getattr(self, descriptor, None)
2856
+
2857
+ if desc_attr is None or desc_attr.empty:
2858
+ raise ValueError(f"Descriptor '{descriptor}' has not been calculated yet. "
2859
+ f"Call get_{descriptor}() first.")
2860
+
2861
+ return desc_attr.columns.tolist()
2862
+
2863
+ def __repr__(self) -> str:
2864
+ return f'<Descriptor: {self}>'
2865
+
2866
+ def __len__(self) -> int:
2867
+ return len(self.all_descriptors)
2868
+
2869
+ def __shape__(self) -> Tuple[int, int]:
2870
+ return self.all_descriptors.shape
2871
+
2872
+ def __sizeof__(self) -> int:
2873
+ """ Get size of all_descriptors object that stores all descriptor values. """
2874
+ return self.all_descriptors.__sizeof__()
2875
+
2876
+ class DescriptorError(Exception):
2877
+ """Base exception for descriptor operations."""
2878
+ pass
2879
+
2880
+
2881
+ class InvalidSequenceError(DescriptorError):
2882
+ """Raised when sequence contains invalid amino acids."""
2883
+ pass
2884
+
2885
+
2886
+ class DescriptorConfigError(DescriptorError):
2887
+ """Raised when config file is invalid or malformed."""
2888
+ pass
2889
+
2890
+
2891
+ class InvalidDescriptorError(DescriptorError):
2892
+ """Raised when requesting non-existent descriptor."""
2893
+ pass