PySAR 2.5.0__tar.gz → 2.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pysar-2.5.0 → pysar-2.5.1}/PKG-INFO +6 -5
- {pysar-2.5.0 → pysar-2.5.1}/PySAR.egg-info/PKG-INFO +6 -5
- {pysar-2.5.0 → pysar-2.5.1}/PySAR.egg-info/requires.txt +0 -1
- {pysar-2.5.0 → pysar-2.5.1}/README.md +5 -3
- {pysar-2.5.0 → pysar-2.5.1}/docs/conf.py +1 -2
- {pysar-2.5.0 → pysar-2.5.1}/pySAR/__init__.py +1 -1
- {pysar-2.5.0 → pysar-2.5.1}/pySAR/descriptors.py +0 -1
- {pysar-2.5.0 → pysar-2.5.1}/pySAR/evaluate.py +2 -2
- {pysar-2.5.0 → pysar-2.5.1}/pySAR/globals_.py +0 -3
- {pysar-2.5.0 → pysar-2.5.1}/pySAR/model.py +6 -12
- {pysar-2.5.0 → pysar-2.5.1}/pySAR/pySAR.py +10 -11
- {pysar-2.5.0 → pysar-2.5.1}/pySAR/utils.py +0 -37
- {pysar-2.5.0 → pysar-2.5.1}/pyproject.toml +1 -2
- {pysar-2.5.0 → pysar-2.5.1}/tests/test_model.py +2 -2
- {pysar-2.5.0 → pysar-2.5.1}/tests/test_pySAR.py +2 -2
- {pysar-2.5.0 → pysar-2.5.1}/tests/test_utils.py +0 -38
- {pysar-2.5.0 → pysar-2.5.1}/LICENSE +0 -0
- {pysar-2.5.0 → pysar-2.5.1}/PySAR.egg-info/SOURCES.txt +0 -0
- {pysar-2.5.0 → pysar-2.5.1}/PySAR.egg-info/dependency_links.txt +0 -0
- {pysar-2.5.0 → pysar-2.5.1}/PySAR.egg-info/not-zip-safe +0 -0
- {pysar-2.5.0 → pysar-2.5.1}/PySAR.egg-info/top_level.txt +0 -0
- {pysar-2.5.0 → pysar-2.5.1}/pySAR/encoding.py +0 -0
- {pysar-2.5.0 → pysar-2.5.1}/pySAR/plots.py +0 -0
- {pysar-2.5.0 → pysar-2.5.1}/pySAR/py.typed +0 -0
- {pysar-2.5.0 → pysar-2.5.1}/pySAR/pyDSP.py +0 -0
- {pysar-2.5.0 → pysar-2.5.1}/setup.cfg +0 -0
- {pysar-2.5.0 → pysar-2.5.1}/tests/test_descriptors.py +0 -0
- {pysar-2.5.0 → pysar-2.5.1}/tests/test_encoding.py +0 -0
- {pysar-2.5.0 → pysar-2.5.1}/tests/test_evaluate.py +0 -0
- {pysar-2.5.0 → pysar-2.5.1}/tests/test_plots.py +0 -0
- {pysar-2.5.0 → pysar-2.5.1}/tests/test_pyDSP.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: PySAR
|
|
3
|
-
Version: 2.5.
|
|
3
|
+
Version: 2.5.1
|
|
4
4
|
Summary: Analysing Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.
|
|
5
5
|
Author-email: AJ McKenna <amckenna41@qub.ac.uk>
|
|
6
6
|
Maintainer-email: AJ McKenna <amckenna41@qub.ac.uk>
|
|
@@ -33,7 +33,6 @@ License-File: LICENSE
|
|
|
33
33
|
Requires-Dist: numpy>=1.21
|
|
34
34
|
Requires-Dist: pandas>=1.3
|
|
35
35
|
Requires-Dist: scipy>=1.7
|
|
36
|
-
Requires-Dist: delayed>=0.11
|
|
37
36
|
Requires-Dist: scikit-learn>=1.0
|
|
38
37
|
Requires-Dist: matplotlib>=3.4
|
|
39
38
|
Requires-Dist: seaborn>=0.11
|
|
@@ -50,7 +49,7 @@ Requires-Dist: sphinx; extra == "docs"
|
|
|
50
49
|
Dynamic: license-file
|
|
51
50
|
|
|
52
51
|
<p align="center">
|
|
53
|
-
<img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="
|
|
52
|
+
<img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="400" width="350"/>
|
|
54
53
|
</p>
|
|
55
54
|
|
|
56
55
|
# pySAR - Python Sequence Activity Relationship #
|
|
@@ -126,7 +125,6 @@ Requirements
|
|
|
126
125
|
* [pandas][pandas] >= 1.3
|
|
127
126
|
* [scikit-learn][sklearn] >= 1.0
|
|
128
127
|
* [scipy][scipy] >= 1.7
|
|
129
|
-
* [delayed][delayed] >= 0.11
|
|
130
128
|
* [tqdm][tqdm] >= 4.60
|
|
131
129
|
* [matplotlib][matplotlib] >= 3.4
|
|
132
130
|
* [seaborn][seaborn] >= 0.11
|
|
@@ -711,6 +709,10 @@ Journal of Chemical Information and Modeling 2020 60 (6), 2773-2790
|
|
|
711
709
|
DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
712
710
|
\[8\]: Medina-Ortiz, D., Contreras, S., Amado-Hinojosa, J., Torres-Almonacid, J., Asenjo, J. A., Navarrete, M., & Olivera-Nappa, Á. (2020). Combination of digital signal processing and assembled predictive models facilitates the rational design of proteins. ArXiv [Cs.CE]. <br>
|
|
713
711
|
|
|
712
|
+
|
|
713
|
+
[<img src="https://img.shields.io/github/stars/amckenna41/pySAR?color=green&label=star%20it%20on%20GitHub" width="132" height="20" alt="Star it on GitHub">](https://github.com/amckenna41/pySAR)
|
|
714
|
+
|
|
715
|
+
|
|
714
716
|
<a href="https://www.buymeacoffee.com/amckenna41" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="41" width="174"></a>
|
|
715
717
|
|
|
716
718
|
[Back to top](#TOP)
|
|
@@ -727,7 +729,6 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
|
727
729
|
[tqdm]: https://tqdm.github.io/
|
|
728
730
|
[seaborn]: https://seaborn.pydata.org/
|
|
729
731
|
[matplotlib]: https://matplotlib.org/
|
|
730
|
-
[delayed]: https://pypi.org/project/delayed/
|
|
731
732
|
[PyPi]: https://pypi.org/project/pysar/
|
|
732
733
|
[article]: https://www.sciencedirect.com/science/article/abs/pii/S1532046422000326
|
|
733
734
|
[pdf]: https://github.com/amckenna41/pySAR/blob/master/pySAR_research.pdf
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: PySAR
|
|
3
|
-
Version: 2.5.
|
|
3
|
+
Version: 2.5.1
|
|
4
4
|
Summary: Analysing Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.
|
|
5
5
|
Author-email: AJ McKenna <amckenna41@qub.ac.uk>
|
|
6
6
|
Maintainer-email: AJ McKenna <amckenna41@qub.ac.uk>
|
|
@@ -33,7 +33,6 @@ License-File: LICENSE
|
|
|
33
33
|
Requires-Dist: numpy>=1.21
|
|
34
34
|
Requires-Dist: pandas>=1.3
|
|
35
35
|
Requires-Dist: scipy>=1.7
|
|
36
|
-
Requires-Dist: delayed>=0.11
|
|
37
36
|
Requires-Dist: scikit-learn>=1.0
|
|
38
37
|
Requires-Dist: matplotlib>=3.4
|
|
39
38
|
Requires-Dist: seaborn>=0.11
|
|
@@ -50,7 +49,7 @@ Requires-Dist: sphinx; extra == "docs"
|
|
|
50
49
|
Dynamic: license-file
|
|
51
50
|
|
|
52
51
|
<p align="center">
|
|
53
|
-
<img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="
|
|
52
|
+
<img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="400" width="350"/>
|
|
54
53
|
</p>
|
|
55
54
|
|
|
56
55
|
# pySAR - Python Sequence Activity Relationship #
|
|
@@ -126,7 +125,6 @@ Requirements
|
|
|
126
125
|
* [pandas][pandas] >= 1.3
|
|
127
126
|
* [scikit-learn][sklearn] >= 1.0
|
|
128
127
|
* [scipy][scipy] >= 1.7
|
|
129
|
-
* [delayed][delayed] >= 0.11
|
|
130
128
|
* [tqdm][tqdm] >= 4.60
|
|
131
129
|
* [matplotlib][matplotlib] >= 3.4
|
|
132
130
|
* [seaborn][seaborn] >= 0.11
|
|
@@ -711,6 +709,10 @@ Journal of Chemical Information and Modeling 2020 60 (6), 2773-2790
|
|
|
711
709
|
DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
712
710
|
\[8\]: Medina-Ortiz, D., Contreras, S., Amado-Hinojosa, J., Torres-Almonacid, J., Asenjo, J. A., Navarrete, M., & Olivera-Nappa, Á. (2020). Combination of digital signal processing and assembled predictive models facilitates the rational design of proteins. ArXiv [Cs.CE]. <br>
|
|
713
711
|
|
|
712
|
+
|
|
713
|
+
[<img src="https://img.shields.io/github/stars/amckenna41/pySAR?color=green&label=star%20it%20on%20GitHub" width="132" height="20" alt="Star it on GitHub">](https://github.com/amckenna41/pySAR)
|
|
714
|
+
|
|
715
|
+
|
|
714
716
|
<a href="https://www.buymeacoffee.com/amckenna41" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="41" width="174"></a>
|
|
715
717
|
|
|
716
718
|
[Back to top](#TOP)
|
|
@@ -727,7 +729,6 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
|
727
729
|
[tqdm]: https://tqdm.github.io/
|
|
728
730
|
[seaborn]: https://seaborn.pydata.org/
|
|
729
731
|
[matplotlib]: https://matplotlib.org/
|
|
730
|
-
[delayed]: https://pypi.org/project/delayed/
|
|
731
732
|
[PyPi]: https://pypi.org/project/pysar/
|
|
732
733
|
[article]: https://www.sciencedirect.com/science/article/abs/pii/S1532046422000326
|
|
733
734
|
[pdf]: https://github.com/amckenna41/pySAR/blob/master/pySAR_research.pdf
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
<p align="center">
|
|
2
|
-
<img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="
|
|
2
|
+
<img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="400" width="350"/>
|
|
3
3
|
</p>
|
|
4
4
|
|
|
5
5
|
# pySAR - Python Sequence Activity Relationship #
|
|
@@ -75,7 +75,6 @@ Requirements
|
|
|
75
75
|
* [pandas][pandas] >= 1.3
|
|
76
76
|
* [scikit-learn][sklearn] >= 1.0
|
|
77
77
|
* [scipy][scipy] >= 1.7
|
|
78
|
-
* [delayed][delayed] >= 0.11
|
|
79
78
|
* [tqdm][tqdm] >= 4.60
|
|
80
79
|
* [matplotlib][matplotlib] >= 3.4
|
|
81
80
|
* [seaborn][seaborn] >= 0.11
|
|
@@ -660,6 +659,10 @@ Journal of Chemical Information and Modeling 2020 60 (6), 2773-2790
|
|
|
660
659
|
DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
661
660
|
\[8\]: Medina-Ortiz, D., Contreras, S., Amado-Hinojosa, J., Torres-Almonacid, J., Asenjo, J. A., Navarrete, M., & Olivera-Nappa, Á. (2020). Combination of digital signal processing and assembled predictive models facilitates the rational design of proteins. ArXiv [Cs.CE]. <br>
|
|
662
661
|
|
|
662
|
+
|
|
663
|
+
[<img src="https://img.shields.io/github/stars/amckenna41/pySAR?color=green&label=star%20it%20on%20GitHub" width="132" height="20" alt="Star it on GitHub">](https://github.com/amckenna41/pySAR)
|
|
664
|
+
|
|
665
|
+
|
|
663
666
|
<a href="https://www.buymeacoffee.com/amckenna41" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="41" width="174"></a>
|
|
664
667
|
|
|
665
668
|
[Back to top](#TOP)
|
|
@@ -676,7 +679,6 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
|
676
679
|
[tqdm]: https://tqdm.github.io/
|
|
677
680
|
[seaborn]: https://seaborn.pydata.org/
|
|
678
681
|
[matplotlib]: https://matplotlib.org/
|
|
679
|
-
[delayed]: https://pypi.org/project/delayed/
|
|
680
682
|
[PyPi]: https://pypi.org/project/pysar/
|
|
681
683
|
[article]: https://www.sciencedirect.com/science/article/abs/pii/S1532046422000326
|
|
682
684
|
[pdf]: https://github.com/amckenna41/pySAR/blob/master/pySAR_research.pdf
|
|
@@ -15,7 +15,7 @@ sys.path.insert(0, os.path.abspath('..'))
|
|
|
15
15
|
project = 'pySAR'
|
|
16
16
|
copyright = '2026, AJ McKenna'
|
|
17
17
|
author = 'AJ McKenna'
|
|
18
|
-
release = '2.5.
|
|
18
|
+
release = '2.5.1'
|
|
19
19
|
|
|
20
20
|
# -- General configuration ---------------------------------------------------
|
|
21
21
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
|
@@ -36,7 +36,6 @@ autodoc_mock_imports = [
|
|
|
36
36
|
'matplotlib',
|
|
37
37
|
'seaborn',
|
|
38
38
|
'tqdm',
|
|
39
|
-
'delayed',
|
|
40
39
|
'aaindex',
|
|
41
40
|
'protpy',
|
|
42
41
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
""" pySAR software metadata. """
|
|
2
2
|
__name__ = 'pySAR'
|
|
3
|
-
__version__ = "2.5.
|
|
3
|
+
__version__ = "2.5.1"
|
|
4
4
|
__description__ = 'A Python package used to analysis Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.'
|
|
5
5
|
__author__ = 'AJ McKenna: https://github.com/amckenna41'
|
|
6
6
|
__authorEmail__ = 'amckenna41@qub.ac.uk'
|
|
@@ -184,8 +184,8 @@ class Evaluate():
|
|
|
184
184
|
:rpd: float
|
|
185
185
|
the RPD score for the model.
|
|
186
186
|
"""
|
|
187
|
-
|
|
188
|
-
return self.Y_true.std() / np.sqrt(mse) if mse > 0 else np.inf
|
|
187
|
+
# reuse already-computed self.mse to avoid a redundant sklearn call
|
|
188
|
+
return self.Y_true.std() / np.sqrt(self.mse) if self.mse > 0 else np.inf
|
|
189
189
|
|
|
190
190
|
def explained_var_(self, multioutput='uniform_average'):
|
|
191
191
|
"""
|
|
@@ -8,14 +8,11 @@ from datetime import datetime
|
|
|
8
8
|
NOW = datetime.now()
|
|
9
9
|
|
|
10
10
|
#output dir is the default directory used to store all outputs generated
|
|
11
|
-
global OUTPUT_DIR
|
|
12
11
|
OUTPUT_DIR = 'outputs'
|
|
13
12
|
|
|
14
13
|
#current datetime appended to output assets & directories to uniquely identify them
|
|
15
|
-
global CURRENT_DATETIME
|
|
16
14
|
CURRENT_DATETIME = NOW.strftime('%Y-%m-%d_%H-%M-%S')
|
|
17
15
|
|
|
18
16
|
#output folder is the default folder within the OUTPUT_DIR used to store all
|
|
19
17
|
#outputs generated from one run of the program.
|
|
20
|
-
global OUTPUT_FOLDER
|
|
21
18
|
OUTPUT_FOLDER = os.path.join(OUTPUT_DIR, f'model_output_{CURRENT_DATETIME}')
|
|
@@ -92,7 +92,6 @@ class Model():
|
|
|
92
92
|
'sgd': SGDRegressor,
|
|
93
93
|
'stochasticgradientdescent': SGDRegressor,
|
|
94
94
|
'gbr': GradientBoostingRegressor,
|
|
95
|
-
'gradientboost': GradientBoostingRegressor,
|
|
96
95
|
'gradientboostingregressor': GradientBoostingRegressor,
|
|
97
96
|
'svr': SVR,
|
|
98
97
|
'supportvectorregression': SVR,
|
|
@@ -123,15 +122,8 @@ class Model():
|
|
|
123
122
|
else:
|
|
124
123
|
self.parameters = parameters
|
|
125
124
|
|
|
126
|
-
#
|
|
127
|
-
self.valid_models =
|
|
128
|
-
'baggingregressor', 'decisiontreeregressor', 'gbr',
|
|
129
|
-
'gradientboostingregressor', 'linearregression', 'lasso', 'ridge',
|
|
130
|
-
'svr', 'supportvectorregression', 'sgd', 'stochasticgradientdescent',
|
|
131
|
-
'kneighborsregressor', 'knearestneighbors', 'knn', 'elasticnet',
|
|
132
|
-
'extratreesregressor', 'extratrees', 'histgradientboostingregressor',
|
|
133
|
-
'histgradientboosting', 'hgbr', 'gaussianprocessregressor',
|
|
134
|
-
'gaussianprocess', 'gpr']
|
|
125
|
+
#derive valid model names directly from MODEL_CONSTRUCTORS to avoid duplication and sync issues
|
|
126
|
+
self.valid_models = list(self.MODEL_CONSTRUCTORS.keys())
|
|
135
127
|
|
|
136
128
|
#raise error if algorithm parameter isnt string type
|
|
137
129
|
if not(isinstance(self.algorithm, str)):
|
|
@@ -311,8 +303,8 @@ class Model():
|
|
|
311
303
|
try:
|
|
312
304
|
with open(save_path, 'wb') as file:
|
|
313
305
|
pickle.dump(self.model, file)
|
|
314
|
-
except
|
|
315
|
-
|
|
306
|
+
except pickle.PickleError as e:
|
|
307
|
+
raise RuntimeError(f"Error pickling model with path: {save_path}.") from e
|
|
316
308
|
|
|
317
309
|
def hyperparameter_tuning(self, param_grid=None, metric='r2', cv=5, n_jobs=None, verbose=2):
|
|
318
310
|
"""
|
|
@@ -365,6 +357,8 @@ class Model():
|
|
|
365
357
|
|
|
366
358
|
#cv must be of type int and be between 5 and 10, if not then default of 5 is used
|
|
367
359
|
if not isinstance(cv, int) or cv < 5 or cv > 10:
|
|
360
|
+
import warnings
|
|
361
|
+
warnings.warn(f'Invalid cv value {cv!r}; must be an int between 5 and 10. Defaulting to 5.', UserWarning, stacklevel=2)
|
|
368
362
|
cv = 5
|
|
369
363
|
|
|
370
364
|
#copy to avoid mutating caller's dict; filter out parameter names invalid for this model
|
|
@@ -221,7 +221,7 @@ class PySAR():
|
|
|
221
221
|
|
|
222
222
|
#verify no invalid amino acids found in sequences, if so then raise error
|
|
223
223
|
invalid_seqs = valid_sequence(self.sequences)
|
|
224
|
-
if
|
|
224
|
+
if invalid_seqs is not None:
|
|
225
225
|
raise ValueError(f'Invalid amino acids found in protein sequence dataset: {invalid_seqs}.')
|
|
226
226
|
|
|
227
227
|
#get closest match for activity column name in dataset
|
|
@@ -270,7 +270,7 @@ class PySAR():
|
|
|
270
270
|
array of the encoded protein sequences in dataset via user input index/indices.
|
|
271
271
|
"""
|
|
272
272
|
#validate AAI indices are present in the input parameter, if not raise error
|
|
273
|
-
if
|
|
273
|
+
if aai_indices is None or aai_indices == "":
|
|
274
274
|
raise ValueError(f'AAI indices input parameter cannot be None or empty: {aai_indices}.')
|
|
275
275
|
|
|
276
276
|
#check input indices is of correct type (str/list), if not raise type error
|
|
@@ -352,7 +352,7 @@ class PySAR():
|
|
|
352
352
|
pandas Dataframe storing metrics and results of encoding.
|
|
353
353
|
"""
|
|
354
354
|
#validate AAI indices are present in the input parameter
|
|
355
|
-
if
|
|
355
|
+
if aai_indices is None or aai_indices == "" or aai_indices == []:
|
|
356
356
|
raise ValueError(f'AAI indices input parameter cannot be None or empty: {aai_indices}.')
|
|
357
357
|
|
|
358
358
|
#check input indices is of correct type (str/list), if not raise type error
|
|
@@ -385,7 +385,7 @@ class PySAR():
|
|
|
385
385
|
#else use the AAI indices encoding's themselves as the feature/training data (X)
|
|
386
386
|
if (self.use_dsp):
|
|
387
387
|
#if input spectrum is none or empty, raise error.
|
|
388
|
-
if
|
|
388
|
+
if self.spectrum is None or self.spectrum == "":
|
|
389
389
|
raise ValueError(f'Spectrum cannot be None or empty: {self.spectrum}.')
|
|
390
390
|
pyDSP = PyDSP(self.config_file, protein_seqs=encoded_seqs)
|
|
391
391
|
X = pd.DataFrame(pyDSP.spectrum_encoding) #set training data to FFT spectrum encoding
|
|
@@ -471,7 +471,7 @@ class PySAR():
|
|
|
471
471
|
inputted descriptor(s).
|
|
472
472
|
"""
|
|
473
473
|
#raise error if no descriptors specified in input
|
|
474
|
-
if
|
|
474
|
+
if descriptors is None or descriptors == "" or descriptors == []:
|
|
475
475
|
raise ValueError(f'Descriptors input parameter cannot be None or empty: {descriptors}.')
|
|
476
476
|
|
|
477
477
|
#check input descriptor is of correct type str or list, if not raise type error
|
|
@@ -551,7 +551,7 @@ class PySAR():
|
|
|
551
551
|
pandas dataframe storing metrics and results of encoding.
|
|
552
552
|
"""
|
|
553
553
|
#raise error if no descriptor specified in input
|
|
554
|
-
if
|
|
554
|
+
if descriptors is None or descriptors == "" or descriptors == []:
|
|
555
555
|
raise ValueError(f'Descriptors input parameter cannot be None or empty: {descriptors}.')
|
|
556
556
|
|
|
557
557
|
#check input descriptor is of correct type (str or list), if not raise type error
|
|
@@ -633,7 +633,7 @@ class PySAR():
|
|
|
633
633
|
desc_df['Group'] = desc_df['Group'].astype(pd.StringDtype())
|
|
634
634
|
|
|
635
635
|
#ensure aai indices attribute doesn't show up in output results
|
|
636
|
-
if
|
|
636
|
+
if self.aai_indices is not None:
|
|
637
637
|
self.aai_indices = None
|
|
638
638
|
|
|
639
639
|
#print out results from encoding
|
|
@@ -684,8 +684,8 @@ class PySAR():
|
|
|
684
684
|
pandas dataframe storing metrics and results of encoding.
|
|
685
685
|
"""
|
|
686
686
|
#validate AAI indices and Descriptors are present in the input parameters, return error if either is None
|
|
687
|
-
if (descriptors
|
|
688
|
-
|
|
687
|
+
if (descriptors is None or descriptors in ("", [])) or (aai_indices is None or aai_indices in ("", [])):
|
|
688
|
+
raise ValueError('AAI Indices and Descriptor input parameters must not be empty or None.')
|
|
689
689
|
|
|
690
690
|
#check input descriptor & indices are of correct type (str/list), if not raise type error
|
|
691
691
|
if (not isinstance(aai_indices, str) and (not isinstance(aai_indices, list)) or \
|
|
@@ -810,8 +810,7 @@ class PySAR():
|
|
|
810
810
|
evaluation.rmse, evaluation.mse, evaluation.mae, evaluation.rpd, evaluation.explained_var]
|
|
811
811
|
|
|
812
812
|
#convert Index, Category, Descriptor and Group from default Object type -> String datatypes
|
|
813
|
-
|
|
814
|
-
aai_desc_df['Index'] = aai_desc_df['Index'].astype("string")
|
|
813
|
+
aai_desc_df['Index'] = aai_desc_df['Index'].astype(pd.StringDtype())
|
|
815
814
|
aai_desc_df['Category'] = aai_desc_df['Category'].astype(pd.StringDtype())
|
|
816
815
|
aai_desc_df['Descriptor'] = aai_desc_df['Descriptor'].astype(pd.StringDtype())
|
|
817
816
|
aai_desc_df['Group'] = aai_desc_df['Group'].astype(pd.StringDtype())
|
|
@@ -149,43 +149,6 @@ def remove_gaps(sequences):
|
|
|
149
149
|
cleaned = ''.join(str(c) for c in sequences if str(c) != '-')
|
|
150
150
|
return [cleaned]
|
|
151
151
|
|
|
152
|
-
def flatten(array):
|
|
153
|
-
"""
|
|
154
|
-
Lambda function for flattening list of lists or array of lists into one
|
|
155
|
-
1-dimensional array/list. Input must contain an array of arrays of the same
|
|
156
|
-
length. Input will be flattened into a 1-dimensional array of size (M * N, 1)
|
|
157
|
-
where M = len(array) and N = len(array[0]). The flattened output can then be
|
|
158
|
-
reshaped into the required shape and format.
|
|
159
|
-
|
|
160
|
-
Parameters
|
|
161
|
-
==========
|
|
162
|
-
:array: np.ndarray/list
|
|
163
|
-
array of arrays or list of lists to be flattened.
|
|
164
|
-
|
|
165
|
-
Returns
|
|
166
|
-
=======
|
|
167
|
-
:flatten(array/list): np.ndarray/list
|
|
168
|
-
flattened 1-dimensional list or array.
|
|
169
|
-
"""
|
|
170
|
-
#if input is a string then return input as cannot be flattened
|
|
171
|
-
if (isinstance(array, str)):
|
|
172
|
-
return array
|
|
173
|
-
|
|
174
|
-
#create flatten lambda function
|
|
175
|
-
_flatten = lambda array: [item for sublist in array for item in sublist]
|
|
176
|
-
|
|
177
|
-
#flatten array/list
|
|
178
|
-
try:
|
|
179
|
-
flattened_array = _flatten(array)
|
|
180
|
-
except (TypeError, ValueError):
|
|
181
|
-
raise TypeError(f'Error flattening array of type: {type(array)} and size {len(array)}.')
|
|
182
|
-
|
|
183
|
-
#if input is a numpy array then reshape to 1D numpy array else return list
|
|
184
|
-
if (isinstance(array,np.ndarray)):
|
|
185
|
-
return (np.array(flattened_array).reshape([-1, 1]))
|
|
186
|
-
else:
|
|
187
|
-
return flattened_array
|
|
188
|
-
|
|
189
152
|
def zero_padding(sequences):
|
|
190
153
|
"""
|
|
191
154
|
Pad sequences in input array with 0's such that every sequence is of the same length
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "PySAR"
|
|
7
|
-
version = "2.5.
|
|
7
|
+
version = "2.5.1"
|
|
8
8
|
description = "Analysing Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -54,7 +54,6 @@ dependencies = [
|
|
|
54
54
|
"numpy>=1.21",
|
|
55
55
|
"pandas>=1.3",
|
|
56
56
|
"scipy>=1.7",
|
|
57
|
-
"delayed>=0.11",
|
|
58
57
|
"scikit-learn>=1.0",
|
|
59
58
|
"matplotlib>=3.4",
|
|
60
59
|
"seaborn>=0.11",
|
|
@@ -96,7 +96,7 @@ class ModelTests(unittest.TestCase):
|
|
|
96
96
|
aliases = [
|
|
97
97
|
('plsreg', 'plsregression', 'PLSRegression'),
|
|
98
98
|
('randomfor', 'randomforestregressor', 'RandomForestRegressor'),
|
|
99
|
-
('
|
|
99
|
+
('adaboost', 'adaboostregressor', 'AdaBoostRegressor'),
|
|
100
100
|
('bagging', 'baggingregressor', 'BaggingRegressor'),
|
|
101
101
|
('decisiontree', 'decisiontreeregressor', 'DecisionTreeRegressor'),
|
|
102
102
|
('linear', 'linearregression', 'LinearRegression'),
|
|
@@ -299,7 +299,7 @@ class ModelTests(unittest.TestCase):
|
|
|
299
299
|
def test_hyperparameter_tuning(self):
|
|
300
300
|
""" Testing hyperparamter tuning functionality. """
|
|
301
301
|
#1.)
|
|
302
|
-
model = Model(self.dummy_X, self.dummy_Y, algorithm="
|
|
302
|
+
model = Model(self.dummy_X, self.dummy_Y, algorithm="adaboostregressor")
|
|
303
303
|
X_train, X_test, Y_train, Y_test = model.train_test_split(test_split=0.2)
|
|
304
304
|
model.fit()
|
|
305
305
|
param_grid = {'n_estimators': [50,100,150], 'learning_rate': [0.5,0.75,1], 'loss': ['linear','exponential']}
|
|
@@ -69,8 +69,8 @@ class PySARTests(unittest.TestCase):
|
|
|
69
69
|
# @unittest.skip("Skipping metadata tests.")
|
|
70
70
|
def test_pySAR_metadata(self):
|
|
71
71
|
""" Testing correct pySAR version and metadata. """
|
|
72
|
-
self.assertEqual(pysar_.__version__, "2.5.
|
|
73
|
-
f"pySAR version is not correct, expected 2.5.
|
|
72
|
+
self.assertEqual(pysar_.__version__, "2.5.1",
|
|
73
|
+
f"pySAR version is not correct, expected 2.5.1, got {pysar_.__version__}.")
|
|
74
74
|
self.assertEqual(pysar_.__name__, "pySAR",
|
|
75
75
|
f"pySAR software name is not correct, expected pySAR, got {pysar_.__name__}.")
|
|
76
76
|
self.assertEqual(pysar_.__author__, "AJ McKenna: https://github.com/amckenna41",
|
|
@@ -24,8 +24,6 @@ class UtilsTest(unittest.TestCase):
|
|
|
24
24
|
testing correct utils.valid_sequence functionality.
|
|
25
25
|
test_remove_gaps:
|
|
26
26
|
testing correct utils.remove_gaps functionality.
|
|
27
|
-
test_flatten:
|
|
28
|
-
testing correct utils.flatten functionality.
|
|
29
27
|
test_zero_padding:
|
|
30
28
|
testing correct utils.zero_padding functionality.
|
|
31
29
|
test_save_results:
|
|
@@ -129,42 +127,6 @@ class UtilsTest(unittest.TestCase):
|
|
|
129
127
|
self.assertIsInstance(seq4_test, str, f"Expected output to be of type str, got {type(seq4_test)}.")
|
|
130
128
|
self.assertNotIn('-', seq4_test, "Expected there to be no gaps (-) in the sequence.")
|
|
131
129
|
|
|
132
|
-
def test_flatten(self):
|
|
133
|
-
""" Test flatten utility function that flattens an array or list. """
|
|
134
|
-
seq1 = np.array([[1, 2, 3], [4, 5, 6]], np.int32)
|
|
135
|
-
seq2 = np.array([[1, 2, 3], [4, 5, 6],[7, 8, 9]], np.int32)
|
|
136
|
-
seq3 = np.random.randint(10,90,(4,5,2))
|
|
137
|
-
seq4 = ["A", "B", "C", "D", "E", "F"]
|
|
138
|
-
seq5 = "TUVWXYZ"
|
|
139
|
-
#1.)
|
|
140
|
-
flattened_array = utils.flatten(seq1)
|
|
141
|
-
self.assertEqual(flattened_array.shape, (6,1), f"Expected output shape to be (6,1), got {flattened_array.shape}.")
|
|
142
|
-
self.assertIsInstance(flattened_array, np.ndarray, f"Expected output to be of type np.ndarray, got {type(flattened_array)}.")
|
|
143
|
-
self.assertEqual(flattened_array.ndim, 2, f"Expected 2 output dimensions, got {flattened_array.ndim}.")
|
|
144
|
-
self.assertTrue((np.array([[1],[2],[3],[4],[5],[6]]) == flattened_array).all(),
|
|
145
|
-
f"Output array doesn't match expected:\n{flattened_array}.")
|
|
146
|
-
#2.)
|
|
147
|
-
flattened_array_2 = utils.flatten(seq2)
|
|
148
|
-
self.assertEqual(flattened_array_2.shape, (9,1), f"Expected output shape to be (9,1), got {flattened_array_2.shape}.")
|
|
149
|
-
self.assertIsInstance(flattened_array_2, np.ndarray, f"Expected output to be of type np.ndarray, got {type(flattened_array_2)}.")
|
|
150
|
-
self.assertEqual(flattened_array_2.ndim, 2, f"Expected 2 output dimensions, got {flattened_array_2.ndim}.")
|
|
151
|
-
self.assertTrue((np.array([[1],[2],[3],[4],[5],[6],[7],[8],[9]]) == flattened_array_2).all(),
|
|
152
|
-
f"Output array doesn't match expected:\n{flattened_array_2}.")
|
|
153
|
-
#3.)
|
|
154
|
-
flattened_array_3 = utils.flatten(seq3)
|
|
155
|
-
self.assertEqual(flattened_array_3.shape, (40,1), f"Expected output shape to be (40,1), got {flattened_array_3.shape}.")
|
|
156
|
-
self.assertIsInstance(flattened_array_3, np.ndarray, f"Expected output to be of type np.ndarray, got {type(flattened_array_3)}.")
|
|
157
|
-
self.assertEqual(flattened_array_3.ndim, 2, f"Expected 2 output dimensions, got {flattened_array_3.ndim}.")
|
|
158
|
-
#4.)
|
|
159
|
-
flattened_array_4 = utils.flatten(seq4)
|
|
160
|
-
self.assertEqual(len(flattened_array_4), 6, f"Expected length of output to be 6, got {len(flattened_array_4)}.")
|
|
161
|
-
self.assertIsInstance(flattened_array_4, list, f"Expected output to be of type list, got {type(flattened_array_4)}.")
|
|
162
|
-
self.assertEqual(flattened_array_4, seq4, f"Output doesn't match expected sequence {seq4}.")
|
|
163
|
-
#5.)
|
|
164
|
-
flattened_array_5 = utils.flatten(seq5)
|
|
165
|
-
self.assertEqual(flattened_array_5, seq5, f"Output doesn't match expected sequence {seq5}.")
|
|
166
|
-
self.assertIsInstance(flattened_array_5, str, f"Expected output to be of type string, got {type(flattened_array_5)}.")
|
|
167
|
-
|
|
168
130
|
def test_zero_padding(self):
|
|
169
131
|
""" Test zero padding utility function that pads an array or list with 0's. """
|
|
170
132
|
seq1 = np.array([[1, 2, 3, 4, 5], [6, 7, 8]], dtype=object)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|