PySAR 2.5.0__tar.gz → 2.5.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pysar-2.5.0 → pysar-2.5.2}/PKG-INFO +13 -6
- {pysar-2.5.0 → pysar-2.5.2}/PySAR.egg-info/PKG-INFO +13 -6
- {pysar-2.5.0 → pysar-2.5.2}/PySAR.egg-info/SOURCES.txt +1 -0
- {pysar-2.5.0 → pysar-2.5.2}/PySAR.egg-info/requires.txt +0 -1
- {pysar-2.5.0 → pysar-2.5.2}/README.md +13 -5
- {pysar-2.5.0 → pysar-2.5.2}/docs/conf.py +1 -2
- {pysar-2.5.0 → pysar-2.5.2}/pySAR/__init__.py +7 -1
- pysar-2.5.2/pySAR/config.py +103 -0
- {pysar-2.5.0 → pysar-2.5.2}/pySAR/descriptors.py +59 -59
- {pysar-2.5.0 → pysar-2.5.2}/pySAR/encoding.py +240 -37
- {pysar-2.5.0 → pysar-2.5.2}/pySAR/evaluate.py +6 -4
- pysar-2.5.2/pySAR/globals_.py +38 -0
- {pysar-2.5.0 → pysar-2.5.2}/pySAR/model.py +163 -30
- {pysar-2.5.0 → pysar-2.5.2}/pySAR/plots.py +7 -4
- {pysar-2.5.0 → pysar-2.5.2}/pySAR/pyDSP.py +63 -108
- {pysar-2.5.0 → pysar-2.5.2}/pySAR/pySAR.py +533 -231
- {pysar-2.5.0 → pysar-2.5.2}/pySAR/utils.py +14 -47
- {pysar-2.5.0 → pysar-2.5.2}/pyproject.toml +1 -2
- {pysar-2.5.0 → pysar-2.5.2}/tests/test_descriptors.py +52 -0
- {pysar-2.5.0 → pysar-2.5.2}/tests/test_encoding.py +164 -11
- {pysar-2.5.0 → pysar-2.5.2}/tests/test_evaluate.py +3 -3
- {pysar-2.5.0 → pysar-2.5.2}/tests/test_model.py +132 -14
- {pysar-2.5.0 → pysar-2.5.2}/tests/test_plots.py +4 -4
- {pysar-2.5.0 → pysar-2.5.2}/tests/test_pyDSP.py +66 -1
- {pysar-2.5.0 → pysar-2.5.2}/tests/test_pySAR.py +208 -22
- {pysar-2.5.0 → pysar-2.5.2}/tests/test_utils.py +37 -50
- pysar-2.5.0/pySAR/globals_.py +0 -21
- {pysar-2.5.0 → pysar-2.5.2}/LICENSE +0 -0
- {pysar-2.5.0 → pysar-2.5.2}/PySAR.egg-info/dependency_links.txt +0 -0
- {pysar-2.5.0 → pysar-2.5.2}/PySAR.egg-info/not-zip-safe +0 -0
- {pysar-2.5.0 → pysar-2.5.2}/PySAR.egg-info/top_level.txt +0 -0
- {pysar-2.5.0 → pysar-2.5.2}/pySAR/py.typed +0 -0
- {pysar-2.5.0 → pysar-2.5.2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: PySAR
|
|
3
|
-
Version: 2.5.
|
|
3
|
+
Version: 2.5.2
|
|
4
4
|
Summary: Analysing Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.
|
|
5
5
|
Author-email: AJ McKenna <amckenna41@qub.ac.uk>
|
|
6
6
|
Maintainer-email: AJ McKenna <amckenna41@qub.ac.uk>
|
|
@@ -33,7 +33,6 @@ License-File: LICENSE
|
|
|
33
33
|
Requires-Dist: numpy>=1.21
|
|
34
34
|
Requires-Dist: pandas>=1.3
|
|
35
35
|
Requires-Dist: scipy>=1.7
|
|
36
|
-
Requires-Dist: delayed>=0.11
|
|
37
36
|
Requires-Dist: scikit-learn>=1.0
|
|
38
37
|
Requires-Dist: matplotlib>=3.4
|
|
39
38
|
Requires-Dist: seaborn>=0.11
|
|
@@ -50,7 +49,7 @@ Requires-Dist: sphinx; extra == "docs"
|
|
|
50
49
|
Dynamic: license-file
|
|
51
50
|
|
|
52
51
|
<p align="center">
|
|
53
|
-
<img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="
|
|
52
|
+
<img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="400" width="350"/>
|
|
54
53
|
</p>
|
|
55
54
|
|
|
56
55
|
# pySAR - Python Sequence Activity Relationship #
|
|
@@ -71,8 +70,13 @@ Dynamic: license-file
|
|
|
71
70
|
|
|
72
71
|
`pySAR` is a Python library for analysing Sequence Activity Relationships (SARs)/Sequence Function Relationships (SFRs) of protein sequences.
|
|
73
72
|
|
|
73
|
+
|
|
74
|
+
<h2 align="center">
|
|
75
|
+
The NEW front-end app for pySAR is available
|
|
76
|
+
<a href="https://pysar-app.vercel.app/" target="_blank">here</a>!
|
|
77
|
+
</h2>
|
|
78
|
+
|
|
74
79
|
* 📖 The published research article is available [here][article].
|
|
75
|
-
* 🌍 A front-end app for `pySAR` is available [here][frontend] (coming soon).
|
|
76
80
|
* 💻 A quick Colab notebook demo of `pySAR` is available [here][demo].
|
|
77
81
|
* 📰 A **Medium** article that dives deeper into SARs and the `pySAR` software itself is available [here][medium].
|
|
78
82
|
|
|
@@ -126,7 +130,6 @@ Requirements
|
|
|
126
130
|
* [pandas][pandas] >= 1.3
|
|
127
131
|
* [scikit-learn][sklearn] >= 1.0
|
|
128
132
|
* [scipy][scipy] >= 1.7
|
|
129
|
-
* [delayed][delayed] >= 0.11
|
|
130
133
|
* [tqdm][tqdm] >= 4.60
|
|
131
134
|
* [matplotlib][matplotlib] >= 3.4
|
|
132
135
|
* [seaborn][seaborn] >= 0.11
|
|
@@ -711,6 +714,10 @@ Journal of Chemical Information and Modeling 2020 60 (6), 2773-2790
|
|
|
711
714
|
DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
712
715
|
\[8\]: Medina-Ortiz, D., Contreras, S., Amado-Hinojosa, J., Torres-Almonacid, J., Asenjo, J. A., Navarrete, M., & Olivera-Nappa, Á. (2020). Combination of digital signal processing and assembled predictive models facilitates the rational design of proteins. ArXiv [Cs.CE]. <br>
|
|
713
716
|
|
|
717
|
+
|
|
718
|
+
[<img src="https://img.shields.io/github/stars/amckenna41/pySAR?color=green&label=star%20it%20on%20GitHub" width="132" height="20" alt="Star it on GitHub">](https://github.com/amckenna41/pySAR)
|
|
719
|
+
|
|
720
|
+
|
|
714
721
|
<a href="https://www.buymeacoffee.com/amckenna41" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="41" width="174"></a>
|
|
715
722
|
|
|
716
723
|
[Back to top](#TOP)
|
|
@@ -727,7 +734,6 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
|
727
734
|
[tqdm]: https://tqdm.github.io/
|
|
728
735
|
[seaborn]: https://seaborn.pydata.org/
|
|
729
736
|
[matplotlib]: https://matplotlib.org/
|
|
730
|
-
[delayed]: https://pypi.org/project/delayed/
|
|
731
737
|
[PyPi]: https://pypi.org/project/pysar/
|
|
732
738
|
[article]: https://www.sciencedirect.com/science/article/abs/pii/S1532046422000326
|
|
733
739
|
[pdf]: https://github.com/amckenna41/pySAR/blob/master/pySAR_research.pdf
|
|
@@ -738,3 +744,4 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
|
738
744
|
[config]: https://github.com/amckenna41/pySAR/blob/master/CONFIG.md
|
|
739
745
|
[medium]: https://ajmckenna69.medium.com/pysar-a3de9f71733f
|
|
740
746
|
[directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
|
|
747
|
+
[frontend]: https://pysar-app.vercel.app/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: PySAR
|
|
3
|
-
Version: 2.5.
|
|
3
|
+
Version: 2.5.2
|
|
4
4
|
Summary: Analysing Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.
|
|
5
5
|
Author-email: AJ McKenna <amckenna41@qub.ac.uk>
|
|
6
6
|
Maintainer-email: AJ McKenna <amckenna41@qub.ac.uk>
|
|
@@ -33,7 +33,6 @@ License-File: LICENSE
|
|
|
33
33
|
Requires-Dist: numpy>=1.21
|
|
34
34
|
Requires-Dist: pandas>=1.3
|
|
35
35
|
Requires-Dist: scipy>=1.7
|
|
36
|
-
Requires-Dist: delayed>=0.11
|
|
37
36
|
Requires-Dist: scikit-learn>=1.0
|
|
38
37
|
Requires-Dist: matplotlib>=3.4
|
|
39
38
|
Requires-Dist: seaborn>=0.11
|
|
@@ -50,7 +49,7 @@ Requires-Dist: sphinx; extra == "docs"
|
|
|
50
49
|
Dynamic: license-file
|
|
51
50
|
|
|
52
51
|
<p align="center">
|
|
53
|
-
<img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="
|
|
52
|
+
<img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="400" width="350"/>
|
|
54
53
|
</p>
|
|
55
54
|
|
|
56
55
|
# pySAR - Python Sequence Activity Relationship #
|
|
@@ -71,8 +70,13 @@ Dynamic: license-file
|
|
|
71
70
|
|
|
72
71
|
`pySAR` is a Python library for analysing Sequence Activity Relationships (SARs)/Sequence Function Relationships (SFRs) of protein sequences.
|
|
73
72
|
|
|
73
|
+
|
|
74
|
+
<h2 align="center">
|
|
75
|
+
The NEW front-end app for pySAR is available
|
|
76
|
+
<a href="https://pysar-app.vercel.app/" target="_blank">here</a>!
|
|
77
|
+
</h2>
|
|
78
|
+
|
|
74
79
|
* 📖 The published research article is available [here][article].
|
|
75
|
-
* 🌍 A front-end app for `pySAR` is available [here][frontend] (coming soon).
|
|
76
80
|
* 💻 A quick Colab notebook demo of `pySAR` is available [here][demo].
|
|
77
81
|
* 📰 A **Medium** article that dives deeper into SARs and the `pySAR` software itself is available [here][medium].
|
|
78
82
|
|
|
@@ -126,7 +130,6 @@ Requirements
|
|
|
126
130
|
* [pandas][pandas] >= 1.3
|
|
127
131
|
* [scikit-learn][sklearn] >= 1.0
|
|
128
132
|
* [scipy][scipy] >= 1.7
|
|
129
|
-
* [delayed][delayed] >= 0.11
|
|
130
133
|
* [tqdm][tqdm] >= 4.60
|
|
131
134
|
* [matplotlib][matplotlib] >= 3.4
|
|
132
135
|
* [seaborn][seaborn] >= 0.11
|
|
@@ -711,6 +714,10 @@ Journal of Chemical Information and Modeling 2020 60 (6), 2773-2790
|
|
|
711
714
|
DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
712
715
|
\[8\]: Medina-Ortiz, D., Contreras, S., Amado-Hinojosa, J., Torres-Almonacid, J., Asenjo, J. A., Navarrete, M., & Olivera-Nappa, Á. (2020). Combination of digital signal processing and assembled predictive models facilitates the rational design of proteins. ArXiv [Cs.CE]. <br>
|
|
713
716
|
|
|
717
|
+
|
|
718
|
+
[<img src="https://img.shields.io/github/stars/amckenna41/pySAR?color=green&label=star%20it%20on%20GitHub" width="132" height="20" alt="Star it on GitHub">](https://github.com/amckenna41/pySAR)
|
|
719
|
+
|
|
720
|
+
|
|
714
721
|
<a href="https://www.buymeacoffee.com/amckenna41" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="41" width="174"></a>
|
|
715
722
|
|
|
716
723
|
[Back to top](#TOP)
|
|
@@ -727,7 +734,6 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
|
727
734
|
[tqdm]: https://tqdm.github.io/
|
|
728
735
|
[seaborn]: https://seaborn.pydata.org/
|
|
729
736
|
[matplotlib]: https://matplotlib.org/
|
|
730
|
-
[delayed]: https://pypi.org/project/delayed/
|
|
731
737
|
[PyPi]: https://pypi.org/project/pysar/
|
|
732
738
|
[article]: https://www.sciencedirect.com/science/article/abs/pii/S1532046422000326
|
|
733
739
|
[pdf]: https://github.com/amckenna41/pySAR/blob/master/pySAR_research.pdf
|
|
@@ -738,3 +744,4 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
|
738
744
|
[config]: https://github.com/amckenna41/pySAR/blob/master/CONFIG.md
|
|
739
745
|
[medium]: https://ajmckenna69.medium.com/pysar-a3de9f71733f
|
|
740
746
|
[directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
|
|
747
|
+
[frontend]: https://pysar-app.vercel.app/
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
<p align="center">
|
|
2
|
-
<img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="
|
|
2
|
+
<img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="400" width="350"/>
|
|
3
3
|
</p>
|
|
4
4
|
|
|
5
5
|
# pySAR - Python Sequence Activity Relationship #
|
|
@@ -20,8 +20,13 @@
|
|
|
20
20
|
|
|
21
21
|
`pySAR` is a Python library for analysing Sequence Activity Relationships (SARs)/Sequence Function Relationships (SFRs) of protein sequences.
|
|
22
22
|
|
|
23
|
+
|
|
24
|
+
<h2 align="center">
|
|
25
|
+
The NEW front-end app for pySAR is available
|
|
26
|
+
<a href="https://pysar-app.vercel.app/" target="_blank">here</a>!
|
|
27
|
+
</h2>
|
|
28
|
+
|
|
23
29
|
* 📖 The published research article is available [here][article].
|
|
24
|
-
* 🌍 A front-end app for `pySAR` is available [here][frontend] (coming soon).
|
|
25
30
|
* 💻 A quick Colab notebook demo of `pySAR` is available [here][demo].
|
|
26
31
|
* 📰 A **Medium** article that dives deeper into SARs and the `pySAR` software itself is available [here][medium].
|
|
27
32
|
|
|
@@ -75,7 +80,6 @@ Requirements
|
|
|
75
80
|
* [pandas][pandas] >= 1.3
|
|
76
81
|
* [scikit-learn][sklearn] >= 1.0
|
|
77
82
|
* [scipy][scipy] >= 1.7
|
|
78
|
-
* [delayed][delayed] >= 0.11
|
|
79
83
|
* [tqdm][tqdm] >= 4.60
|
|
80
84
|
* [matplotlib][matplotlib] >= 3.4
|
|
81
85
|
* [seaborn][seaborn] >= 0.11
|
|
@@ -660,6 +664,10 @@ Journal of Chemical Information and Modeling 2020 60 (6), 2773-2790
|
|
|
660
664
|
DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
661
665
|
\[8\]: Medina-Ortiz, D., Contreras, S., Amado-Hinojosa, J., Torres-Almonacid, J., Asenjo, J. A., Navarrete, M., & Olivera-Nappa, Á. (2020). Combination of digital signal processing and assembled predictive models facilitates the rational design of proteins. ArXiv [Cs.CE]. <br>
|
|
662
666
|
|
|
667
|
+
|
|
668
|
+
[<img src="https://img.shields.io/github/stars/amckenna41/pySAR?color=green&label=star%20it%20on%20GitHub" width="132" height="20" alt="Star it on GitHub">](https://github.com/amckenna41/pySAR)
|
|
669
|
+
|
|
670
|
+
|
|
663
671
|
<a href="https://www.buymeacoffee.com/amckenna41" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="41" width="174"></a>
|
|
664
672
|
|
|
665
673
|
[Back to top](#TOP)
|
|
@@ -676,7 +684,6 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
|
676
684
|
[tqdm]: https://tqdm.github.io/
|
|
677
685
|
[seaborn]: https://seaborn.pydata.org/
|
|
678
686
|
[matplotlib]: https://matplotlib.org/
|
|
679
|
-
[delayed]: https://pypi.org/project/delayed/
|
|
680
687
|
[PyPi]: https://pypi.org/project/pysar/
|
|
681
688
|
[article]: https://www.sciencedirect.com/science/article/abs/pii/S1532046422000326
|
|
682
689
|
[pdf]: https://github.com/amckenna41/pySAR/blob/master/pySAR_research.pdf
|
|
@@ -686,4 +693,5 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
|
686
693
|
[license]: https://github.com/amckenna41/pySAR/blob/master/LICENSE
|
|
687
694
|
[config]: https://github.com/amckenna41/pySAR/blob/master/CONFIG.md
|
|
688
695
|
[medium]: https://ajmckenna69.medium.com/pysar-a3de9f71733f
|
|
689
|
-
[directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
|
|
696
|
+
[directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
|
|
697
|
+
[frontend]: https://pysar-app.vercel.app/
|
|
@@ -15,7 +15,7 @@ sys.path.insert(0, os.path.abspath('..'))
|
|
|
15
15
|
project = 'pySAR'
|
|
16
16
|
copyright = '2026, AJ McKenna'
|
|
17
17
|
author = 'AJ McKenna'
|
|
18
|
-
release = '2.5.
|
|
18
|
+
release = '2.5.2'
|
|
19
19
|
|
|
20
20
|
# -- General configuration ---------------------------------------------------
|
|
21
21
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
|
@@ -36,7 +36,6 @@ autodoc_mock_imports = [
|
|
|
36
36
|
'matplotlib',
|
|
37
37
|
'seaborn',
|
|
38
38
|
'tqdm',
|
|
39
|
-
'delayed',
|
|
40
39
|
'aaindex',
|
|
41
40
|
'protpy',
|
|
42
41
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
""" pySAR software metadata. """
|
|
2
2
|
__name__ = 'pySAR'
|
|
3
|
-
__version__ = "2.5.
|
|
3
|
+
__version__ = "2.5.2"
|
|
4
4
|
__description__ = 'A Python package used to analysis Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.'
|
|
5
5
|
__author__ = 'AJ McKenna: https://github.com/amckenna41'
|
|
6
6
|
__authorEmail__ = 'amckenna41@qub.ac.uk'
|
|
@@ -13,6 +13,9 @@ __keywords__ = ["bioinformatics", "protein engineering", "python", "pypi", "mach
|
|
|
13
13
|
"directed evolution", "drug discovery", "sequence activity relationships", "SAR", "aaindex", "protpy", "protein descriptors"]
|
|
14
14
|
__test_suite__ = "tests"
|
|
15
15
|
|
|
16
|
+
from .encoding import SortKey, EncodingResult
|
|
17
|
+
from .config import PySARConfig
|
|
18
|
+
|
|
16
19
|
__all__ = [
|
|
17
20
|
'__version__',
|
|
18
21
|
'__description__',
|
|
@@ -25,4 +28,7 @@ __all__ = [
|
|
|
25
28
|
'__status__',
|
|
26
29
|
'__keywords__',
|
|
27
30
|
'__test_suite__',
|
|
31
|
+
'SortKey',
|
|
32
|
+
'EncodingResult',
|
|
33
|
+
'PySARConfig',
|
|
28
34
|
]
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
################# PySARConfig #################
|
|
3
|
+
################################################################################
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class PySARConfig:
|
|
11
|
+
"""
|
|
12
|
+
Typed configuration container for PySAR and Encoding.
|
|
13
|
+
|
|
14
|
+
All parameters mirror the keys in the JSON configuration files so a
|
|
15
|
+
``PySARConfig`` instance can be used wherever a config filepath is accepted.
|
|
16
|
+
Fields left as *None* fall back to the defaults encoded in the JSON file.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
==========
|
|
20
|
+
:config_file: str
|
|
21
|
+
Path to the JSON configuration file. When provided all other fields
|
|
22
|
+
are used as overrides rather than replacements.
|
|
23
|
+
:dataset: str
|
|
24
|
+
Path to the CSV dataset of protein sequences and activity values.
|
|
25
|
+
:sequence_col: str
|
|
26
|
+
Name of the column in *dataset* that contains the protein sequences.
|
|
27
|
+
:activity_col: str
|
|
28
|
+
Name of the column in *dataset* that contains the activity/fitness values.
|
|
29
|
+
:algorithm: str
|
|
30
|
+
Sklearn regression algorithm name (e.g. ``'plsregression'``, ``'randomforest'``).
|
|
31
|
+
:parameters: dict
|
|
32
|
+
Keyword arguments forwarded to the sklearn model constructor.
|
|
33
|
+
:test_split: float
|
|
34
|
+
Fraction of data held back for testing (0 < test_split < 1).
|
|
35
|
+
:use_dsp: bool
|
|
36
|
+
Apply a DSP (FFT) pipeline to the AAI-encoded sequences before modelling.
|
|
37
|
+
:spectrum: str
|
|
38
|
+
Informational spectrum to use when *use_dsp* is True.
|
|
39
|
+
One of ``'power'``, ``'real'``, ``'imaginary'``, ``'absolute'``.
|
|
40
|
+
:window_type: str
|
|
41
|
+
Window function to apply before the FFT (e.g. ``'hamming'``, ``'blackman'``).
|
|
42
|
+
:filter_type: str
|
|
43
|
+
Filter to apply after the FFT (e.g. ``'savgol'``, ``'medfilt'``).
|
|
44
|
+
:descriptors_csv: str
|
|
45
|
+
Path to a pre-calculated descriptors CSV file. When provided the
|
|
46
|
+
``Descriptors`` class will import values directly rather than
|
|
47
|
+
recomputing them.
|
|
48
|
+
|
|
49
|
+
Usage
|
|
50
|
+
=====
|
|
51
|
+
>>> cfg = PySARConfig(
|
|
52
|
+
... config_file="thermostability.json",
|
|
53
|
+
... algorithm="randomforest",
|
|
54
|
+
... test_split=0.1,
|
|
55
|
+
... )
|
|
56
|
+
>>> from pySAR import PySAR
|
|
57
|
+
>>> sar = PySAR(cfg.config_file, algorithm=cfg.algorithm, test_split=cfg.test_split)
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
config_file: str = ""
|
|
61
|
+
dataset: Optional[str] = None
|
|
62
|
+
sequence_col: Optional[str] = None
|
|
63
|
+
activity_col: Optional[str] = None
|
|
64
|
+
algorithm: Optional[str] = None
|
|
65
|
+
parameters: Optional[Dict[str, Any]] = None
|
|
66
|
+
test_split: Optional[float] = None
|
|
67
|
+
use_dsp: Optional[bool] = None
|
|
68
|
+
spectrum: Optional[str] = None
|
|
69
|
+
window_type: Optional[str] = None
|
|
70
|
+
filter_type: Optional[str] = None
|
|
71
|
+
descriptors_csv: Optional[str] = None
|
|
72
|
+
|
|
73
|
+
def to_kwargs(self) -> Dict[str, Any]:
|
|
74
|
+
"""
|
|
75
|
+
Return a dict of non-None, non-config_file fields suitable for passing
|
|
76
|
+
as ``**kwargs`` to :class:`~pySAR.pySAR.PySAR` or
|
|
77
|
+
:class:`~pySAR.encoding.Encoding`.
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
=======
|
|
81
|
+
:kwargs: dict
|
|
82
|
+
Only fields that have been explicitly set (i.e. are not None) are
|
|
83
|
+
included. The ``config_file`` field is excluded since it is passed
|
|
84
|
+
as a positional argument.
|
|
85
|
+
"""
|
|
86
|
+
result: Dict[str, Any] = {}
|
|
87
|
+
for field_name in (
|
|
88
|
+
"dataset",
|
|
89
|
+
"sequence_col",
|
|
90
|
+
"activity_col",
|
|
91
|
+
"algorithm",
|
|
92
|
+
"parameters",
|
|
93
|
+
"test_split",
|
|
94
|
+
"use_dsp",
|
|
95
|
+
"spectrum",
|
|
96
|
+
"window_type",
|
|
97
|
+
"filter_type",
|
|
98
|
+
"descriptors_csv",
|
|
99
|
+
):
|
|
100
|
+
value = getattr(self, field_name)
|
|
101
|
+
if value is not None:
|
|
102
|
+
result[field_name] = value
|
|
103
|
+
return result
|
|
@@ -8,11 +8,11 @@ import pandas as pd
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
from difflib import get_close_matches
|
|
10
10
|
import json
|
|
11
|
-
from json import JSONDecodeError
|
|
12
11
|
import itertools
|
|
13
12
|
import time
|
|
14
13
|
from tqdm import tqdm
|
|
15
14
|
from functools import lru_cache
|
|
15
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
16
16
|
|
|
17
17
|
from .utils import *
|
|
18
18
|
import protpy as protpy
|
|
@@ -375,13 +375,15 @@ class Descriptors():
|
|
|
375
375
|
[14] B. Hollas, “An analysis of the autocorrelation descriptor for molecules,” J. Math. Chem.,
|
|
376
376
|
vol. 33, no. 2, pp. 91–101, 2003.
|
|
377
377
|
"""
|
|
378
|
-
def __init__(self,
|
|
379
|
-
config_file: str = "",
|
|
380
|
-
protein_seqs: Optional[Union[pd.Series, str]] = None,
|
|
378
|
+
def __init__(self,
|
|
379
|
+
config_file: str = "",
|
|
380
|
+
protein_seqs: Optional[Union[pd.Series, str]] = None,
|
|
381
|
+
n_jobs: int = 1,
|
|
381
382
|
**kwargs) -> None:
|
|
382
383
|
|
|
383
384
|
self.config_file = config_file
|
|
384
385
|
self.protein_seqs = protein_seqs
|
|
386
|
+
self.n_jobs = max(1, int(n_jobs))
|
|
385
387
|
self.kwargs = locals()['kwargs'] #get any keyword argument variables of class
|
|
386
388
|
self.config_parameters = {}
|
|
387
389
|
|
|
@@ -1996,55 +1998,40 @@ class Descriptors():
|
|
|
1996
1998
|
#start time counter
|
|
1997
1999
|
start = time.time()
|
|
1998
2000
|
|
|
1999
|
-
#
|
|
2000
|
-
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
|
|
2014
|
-
|
|
2015
|
-
|
|
2016
|
-
|
|
2017
|
-
self.moran_autocorrelation = self.get_moran_autocorrelation()
|
|
2018
|
-
|
|
2019
|
-
if (descr == "geary_autocorrelation" and getattr(self, "geary_autocorrelation").empty):
|
|
2020
|
-
self.geary_autocorrelation = self.get_geary_autocorrelation()
|
|
2021
|
-
|
|
2022
|
-
if (descr == "ctd" and getattr(self, "ctd").empty):
|
|
2023
|
-
self.ctd = self.get_ctd()
|
|
2024
|
-
|
|
2025
|
-
if (descr == "ctd_composition" and getattr(self, "ctd_composition").empty):
|
|
2026
|
-
self.ctd_composition = self.get_ctd_composition()
|
|
2027
|
-
|
|
2028
|
-
if (descr == "ctd_transition" and getattr(self, "ctd_transition").empty):
|
|
2029
|
-
self.ctd_transition = self.get_ctd_transition()
|
|
2030
|
-
|
|
2031
|
-
if (descr == "ctd_distribution" and getattr(self, "ctd_distribution").empty):
|
|
2032
|
-
self.ctd_distribution = self.get_ctd_distribution()
|
|
2033
|
-
|
|
2034
|
-
if (descr == "conjoint_triad" and getattr(self, "conjoint_triad").empty):
|
|
2035
|
-
self.conjoint_triad = self.get_conjoint_triad()
|
|
2036
|
-
|
|
2037
|
-
if (descr == "sequence_order_coupling_number" and getattr(self, "sequence_order_coupling_number").empty):
|
|
2038
|
-
self.sequence_order_coupling_number = self.get_sequence_order_coupling_number()
|
|
2039
|
-
|
|
2040
|
-
if (descr == "quasi_sequence_order" and getattr(self, "quasi_sequence_order").empty):
|
|
2041
|
-
self.quasi_sequence_order = self.get_quasi_sequence_order()
|
|
2042
|
-
|
|
2043
|
-
if (descr == "pseudo_amino_acid_composition" and getattr(self, "pseudo_amino_acid_composition").empty):
|
|
2044
|
-
self.pseudo_amino_acid_composition = self.get_pseudo_amino_acid_composition()
|
|
2001
|
+
#map each descriptor name to its getter for sequential and parallel dispatch
|
|
2002
|
+
_getter_map = [
|
|
2003
|
+
('amino_acid_composition', self.get_amino_acid_composition),
|
|
2004
|
+
('dipeptide_composition', self.get_dipeptide_composition),
|
|
2005
|
+
('tripeptide_composition', self.get_tripeptide_composition),
|
|
2006
|
+
('moreaubroto_autocorrelation', self.get_moreaubroto_autocorrelation),
|
|
2007
|
+
('moran_autocorrelation', self.get_moran_autocorrelation),
|
|
2008
|
+
('geary_autocorrelation', self.get_geary_autocorrelation),
|
|
2009
|
+
('ctd', self.get_ctd),
|
|
2010
|
+
('ctd_composition', self.get_ctd_composition),
|
|
2011
|
+
('ctd_transition', self.get_ctd_transition),
|
|
2012
|
+
('ctd_distribution', self.get_ctd_distribution),
|
|
2013
|
+
('conjoint_triad', self.get_conjoint_triad),
|
|
2014
|
+
('sequence_order_coupling_number', self.get_sequence_order_coupling_number),
|
|
2015
|
+
('quasi_sequence_order', self.get_quasi_sequence_order),
|
|
2016
|
+
('pseudo_amino_acid_composition', self.get_pseudo_amino_acid_composition),
|
|
2017
|
+
('amphiphilic_pseudo_amino_acid_composition', self.get_amphiphilic_pseudo_amino_acid_composition),
|
|
2018
|
+
]
|
|
2045
2019
|
|
|
2046
|
-
|
|
2047
|
-
|
|
2020
|
+
if self.n_jobs > 1:
|
|
2021
|
+
#compute descriptors concurrently; skip any already populated from a prior import
|
|
2022
|
+
pending = [(name, getter) for name, getter in _getter_map if getattr(self, name).empty]
|
|
2023
|
+
with ThreadPoolExecutor(max_workers=self.n_jobs) as executor:
|
|
2024
|
+
futures = {executor.submit(getter): name for name, getter in pending}
|
|
2025
|
+
for future in tqdm(as_completed(futures), total=len(futures), unit=" descriptor",
|
|
2026
|
+
desc="Descriptors", ncols=90):
|
|
2027
|
+
name = futures[future]
|
|
2028
|
+
setattr(self, name, future.result())
|
|
2029
|
+
else:
|
|
2030
|
+
#iterate over all descriptors sequentially, calculating each using their respective function
|
|
2031
|
+
for name, getter in tqdm(_getter_map, unit=" descriptor", position=0,
|
|
2032
|
+
desc="Descriptors", mininterval=30, ncols=90):
|
|
2033
|
+
if getattr(self, name).empty:
|
|
2034
|
+
setattr(self, name, getter())
|
|
2048
2035
|
|
|
2049
2036
|
#stop time counter, calculate elapsed time
|
|
2050
2037
|
end = time.time()
|
|
@@ -2321,13 +2308,14 @@ class Descriptors():
|
|
|
2321
2308
|
|
|
2322
2309
|
return all_descriptors
|
|
2323
2310
|
|
|
2324
|
-
def _calculate_descriptor_batch(self,
|
|
2325
|
-
descriptor_func: Callable,
|
|
2311
|
+
def _calculate_descriptor_batch(self,
|
|
2312
|
+
descriptor_func: Callable,
|
|
2326
2313
|
desc_name: str = "",
|
|
2327
2314
|
**kwargs) -> pd.DataFrame:
|
|
2328
2315
|
"""
|
|
2329
2316
|
Generic helper method to calculate descriptors for all sequences, preventing code repetition.
|
|
2330
|
-
|
|
2317
|
+
Uses self.n_jobs threads to parallelise across sequences when n_jobs > 1.
|
|
2318
|
+
|
|
2331
2319
|
Parameters
|
|
2332
2320
|
==========
|
|
2333
2321
|
:descriptor_func: Callable
|
|
@@ -2336,16 +2324,28 @@ class Descriptors():
|
|
|
2336
2324
|
Name of descriptor for progress tracking
|
|
2337
2325
|
:kwargs: dict
|
|
2338
2326
|
Additional keyword arguments to pass to descriptor function
|
|
2339
|
-
|
|
2327
|
+
|
|
2340
2328
|
Returns
|
|
2341
2329
|
=======
|
|
2342
2330
|
:pd.DataFrame
|
|
2343
2331
|
Dataframe with calculated descriptor values for all sequences
|
|
2344
2332
|
"""
|
|
2345
|
-
|
|
2333
|
+
seqs = list(self.protein_seqs)
|
|
2346
2334
|
|
|
2347
|
-
|
|
2348
|
-
|
|
2335
|
+
if self.n_jobs <= 1:
|
|
2336
|
+
iterator = tqdm(seqs, desc=f"Computing {desc_name}", ncols=90) if desc_name else seqs
|
|
2337
|
+
# accumulate results in a list to avoid O(n²) repeated concat
|
|
2338
|
+
desc_list = [descriptor_func(seq, **kwargs) for seq in iterator]
|
|
2339
|
+
else:
|
|
2340
|
+
desc_list = [None] * len(seqs)
|
|
2341
|
+
with ThreadPoolExecutor(max_workers=self.n_jobs) as executor:
|
|
2342
|
+
futures = {executor.submit(descriptor_func, seq, **kwargs): i
|
|
2343
|
+
for i, seq in enumerate(seqs)}
|
|
2344
|
+
progress = tqdm(as_completed(futures), total=len(seqs),
|
|
2345
|
+
desc=f"Computing {desc_name}", ncols=90) if desc_name else as_completed(futures)
|
|
2346
|
+
for future in progress:
|
|
2347
|
+
i = futures[future]
|
|
2348
|
+
desc_list[i] = future.result()
|
|
2349
2349
|
|
|
2350
2350
|
return pd.concat(desc_list, ignore_index=False).reset_index(drop=True)
|
|
2351
2351
|
|