PySAR 2.5.0__tar.gz → 2.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {pysar-2.5.0 → pysar-2.5.2}/PKG-INFO +13 -6
  2. {pysar-2.5.0 → pysar-2.5.2}/PySAR.egg-info/PKG-INFO +13 -6
  3. {pysar-2.5.0 → pysar-2.5.2}/PySAR.egg-info/SOURCES.txt +1 -0
  4. {pysar-2.5.0 → pysar-2.5.2}/PySAR.egg-info/requires.txt +0 -1
  5. {pysar-2.5.0 → pysar-2.5.2}/README.md +13 -5
  6. {pysar-2.5.0 → pysar-2.5.2}/docs/conf.py +1 -2
  7. {pysar-2.5.0 → pysar-2.5.2}/pySAR/__init__.py +7 -1
  8. pysar-2.5.2/pySAR/config.py +103 -0
  9. {pysar-2.5.0 → pysar-2.5.2}/pySAR/descriptors.py +59 -59
  10. {pysar-2.5.0 → pysar-2.5.2}/pySAR/encoding.py +240 -37
  11. {pysar-2.5.0 → pysar-2.5.2}/pySAR/evaluate.py +6 -4
  12. pysar-2.5.2/pySAR/globals_.py +38 -0
  13. {pysar-2.5.0 → pysar-2.5.2}/pySAR/model.py +163 -30
  14. {pysar-2.5.0 → pysar-2.5.2}/pySAR/plots.py +7 -4
  15. {pysar-2.5.0 → pysar-2.5.2}/pySAR/pyDSP.py +63 -108
  16. {pysar-2.5.0 → pysar-2.5.2}/pySAR/pySAR.py +533 -231
  17. {pysar-2.5.0 → pysar-2.5.2}/pySAR/utils.py +14 -47
  18. {pysar-2.5.0 → pysar-2.5.2}/pyproject.toml +1 -2
  19. {pysar-2.5.0 → pysar-2.5.2}/tests/test_descriptors.py +52 -0
  20. {pysar-2.5.0 → pysar-2.5.2}/tests/test_encoding.py +164 -11
  21. {pysar-2.5.0 → pysar-2.5.2}/tests/test_evaluate.py +3 -3
  22. {pysar-2.5.0 → pysar-2.5.2}/tests/test_model.py +132 -14
  23. {pysar-2.5.0 → pysar-2.5.2}/tests/test_plots.py +4 -4
  24. {pysar-2.5.0 → pysar-2.5.2}/tests/test_pyDSP.py +66 -1
  25. {pysar-2.5.0 → pysar-2.5.2}/tests/test_pySAR.py +208 -22
  26. {pysar-2.5.0 → pysar-2.5.2}/tests/test_utils.py +37 -50
  27. pysar-2.5.0/pySAR/globals_.py +0 -21
  28. {pysar-2.5.0 → pysar-2.5.2}/LICENSE +0 -0
  29. {pysar-2.5.0 → pysar-2.5.2}/PySAR.egg-info/dependency_links.txt +0 -0
  30. {pysar-2.5.0 → pysar-2.5.2}/PySAR.egg-info/not-zip-safe +0 -0
  31. {pysar-2.5.0 → pysar-2.5.2}/PySAR.egg-info/top_level.txt +0 -0
  32. {pysar-2.5.0 → pysar-2.5.2}/pySAR/py.typed +0 -0
  33. {pysar-2.5.0 → pysar-2.5.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: PySAR
3
- Version: 2.5.0
3
+ Version: 2.5.2
4
4
  Summary: Analysing Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.
5
5
  Author-email: AJ McKenna <amckenna41@qub.ac.uk>
6
6
  Maintainer-email: AJ McKenna <amckenna41@qub.ac.uk>
@@ -33,7 +33,6 @@ License-File: LICENSE
33
33
  Requires-Dist: numpy>=1.21
34
34
  Requires-Dist: pandas>=1.3
35
35
  Requires-Dist: scipy>=1.7
36
- Requires-Dist: delayed>=0.11
37
36
  Requires-Dist: scikit-learn>=1.0
38
37
  Requires-Dist: matplotlib>=3.4
39
38
  Requires-Dist: seaborn>=0.11
@@ -50,7 +49,7 @@ Requires-Dist: sphinx; extra == "docs"
50
49
  Dynamic: license-file
51
50
 
52
51
  <p align="center">
53
- <img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="300" width="400"/>
52
+ <img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="400" width="350"/>
54
53
  </p>
55
54
 
56
55
  # pySAR - Python Sequence Activity Relationship #
@@ -71,8 +70,13 @@ Dynamic: license-file
71
70
 
72
71
  `pySAR` is a Python library for analysing Sequence Activity Relationships (SARs)/Sequence Function Relationships (SFRs) of protein sequences.
73
72
 
73
+
74
+ <h2 align="center">
75
+ The NEW front-end app for pySAR is available
76
+ <a href="https://pysar-app.vercel.app/" target="_blank">here</a>!
77
+ </h2>
78
+
74
79
  * 📖 The published research article is available [here][article].
75
- * 🌍 A front-end app for `pySAR` is available [here][frontend] (coming soon).
76
80
  * 💻 A quick Colab notebook demo of `pySAR` is available [here][demo].
77
81
  * 📰 A **Medium** article that dives deeper into SARs and the `pySAR` software itself is available [here][medium].
78
82
 
@@ -126,7 +130,6 @@ Requirements
126
130
  * [pandas][pandas] >= 1.3
127
131
  * [scikit-learn][sklearn] >= 1.0
128
132
  * [scipy][scipy] >= 1.7
129
- * [delayed][delayed] >= 0.11
130
133
  * [tqdm][tqdm] >= 4.60
131
134
  * [matplotlib][matplotlib] >= 3.4
132
135
  * [seaborn][seaborn] >= 0.11
@@ -711,6 +714,10 @@ Journal of Chemical Information and Modeling 2020 60 (6), 2773-2790
711
714
  DOI: 10.1021/acs.jcim.0c00073 <br><br>
712
715
  \[8\]: Medina-Ortiz, D., Contreras, S., Amado-Hinojosa, J., Torres-Almonacid, J., Asenjo, J. A., Navarrete, M., & Olivera-Nappa, Á. (2020). Combination of digital signal processing and assembled predictive models facilitates the rational design of proteins. ArXiv [Cs.CE]. <br>
713
716
 
717
+
718
+ [<img src="https://img.shields.io/github/stars/amckenna41/pySAR?color=green&label=star%20it%20on%20GitHub" width="132" height="20" alt="Star it on GitHub">](https://github.com/amckenna41/pySAR)
719
+
720
+
714
721
  <a href="https://www.buymeacoffee.com/amckenna41" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="41" width="174"></a>
715
722
 
716
723
  [Back to top](#TOP)
@@ -727,7 +734,6 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
727
734
  [tqdm]: https://tqdm.github.io/
728
735
  [seaborn]: https://seaborn.pydata.org/
729
736
  [matplotlib]: https://matplotlib.org/
730
- [delayed]: https://pypi.org/project/delayed/
731
737
  [PyPi]: https://pypi.org/project/pysar/
732
738
  [article]: https://www.sciencedirect.com/science/article/abs/pii/S1532046422000326
733
739
  [pdf]: https://github.com/amckenna41/pySAR/blob/master/pySAR_research.pdf
@@ -738,3 +744,4 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
738
744
  [config]: https://github.com/amckenna41/pySAR/blob/master/CONFIG.md
739
745
  [medium]: https://ajmckenna69.medium.com/pysar-a3de9f71733f
740
746
  [directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
747
+ [frontend]: https://pysar-app.vercel.app/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: PySAR
3
- Version: 2.5.0
3
+ Version: 2.5.2
4
4
  Summary: Analysing Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.
5
5
  Author-email: AJ McKenna <amckenna41@qub.ac.uk>
6
6
  Maintainer-email: AJ McKenna <amckenna41@qub.ac.uk>
@@ -33,7 +33,6 @@ License-File: LICENSE
33
33
  Requires-Dist: numpy>=1.21
34
34
  Requires-Dist: pandas>=1.3
35
35
  Requires-Dist: scipy>=1.7
36
- Requires-Dist: delayed>=0.11
37
36
  Requires-Dist: scikit-learn>=1.0
38
37
  Requires-Dist: matplotlib>=3.4
39
38
  Requires-Dist: seaborn>=0.11
@@ -50,7 +49,7 @@ Requires-Dist: sphinx; extra == "docs"
50
49
  Dynamic: license-file
51
50
 
52
51
  <p align="center">
53
- <img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="300" width="400"/>
52
+ <img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="400" width="350"/>
54
53
  </p>
55
54
 
56
55
  # pySAR - Python Sequence Activity Relationship #
@@ -71,8 +70,13 @@ Dynamic: license-file
71
70
 
72
71
  `pySAR` is a Python library for analysing Sequence Activity Relationships (SARs)/Sequence Function Relationships (SFRs) of protein sequences.
73
72
 
73
+
74
+ <h2 align="center">
75
+ The NEW front-end app for pySAR is available
76
+ <a href="https://pysar-app.vercel.app/" target="_blank">here</a>!
77
+ </h2>
78
+
74
79
  * 📖 The published research article is available [here][article].
75
- * 🌍 A front-end app for `pySAR` is available [here][frontend] (coming soon).
76
80
  * 💻 A quick Colab notebook demo of `pySAR` is available [here][demo].
77
81
  * 📰 A **Medium** article that dives deeper into SARs and the `pySAR` software itself is available [here][medium].
78
82
 
@@ -126,7 +130,6 @@ Requirements
126
130
  * [pandas][pandas] >= 1.3
127
131
  * [scikit-learn][sklearn] >= 1.0
128
132
  * [scipy][scipy] >= 1.7
129
- * [delayed][delayed] >= 0.11
130
133
  * [tqdm][tqdm] >= 4.60
131
134
  * [matplotlib][matplotlib] >= 3.4
132
135
  * [seaborn][seaborn] >= 0.11
@@ -711,6 +714,10 @@ Journal of Chemical Information and Modeling 2020 60 (6), 2773-2790
711
714
  DOI: 10.1021/acs.jcim.0c00073 <br><br>
712
715
  \[8\]: Medina-Ortiz, D., Contreras, S., Amado-Hinojosa, J., Torres-Almonacid, J., Asenjo, J. A., Navarrete, M., & Olivera-Nappa, Á. (2020). Combination of digital signal processing and assembled predictive models facilitates the rational design of proteins. ArXiv [Cs.CE]. <br>
713
716
 
717
+
718
+ [<img src="https://img.shields.io/github/stars/amckenna41/pySAR?color=green&label=star%20it%20on%20GitHub" width="132" height="20" alt="Star it on GitHub">](https://github.com/amckenna41/pySAR)
719
+
720
+
714
721
  <a href="https://www.buymeacoffee.com/amckenna41" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="41" width="174"></a>
715
722
 
716
723
  [Back to top](#TOP)
@@ -727,7 +734,6 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
727
734
  [tqdm]: https://tqdm.github.io/
728
735
  [seaborn]: https://seaborn.pydata.org/
729
736
  [matplotlib]: https://matplotlib.org/
730
- [delayed]: https://pypi.org/project/delayed/
731
737
  [PyPi]: https://pypi.org/project/pysar/
732
738
  [article]: https://www.sciencedirect.com/science/article/abs/pii/S1532046422000326
733
739
  [pdf]: https://github.com/amckenna41/pySAR/blob/master/pySAR_research.pdf
@@ -738,3 +744,4 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
738
744
  [config]: https://github.com/amckenna41/pySAR/blob/master/CONFIG.md
739
745
  [medium]: https://ajmckenna69.medium.com/pysar-a3de9f71733f
740
746
  [directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
747
+ [frontend]: https://pysar-app.vercel.app/
@@ -9,6 +9,7 @@ PySAR.egg-info/requires.txt
9
9
  PySAR.egg-info/top_level.txt
10
10
  docs/conf.py
11
11
  pySAR/__init__.py
12
+ pySAR/config.py
12
13
  pySAR/descriptors.py
13
14
  pySAR/encoding.py
14
15
  pySAR/evaluate.py
@@ -1,7 +1,6 @@
1
1
  numpy>=1.21
2
2
  pandas>=1.3
3
3
  scipy>=1.7
4
- delayed>=0.11
5
4
  scikit-learn>=1.0
6
5
  matplotlib>=3.4
7
6
  seaborn>=0.11
@@ -1,5 +1,5 @@
1
1
  <p align="center">
2
- <img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="300" width="400"/>
2
+ <img src="https://raw.githubusercontent.com/amckenna41/pySAR/master/images/pySAR.png" alt="pySARLogo" height="400" width="350"/>
3
3
  </p>
4
4
 
5
5
  # pySAR - Python Sequence Activity Relationship #
@@ -20,8 +20,13 @@
20
20
 
21
21
  `pySAR` is a Python library for analysing Sequence Activity Relationships (SARs)/Sequence Function Relationships (SFRs) of protein sequences.
22
22
 
23
+
24
+ <h2 align="center">
25
+ The NEW front-end app for pySAR is available
26
+ <a href="https://pysar-app.vercel.app/" target="_blank">here</a>!
27
+ </h2>
28
+
23
29
  * 📖 The published research article is available [here][article].
24
- * 🌍 A front-end app for `pySAR` is available [here][frontend] (coming soon).
25
30
  * 💻 A quick Colab notebook demo of `pySAR` is available [here][demo].
26
31
  * 📰 A **Medium** article that dives deeper into SARs and the `pySAR` software itself is available [here][medium].
27
32
 
@@ -75,7 +80,6 @@ Requirements
75
80
  * [pandas][pandas] >= 1.3
76
81
  * [scikit-learn][sklearn] >= 1.0
77
82
  * [scipy][scipy] >= 1.7
78
- * [delayed][delayed] >= 0.11
79
83
  * [tqdm][tqdm] >= 4.60
80
84
  * [matplotlib][matplotlib] >= 3.4
81
85
  * [seaborn][seaborn] >= 0.11
@@ -660,6 +664,10 @@ Journal of Chemical Information and Modeling 2020 60 (6), 2773-2790
660
664
  DOI: 10.1021/acs.jcim.0c00073 <br><br>
661
665
  \[8\]: Medina-Ortiz, D., Contreras, S., Amado-Hinojosa, J., Torres-Almonacid, J., Asenjo, J. A., Navarrete, M., & Olivera-Nappa, Á. (2020). Combination of digital signal processing and assembled predictive models facilitates the rational design of proteins. ArXiv [Cs.CE]. <br>
662
666
 
667
+
668
+ [<img src="https://img.shields.io/github/stars/amckenna41/pySAR?color=green&label=star%20it%20on%20GitHub" width="132" height="20" alt="Star it on GitHub">](https://github.com/amckenna41/pySAR)
669
+
670
+
663
671
  <a href="https://www.buymeacoffee.com/amckenna41" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="41" width="174"></a>
664
672
 
665
673
  [Back to top](#TOP)
@@ -676,7 +684,6 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
676
684
  [tqdm]: https://tqdm.github.io/
677
685
  [seaborn]: https://seaborn.pydata.org/
678
686
  [matplotlib]: https://matplotlib.org/
679
- [delayed]: https://pypi.org/project/delayed/
680
687
  [PyPi]: https://pypi.org/project/pysar/
681
688
  [article]: https://www.sciencedirect.com/science/article/abs/pii/S1532046422000326
682
689
  [pdf]: https://github.com/amckenna41/pySAR/blob/master/pySAR_research.pdf
@@ -686,4 +693,5 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
686
693
  [license]: https://github.com/amckenna41/pySAR/blob/master/LICENSE
687
694
  [config]: https://github.com/amckenna41/pySAR/blob/master/CONFIG.md
688
695
  [medium]: https://ajmckenna69.medium.com/pysar-a3de9f71733f
689
- [directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
696
+ [directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
697
+ [frontend]: https://pysar-app.vercel.app/
@@ -15,7 +15,7 @@ sys.path.insert(0, os.path.abspath('..'))
15
15
  project = 'pySAR'
16
16
  copyright = '2026, AJ McKenna'
17
17
  author = 'AJ McKenna'
18
- release = '2.5.0'
18
+ release = '2.5.2'
19
19
 
20
20
  # -- General configuration ---------------------------------------------------
21
21
  # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
@@ -36,7 +36,6 @@ autodoc_mock_imports = [
36
36
  'matplotlib',
37
37
  'seaborn',
38
38
  'tqdm',
39
- 'delayed',
40
39
  'aaindex',
41
40
  'protpy',
42
41
  ]
@@ -1,6 +1,6 @@
1
1
  """ pySAR software metadata. """
2
2
  __name__ = 'pySAR'
3
- __version__ = "2.5.0"
3
+ __version__ = "2.5.2"
4
4
  __description__ = 'A Python package used to analysis Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.'
5
5
  __author__ = 'AJ McKenna: https://github.com/amckenna41'
6
6
  __authorEmail__ = 'amckenna41@qub.ac.uk'
@@ -13,6 +13,9 @@ __keywords__ = ["bioinformatics", "protein engineering", "python", "pypi", "mach
13
13
  "directed evolution", "drug discovery", "sequence activity relationships", "SAR", "aaindex", "protpy", "protein descriptors"]
14
14
  __test_suite__ = "tests"
15
15
 
16
+ from .encoding import SortKey, EncodingResult
17
+ from .config import PySARConfig
18
+
16
19
  __all__ = [
17
20
  '__version__',
18
21
  '__description__',
@@ -25,4 +28,7 @@ __all__ = [
25
28
  '__status__',
26
29
  '__keywords__',
27
30
  '__test_suite__',
31
+ 'SortKey',
32
+ 'EncodingResult',
33
+ 'PySARConfig',
28
34
  ]
@@ -0,0 +1,103 @@
1
+ ################################################################################
2
+ ################# PySARConfig #################
3
+ ################################################################################
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, Dict, List, Optional, Union
7
+
8
+
9
+ @dataclass
10
+ class PySARConfig:
11
+ """
12
+ Typed configuration container for PySAR and Encoding.
13
+
14
+ All parameters mirror the keys in the JSON configuration files so a
15
+ ``PySARConfig`` instance can be used wherever a config filepath is accepted.
16
+ Fields left as *None* fall back to the defaults encoded in the JSON file.
17
+
18
+ Parameters
19
+ ==========
20
+ :config_file: str
21
+ Path to the JSON configuration file. When provided all other fields
22
+ are used as overrides rather than replacements.
23
+ :dataset: str
24
+ Path to the CSV dataset of protein sequences and activity values.
25
+ :sequence_col: str
26
+ Name of the column in *dataset* that contains the protein sequences.
27
+ :activity_col: str
28
+ Name of the column in *dataset* that contains the activity/fitness values.
29
+ :algorithm: str
30
+ Sklearn regression algorithm name (e.g. ``'plsregression'``, ``'randomforest'``).
31
+ :parameters: dict
32
+ Keyword arguments forwarded to the sklearn model constructor.
33
+ :test_split: float
34
+ Fraction of data held back for testing (0 < test_split < 1).
35
+ :use_dsp: bool
36
+ Apply a DSP (FFT) pipeline to the AAI-encoded sequences before modelling.
37
+ :spectrum: str
38
+ Informational spectrum to use when *use_dsp* is True.
39
+ One of ``'power'``, ``'real'``, ``'imaginary'``, ``'absolute'``.
40
+ :window_type: str
41
+ Window function to apply before the FFT (e.g. ``'hamming'``, ``'blackman'``).
42
+ :filter_type: str
43
+ Filter to apply after the FFT (e.g. ``'savgol'``, ``'medfilt'``).
44
+ :descriptors_csv: str
45
+ Path to a pre-calculated descriptors CSV file. When provided the
46
+ ``Descriptors`` class will import values directly rather than
47
+ recomputing them.
48
+
49
+ Usage
50
+ =====
51
+ >>> cfg = PySARConfig(
52
+ ... config_file="thermostability.json",
53
+ ... algorithm="randomforest",
54
+ ... test_split=0.1,
55
+ ... )
56
+ >>> from pySAR import PySAR
57
+ >>> sar = PySAR(cfg.config_file, algorithm=cfg.algorithm, test_split=cfg.test_split)
58
+ """
59
+
60
+ config_file: str = ""
61
+ dataset: Optional[str] = None
62
+ sequence_col: Optional[str] = None
63
+ activity_col: Optional[str] = None
64
+ algorithm: Optional[str] = None
65
+ parameters: Optional[Dict[str, Any]] = None
66
+ test_split: Optional[float] = None
67
+ use_dsp: Optional[bool] = None
68
+ spectrum: Optional[str] = None
69
+ window_type: Optional[str] = None
70
+ filter_type: Optional[str] = None
71
+ descriptors_csv: Optional[str] = None
72
+
73
+ def to_kwargs(self) -> Dict[str, Any]:
74
+ """
75
+ Return a dict of non-None, non-config_file fields suitable for passing
76
+ as ``**kwargs`` to :class:`~pySAR.pySAR.PySAR` or
77
+ :class:`~pySAR.encoding.Encoding`.
78
+
79
+ Returns
80
+ =======
81
+ :kwargs: dict
82
+ Only fields that have been explicitly set (i.e. are not None) are
83
+ included. The ``config_file`` field is excluded since it is passed
84
+ as a positional argument.
85
+ """
86
+ result: Dict[str, Any] = {}
87
+ for field_name in (
88
+ "dataset",
89
+ "sequence_col",
90
+ "activity_col",
91
+ "algorithm",
92
+ "parameters",
93
+ "test_split",
94
+ "use_dsp",
95
+ "spectrum",
96
+ "window_type",
97
+ "filter_type",
98
+ "descriptors_csv",
99
+ ):
100
+ value = getattr(self, field_name)
101
+ if value is not None:
102
+ result[field_name] = value
103
+ return result
@@ -8,11 +8,11 @@ import pandas as pd
8
8
  import numpy as np
9
9
  from difflib import get_close_matches
10
10
  import json
11
- from json import JSONDecodeError
12
11
  import itertools
13
12
  import time
14
13
  from tqdm import tqdm
15
14
  from functools import lru_cache
15
+ from concurrent.futures import ThreadPoolExecutor, as_completed
16
16
 
17
17
  from .utils import *
18
18
  import protpy as protpy
@@ -375,13 +375,15 @@ class Descriptors():
375
375
  [14] B. Hollas, “An analysis of the autocorrelation descriptor for molecules,” J. Math. Chem.,
376
376
  vol. 33, no. 2, pp. 91–101, 2003.
377
377
  """
378
- def __init__(self,
379
- config_file: str = "",
380
- protein_seqs: Optional[Union[pd.Series, str]] = None,
378
+ def __init__(self,
379
+ config_file: str = "",
380
+ protein_seqs: Optional[Union[pd.Series, str]] = None,
381
+ n_jobs: int = 1,
381
382
  **kwargs) -> None:
382
383
 
383
384
  self.config_file = config_file
384
385
  self.protein_seqs = protein_seqs
386
+ self.n_jobs = max(1, int(n_jobs))
385
387
  self.kwargs = locals()['kwargs'] #get any keyword argument variables of class
386
388
  self.config_parameters = {}
387
389
 
@@ -1996,55 +1998,40 @@ class Descriptors():
1996
1998
  #start time counter
1997
1999
  start = time.time()
1998
2000
 
1999
- #iterate over all descriptors, calculating each using their respective function and the protpy package
2000
- for descr in tqdm(self.all_descriptors_list(), unit=" descriptor", position=0,
2001
- desc="Descriptors", mininterval=30, ncols=90):
2002
-
2003
- #if descriptor attribute DF is empty then call its respective get_descriptor function
2004
- if (descr == "amino_acid_composition" and getattr(self, "amino_acid_composition").empty):
2005
- self.amino_acid_composition = self.get_amino_acid_composition()
2006
-
2007
- if (descr == "dipeptide_composition" and getattr(self, "dipeptide_composition").empty):
2008
- self.dipeptide_composition = self.get_dipeptide_composition()
2009
-
2010
- if (descr == "tripeptide_composition" and getattr(self, "tripeptide_composition").empty):
2011
- self.tripeptide_composition = self.get_tripeptide_composition()
2012
-
2013
- if (descr == "moreaubroto_autocorrelation" and getattr(self, "moreaubroto_autocorrelation").empty):
2014
- self.moreaubroto_autocorrelation = self.get_moreaubroto_autocorrelation()
2015
-
2016
- if (descr == "moran_autocorrelation" and getattr(self, "moran_autocorrelation").empty):
2017
- self.moran_autocorrelation = self.get_moran_autocorrelation()
2018
-
2019
- if (descr == "geary_autocorrelation" and getattr(self, "geary_autocorrelation").empty):
2020
- self.geary_autocorrelation = self.get_geary_autocorrelation()
2021
-
2022
- if (descr == "ctd" and getattr(self, "ctd").empty):
2023
- self.ctd = self.get_ctd()
2024
-
2025
- if (descr == "ctd_composition" and getattr(self, "ctd_composition").empty):
2026
- self.ctd_composition = self.get_ctd_composition()
2027
-
2028
- if (descr == "ctd_transition" and getattr(self, "ctd_transition").empty):
2029
- self.ctd_transition = self.get_ctd_transition()
2030
-
2031
- if (descr == "ctd_distribution" and getattr(self, "ctd_distribution").empty):
2032
- self.ctd_distribution = self.get_ctd_distribution()
2033
-
2034
- if (descr == "conjoint_triad" and getattr(self, "conjoint_triad").empty):
2035
- self.conjoint_triad = self.get_conjoint_triad()
2036
-
2037
- if (descr == "sequence_order_coupling_number" and getattr(self, "sequence_order_coupling_number").empty):
2038
- self.sequence_order_coupling_number = self.get_sequence_order_coupling_number()
2039
-
2040
- if (descr == "quasi_sequence_order" and getattr(self, "quasi_sequence_order").empty):
2041
- self.quasi_sequence_order = self.get_quasi_sequence_order()
2042
-
2043
- if (descr == "pseudo_amino_acid_composition" and getattr(self, "pseudo_amino_acid_composition").empty):
2044
- self.pseudo_amino_acid_composition = self.get_pseudo_amino_acid_composition()
2001
+ #map each descriptor name to its getter for sequential and parallel dispatch
2002
+ _getter_map = [
2003
+ ('amino_acid_composition', self.get_amino_acid_composition),
2004
+ ('dipeptide_composition', self.get_dipeptide_composition),
2005
+ ('tripeptide_composition', self.get_tripeptide_composition),
2006
+ ('moreaubroto_autocorrelation', self.get_moreaubroto_autocorrelation),
2007
+ ('moran_autocorrelation', self.get_moran_autocorrelation),
2008
+ ('geary_autocorrelation', self.get_geary_autocorrelation),
2009
+ ('ctd', self.get_ctd),
2010
+ ('ctd_composition', self.get_ctd_composition),
2011
+ ('ctd_transition', self.get_ctd_transition),
2012
+ ('ctd_distribution', self.get_ctd_distribution),
2013
+ ('conjoint_triad', self.get_conjoint_triad),
2014
+ ('sequence_order_coupling_number', self.get_sequence_order_coupling_number),
2015
+ ('quasi_sequence_order', self.get_quasi_sequence_order),
2016
+ ('pseudo_amino_acid_composition', self.get_pseudo_amino_acid_composition),
2017
+ ('amphiphilic_pseudo_amino_acid_composition', self.get_amphiphilic_pseudo_amino_acid_composition),
2018
+ ]
2045
2019
 
2046
- if (descr == "amphiphilic_pseudo_amino_acid_composition" and getattr(self, "amphiphilic_pseudo_amino_acid_composition").empty):
2047
- self.amphiphilic_pseudo_amino_acid_composition = self.get_amphiphilic_pseudo_amino_acid_composition()
2020
+ if self.n_jobs > 1:
2021
+ #compute descriptors concurrently; skip any already populated from a prior import
2022
+ pending = [(name, getter) for name, getter in _getter_map if getattr(self, name).empty]
2023
+ with ThreadPoolExecutor(max_workers=self.n_jobs) as executor:
2024
+ futures = {executor.submit(getter): name for name, getter in pending}
2025
+ for future in tqdm(as_completed(futures), total=len(futures), unit=" descriptor",
2026
+ desc="Descriptors", ncols=90):
2027
+ name = futures[future]
2028
+ setattr(self, name, future.result())
2029
+ else:
2030
+ #iterate over all descriptors sequentially, calculating each using their respective function
2031
+ for name, getter in tqdm(_getter_map, unit=" descriptor", position=0,
2032
+ desc="Descriptors", mininterval=30, ncols=90):
2033
+ if getattr(self, name).empty:
2034
+ setattr(self, name, getter())
2048
2035
 
2049
2036
  #stop time counter, calculate elapsed time
2050
2037
  end = time.time()
@@ -2321,13 +2308,14 @@ class Descriptors():
2321
2308
 
2322
2309
  return all_descriptors
2323
2310
 
2324
- def _calculate_descriptor_batch(self,
2325
- descriptor_func: Callable,
2311
+ def _calculate_descriptor_batch(self,
2312
+ descriptor_func: Callable,
2326
2313
  desc_name: str = "",
2327
2314
  **kwargs) -> pd.DataFrame:
2328
2315
  """
2329
2316
  Generic helper method to calculate descriptors for all sequences, preventing code repetition.
2330
-
2317
+ Uses self.n_jobs threads to parallelise across sequences when n_jobs > 1.
2318
+
2331
2319
  Parameters
2332
2320
  ==========
2333
2321
  :descriptor_func: Callable
@@ -2336,16 +2324,28 @@ class Descriptors():
2336
2324
  Name of descriptor for progress tracking
2337
2325
  :kwargs: dict
2338
2326
  Additional keyword arguments to pass to descriptor function
2339
-
2327
+
2340
2328
  Returns
2341
2329
  =======
2342
2330
  :pd.DataFrame
2343
2331
  Dataframe with calculated descriptor values for all sequences
2344
2332
  """
2345
- iterator = tqdm(self.protein_seqs, desc=f"Computing {desc_name}") if desc_name else self.protein_seqs
2333
+ seqs = list(self.protein_seqs)
2346
2334
 
2347
- # accumulate results in a list to avoid O(n²) repeated concat
2348
- desc_list = [descriptor_func(seq, **kwargs) for seq in iterator]
2335
+ if self.n_jobs <= 1:
2336
+ iterator = tqdm(seqs, desc=f"Computing {desc_name}", ncols=90) if desc_name else seqs
2337
+ # accumulate results in a list to avoid O(n²) repeated concat
2338
+ desc_list = [descriptor_func(seq, **kwargs) for seq in iterator]
2339
+ else:
2340
+ desc_list = [None] * len(seqs)
2341
+ with ThreadPoolExecutor(max_workers=self.n_jobs) as executor:
2342
+ futures = {executor.submit(descriptor_func, seq, **kwargs): i
2343
+ for i, seq in enumerate(seqs)}
2344
+ progress = tqdm(as_completed(futures), total=len(seqs),
2345
+ desc=f"Computing {desc_name}", ncols=90) if desc_name else as_completed(futures)
2346
+ for future in progress:
2347
+ i = futures[future]
2348
+ desc_list[i] = future.result()
2349
2349
 
2350
2350
  return pd.concat(desc_list, ignore_index=False).reset_index(drop=True)
2351
2351