PySAR 2.5.1__tar.gz → 2.5.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pysar-2.5.1 → pysar-2.5.2}/PKG-INFO +8 -2
- {pysar-2.5.1 → pysar-2.5.2}/PySAR.egg-info/PKG-INFO +8 -2
- {pysar-2.5.1 → pysar-2.5.2}/PySAR.egg-info/SOURCES.txt +1 -0
- {pysar-2.5.1 → pysar-2.5.2}/README.md +8 -2
- {pysar-2.5.1 → pysar-2.5.2}/docs/conf.py +1 -1
- {pysar-2.5.1 → pysar-2.5.2}/pySAR/__init__.py +7 -1
- pysar-2.5.2/pySAR/config.py +103 -0
- {pysar-2.5.1 → pysar-2.5.2}/pySAR/descriptors.py +59 -58
- {pysar-2.5.1 → pysar-2.5.2}/pySAR/encoding.py +240 -37
- {pysar-2.5.1 → pysar-2.5.2}/pySAR/evaluate.py +6 -4
- pysar-2.5.2/pySAR/globals_.py +38 -0
- {pysar-2.5.1 → pysar-2.5.2}/pySAR/model.py +157 -18
- {pysar-2.5.1 → pysar-2.5.2}/pySAR/plots.py +7 -4
- {pysar-2.5.1 → pysar-2.5.2}/pySAR/pyDSP.py +63 -108
- {pysar-2.5.1 → pysar-2.5.2}/pySAR/pySAR.py +523 -220
- {pysar-2.5.1 → pysar-2.5.2}/pySAR/utils.py +14 -10
- {pysar-2.5.1 → pysar-2.5.2}/pyproject.toml +1 -1
- {pysar-2.5.1 → pysar-2.5.2}/tests/test_descriptors.py +52 -0
- {pysar-2.5.1 → pysar-2.5.2}/tests/test_encoding.py +164 -11
- {pysar-2.5.1 → pysar-2.5.2}/tests/test_evaluate.py +3 -3
- {pysar-2.5.1 → pysar-2.5.2}/tests/test_model.py +130 -12
- {pysar-2.5.1 → pysar-2.5.2}/tests/test_plots.py +4 -4
- {pysar-2.5.1 → pysar-2.5.2}/tests/test_pyDSP.py +66 -1
- {pysar-2.5.1 → pysar-2.5.2}/tests/test_pySAR.py +208 -22
- {pysar-2.5.1 → pysar-2.5.2}/tests/test_utils.py +38 -13
- pysar-2.5.1/pySAR/globals_.py +0 -18
- {pysar-2.5.1 → pysar-2.5.2}/LICENSE +0 -0
- {pysar-2.5.1 → pysar-2.5.2}/PySAR.egg-info/dependency_links.txt +0 -0
- {pysar-2.5.1 → pysar-2.5.2}/PySAR.egg-info/not-zip-safe +0 -0
- {pysar-2.5.1 → pysar-2.5.2}/PySAR.egg-info/requires.txt +0 -0
- {pysar-2.5.1 → pysar-2.5.2}/PySAR.egg-info/top_level.txt +0 -0
- {pysar-2.5.1 → pysar-2.5.2}/pySAR/py.typed +0 -0
- {pysar-2.5.1 → pysar-2.5.2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: PySAR
|
|
3
|
-
Version: 2.5.
|
|
3
|
+
Version: 2.5.2
|
|
4
4
|
Summary: Analysing Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.
|
|
5
5
|
Author-email: AJ McKenna <amckenna41@qub.ac.uk>
|
|
6
6
|
Maintainer-email: AJ McKenna <amckenna41@qub.ac.uk>
|
|
@@ -70,8 +70,13 @@ Dynamic: license-file
|
|
|
70
70
|
|
|
71
71
|
`pySAR` is a Python library for analysing Sequence Activity Relationships (SARs)/Sequence Function Relationships (SFRs) of protein sequences.
|
|
72
72
|
|
|
73
|
+
|
|
74
|
+
<h2 align="center">
|
|
75
|
+
The NEW front-end app for pySAR is available
|
|
76
|
+
<a href="https://pysar-app.vercel.app/" target="_blank">here</a>!
|
|
77
|
+
</h2>
|
|
78
|
+
|
|
73
79
|
* 📖 The published research article is available [here][article].
|
|
74
|
-
* 🌍 A front-end app for `pySAR` is available [here][frontend] (coming soon).
|
|
75
80
|
* 💻 A quick Colab notebook demo of `pySAR` is available [here][demo].
|
|
76
81
|
* 📰 A **Medium** article that dives deeper into SARs and the `pySAR` software itself is available [here][medium].
|
|
77
82
|
|
|
@@ -739,3 +744,4 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
|
739
744
|
[config]: https://github.com/amckenna41/pySAR/blob/master/CONFIG.md
|
|
740
745
|
[medium]: https://ajmckenna69.medium.com/pysar-a3de9f71733f
|
|
741
746
|
[directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
|
|
747
|
+
[frontend]: https://pysar-app.vercel.app/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: PySAR
|
|
3
|
-
Version: 2.5.
|
|
3
|
+
Version: 2.5.2
|
|
4
4
|
Summary: Analysing Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.
|
|
5
5
|
Author-email: AJ McKenna <amckenna41@qub.ac.uk>
|
|
6
6
|
Maintainer-email: AJ McKenna <amckenna41@qub.ac.uk>
|
|
@@ -70,8 +70,13 @@ Dynamic: license-file
|
|
|
70
70
|
|
|
71
71
|
`pySAR` is a Python library for analysing Sequence Activity Relationships (SARs)/Sequence Function Relationships (SFRs) of protein sequences.
|
|
72
72
|
|
|
73
|
+
|
|
74
|
+
<h2 align="center">
|
|
75
|
+
The NEW front-end app for pySAR is available
|
|
76
|
+
<a href="https://pysar-app.vercel.app/" target="_blank">here</a>!
|
|
77
|
+
</h2>
|
|
78
|
+
|
|
73
79
|
* 📖 The published research article is available [here][article].
|
|
74
|
-
* 🌍 A front-end app for `pySAR` is available [here][frontend] (coming soon).
|
|
75
80
|
* 💻 A quick Colab notebook demo of `pySAR` is available [here][demo].
|
|
76
81
|
* 📰 A **Medium** article that dives deeper into SARs and the `pySAR` software itself is available [here][medium].
|
|
77
82
|
|
|
@@ -739,3 +744,4 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
|
739
744
|
[config]: https://github.com/amckenna41/pySAR/blob/master/CONFIG.md
|
|
740
745
|
[medium]: https://ajmckenna69.medium.com/pysar-a3de9f71733f
|
|
741
746
|
[directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
|
|
747
|
+
[frontend]: https://pysar-app.vercel.app/
|
|
@@ -20,8 +20,13 @@
|
|
|
20
20
|
|
|
21
21
|
`pySAR` is a Python library for analysing Sequence Activity Relationships (SARs)/Sequence Function Relationships (SFRs) of protein sequences.
|
|
22
22
|
|
|
23
|
+
|
|
24
|
+
<h2 align="center">
|
|
25
|
+
The NEW front-end app for pySAR is available
|
|
26
|
+
<a href="https://pysar-app.vercel.app/" target="_blank">here</a>!
|
|
27
|
+
</h2>
|
|
28
|
+
|
|
23
29
|
* 📖 The published research article is available [here][article].
|
|
24
|
-
* 🌍 A front-end app for `pySAR` is available [here][frontend] (coming soon).
|
|
25
30
|
* 💻 A quick Colab notebook demo of `pySAR` is available [here][demo].
|
|
26
31
|
* 📰 A **Medium** article that dives deeper into SARs and the `pySAR` software itself is available [here][medium].
|
|
27
32
|
|
|
@@ -688,4 +693,5 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
|
|
|
688
693
|
[license]: https://github.com/amckenna41/pySAR/blob/master/LICENSE
|
|
689
694
|
[config]: https://github.com/amckenna41/pySAR/blob/master/CONFIG.md
|
|
690
695
|
[medium]: https://ajmckenna69.medium.com/pysar-a3de9f71733f
|
|
691
|
-
[directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
|
|
696
|
+
[directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
|
|
697
|
+
[frontend]: https://pysar-app.vercel.app/
|
|
@@ -15,7 +15,7 @@ sys.path.insert(0, os.path.abspath('..'))
|
|
|
15
15
|
project = 'pySAR'
|
|
16
16
|
copyright = '2026, AJ McKenna'
|
|
17
17
|
author = 'AJ McKenna'
|
|
18
|
-
release = '2.5.
|
|
18
|
+
release = '2.5.2'
|
|
19
19
|
|
|
20
20
|
# -- General configuration ---------------------------------------------------
|
|
21
21
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
""" pySAR software metadata. """
|
|
2
2
|
__name__ = 'pySAR'
|
|
3
|
-
__version__ = "2.5.
|
|
3
|
+
__version__ = "2.5.2"
|
|
4
4
|
__description__ = 'A Python package used to analysis Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.'
|
|
5
5
|
__author__ = 'AJ McKenna: https://github.com/amckenna41'
|
|
6
6
|
__authorEmail__ = 'amckenna41@qub.ac.uk'
|
|
@@ -13,6 +13,9 @@ __keywords__ = ["bioinformatics", "protein engineering", "python", "pypi", "mach
|
|
|
13
13
|
"directed evolution", "drug discovery", "sequence activity relationships", "SAR", "aaindex", "protpy", "protein descriptors"]
|
|
14
14
|
__test_suite__ = "tests"
|
|
15
15
|
|
|
16
|
+
from .encoding import SortKey, EncodingResult
|
|
17
|
+
from .config import PySARConfig
|
|
18
|
+
|
|
16
19
|
__all__ = [
|
|
17
20
|
'__version__',
|
|
18
21
|
'__description__',
|
|
@@ -25,4 +28,7 @@ __all__ = [
|
|
|
25
28
|
'__status__',
|
|
26
29
|
'__keywords__',
|
|
27
30
|
'__test_suite__',
|
|
31
|
+
'SortKey',
|
|
32
|
+
'EncodingResult',
|
|
33
|
+
'PySARConfig',
|
|
28
34
|
]
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
################################################################################
|
|
2
|
+
################# PySARConfig #################
|
|
3
|
+
################################################################################
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class PySARConfig:
|
|
11
|
+
"""
|
|
12
|
+
Typed configuration container for PySAR and Encoding.
|
|
13
|
+
|
|
14
|
+
All parameters mirror the keys in the JSON configuration files so a
|
|
15
|
+
``PySARConfig`` instance can be used wherever a config filepath is accepted.
|
|
16
|
+
Fields left as *None* fall back to the defaults encoded in the JSON file.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
==========
|
|
20
|
+
:config_file: str
|
|
21
|
+
Path to the JSON configuration file. When provided all other fields
|
|
22
|
+
are used as overrides rather than replacements.
|
|
23
|
+
:dataset: str
|
|
24
|
+
Path to the CSV dataset of protein sequences and activity values.
|
|
25
|
+
:sequence_col: str
|
|
26
|
+
Name of the column in *dataset* that contains the protein sequences.
|
|
27
|
+
:activity_col: str
|
|
28
|
+
Name of the column in *dataset* that contains the activity/fitness values.
|
|
29
|
+
:algorithm: str
|
|
30
|
+
Sklearn regression algorithm name (e.g. ``'plsregression'``, ``'randomforest'``).
|
|
31
|
+
:parameters: dict
|
|
32
|
+
Keyword arguments forwarded to the sklearn model constructor.
|
|
33
|
+
:test_split: float
|
|
34
|
+
Fraction of data held back for testing (0 < test_split < 1).
|
|
35
|
+
:use_dsp: bool
|
|
36
|
+
Apply a DSP (FFT) pipeline to the AAI-encoded sequences before modelling.
|
|
37
|
+
:spectrum: str
|
|
38
|
+
Informational spectrum to use when *use_dsp* is True.
|
|
39
|
+
One of ``'power'``, ``'real'``, ``'imaginary'``, ``'absolute'``.
|
|
40
|
+
:window_type: str
|
|
41
|
+
Window function to apply before the FFT (e.g. ``'hamming'``, ``'blackman'``).
|
|
42
|
+
:filter_type: str
|
|
43
|
+
Filter to apply after the FFT (e.g. ``'savgol'``, ``'medfilt'``).
|
|
44
|
+
:descriptors_csv: str
|
|
45
|
+
Path to a pre-calculated descriptors CSV file. When provided the
|
|
46
|
+
``Descriptors`` class will import values directly rather than
|
|
47
|
+
recomputing them.
|
|
48
|
+
|
|
49
|
+
Usage
|
|
50
|
+
=====
|
|
51
|
+
>>> cfg = PySARConfig(
|
|
52
|
+
... config_file="thermostability.json",
|
|
53
|
+
... algorithm="randomforest",
|
|
54
|
+
... test_split=0.1,
|
|
55
|
+
... )
|
|
56
|
+
>>> from pySAR import PySAR
|
|
57
|
+
>>> sar = PySAR(cfg.config_file, algorithm=cfg.algorithm, test_split=cfg.test_split)
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
config_file: str = ""
|
|
61
|
+
dataset: Optional[str] = None
|
|
62
|
+
sequence_col: Optional[str] = None
|
|
63
|
+
activity_col: Optional[str] = None
|
|
64
|
+
algorithm: Optional[str] = None
|
|
65
|
+
parameters: Optional[Dict[str, Any]] = None
|
|
66
|
+
test_split: Optional[float] = None
|
|
67
|
+
use_dsp: Optional[bool] = None
|
|
68
|
+
spectrum: Optional[str] = None
|
|
69
|
+
window_type: Optional[str] = None
|
|
70
|
+
filter_type: Optional[str] = None
|
|
71
|
+
descriptors_csv: Optional[str] = None
|
|
72
|
+
|
|
73
|
+
def to_kwargs(self) -> Dict[str, Any]:
|
|
74
|
+
"""
|
|
75
|
+
Return a dict of non-None, non-config_file fields suitable for passing
|
|
76
|
+
as ``**kwargs`` to :class:`~pySAR.pySAR.PySAR` or
|
|
77
|
+
:class:`~pySAR.encoding.Encoding`.
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
=======
|
|
81
|
+
:kwargs: dict
|
|
82
|
+
Only fields that have been explicitly set (i.e. are not None) are
|
|
83
|
+
included. The ``config_file`` field is excluded since it is passed
|
|
84
|
+
as a positional argument.
|
|
85
|
+
"""
|
|
86
|
+
result: Dict[str, Any] = {}
|
|
87
|
+
for field_name in (
|
|
88
|
+
"dataset",
|
|
89
|
+
"sequence_col",
|
|
90
|
+
"activity_col",
|
|
91
|
+
"algorithm",
|
|
92
|
+
"parameters",
|
|
93
|
+
"test_split",
|
|
94
|
+
"use_dsp",
|
|
95
|
+
"spectrum",
|
|
96
|
+
"window_type",
|
|
97
|
+
"filter_type",
|
|
98
|
+
"descriptors_csv",
|
|
99
|
+
):
|
|
100
|
+
value = getattr(self, field_name)
|
|
101
|
+
if value is not None:
|
|
102
|
+
result[field_name] = value
|
|
103
|
+
return result
|
|
@@ -12,6 +12,7 @@ import itertools
|
|
|
12
12
|
import time
|
|
13
13
|
from tqdm import tqdm
|
|
14
14
|
from functools import lru_cache
|
|
15
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
15
16
|
|
|
16
17
|
from .utils import *
|
|
17
18
|
import protpy as protpy
|
|
@@ -374,13 +375,15 @@ class Descriptors():
|
|
|
374
375
|
[14] B. Hollas, “An analysis of the autocorrelation descriptor for molecules,” J. Math. Chem.,
|
|
375
376
|
vol. 33, no. 2, pp. 91–101, 2003.
|
|
376
377
|
"""
|
|
377
|
-
def __init__(self,
|
|
378
|
-
config_file: str = "",
|
|
379
|
-
protein_seqs: Optional[Union[pd.Series, str]] = None,
|
|
378
|
+
def __init__(self,
|
|
379
|
+
config_file: str = "",
|
|
380
|
+
protein_seqs: Optional[Union[pd.Series, str]] = None,
|
|
381
|
+
n_jobs: int = 1,
|
|
380
382
|
**kwargs) -> None:
|
|
381
383
|
|
|
382
384
|
self.config_file = config_file
|
|
383
385
|
self.protein_seqs = protein_seqs
|
|
386
|
+
self.n_jobs = max(1, int(n_jobs))
|
|
384
387
|
self.kwargs = locals()['kwargs'] #get any keyword argument variables of class
|
|
385
388
|
self.config_parameters = {}
|
|
386
389
|
|
|
@@ -1995,55 +1998,40 @@ class Descriptors():
|
|
|
1995
1998
|
#start time counter
|
|
1996
1999
|
start = time.time()
|
|
1997
2000
|
|
|
1998
|
-
#
|
|
1999
|
-
|
|
2000
|
-
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
|
|
2014
|
-
|
|
2015
|
-
|
|
2016
|
-
self.moran_autocorrelation = self.get_moran_autocorrelation()
|
|
2017
|
-
|
|
2018
|
-
if (descr == "geary_autocorrelation" and getattr(self, "geary_autocorrelation").empty):
|
|
2019
|
-
self.geary_autocorrelation = self.get_geary_autocorrelation()
|
|
2020
|
-
|
|
2021
|
-
if (descr == "ctd" and getattr(self, "ctd").empty):
|
|
2022
|
-
self.ctd = self.get_ctd()
|
|
2023
|
-
|
|
2024
|
-
if (descr == "ctd_composition" and getattr(self, "ctd_composition").empty):
|
|
2025
|
-
self.ctd_composition = self.get_ctd_composition()
|
|
2026
|
-
|
|
2027
|
-
if (descr == "ctd_transition" and getattr(self, "ctd_transition").empty):
|
|
2028
|
-
self.ctd_transition = self.get_ctd_transition()
|
|
2029
|
-
|
|
2030
|
-
if (descr == "ctd_distribution" and getattr(self, "ctd_distribution").empty):
|
|
2031
|
-
self.ctd_distribution = self.get_ctd_distribution()
|
|
2032
|
-
|
|
2033
|
-
if (descr == "conjoint_triad" and getattr(self, "conjoint_triad").empty):
|
|
2034
|
-
self.conjoint_triad = self.get_conjoint_triad()
|
|
2035
|
-
|
|
2036
|
-
if (descr == "sequence_order_coupling_number" and getattr(self, "sequence_order_coupling_number").empty):
|
|
2037
|
-
self.sequence_order_coupling_number = self.get_sequence_order_coupling_number()
|
|
2038
|
-
|
|
2039
|
-
if (descr == "quasi_sequence_order" and getattr(self, "quasi_sequence_order").empty):
|
|
2040
|
-
self.quasi_sequence_order = self.get_quasi_sequence_order()
|
|
2041
|
-
|
|
2042
|
-
if (descr == "pseudo_amino_acid_composition" and getattr(self, "pseudo_amino_acid_composition").empty):
|
|
2043
|
-
self.pseudo_amino_acid_composition = self.get_pseudo_amino_acid_composition()
|
|
2001
|
+
#map each descriptor name to its getter for sequential and parallel dispatch
|
|
2002
|
+
_getter_map = [
|
|
2003
|
+
('amino_acid_composition', self.get_amino_acid_composition),
|
|
2004
|
+
('dipeptide_composition', self.get_dipeptide_composition),
|
|
2005
|
+
('tripeptide_composition', self.get_tripeptide_composition),
|
|
2006
|
+
('moreaubroto_autocorrelation', self.get_moreaubroto_autocorrelation),
|
|
2007
|
+
('moran_autocorrelation', self.get_moran_autocorrelation),
|
|
2008
|
+
('geary_autocorrelation', self.get_geary_autocorrelation),
|
|
2009
|
+
('ctd', self.get_ctd),
|
|
2010
|
+
('ctd_composition', self.get_ctd_composition),
|
|
2011
|
+
('ctd_transition', self.get_ctd_transition),
|
|
2012
|
+
('ctd_distribution', self.get_ctd_distribution),
|
|
2013
|
+
('conjoint_triad', self.get_conjoint_triad),
|
|
2014
|
+
('sequence_order_coupling_number', self.get_sequence_order_coupling_number),
|
|
2015
|
+
('quasi_sequence_order', self.get_quasi_sequence_order),
|
|
2016
|
+
('pseudo_amino_acid_composition', self.get_pseudo_amino_acid_composition),
|
|
2017
|
+
('amphiphilic_pseudo_amino_acid_composition', self.get_amphiphilic_pseudo_amino_acid_composition),
|
|
2018
|
+
]
|
|
2044
2019
|
|
|
2045
|
-
|
|
2046
|
-
|
|
2020
|
+
if self.n_jobs > 1:
|
|
2021
|
+
#compute descriptors concurrently; skip any already populated from a prior import
|
|
2022
|
+
pending = [(name, getter) for name, getter in _getter_map if getattr(self, name).empty]
|
|
2023
|
+
with ThreadPoolExecutor(max_workers=self.n_jobs) as executor:
|
|
2024
|
+
futures = {executor.submit(getter): name for name, getter in pending}
|
|
2025
|
+
for future in tqdm(as_completed(futures), total=len(futures), unit=" descriptor",
|
|
2026
|
+
desc="Descriptors", ncols=90):
|
|
2027
|
+
name = futures[future]
|
|
2028
|
+
setattr(self, name, future.result())
|
|
2029
|
+
else:
|
|
2030
|
+
#iterate over all descriptors sequentially, calculating each using their respective function
|
|
2031
|
+
for name, getter in tqdm(_getter_map, unit=" descriptor", position=0,
|
|
2032
|
+
desc="Descriptors", mininterval=30, ncols=90):
|
|
2033
|
+
if getattr(self, name).empty:
|
|
2034
|
+
setattr(self, name, getter())
|
|
2047
2035
|
|
|
2048
2036
|
#stop time counter, calculate elapsed time
|
|
2049
2037
|
end = time.time()
|
|
@@ -2320,13 +2308,14 @@ class Descriptors():
|
|
|
2320
2308
|
|
|
2321
2309
|
return all_descriptors
|
|
2322
2310
|
|
|
2323
|
-
def _calculate_descriptor_batch(self,
|
|
2324
|
-
descriptor_func: Callable,
|
|
2311
|
+
def _calculate_descriptor_batch(self,
|
|
2312
|
+
descriptor_func: Callable,
|
|
2325
2313
|
desc_name: str = "",
|
|
2326
2314
|
**kwargs) -> pd.DataFrame:
|
|
2327
2315
|
"""
|
|
2328
2316
|
Generic helper method to calculate descriptors for all sequences, preventing code repetition.
|
|
2329
|
-
|
|
2317
|
+
Uses self.n_jobs threads to parallelise across sequences when n_jobs > 1.
|
|
2318
|
+
|
|
2330
2319
|
Parameters
|
|
2331
2320
|
==========
|
|
2332
2321
|
:descriptor_func: Callable
|
|
@@ -2335,16 +2324,28 @@ class Descriptors():
|
|
|
2335
2324
|
Name of descriptor for progress tracking
|
|
2336
2325
|
:kwargs: dict
|
|
2337
2326
|
Additional keyword arguments to pass to descriptor function
|
|
2338
|
-
|
|
2327
|
+
|
|
2339
2328
|
Returns
|
|
2340
2329
|
=======
|
|
2341
2330
|
:pd.DataFrame
|
|
2342
2331
|
Dataframe with calculated descriptor values for all sequences
|
|
2343
2332
|
"""
|
|
2344
|
-
|
|
2333
|
+
seqs = list(self.protein_seqs)
|
|
2345
2334
|
|
|
2346
|
-
|
|
2347
|
-
|
|
2335
|
+
if self.n_jobs <= 1:
|
|
2336
|
+
iterator = tqdm(seqs, desc=f"Computing {desc_name}", ncols=90) if desc_name else seqs
|
|
2337
|
+
# accumulate results in a list to avoid O(n²) repeated concat
|
|
2338
|
+
desc_list = [descriptor_func(seq, **kwargs) for seq in iterator]
|
|
2339
|
+
else:
|
|
2340
|
+
desc_list = [None] * len(seqs)
|
|
2341
|
+
with ThreadPoolExecutor(max_workers=self.n_jobs) as executor:
|
|
2342
|
+
futures = {executor.submit(descriptor_func, seq, **kwargs): i
|
|
2343
|
+
for i, seq in enumerate(seqs)}
|
|
2344
|
+
progress = tqdm(as_completed(futures), total=len(seqs),
|
|
2345
|
+
desc=f"Computing {desc_name}", ncols=90) if desc_name else as_completed(futures)
|
|
2346
|
+
for future in progress:
|
|
2347
|
+
i = futures[future]
|
|
2348
|
+
desc_list[i] = future.result()
|
|
2348
2349
|
|
|
2349
2350
|
return pd.concat(desc_list, ignore_index=False).reset_index(drop=True)
|
|
2350
2351
|
|