PySAR 2.5.1__tar.gz → 2.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {pysar-2.5.1 → pysar-2.5.2}/PKG-INFO +8 -2
  2. {pysar-2.5.1 → pysar-2.5.2}/PySAR.egg-info/PKG-INFO +8 -2
  3. {pysar-2.5.1 → pysar-2.5.2}/PySAR.egg-info/SOURCES.txt +1 -0
  4. {pysar-2.5.1 → pysar-2.5.2}/README.md +8 -2
  5. {pysar-2.5.1 → pysar-2.5.2}/docs/conf.py +1 -1
  6. {pysar-2.5.1 → pysar-2.5.2}/pySAR/__init__.py +7 -1
  7. pysar-2.5.2/pySAR/config.py +103 -0
  8. {pysar-2.5.1 → pysar-2.5.2}/pySAR/descriptors.py +59 -58
  9. {pysar-2.5.1 → pysar-2.5.2}/pySAR/encoding.py +240 -37
  10. {pysar-2.5.1 → pysar-2.5.2}/pySAR/evaluate.py +6 -4
  11. pysar-2.5.2/pySAR/globals_.py +38 -0
  12. {pysar-2.5.1 → pysar-2.5.2}/pySAR/model.py +157 -18
  13. {pysar-2.5.1 → pysar-2.5.2}/pySAR/plots.py +7 -4
  14. {pysar-2.5.1 → pysar-2.5.2}/pySAR/pyDSP.py +63 -108
  15. {pysar-2.5.1 → pysar-2.5.2}/pySAR/pySAR.py +523 -220
  16. {pysar-2.5.1 → pysar-2.5.2}/pySAR/utils.py +14 -10
  17. {pysar-2.5.1 → pysar-2.5.2}/pyproject.toml +1 -1
  18. {pysar-2.5.1 → pysar-2.5.2}/tests/test_descriptors.py +52 -0
  19. {pysar-2.5.1 → pysar-2.5.2}/tests/test_encoding.py +164 -11
  20. {pysar-2.5.1 → pysar-2.5.2}/tests/test_evaluate.py +3 -3
  21. {pysar-2.5.1 → pysar-2.5.2}/tests/test_model.py +130 -12
  22. {pysar-2.5.1 → pysar-2.5.2}/tests/test_plots.py +4 -4
  23. {pysar-2.5.1 → pysar-2.5.2}/tests/test_pyDSP.py +66 -1
  24. {pysar-2.5.1 → pysar-2.5.2}/tests/test_pySAR.py +208 -22
  25. {pysar-2.5.1 → pysar-2.5.2}/tests/test_utils.py +38 -13
  26. pysar-2.5.1/pySAR/globals_.py +0 -18
  27. {pysar-2.5.1 → pysar-2.5.2}/LICENSE +0 -0
  28. {pysar-2.5.1 → pysar-2.5.2}/PySAR.egg-info/dependency_links.txt +0 -0
  29. {pysar-2.5.1 → pysar-2.5.2}/PySAR.egg-info/not-zip-safe +0 -0
  30. {pysar-2.5.1 → pysar-2.5.2}/PySAR.egg-info/requires.txt +0 -0
  31. {pysar-2.5.1 → pysar-2.5.2}/PySAR.egg-info/top_level.txt +0 -0
  32. {pysar-2.5.1 → pysar-2.5.2}/pySAR/py.typed +0 -0
  33. {pysar-2.5.1 → pysar-2.5.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: PySAR
3
- Version: 2.5.1
3
+ Version: 2.5.2
4
4
  Summary: Analysing Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.
5
5
  Author-email: AJ McKenna <amckenna41@qub.ac.uk>
6
6
  Maintainer-email: AJ McKenna <amckenna41@qub.ac.uk>
@@ -70,8 +70,13 @@ Dynamic: license-file
70
70
 
71
71
  `pySAR` is a Python library for analysing Sequence Activity Relationships (SARs)/Sequence Function Relationships (SFRs) of protein sequences.
72
72
 
73
+
74
+ <h2 align="center">
75
+ The NEW front-end app for pySAR is available
76
+ <a href="https://pysar-app.vercel.app/" target="_blank">here</a>!
77
+ </h2>
78
+
73
79
  * 📖 The published research article is available [here][article].
74
- * 🌍 A front-end app for `pySAR` is available [here][frontend] (coming soon).
75
80
  * 💻 A quick Colab notebook demo of `pySAR` is available [here][demo].
76
81
  * 📰 A **Medium** article that dives deeper into SARs and the `pySAR` software itself is available [here][medium].
77
82
 
@@ -739,3 +744,4 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
739
744
  [config]: https://github.com/amckenna41/pySAR/blob/master/CONFIG.md
740
745
  [medium]: https://ajmckenna69.medium.com/pysar-a3de9f71733f
741
746
  [directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
747
+ [frontend]: https://pysar-app.vercel.app/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: PySAR
3
- Version: 2.5.1
3
+ Version: 2.5.2
4
4
  Summary: Analysing Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.
5
5
  Author-email: AJ McKenna <amckenna41@qub.ac.uk>
6
6
  Maintainer-email: AJ McKenna <amckenna41@qub.ac.uk>
@@ -70,8 +70,13 @@ Dynamic: license-file
70
70
 
71
71
  `pySAR` is a Python library for analysing Sequence Activity Relationships (SARs)/Sequence Function Relationships (SFRs) of protein sequences.
72
72
 
73
+
74
+ <h2 align="center">
75
+ The NEW front-end app for pySAR is available
76
+ <a href="https://pysar-app.vercel.app/" target="_blank">here</a>!
77
+ </h2>
78
+
73
79
  * 📖 The published research article is available [here][article].
74
- * 🌍 A front-end app for `pySAR` is available [here][frontend] (coming soon).
75
80
  * 💻 A quick Colab notebook demo of `pySAR` is available [here][demo].
76
81
  * 📰 A **Medium** article that dives deeper into SARs and the `pySAR` software itself is available [here][medium].
77
82
 
@@ -739,3 +744,4 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
739
744
  [config]: https://github.com/amckenna41/pySAR/blob/master/CONFIG.md
740
745
  [medium]: https://ajmckenna69.medium.com/pysar-a3de9f71733f
741
746
  [directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
747
+ [frontend]: https://pysar-app.vercel.app/
@@ -9,6 +9,7 @@ PySAR.egg-info/requires.txt
9
9
  PySAR.egg-info/top_level.txt
10
10
  docs/conf.py
11
11
  pySAR/__init__.py
12
+ pySAR/config.py
12
13
  pySAR/descriptors.py
13
14
  pySAR/encoding.py
14
15
  pySAR/evaluate.py
@@ -20,8 +20,13 @@
20
20
 
21
21
  `pySAR` is a Python library for analysing Sequence Activity Relationships (SARs)/Sequence Function Relationships (SFRs) of protein sequences.
22
22
 
23
+
24
+ <h2 align="center">
25
+ The NEW front-end app for pySAR is available
26
+ <a href="https://pysar-app.vercel.app/" target="_blank">here</a>!
27
+ </h2>
28
+
23
29
  * 📖 The published research article is available [here][article].
24
- * 🌍 A front-end app for `pySAR` is available [here][frontend] (coming soon).
25
30
  * 💻 A quick Colab notebook demo of `pySAR` is available [here][demo].
26
31
  * 📰 A **Medium** article that dives deeper into SARs and the `pySAR` software itself is available [here][medium].
27
32
 
@@ -688,4 +693,5 @@ DOI: 10.1021/acs.jcim.0c00073 <br><br>
688
693
  [license]: https://github.com/amckenna41/pySAR/blob/master/LICENSE
689
694
  [config]: https://github.com/amckenna41/pySAR/blob/master/CONFIG.md
690
695
  [medium]: https://ajmckenna69.medium.com/pysar-a3de9f71733f
691
- [directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
696
+ [directed_evolution]: https://en.wikipedia.org/wiki/Directed_evolution_(protein_engineering)
697
+ [frontend]: https://pysar-app.vercel.app/
@@ -15,7 +15,7 @@ sys.path.insert(0, os.path.abspath('..'))
15
15
  project = 'pySAR'
16
16
  copyright = '2026, AJ McKenna'
17
17
  author = 'AJ McKenna'
18
- release = '2.5.1'
18
+ release = '2.5.2'
19
19
 
20
20
  # -- General configuration ---------------------------------------------------
21
21
  # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
@@ -1,6 +1,6 @@
1
1
  """ pySAR software metadata. """
2
2
  __name__ = 'pySAR'
3
- __version__ = "2.5.1"
3
+ __version__ = "2.5.2"
4
4
  __description__ = 'A Python package used to analysis Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.'
5
5
  __author__ = 'AJ McKenna: https://github.com/amckenna41'
6
6
  __authorEmail__ = 'amckenna41@qub.ac.uk'
@@ -13,6 +13,9 @@ __keywords__ = ["bioinformatics", "protein engineering", "python", "pypi", "mach
13
13
  "directed evolution", "drug discovery", "sequence activity relationships", "SAR", "aaindex", "protpy", "protein descriptors"]
14
14
  __test_suite__ = "tests"
15
15
 
16
+ from .encoding import SortKey, EncodingResult
17
+ from .config import PySARConfig
18
+
16
19
  __all__ = [
17
20
  '__version__',
18
21
  '__description__',
@@ -25,4 +28,7 @@ __all__ = [
25
28
  '__status__',
26
29
  '__keywords__',
27
30
  '__test_suite__',
31
+ 'SortKey',
32
+ 'EncodingResult',
33
+ 'PySARConfig',
28
34
  ]
@@ -0,0 +1,103 @@
1
+ ################################################################################
2
+ ################# PySARConfig #################
3
+ ################################################################################
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, Dict, List, Optional, Union
7
+
8
+
9
+ @dataclass
10
+ class PySARConfig:
11
+ """
12
+ Typed configuration container for PySAR and Encoding.
13
+
14
+ All parameters mirror the keys in the JSON configuration files so a
15
+ ``PySARConfig`` instance can be used wherever a config filepath is accepted.
16
+ Fields left as *None* fall back to the defaults encoded in the JSON file.
17
+
18
+ Parameters
19
+ ==========
20
+ :config_file: str
21
+ Path to the JSON configuration file. When provided all other fields
22
+ are used as overrides rather than replacements.
23
+ :dataset: str
24
+ Path to the CSV dataset of protein sequences and activity values.
25
+ :sequence_col: str
26
+ Name of the column in *dataset* that contains the protein sequences.
27
+ :activity_col: str
28
+ Name of the column in *dataset* that contains the activity/fitness values.
29
+ :algorithm: str
30
+ Sklearn regression algorithm name (e.g. ``'plsregression'``, ``'randomforest'``).
31
+ :parameters: dict
32
+ Keyword arguments forwarded to the sklearn model constructor.
33
+ :test_split: float
34
+ Fraction of data held back for testing (0 < test_split < 1).
35
+ :use_dsp: bool
36
+ Apply a DSP (FFT) pipeline to the AAI-encoded sequences before modelling.
37
+ :spectrum: str
38
+ Informational spectrum to use when *use_dsp* is True.
39
+ One of ``'power'``, ``'real'``, ``'imaginary'``, ``'absolute'``.
40
+ :window_type: str
41
+ Window function to apply before the FFT (e.g. ``'hamming'``, ``'blackman'``).
42
+ :filter_type: str
43
+ Filter to apply after the FFT (e.g. ``'savgol'``, ``'medfilt'``).
44
+ :descriptors_csv: str
45
+ Path to a pre-calculated descriptors CSV file. When provided the
46
+ ``Descriptors`` class will import values directly rather than
47
+ recomputing them.
48
+
49
+ Usage
50
+ =====
51
+ >>> cfg = PySARConfig(
52
+ ... config_file="thermostability.json",
53
+ ... algorithm="randomforest",
54
+ ... test_split=0.1,
55
+ ... )
56
+ >>> from pySAR import PySAR
57
+ >>> sar = PySAR(cfg.config_file, algorithm=cfg.algorithm, test_split=cfg.test_split)
58
+ """
59
+
60
+ config_file: str = ""
61
+ dataset: Optional[str] = None
62
+ sequence_col: Optional[str] = None
63
+ activity_col: Optional[str] = None
64
+ algorithm: Optional[str] = None
65
+ parameters: Optional[Dict[str, Any]] = None
66
+ test_split: Optional[float] = None
67
+ use_dsp: Optional[bool] = None
68
+ spectrum: Optional[str] = None
69
+ window_type: Optional[str] = None
70
+ filter_type: Optional[str] = None
71
+ descriptors_csv: Optional[str] = None
72
+
73
+ def to_kwargs(self) -> Dict[str, Any]:
74
+ """
75
+ Return a dict of non-None, non-config_file fields suitable for passing
76
+ as ``**kwargs`` to :class:`~pySAR.pySAR.PySAR` or
77
+ :class:`~pySAR.encoding.Encoding`.
78
+
79
+ Returns
80
+ =======
81
+ :kwargs: dict
82
+ Only fields that have been explicitly set (i.e. are not None) are
83
+ included. The ``config_file`` field is excluded since it is passed
84
+ as a positional argument.
85
+ """
86
+ result: Dict[str, Any] = {}
87
+ for field_name in (
88
+ "dataset",
89
+ "sequence_col",
90
+ "activity_col",
91
+ "algorithm",
92
+ "parameters",
93
+ "test_split",
94
+ "use_dsp",
95
+ "spectrum",
96
+ "window_type",
97
+ "filter_type",
98
+ "descriptors_csv",
99
+ ):
100
+ value = getattr(self, field_name)
101
+ if value is not None:
102
+ result[field_name] = value
103
+ return result
@@ -12,6 +12,7 @@ import itertools
12
12
  import time
13
13
  from tqdm import tqdm
14
14
  from functools import lru_cache
15
+ from concurrent.futures import ThreadPoolExecutor, as_completed
15
16
 
16
17
  from .utils import *
17
18
  import protpy as protpy
@@ -374,13 +375,15 @@ class Descriptors():
374
375
  [14] B. Hollas, “An analysis of the autocorrelation descriptor for molecules,” J. Math. Chem.,
375
376
  vol. 33, no. 2, pp. 91–101, 2003.
376
377
  """
377
- def __init__(self,
378
- config_file: str = "",
379
- protein_seqs: Optional[Union[pd.Series, str]] = None,
378
+ def __init__(self,
379
+ config_file: str = "",
380
+ protein_seqs: Optional[Union[pd.Series, str]] = None,
381
+ n_jobs: int = 1,
380
382
  **kwargs) -> None:
381
383
 
382
384
  self.config_file = config_file
383
385
  self.protein_seqs = protein_seqs
386
+ self.n_jobs = max(1, int(n_jobs))
384
387
  self.kwargs = locals()['kwargs'] #get any keyword argument variables of class
385
388
  self.config_parameters = {}
386
389
 
@@ -1995,55 +1998,40 @@ class Descriptors():
1995
1998
  #start time counter
1996
1999
  start = time.time()
1997
2000
 
1998
- #iterate over all descriptors, calculating each using their respective function and the protpy package
1999
- for descr in tqdm(self.all_descriptors_list(), unit=" descriptor", position=0,
2000
- desc="Descriptors", mininterval=30, ncols=90):
2001
-
2002
- #if descriptor attribute DF is empty then call its respective get_descriptor function
2003
- if (descr == "amino_acid_composition" and getattr(self, "amino_acid_composition").empty):
2004
- self.amino_acid_composition = self.get_amino_acid_composition()
2005
-
2006
- if (descr == "dipeptide_composition" and getattr(self, "dipeptide_composition").empty):
2007
- self.dipeptide_composition = self.get_dipeptide_composition()
2008
-
2009
- if (descr == "tripeptide_composition" and getattr(self, "tripeptide_composition").empty):
2010
- self.tripeptide_composition = self.get_tripeptide_composition()
2011
-
2012
- if (descr == "moreaubroto_autocorrelation" and getattr(self, "moreaubroto_autocorrelation").empty):
2013
- self.moreaubroto_autocorrelation = self.get_moreaubroto_autocorrelation()
2014
-
2015
- if (descr == "moran_autocorrelation" and getattr(self, "moran_autocorrelation").empty):
2016
- self.moran_autocorrelation = self.get_moran_autocorrelation()
2017
-
2018
- if (descr == "geary_autocorrelation" and getattr(self, "geary_autocorrelation").empty):
2019
- self.geary_autocorrelation = self.get_geary_autocorrelation()
2020
-
2021
- if (descr == "ctd" and getattr(self, "ctd").empty):
2022
- self.ctd = self.get_ctd()
2023
-
2024
- if (descr == "ctd_composition" and getattr(self, "ctd_composition").empty):
2025
- self.ctd_composition = self.get_ctd_composition()
2026
-
2027
- if (descr == "ctd_transition" and getattr(self, "ctd_transition").empty):
2028
- self.ctd_transition = self.get_ctd_transition()
2029
-
2030
- if (descr == "ctd_distribution" and getattr(self, "ctd_distribution").empty):
2031
- self.ctd_distribution = self.get_ctd_distribution()
2032
-
2033
- if (descr == "conjoint_triad" and getattr(self, "conjoint_triad").empty):
2034
- self.conjoint_triad = self.get_conjoint_triad()
2035
-
2036
- if (descr == "sequence_order_coupling_number" and getattr(self, "sequence_order_coupling_number").empty):
2037
- self.sequence_order_coupling_number = self.get_sequence_order_coupling_number()
2038
-
2039
- if (descr == "quasi_sequence_order" and getattr(self, "quasi_sequence_order").empty):
2040
- self.quasi_sequence_order = self.get_quasi_sequence_order()
2041
-
2042
- if (descr == "pseudo_amino_acid_composition" and getattr(self, "pseudo_amino_acid_composition").empty):
2043
- self.pseudo_amino_acid_composition = self.get_pseudo_amino_acid_composition()
2001
+ #map each descriptor name to its getter for sequential and parallel dispatch
2002
+ _getter_map = [
2003
+ ('amino_acid_composition', self.get_amino_acid_composition),
2004
+ ('dipeptide_composition', self.get_dipeptide_composition),
2005
+ ('tripeptide_composition', self.get_tripeptide_composition),
2006
+ ('moreaubroto_autocorrelation', self.get_moreaubroto_autocorrelation),
2007
+ ('moran_autocorrelation', self.get_moran_autocorrelation),
2008
+ ('geary_autocorrelation', self.get_geary_autocorrelation),
2009
+ ('ctd', self.get_ctd),
2010
+ ('ctd_composition', self.get_ctd_composition),
2011
+ ('ctd_transition', self.get_ctd_transition),
2012
+ ('ctd_distribution', self.get_ctd_distribution),
2013
+ ('conjoint_triad', self.get_conjoint_triad),
2014
+ ('sequence_order_coupling_number', self.get_sequence_order_coupling_number),
2015
+ ('quasi_sequence_order', self.get_quasi_sequence_order),
2016
+ ('pseudo_amino_acid_composition', self.get_pseudo_amino_acid_composition),
2017
+ ('amphiphilic_pseudo_amino_acid_composition', self.get_amphiphilic_pseudo_amino_acid_composition),
2018
+ ]
2044
2019
 
2045
- if (descr == "amphiphilic_pseudo_amino_acid_composition" and getattr(self, "amphiphilic_pseudo_amino_acid_composition").empty):
2046
- self.amphiphilic_pseudo_amino_acid_composition = self.get_amphiphilic_pseudo_amino_acid_composition()
2020
+ if self.n_jobs > 1:
2021
+ #compute descriptors concurrently; skip any already populated from a prior import
2022
+ pending = [(name, getter) for name, getter in _getter_map if getattr(self, name).empty]
2023
+ with ThreadPoolExecutor(max_workers=self.n_jobs) as executor:
2024
+ futures = {executor.submit(getter): name for name, getter in pending}
2025
+ for future in tqdm(as_completed(futures), total=len(futures), unit=" descriptor",
2026
+ desc="Descriptors", ncols=90):
2027
+ name = futures[future]
2028
+ setattr(self, name, future.result())
2029
+ else:
2030
+ #iterate over all descriptors sequentially, calculating each using their respective function
2031
+ for name, getter in tqdm(_getter_map, unit=" descriptor", position=0,
2032
+ desc="Descriptors", mininterval=30, ncols=90):
2033
+ if getattr(self, name).empty:
2034
+ setattr(self, name, getter())
2047
2035
 
2048
2036
  #stop time counter, calculate elapsed time
2049
2037
  end = time.time()
@@ -2320,13 +2308,14 @@ class Descriptors():
2320
2308
 
2321
2309
  return all_descriptors
2322
2310
 
2323
- def _calculate_descriptor_batch(self,
2324
- descriptor_func: Callable,
2311
+ def _calculate_descriptor_batch(self,
2312
+ descriptor_func: Callable,
2325
2313
  desc_name: str = "",
2326
2314
  **kwargs) -> pd.DataFrame:
2327
2315
  """
2328
2316
  Generic helper method to calculate descriptors for all sequences, preventing code repetition.
2329
-
2317
+ Uses self.n_jobs threads to parallelise across sequences when n_jobs > 1.
2318
+
2330
2319
  Parameters
2331
2320
  ==========
2332
2321
  :descriptor_func: Callable
@@ -2335,16 +2324,28 @@ class Descriptors():
2335
2324
  Name of descriptor for progress tracking
2336
2325
  :kwargs: dict
2337
2326
  Additional keyword arguments to pass to descriptor function
2338
-
2327
+
2339
2328
  Returns
2340
2329
  =======
2341
2330
  :pd.DataFrame
2342
2331
  Dataframe with calculated descriptor values for all sequences
2343
2332
  """
2344
- iterator = tqdm(self.protein_seqs, desc=f"Computing {desc_name}") if desc_name else self.protein_seqs
2333
+ seqs = list(self.protein_seqs)
2345
2334
 
2346
- # accumulate results in a list to avoid O(n²) repeated concat
2347
- desc_list = [descriptor_func(seq, **kwargs) for seq in iterator]
2335
+ if self.n_jobs <= 1:
2336
+ iterator = tqdm(seqs, desc=f"Computing {desc_name}", ncols=90) if desc_name else seqs
2337
+ # accumulate results in a list to avoid O(n²) repeated concat
2338
+ desc_list = [descriptor_func(seq, **kwargs) for seq in iterator]
2339
+ else:
2340
+ desc_list = [None] * len(seqs)
2341
+ with ThreadPoolExecutor(max_workers=self.n_jobs) as executor:
2342
+ futures = {executor.submit(descriptor_func, seq, **kwargs): i
2343
+ for i, seq in enumerate(seqs)}
2344
+ progress = tqdm(as_completed(futures), total=len(seqs),
2345
+ desc=f"Computing {desc_name}", ncols=90) if desc_name else as_completed(futures)
2346
+ for future in progress:
2347
+ i = futures[future]
2348
+ desc_list[i] = future.result()
2348
2349
 
2349
2350
  return pd.concat(desc_list, ignore_index=False).reset_index(drop=True)
2350
2351