structuredca 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- structuredca-1.0.0/LICENSE +24 -0
- structuredca-1.0.0/MANIFEST.in +20 -0
- structuredca-1.0.0/PKG-INFO +151 -0
- structuredca-1.0.0/README.md +131 -0
- structuredca-1.0.0/pyproject.toml +41 -0
- structuredca-1.0.0/setup.cfg +4 -0
- structuredca-1.0.0/setup.py +45 -0
- structuredca-1.0.0/structuredca/__init__.py +20 -0
- structuredca-1.0.0/structuredca/aligner/__init__.py +2 -0
- structuredca-1.0.0/structuredca/aligner/linear_extrapolation.py +87 -0
- structuredca-1.0.0/structuredca/aligner/structure_sequence_alignment.py +353 -0
- structuredca-1.0.0/structuredca/cli.py +233 -0
- structuredca-1.0.0/structuredca/dca_model/__init__.py +2 -0
- structuredca-1.0.0/structuredca/dca_model/data_structures/__init__.py +2 -0
- structuredca-1.0.0/structuredca/dca_model/data_structures/sparse_J.py +232 -0
- structuredca-1.0.0/structuredca/dca_model/data_structures/sparse_matrix.py +83 -0
- structuredca-1.0.0/structuredca/dca_model/dca_model.py +757 -0
- structuredca-1.0.0/structuredca/dca_model/dca_solvers/__init__.py +2 -0
- structuredca-1.0.0/structuredca/dca_model/dca_solvers/dca_solver.py +118 -0
- structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/__init__.py +1 -0
- structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/include/plmdca.h +126 -0
- structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/lbfgs/include/arithmetic_ansi.h +133 -0
- structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/lbfgs/include/arithmetic_sse_double.h +294 -0
- structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/lbfgs/include/arithmetic_sse_float.h +298 -0
- structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/lbfgs/include/lbfgs.h +747 -0
- structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/lbfgs/lib/lbfgs.cpp +1374 -0
- structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/plmdca.cpp +768 -0
- structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/plmdca.py +213 -0
- structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/plmdcaBackend.cpp +278 -0
- structuredca-1.0.0/structuredca/dca_model/gauge.py +183 -0
- structuredca-1.0.0/structuredca/sequence/__init__.py +6 -0
- structuredca-1.0.0/structuredca/sequence/amino_acid.py +256 -0
- structuredca-1.0.0/structuredca/sequence/fasta_reader.py +155 -0
- structuredca-1.0.0/structuredca/sequence/msa.py +159 -0
- structuredca-1.0.0/structuredca/sequence/mutation.py +98 -0
- structuredca-1.0.0/structuredca/sequence/pairwise_alignment.py +300 -0
- structuredca-1.0.0/structuredca/sequence/sequence.py +172 -0
- structuredca-1.0.0/structuredca/structure/__init__.py +2 -0
- structuredca-1.0.0/structuredca/structure/residue.py +47 -0
- structuredca-1.0.0/structuredca/structure/rsa/__init__.py +2 -0
- structuredca-1.0.0/structuredca/structure/rsa/rsa_biopython.py +99 -0
- structuredca-1.0.0/structuredca/structure/rsa/rsa_solver.py +117 -0
- structuredca-1.0.0/structuredca/structure/structure.py +559 -0
- structuredca-1.0.0/structuredca/structuredca.py +673 -0
- structuredca-1.0.0/structuredca/utils/__init__.py +2 -0
- structuredca-1.0.0/structuredca/utils/logger.py +63 -0
- structuredca-1.0.0/structuredca/utils/utils.py +47 -0
- structuredca-1.0.0/structuredca.egg-info/SOURCES.txt +45 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c), 2026, Matsvei Tsishyn
|
|
4
|
+
Copyright (c), 2026, Hugo Talibart
|
|
5
|
+
Copyright (c), 2026, Marianne Rooman
|
|
6
|
+
Copyright (c), 2026, Fabrizio Pucci
|
|
7
|
+
|
|
8
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
9
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
10
|
+
in the Software without restriction, including without limitation the rights
|
|
11
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
12
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
13
|
+
furnished to do so, subject to the following conditions:
|
|
14
|
+
|
|
15
|
+
The above copyright notice and this permission notice shall be included in all
|
|
16
|
+
copies or substantial portions of the Software.
|
|
17
|
+
|
|
18
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
19
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
20
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
21
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
22
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
23
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
24
|
+
SOFTWARE.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Include files required for building C++ extension
|
|
2
|
+
include structuredca/dca_model/dca_solvers/plmdca/include/*
|
|
3
|
+
include structuredca/dca_model/dca_solvers/plmdca/lbfgs/include/*
|
|
4
|
+
recursive-include structuredca/dca_model/dca_solvers/plmdca *.cpp
|
|
5
|
+
|
|
6
|
+
# Exclude files and directories that should not be in the package
|
|
7
|
+
exclude Logo.png
|
|
8
|
+
exclude conda-env.yml
|
|
9
|
+
exclude test_data/*
|
|
10
|
+
exclude tutorials/*
|
|
11
|
+
global-exclude *.py[cod]
|
|
12
|
+
global-exclude __pycache__/*
|
|
13
|
+
global-exclude 0_*
|
|
14
|
+
global-exclude *.so
|
|
15
|
+
global-exclude *.a
|
|
16
|
+
global-exclude *.o
|
|
17
|
+
|
|
18
|
+
# Exclude build artifacts
|
|
19
|
+
global-exclude structuredca/dca_model/dca_solvers/plmdca/build/*
|
|
20
|
+
global-exclude *.egg-info/*
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: structuredca
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Structure-Informed Direct Couplings Analysis.
|
|
5
|
+
Author-email: Matsvei Tsishyn <matsvei.tsishyn@protonmail.com>, Hugo Talibart <hugo.talibart@protonmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/3BioCompBio/StructureDCA
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: C++
|
|
10
|
+
Classifier: Programming Language :: C
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
13
|
+
Requires-Python: >=3.9
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: numpy
|
|
17
|
+
Requires-Dist: biopython>=1.75
|
|
18
|
+
Requires-Dist: littlecsv
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# StructureDCA
|
|
23
|
+
|
|
24
|
+
[](https://pypi.org/project/structuredca/) [](https://opensource.org/licenses/MIT) [](
|
|
25
|
+
https://colab.research.google.com/github/3BioCompBio/StructureDCA/blob/main/colab_notebook_StructureDCA.ipynb)
|
|
26
|
+
<div style="text-align: center;">
|
|
27
|
+
<img src="Logo.png" alt="[StructureDCA Logo]" height="400"/>
|
|
28
|
+
</div>
|
|
29
|
+
|
|
30
|
+
The `structuredca` Python package implements **Structure-Informed Direct Coupling Analysis** (StructureDCA) to predict the **effects of missense mutations on proteins**.
|
|
31
|
+
|
|
32
|
+
Standard DCA methods use **Multiple Sequence Alignments (MSAs)** to build a **statistical evolutionary model** of homologous protein families. They rely on single-site fields `h` and pairwise couplings `J` that capture co-evolution between residue positions.
|
|
33
|
+
StructureDCA extends this framework by incorporating the **residue–residue contact map** derived from the protein **3D structure** to infer a **sparse DCA model**, in which couplings between spatially distant residue pairs are removed.
|
|
34
|
+
This approach leverages the observation that functionally relevant, co-evolving residues are most often structurally in contact.
|
|
35
|
+
|
|
36
|
+
The package includes a **pseudolikelihood-maximization DCA solver** capable of inferring sparse DCA models, where selected coupling coefficients `Jij` are constrained to zero.
|
|
37
|
+
StructureDCA combines a flexible, user-friendly Python interface with the high computational efficiency of its C++ backend.
|
|
38
|
+
This model was initially developed to improve classical DCA methods for predicting the effects of missense mutations in proteins. However, StructureDCA can be applied to any DCA-based analysis (except for contact predictions...) and provides the full functionality of both standard and sparse DCA models.
|
|
39
|
+
|
|
40
|
+
**Please cite**:
|
|
41
|
+
- [Matsvei Tsishyn, Hugo Talibart, Marianne Rooman, Fabrizio Pucci. Inferring Protein Mutational Landscape with Structure-Informed Direct Coupling Analysis. BioRxiv](https://www.biorxiv.org/).
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
## Installation and Usage
|
|
45
|
+
|
|
46
|
+
### Colab Notebook
|
|
47
|
+
|
|
48
|
+
You can instantly try StructureDCA in this [Colab Notebook](https://colab.research.google.com/github/3BioCompBio/StructureDCA/blob/main/colab_notebook_StructureDCA.ipynb). This notebook acts as a **user-friendly web server** / **graphical interface**, offering helpers to **automatically fetch or generate the MSA and 3D structure** for your target protein.
|
|
49
|
+
You can then **visualize** your mutational landscape predictions as a DMS heatmap or mapped to the 3D structure.
|
|
50
|
+
|
|
51
|
+
### Installation
|
|
52
|
+
Installation with `pip`:
|
|
53
|
+
```bash
|
|
54
|
+
pip install structuredca
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### CLI usage
|
|
58
|
+
Use StructureDCA with a Command Line Interface (CLI).
|
|
59
|
+
For example, from the directory `./test_data/`, run:
|
|
60
|
+
```bash
|
|
61
|
+
structuredca ./6acv_A_29-94.fasta ./6acv_A_29-94.pdb A -o ./6acv_A_29-94_structuredca.csv
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
To show CLI usage and optional arguments, run:
|
|
65
|
+
```bash
|
|
66
|
+
structuredca --help
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Python usage
|
|
70
|
+
Make sure the first sequence in your MSA file is the target sequence to mutate (otherwise have a look at tutorial 1).
|
|
71
|
+
From directory `./test_data/` execute the following Python code:
|
|
72
|
+
```python
|
|
73
|
+
# Import
|
|
74
|
+
from structuredca import StructureDCA
|
|
75
|
+
|
|
76
|
+
# Log basic usage and arguments
|
|
77
|
+
StructureDCA.help()
|
|
78
|
+
|
|
79
|
+
# Initialize StructureDCA model
|
|
80
|
+
sdca = StructureDCA(
|
|
81
|
+
msa_path='./6acv_A_29-94.fasta',
|
|
82
|
+
pdb_path='./6acv_A_29-94.pdb', chains='A',
|
|
83
|
+
use_contacts_plddt_filter=False, # use only if 3D structure is an AlphaFold model (or similar) to remove low pLDDT regions from contacts
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Evaluate the evolutionary energy difference (ΔE) of mutations
|
|
87
|
+
# scores can be reweighted by Relative Solvent Accessibility-complement (RSAc) -> advised to predict stability changes (ΔΔG)
|
|
88
|
+
# * dE = 0 means neutral mutation
|
|
89
|
+
# * dE >> 0 means destabilizing / deleterious mutation
|
|
90
|
+
dE_mut1 = sdca.eval_mutation('K13H', reweight_by_rsa=True)
|
|
91
|
+
dE_mut2 = sdca.eval_mutation('K13H:K12G', reweight_by_rsa=True)
|
|
92
|
+
|
|
93
|
+
# Evaluate ΔE of all single mutations and save results to a file
|
|
94
|
+
dE_all = sdca.eval_mutations_table(
|
|
95
|
+
save_path='./6acv_A_29-94_structuredca.csv',
|
|
96
|
+
log_output_sample=True,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Evaluate absolute evolutionary energy (E) of a sequence
|
|
100
|
+
seq_to_evaluate = 'A' * sdca.msa_length # arbitrary example: AAAAA...
|
|
101
|
+
E_only_alanine = sdca.eval_sequence(seq_to_evaluate, reweight_by_rsa=True)
|
|
102
|
+
|
|
103
|
+
# Evaluate relative probabilities for the 20 Amino Acids at this position given a background sequence
|
|
104
|
+
fasta_position = 10 # as in FASTA index system (starts at 1)
|
|
105
|
+
array_position = fasta_position - 1 # As in a Python array (starts at 0)
|
|
106
|
+
amino_acid_probabilities = sdca.position_probabilities(array_position) # P(a) = e^{-dE(wt→a)} / ∑_b e^{-dE(wt→b)}
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Tutorials and Advanced Usage
|
|
110
|
+
In the `./tutorials/` directory, we provide a series of Jupyter notebooks that illustrate different ways to using **StructureDCA**:
|
|
111
|
+
|
|
112
|
+
1. **Basics and arguments** (`1_sdca-basics.ipynb`): basics, evaluating effects of mutations with StructureDCA, using optional arguments (like `distance_cutoff` or `lambda_h` / `lambda_J`), evaluate mutations with an alternative background sequence.
|
|
113
|
+
|
|
114
|
+
2. **Access properties** (`2_sdca-properties.ipynb`): access StructureDCA coefficients and properties (like fields `h`, couplings `J`, Frobenius norms, residue-residue distance matrix, contact map, ...).
|
|
115
|
+
|
|
116
|
+
3. **Standard DCA** (`3_sdca-standard-dca.ipynb`): solve standard (fully connected) DCA models and run without protein 3D structure.
|
|
117
|
+
|
|
118
|
+
4. **Protein–Protein Interactions** (`4_sdca-ppis.ipynb`): working with protein–protein interactions (PPIs).
|
|
119
|
+
Compute RSA from the biologically relevant conformation, include inter-chain contacts arising from homomers, and build a StructureDCA model from a concatenated MSA of a heteromer PPI of highly coevoling proteins.
|
|
120
|
+
|
|
121
|
+
5. **Custom contacts** (`5_sdca-custom-contacts.ipynb`): build a StructureDCA model with custom contact map (instead of the default distance criteria) and custom weights for StructureDCA[RSA] (instead of default RSA-based weights) to derive any possible sparse DCA model.
|
|
122
|
+
|
|
123
|
+
## Build and Installation Notes
|
|
124
|
+
|
|
125
|
+
### Requirements
|
|
126
|
+
- Python 3.9 or later
|
|
127
|
+
- Python packages `numpy` and `biopython` (version 1.75 or later)
|
|
128
|
+
- A C++ compiler that supports C++17 and OpenMP (such as GCC, LLVM or MSVC).
|
|
129
|
+
|
|
130
|
+
### Troubleshooting macOS build errors
|
|
131
|
+
StructureDCA uses OpenMP for parallel computation in C++.
|
|
132
|
+
Since macOS `Clang` compiler does not include OpenMP support by default, macOS users may need to install and configure the `LLVM` compiler to build the package.
|
|
133
|
+
|
|
134
|
+
If you are using macOS and encounter compilation errors related to `-fopenmp`, please follow these steps:
|
|
135
|
+
|
|
136
|
+
- Install `Homebrew`
|
|
137
|
+
- Install `LLVM` and `OpenMP` runtime and configure Python to use this compiler. Then try to re-install StructureDCA. Run something like (paths may slightly differ):
|
|
138
|
+
```bash
|
|
139
|
+
brew install llvm libomp
|
|
140
|
+
export CC=/usr/local/opt/llvm/bin/clang
|
|
141
|
+
export CXX=/usr/local/opt/llvm/bin/clang++
|
|
142
|
+
export CPPFLAGS="-I/usr/local/opt/libomp/include"
|
|
143
|
+
export LDFLAGS="-L/usr/local/opt/libomp/lib -lomp"
|
|
144
|
+
pip install structuredca
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
Well, I am not exactly sure about this fix, but I know it is possible.
|
|
148
|
+
|
|
149
|
+
## Credits
|
|
150
|
+
- For inferring the DCA coefficients, StructureDCA uses a gradient descent solver: [L-BFGS](https://github.com/chokkan/liblbfgs "libLBFGS") by Naoaki Okazaki (which is included in this repo).
|
|
151
|
+
- The part of the code that makes the bridge between Python and C++ is inspired from the [plmDCA implementation 'pycofitness'](https://github.com/KIT-MBS/pycofitness/) by Mehari B. Zerihun, Fabrizio Pucci.
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
|
|
2
|
+
# StructureDCA
|
|
3
|
+
|
|
4
|
+
[](https://pypi.org/project/structuredca/) [](https://opensource.org/licenses/MIT) [](
|
|
5
|
+
https://colab.research.google.com/github/3BioCompBio/StructureDCA/blob/main/colab_notebook_StructureDCA.ipynb)
|
|
6
|
+
<div style="text-align: center;">
|
|
7
|
+
<img src="Logo.png" alt="[StructureDCA Logo]" height="400"/>
|
|
8
|
+
</div>
|
|
9
|
+
|
|
10
|
+
The `structuredca` Python package implements **Structure-Informed Direct Coupling Analysis** (StructureDCA) to predict the **effects of missense mutations on proteins**.
|
|
11
|
+
|
|
12
|
+
Standard DCA methods use **Multiple Sequence Alignments (MSAs)** to build a **statistical evolutionary model** of homologous protein families. They rely on single-site fields `h` and pairwise couplings `J` that capture co-evolution between residue positions.
|
|
13
|
+
StructureDCA extends this framework by incorporating the **residue–residue contact map** derived from the protein **3D structure** to infer a **sparse DCA model**, in which couplings between spatially distant residue pairs are removed.
|
|
14
|
+
This approach leverages the observation that functionally relevant, co-evolving residues are most often structurally in contact.
|
|
15
|
+
|
|
16
|
+
The package includes a **pseudolikelihood-maximization DCA solver** capable of inferring sparse DCA models, where selected coupling coefficients `Jij` are constrained to zero.
|
|
17
|
+
StructureDCA combines a flexible, user-friendly Python interface with the high computational efficiency of its C++ backend.
|
|
18
|
+
This model was initially developed to improve classical DCA methods for predicting the effects of missense mutations in proteins. However, StructureDCA can be applied to any DCA-based analysis (except for contact predictions...) and provides the full functionality of both standard and sparse DCA models.
|
|
19
|
+
|
|
20
|
+
**Please cite**:
|
|
21
|
+
- [Matsvei Tsishyn, Hugo Talibart, Marianne Rooman, Fabrizio Pucci. Inferring Protein Mutational Landscape with Structure-Informed Direct Coupling Analysis. BioRxiv](https://www.biorxiv.org/).
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
## Installation and Usage
|
|
25
|
+
|
|
26
|
+
### Colab Notebook
|
|
27
|
+
|
|
28
|
+
You can instantly try StructureDCA in this [Colab Notebook](https://colab.research.google.com/github/3BioCompBio/StructureDCA/blob/main/colab_notebook_StructureDCA.ipynb). This notebook acts as a **user-friendly web server** / **graphical interface**, offering helpers to **automatically fetch or generate the MSA and 3D structure** for your target protein.
|
|
29
|
+
You can then **visualize** your mutational landscape predictions as a DMS heatmap or mapped to the 3D structure.
|
|
30
|
+
|
|
31
|
+
### Installation
|
|
32
|
+
Installation with `pip`:
|
|
33
|
+
```bash
|
|
34
|
+
pip install structuredca
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### CLI usage
|
|
38
|
+
Use StructureDCA with a Command Line Interface (CLI).
|
|
39
|
+
For example, from the directory `./test_data/`, run:
|
|
40
|
+
```bash
|
|
41
|
+
structuredca ./6acv_A_29-94.fasta ./6acv_A_29-94.pdb A -o ./6acv_A_29-94_structuredca.csv
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
To show CLI usage and optional arguments, run:
|
|
45
|
+
```bash
|
|
46
|
+
structuredca --help
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Python usage
|
|
50
|
+
Make sure the first sequence in your MSA file is the target sequence to mutate (otherwise have a look at tutorial 1).
|
|
51
|
+
From directory `./test_data/` execute the following Python code:
|
|
52
|
+
```python
|
|
53
|
+
# Import
|
|
54
|
+
from structuredca import StructureDCA
|
|
55
|
+
|
|
56
|
+
# Log basic usage and arguments
|
|
57
|
+
StructureDCA.help()
|
|
58
|
+
|
|
59
|
+
# Initialize StructureDCA model
|
|
60
|
+
sdca = StructureDCA(
|
|
61
|
+
msa_path='./6acv_A_29-94.fasta',
|
|
62
|
+
pdb_path='./6acv_A_29-94.pdb', chains='A',
|
|
63
|
+
use_contacts_plddt_filter=False, # use only if 3D structure is an AlphaFold model (or similar) to remove low pLDDT regions from contacts
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Evaluate the evolutionary energy difference (ΔE) of mutations
|
|
67
|
+
# scores can be reweighted by Relative Solvent Accessibility-complement (RSAc) -> advised to predict stability changes (ΔΔG)
|
|
68
|
+
# * dE = 0 means neutral mutation
|
|
69
|
+
# * dE >> 0 means destabilizing / deleterious mutation
|
|
70
|
+
dE_mut1 = sdca.eval_mutation('K13H', reweight_by_rsa=True)
|
|
71
|
+
dE_mut2 = sdca.eval_mutation('K13H:K12G', reweight_by_rsa=True)
|
|
72
|
+
|
|
73
|
+
# Evaluate ΔE of all single mutations and save results to a file
|
|
74
|
+
dE_all = sdca.eval_mutations_table(
|
|
75
|
+
save_path='./6acv_A_29-94_structuredca.csv',
|
|
76
|
+
log_output_sample=True,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Evaluate absolute evolutionary energy (E) of a sequence
|
|
80
|
+
seq_to_evaluate = 'A' * sdca.msa_length # arbitrary example: AAAAA...
|
|
81
|
+
E_only_alanine = sdca.eval_sequence(seq_to_evaluate, reweight_by_rsa=True)
|
|
82
|
+
|
|
83
|
+
# Evaluate relative probabilities for the 20 Amino Acids at this position given a background sequence
|
|
84
|
+
fasta_position = 10 # as in FASTA index system (starts at 1)
|
|
85
|
+
array_position = fasta_position - 1 # As in a Python array (starts at 0)
|
|
86
|
+
amino_acid_probabilities = sdca.position_probabilities(array_position) # P(a) = e^{-dE(wt→a)} / ∑_b e^{-dE(wt→b)}
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Tutorials and Advanced Usage
|
|
90
|
+
In the `./tutorials/` directory, we provide a series of Jupyter notebooks that illustrate different ways to using **StructureDCA**:
|
|
91
|
+
|
|
92
|
+
1. **Basics and arguments** (`1_sdca-basics.ipynb`): basics, evaluating effects of mutations with StructureDCA, using optional arguments (like `distance_cutoff` or `lambda_h` / `lambda_J`), evaluate mutations with an alternative background sequence.
|
|
93
|
+
|
|
94
|
+
2. **Access properties** (`2_sdca-properties.ipynb`): access StructureDCA coefficients and properties (like fields `h`, couplings `J`, Frobenius norms, residue-residue distance matrix, contact map, ...).
|
|
95
|
+
|
|
96
|
+
3. **Standard DCA** (`3_sdca-standard-dca.ipynb`): solve standard (fully connected) DCA models and run without protein 3D structure.
|
|
97
|
+
|
|
98
|
+
4. **Protein–Protein Interactions** (`4_sdca-ppis.ipynb`): working with protein–protein interactions (PPIs).
|
|
99
|
+
Compute RSA from the biologically relevant conformation, include inter-chain contacts arising from homomers, and build a StructureDCA model from a concatenated MSA of a heteromer PPI of highly coevoling proteins.
|
|
100
|
+
|
|
101
|
+
5. **Custom contacts** (`5_sdca-custom-contacts.ipynb`): build a StructureDCA model with custom contact map (instead of the default distance criteria) and custom weights for StructureDCA[RSA] (instead of default RSA-based weights) to derive any possible sparse DCA model.
|
|
102
|
+
|
|
103
|
+
## Build and Installation Notes
|
|
104
|
+
|
|
105
|
+
### Requirements
|
|
106
|
+
- Python 3.9 or later
|
|
107
|
+
- Python packages `numpy` and `biopython` (version 1.75 or later)
|
|
108
|
+
- A C++ compiler that supports C++17 and OpenMP (such as GCC, LLVM or MSVC).
|
|
109
|
+
|
|
110
|
+
### Troubleshooting macOS build errors
|
|
111
|
+
StructureDCA uses OpenMP for parallel computation in C++.
|
|
112
|
+
Since macOS `Clang` compiler does not include OpenMP support by default, macOS users may need to install and configure the `LLVM` compiler to build the package.
|
|
113
|
+
|
|
114
|
+
If you are using macOS and encounter compilation errors related to `-fopenmp`, please follow these steps:
|
|
115
|
+
|
|
116
|
+
- Install `Homebrew`
|
|
117
|
+
- Install `LLVM` and `OpenMP` runtime and configure Python to use this compiler. Then try to re-install StructureDCA. Run something like (paths may slightly differ):
|
|
118
|
+
```bash
|
|
119
|
+
brew install llvm libomp
|
|
120
|
+
export CC=/usr/local/opt/llvm/bin/clang
|
|
121
|
+
export CXX=/usr/local/opt/llvm/bin/clang++
|
|
122
|
+
export CPPFLAGS="-I/usr/local/opt/libomp/include"
|
|
123
|
+
export LDFLAGS="-L/usr/local/opt/libomp/lib -lomp"
|
|
124
|
+
pip install structuredca
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Well, I am not exactly sure about this fix, but I know it is possible.
|
|
128
|
+
|
|
129
|
+
## Credits
|
|
130
|
+
- For inferring the DCA coefficients, StructureDCA uses a gradient descent solver: [L-BFGS](https://github.com/chokkan/liblbfgs "libLBFGS") by Naoaki Okazaki (which is included in this repo).
|
|
131
|
+
- The part of the code that makes the bridge between Python and C++ is inspired from the [plmDCA implementation 'pycofitness'](https://github.com/KIT-MBS/pycofitness/) by Mehari B. Zerihun, Fabrizio Pucci.
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
|
|
2
|
+
# Pip Build Setup --------------------------------------------------------------
|
|
3
|
+
[build-system]
|
|
4
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
5
|
+
build-backend = "setuptools.build_meta"
|
|
6
|
+
|
|
7
|
+
# Pip Package Metadata ---------------------------------------------------------
|
|
8
|
+
[project]
|
|
9
|
+
name = "structuredca"
|
|
10
|
+
version = "1.0.0"
|
|
11
|
+
description = "Structure-Informed Direct Couplings Analysis."
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Matsvei Tsishyn", email = "matsvei.tsishyn@protonmail.com" },
|
|
15
|
+
{ name = "Hugo Talibart", email = "hugo.talibart@protonmail.com" }
|
|
16
|
+
]
|
|
17
|
+
requires-python = ">=3.9"
|
|
18
|
+
license = "MIT"
|
|
19
|
+
license-files = ["LICENSE"]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"numpy",
|
|
22
|
+
"biopython>=1.75",
|
|
23
|
+
"littlecsv",
|
|
24
|
+
]
|
|
25
|
+
classifiers = [
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: C++",
|
|
28
|
+
"Programming Language :: C",
|
|
29
|
+
"Operating System :: OS Independent",
|
|
30
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics"
|
|
31
|
+
]
|
|
32
|
+
urls = { Homepage = "https://github.com/3BioCompBio/StructureDCA" }
|
|
33
|
+
|
|
34
|
+
# Entry Points for CLI ---------------------------------------------------------
|
|
35
|
+
[project.scripts]
|
|
36
|
+
structuredca = "structuredca.cli:main"
|
|
37
|
+
|
|
38
|
+
# Setup pip Package ------------------------------------------------------------
|
|
39
|
+
# this finds all the python packages recursively
|
|
40
|
+
[tool.setuptools]
|
|
41
|
+
packages = { find = {} }
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
NOTE about the 'setup.py' deprecation.
|
|
4
|
+
Despite the fact that 'setup.py' is now deprecated and replaced by 'pyproject.toml', it is still required to include C++ modules in pip packages.
|
|
5
|
+
Here is a minimal 'setup.py' that includes the C++ code in the package to complement the 'pyproject.toml' (this is fucked up)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Imports ----------------------------------------------------------------------
|
|
9
|
+
import os
|
|
10
|
+
from setuptools import setup, Extension
|
|
11
|
+
|
|
12
|
+
# Extensions -------------------------------------------------------------------
|
|
13
|
+
|
|
14
|
+
# Detect compiler platform to manage OpenMP compilation
|
|
15
|
+
compile_args = ['-std=c++17', '-O3']
|
|
16
|
+
link_args = ['-O3']
|
|
17
|
+
if os.name == 'nt':
|
|
18
|
+
# Windows (MSVC)
|
|
19
|
+
compile_args.append('/openmp')
|
|
20
|
+
else:
|
|
21
|
+
# Linux/macOS (GCC/Clang)
|
|
22
|
+
compile_args.append('-fopenmp')
|
|
23
|
+
link_args.append('-fopenmp')
|
|
24
|
+
|
|
25
|
+
# Define extension (C++ code that need to be compiled)
|
|
26
|
+
plmdca_ext = Extension(
|
|
27
|
+
name='structuredca.dca_model.dca_solvers.plmdca.lib_plmdcaBackend',
|
|
28
|
+
sources=[ # .cpp files
|
|
29
|
+
'structuredca/dca_model/dca_solvers/plmdca/plmdcaBackend.cpp',
|
|
30
|
+
'structuredca/dca_model/dca_solvers/plmdca/plmdca.cpp',
|
|
31
|
+
'structuredca/dca_model/dca_solvers/plmdca/lbfgs/lib/lbfgs.cpp',
|
|
32
|
+
],
|
|
33
|
+
include_dirs=[ # .h directories
|
|
34
|
+
'structuredca/dca_model/dca_solvers/plmdca/include/',
|
|
35
|
+
'structuredca/dca_model/dca_solvers/plmdca/lbfgs/include/',
|
|
36
|
+
],
|
|
37
|
+
language='c++',
|
|
38
|
+
extra_compile_args=compile_args, # optimization and other flags
|
|
39
|
+
extra_link_args=link_args,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Setup ------------------------------------------------------------------------
|
|
43
|
+
setup(
|
|
44
|
+
ext_modules = [plmdca_ext],
|
|
45
|
+
)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
StructureDCA
|
|
3
|
+
============
|
|
4
|
+
|
|
5
|
+
**Structure-Informed Direct Coupling Analysis** to predict
|
|
6
|
+
the **effects of missense mutations on proteins**.
|
|
7
|
+
It incorporates the residues contact map derived from protein
|
|
8
|
+
3D structures to infer an **evolutionary sparse DCA model**.
|
|
9
|
+
This approach leverages the observation that functionally relevant,
|
|
10
|
+
co-evolving residues are most often in structural contact.
|
|
11
|
+
|
|
12
|
+
Example of usage in Python:
|
|
13
|
+
|
|
14
|
+
>>> from structuredca import StructureDCA
|
|
15
|
+
>>> sdca = StructureDCA('./msa1.fasta', './pdb1.pdb', 'A')
|
|
16
|
+
>>> mutation_score = sdca.eval_mutation('K24M:H39G', reweight_by_rsa=False)
|
|
17
|
+
>>> all_muts_scores_table = sdca.eval_mutations_table()
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from structuredca.structuredca import StructureDCA
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
|
|
2
|
+
# Imports ----------------------------------------------------------------------
|
|
3
|
+
from typing import List
|
|
4
|
+
import numpy as np
|
|
5
|
+
from numpy.typing import NDArray
|
|
6
|
+
|
|
7
|
+
# Main -------------------------------------------------------------------------
|
|
8
|
+
class LinearExtrapolation:
|
|
9
|
+
"""
|
|
10
|
+
Class to extrapolate distance matrices distance_increment between two consecutive nodes.
|
|
11
|
+
|
|
12
|
+
args:
|
|
13
|
+
distance_increment: float increment distance between two nodes: d[i, i+2] - d[i, i+1]
|
|
14
|
+
distance_adj: float distance_increment for two neighbour nodes: d[i, i+1]
|
|
15
|
+
|
|
16
|
+
usage:
|
|
17
|
+
lin_ext = LinearExtrapolation(distance_increment=3.2, distance_adj=0.135)
|
|
18
|
+
|
|
19
|
+
squared_dist_matrix = lin_ext.extrapolate_diagonal_matrix(10) # shape = (10, 10)
|
|
20
|
+
|
|
21
|
+
prolongation_dist_matrix = lin_ext.extrapolate_marginal_matrix([5.1, 5.3, 5.0], 10) # shape = (10, 3)
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
# Constructor --------------------------------------------------------------
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
distance_increment: float,
|
|
28
|
+
distance_adj: float,
|
|
29
|
+
name: str="linear_extrapolator"
|
|
30
|
+
):
|
|
31
|
+
|
|
32
|
+
# Guardians
|
|
33
|
+
assert distance_increment > 0.0, f"ERROR in LinearExtrapolation(): distance_increment={distance_increment} should be stricktly positive."
|
|
34
|
+
assert distance_adj > 0.0, f"ERROR in LinearExtrapolation(): distance_adj={distance_adj} should be stricktly positive."
|
|
35
|
+
|
|
36
|
+
# Init properties
|
|
37
|
+
self.name = name
|
|
38
|
+
self.distance_increment = np.float32(distance_increment)
|
|
39
|
+
self.distance_adj = np.float32(distance_adj)
|
|
40
|
+
|
|
41
|
+
# Base methods -------------------------------------------------------------
|
|
42
|
+
def __str__(self) -> str:
|
|
43
|
+
return f"LinearExtrapolation('{self.name}', d_inc={self.distance_increment:.2f}, d_nxt={self.distance_adj:.2f})"
|
|
44
|
+
|
|
45
|
+
# Extrapoation -------------------------------------------------------------
|
|
46
|
+
def extrapolate_diagonal_matrix(self, n: int) -> NDArray[np.float32]:
|
|
47
|
+
"""
|
|
48
|
+
Return extrapolated diagonal distance matrix (shape=(n, n)).
|
|
49
|
+
"""
|
|
50
|
+
assert n > 0, f"ERROR in {self}.extrapolate_diagonal_matrix(): n='{n}' should be stricktly positive."
|
|
51
|
+
extrapolation_array = [self.distance_adj + i*self.distance_increment for i in range(n)]
|
|
52
|
+
matrix = np.zeros((n, n), dtype=np.float32)
|
|
53
|
+
for i1 in range(n):
|
|
54
|
+
for i2 in range(i1):
|
|
55
|
+
delta_i = i1 - i2
|
|
56
|
+
d = extrapolation_array[delta_i - 1]
|
|
57
|
+
matrix[i1, i2] = d
|
|
58
|
+
matrix[i2, i1] = d
|
|
59
|
+
return matrix
|
|
60
|
+
|
|
61
|
+
def extrapolate_marginal_matrix(self, distance_array: List[float], n: int, reverse_lines: bool=False) -> NDArray[np.float32]:
|
|
62
|
+
"""
|
|
63
|
+
Return extrapolated marginal distance matrix by extrapolating n steps from a distance_array (shape=(n, len(distance_array))).
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
# Guardians
|
|
67
|
+
assert n > 0, f"ERROR in {self}.extrapolate_marginal_matrix(): n='{n}' should be stricktly positive."
|
|
68
|
+
|
|
69
|
+
# Init
|
|
70
|
+
ZERO = np.float32(0.0)
|
|
71
|
+
distance_array = np.array(distance_array, dtype=np.float32)
|
|
72
|
+
|
|
73
|
+
# First increment (might use distance_adj or distance_increment)
|
|
74
|
+
first_increment_arr = np.array([self.distance_adj if d == ZERO else self.distance_increment for d in distance_array], dtype=np.float32)
|
|
75
|
+
matrix = [distance_array + first_increment_arr]
|
|
76
|
+
|
|
77
|
+
# Following increments (use distance_increment)
|
|
78
|
+
for i in range(n-1):
|
|
79
|
+
increment = np.float32((i+2)*self.distance_increment)
|
|
80
|
+
matrix.append(distance_array + increment)
|
|
81
|
+
|
|
82
|
+
# Reverse if required
|
|
83
|
+
if reverse_lines:
|
|
84
|
+
matrix = matrix[::-1]
|
|
85
|
+
|
|
86
|
+
# Format and return
|
|
87
|
+
return np.array(matrix, dtype=np.float32)
|