structuredca 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. structuredca-1.0.0/LICENSE +24 -0
  2. structuredca-1.0.0/MANIFEST.in +20 -0
  3. structuredca-1.0.0/PKG-INFO +151 -0
  4. structuredca-1.0.0/README.md +131 -0
  5. structuredca-1.0.0/pyproject.toml +41 -0
  6. structuredca-1.0.0/setup.cfg +4 -0
  7. structuredca-1.0.0/setup.py +45 -0
  8. structuredca-1.0.0/structuredca/__init__.py +20 -0
  9. structuredca-1.0.0/structuredca/aligner/__init__.py +2 -0
  10. structuredca-1.0.0/structuredca/aligner/linear_extrapolation.py +87 -0
  11. structuredca-1.0.0/structuredca/aligner/structure_sequence_alignment.py +353 -0
  12. structuredca-1.0.0/structuredca/cli.py +233 -0
  13. structuredca-1.0.0/structuredca/dca_model/__init__.py +2 -0
  14. structuredca-1.0.0/structuredca/dca_model/data_structures/__init__.py +2 -0
  15. structuredca-1.0.0/structuredca/dca_model/data_structures/sparse_J.py +232 -0
  16. structuredca-1.0.0/structuredca/dca_model/data_structures/sparse_matrix.py +83 -0
  17. structuredca-1.0.0/structuredca/dca_model/dca_model.py +757 -0
  18. structuredca-1.0.0/structuredca/dca_model/dca_solvers/__init__.py +2 -0
  19. structuredca-1.0.0/structuredca/dca_model/dca_solvers/dca_solver.py +118 -0
  20. structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/__init__.py +1 -0
  21. structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/include/plmdca.h +126 -0
  22. structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/lbfgs/include/arithmetic_ansi.h +133 -0
  23. structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/lbfgs/include/arithmetic_sse_double.h +294 -0
  24. structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/lbfgs/include/arithmetic_sse_float.h +298 -0
  25. structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/lbfgs/include/lbfgs.h +747 -0
  26. structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/lbfgs/lib/lbfgs.cpp +1374 -0
  27. structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/plmdca.cpp +768 -0
  28. structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/plmdca.py +213 -0
  29. structuredca-1.0.0/structuredca/dca_model/dca_solvers/plmdca/plmdcaBackend.cpp +278 -0
  30. structuredca-1.0.0/structuredca/dca_model/gauge.py +183 -0
  31. structuredca-1.0.0/structuredca/sequence/__init__.py +6 -0
  32. structuredca-1.0.0/structuredca/sequence/amino_acid.py +256 -0
  33. structuredca-1.0.0/structuredca/sequence/fasta_reader.py +155 -0
  34. structuredca-1.0.0/structuredca/sequence/msa.py +159 -0
  35. structuredca-1.0.0/structuredca/sequence/mutation.py +98 -0
  36. structuredca-1.0.0/structuredca/sequence/pairwise_alignment.py +300 -0
  37. structuredca-1.0.0/structuredca/sequence/sequence.py +172 -0
  38. structuredca-1.0.0/structuredca/structure/__init__.py +2 -0
  39. structuredca-1.0.0/structuredca/structure/residue.py +47 -0
  40. structuredca-1.0.0/structuredca/structure/rsa/__init__.py +2 -0
  41. structuredca-1.0.0/structuredca/structure/rsa/rsa_biopython.py +99 -0
  42. structuredca-1.0.0/structuredca/structure/rsa/rsa_solver.py +117 -0
  43. structuredca-1.0.0/structuredca/structure/structure.py +559 -0
  44. structuredca-1.0.0/structuredca/structuredca.py +673 -0
  45. structuredca-1.0.0/structuredca/utils/__init__.py +2 -0
  46. structuredca-1.0.0/structuredca/utils/logger.py +63 -0
  47. structuredca-1.0.0/structuredca/utils/utils.py +47 -0
  48. structuredca-1.0.0/structuredca.egg-info/SOURCES.txt +45 -0
@@ -0,0 +1,24 @@
1
+ MIT License
2
+
3
+ Copyright (c), 2026, Matsvei Tsishyn
4
+ Copyright (c), 2026, Hugo Talibart
5
+ Copyright (c), 2026, Marianne Rooman
6
+ Copyright (c), 2026, Fabrizio Pucci
7
+
8
+ Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ of this software and associated documentation files (the "Software"), to deal
10
+ in the Software without restriction, including without limitation the rights
11
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ copies of the Software, and to permit persons to whom the Software is
13
+ furnished to do so, subject to the following conditions:
14
+
15
+ The above copyright notice and this permission notice shall be included in all
16
+ copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
+ SOFTWARE.
@@ -0,0 +1,20 @@
1
+ # Include files required for building C++ extension
2
+ include structuredca/dca_model/dca_solvers/plmdca/include/*
3
+ include structuredca/dca_model/dca_solvers/plmdca/lbfgs/include/*
4
+ recursive-include structuredca/dca_model/dca_solvers/plmdca *.cpp
5
+
6
+ # Exclude files and directories that should not be in the package
7
+ exclude Logo.png
8
+ exclude conda-env.yml
9
+ exclude test_data/*
10
+ exclude tutorials/*
11
+ global-exclude *.py[cod]
12
+ global-exclude __pycache__/*
13
+ global-exclude 0_*
14
+ global-exclude *.so
15
+ global-exclude *.a
16
+ global-exclude *.o
17
+
18
+ # Exclude build artifacts
19
+ global-exclude structuredca/dca_model/dca_solvers/plmdca/build/*
20
+ global-exclude *.egg-info/*
@@ -0,0 +1,151 @@
1
+ Metadata-Version: 2.4
2
+ Name: structuredca
3
+ Version: 1.0.0
4
+ Summary: Structure-Informed Direct Couplings Analysis.
5
+ Author-email: Matsvei Tsishyn <matsvei.tsishyn@protonmail.com>, Hugo Talibart <hugo.talibart@protonmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/3BioCompBio/StructureDCA
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: C++
10
+ Classifier: Programming Language :: C
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
13
+ Requires-Python: >=3.9
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: numpy
17
+ Requires-Dist: biopython>=1.75
18
+ Requires-Dist: littlecsv
19
+ Dynamic: license-file
20
+
21
+
22
+ # StructureDCA
23
+
24
+ [![PyPi Version](https://img.shields.io/pypi/v/structuredca.svg)](https://pypi.org/project/structuredca/) [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](
25
+ https://colab.research.google.com/github/3BioCompBio/StructureDCA/blob/main/colab_notebook_StructureDCA.ipynb)
26
+ <div style="text-align: center;">
27
+ <img src="Logo.png" alt="[StructureDCA Logo]" height="400"/>
28
+ </div>
29
+
30
+ The `structuredca` Python package implements **Structure-Informed Direct Coupling Analysis** (StructureDCA) to predict the **effects of missense mutations on proteins**.
31
+
32
+ Standard DCA methods use **Multiple Sequence Alignments (MSAs)** to build a **statistical evolutionary model** of homologous protein families. They rely on single-site fields `h` and pairwise couplings `J` that capture co-evolution between residue positions.
33
+ StructureDCA extends this framework by incorporating the **residue–residue contact map** derived from the protein **3D structure** to infer a **sparse DCA model**, in which couplings between spatially distant residue pairs are removed.
34
+ This approach leverages the observation that functionally relevant, co-evolving residues are most often structurally in contact.
35
+
36
+ The package includes a **pseudolikelihood-maximization DCA solver** capable of inferring sparse DCA models, where selected coupling coefficients `Jij` are constrained to zero.
37
+ StructureDCA combines a flexible, user-friendly Python interface with the high computational efficiency of its C++ backend.
38
+ This model was initially developed to improve classical DCA methods for predicting the effects of missense mutations in proteins. However, StructureDCA can be applied to any DCA-based analysis (except for contact predictions...) and provides the full functionality of both standard and sparse DCA models.
39
+
40
+ **Please cite**:
41
+ - [Matsvei Tsishyn, Hugo Talibart, Marianne Rooman, Fabrizio Pucci. Inferring Protein Mutational Landscape with Structure-Informed Direct Coupling Analysis. BioRxiv](https://www.biorxiv.org/).
42
+
43
+
44
+ ## Installation and Usage
45
+
46
+ ### Colab Notebook
47
+
48
+ You can instantly try StructureDCA in this [Colab Notebook](https://colab.research.google.com/github/3BioCompBio/StructureDCA/blob/main/colab_notebook_StructureDCA.ipynb). This notebook acts as a **user-friendly web server** / **graphical interface**, offering helpers to **automatically fetch or generate the MSA and 3D structure** for your target protein.
49
+ You can then **visualize** your mutational landscape predictions as a DMS heatmap or mapped to the 3D structure.
50
+
51
+ ### Installation
52
+ Installation with `pip`:
53
+ ```bash
54
+ pip install structuredca
55
+ ```
56
+
57
+ ### CLI usage
58
+ Use StructureDCA with a Command Line Interface (CLI).
59
+ For example, from the directory `./test_data/`, run:
60
+ ```bash
61
+ structuredca ./6acv_A_29-94.fasta ./6acv_A_29-94.pdb A -o ./6acv_A_29-94_structuredca.csv
62
+ ```
63
+
64
+ To show CLI usage and optional arguments, run:
65
+ ```bash
66
+ structuredca --help
67
+ ```
68
+
69
+ ### Python usage
70
+ Make sure the first sequence in your MSA file is the target sequence to mutate (otherwise have a look at tutorial 1).
71
+ From directory `./test_data/` execute the following Python code:
72
+ ```python
73
+ # Import
74
+ from structuredca import StructureDCA
75
+
76
+ # Log basic usage and arguments
77
+ StructureDCA.help()
78
+
79
+ # Initialize StructureDCA model
80
+ sdca = StructureDCA(
81
+ msa_path='./6acv_A_29-94.fasta',
82
+ pdb_path='./6acv_A_29-94.pdb', chains='A',
83
+ use_contacts_plddt_filter=False, # use only if 3D structure is an AlphaFold model (or similar) to remove low pLDDT regions from contacts
84
+ )
85
+
86
+ # Evaluate the evolutionary energy difference (ΔE) of mutations
87
+ # scores can be reweighted by Relative Solvent Accessibility-complement (RSAc) -> advised to predict stability changes (ΔΔG)
88
+ # * dE = 0 means neutral mutation
89
+ # * dE >> 0 means destabilizing / deleterious mutation
90
+ dE_mut1 = sdca.eval_mutation('K13H', reweight_by_rsa=True)
91
+ dE_mut2 = sdca.eval_mutation('K13H:K12G', reweight_by_rsa=True)
92
+
93
+ # Evaluate ΔE of all single mutations and save results to a file
94
+ dE_all = sdca.eval_mutations_table(
95
+ save_path='./6acv_A_29-94_structuredca.csv',
96
+ log_output_sample=True,
97
+ )
98
+
99
+ # Evaluate absolute evolutionary energy (E) of a sequence
100
+ seq_to_evaluate = 'A' * sdca.msa_length # arbitrary example: AAAAA...
101
+ E_only_alanine = sdca.eval_sequence(seq_to_evaluate, reweight_by_rsa=True)
102
+
103
+ # Evaluate relative probabilities for the 20 Amino Acids at this position given a background sequence
104
+ fasta_position = 10 # as in FASTA index system (starts at 1)
105
+ array_position = fasta_position - 1 # As in a Python array (starts at 0)
106
+ amino_acid_probabilities = sdca.position_probabilities(array_position) # P(a) = e^{-dE(wt→a)} / ∑_b e^{-dE(wt→b)}
107
+ ```
108
+
109
+ ### Tutorials and Advanced Usage
110
+ In the `./tutorials/` directory, we provide a series of Jupyter notebooks that illustrate different ways to using **StructureDCA**:
111
+
112
+ 1. **Basics and arguments** (`1_sdca-basics.ipynb`): basics, evaluating effects of mutations with StructureDCA, using optional arguments (like `distance_cutoff` or `lambda_h` / `lambda_J`), evaluate mutations with an alternative background sequence.
113
+
114
+ 2. **Access properties** (`2_sdca-properties.ipynb`): access StructureDCA coefficients and properties (like fields `h`, couplings `J`, Frobenius norms, residue-residue distance matrix, contact map, ...).
115
+
116
+ 3. **Standard DCA** (`3_sdca-standard-dca.ipynb`): solve standard (fully connected) DCA models and run without protein 3D structure.
117
+
118
+ 4. **Protein–Protein Interactions** (`4_sdca-ppis.ipynb`): working with protein–protein interactions (PPIs).
119
+ Compute RSA from the biologically relevant conformation, include inter-chain contacts arising from homomers, and build a StructureDCA model from a concatenated MSA of a heteromer PPI of highly coevoling proteins.
120
+
121
+ 5. **Custom contacts** (`5_sdca-custom-contacts.ipynb`): build a StructureDCA model with custom contact map (instead of the default distance criteria) and custom weights for StructureDCA[RSA] (instead of default RSA-based weights) to derive any possible sparse DCA model.
122
+
123
+ ## Build and Installation Notes
124
+
125
+ ### Requirements
126
+ - Python 3.9 or later
127
+ - Python packages `numpy` and `biopython` (version 1.75 or later)
128
+ - A C++ compiler that supports C++17 and OpenMP (such as GCC, LLVM or MSVC).
129
+
130
+ ### Troubleshooting macOS build errors
131
+ StructureDCA uses OpenMP for parallel computation in C++.
132
+ Since macOS `Clang` compiler does not include OpenMP support by default, macOS users may need to install and configure the `LLVM` compiler to build the package.
133
+
134
+ If you are using macOS and encounter compilation errors related to `-fopenmp`, please follow these steps:
135
+
136
+ - Install `Homebrew`
137
+ - Install `LLVM` and `OpenMP` runtime and configure Python to use this compiler. Then try to re-install StructureDCA. Run something like (paths may slightly differ):
138
+ ```bash
139
+ brew install llvm libomp
140
+ export CC=/usr/local/opt/llvm/bin/clang
141
+ export CXX=/usr/local/opt/llvm/bin/clang++
142
+ export CPPFLAGS="-I/usr/local/opt/libomp/include"
143
+ export LDFLAGS="-L/usr/local/opt/libomp/lib -lomp"
144
+ pip install structuredca
145
+ ```
146
+
147
+ Well, I am not exactly sure about this fix, but I know it is possible.
148
+
149
+ ## Credits
150
+ - For inferring the DCA coefficients, StructureDCA uses a gradient descent solver: [L-BFGS](https://github.com/chokkan/liblbfgs "libLBFGS") by Naoaki Okazaki (which is included in this repo).
151
+ - The part of the code that makes the bridge between Python and C++ is inspired from the [plmDCA implementation 'pycofitness'](https://github.com/KIT-MBS/pycofitness/) by Mehari B. Zerihun, Fabrizio Pucci.
@@ -0,0 +1,131 @@
1
+
2
+ # StructureDCA
3
+
4
+ [![PyPi Version](https://img.shields.io/pypi/v/structuredca.svg)](https://pypi.org/project/structuredca/) [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](
5
+ https://colab.research.google.com/github/3BioCompBio/StructureDCA/blob/main/colab_notebook_StructureDCA.ipynb)
6
+ <div style="text-align: center;">
7
+ <img src="Logo.png" alt="[StructureDCA Logo]" height="400"/>
8
+ </div>
9
+
10
+ The `structuredca` Python package implements **Structure-Informed Direct Coupling Analysis** (StructureDCA) to predict the **effects of missense mutations on proteins**.
11
+
12
+ Standard DCA methods use **Multiple Sequence Alignments (MSAs)** to build a **statistical evolutionary model** of homologous protein families. They rely on single-site fields `h` and pairwise couplings `J` that capture co-evolution between residue positions.
13
+ StructureDCA extends this framework by incorporating the **residue–residue contact map** derived from the protein **3D structure** to infer a **sparse DCA model**, in which couplings between spatially distant residue pairs are removed.
14
+ This approach leverages the observation that functionally relevant, co-evolving residues are most often structurally in contact.
15
+
16
+ The package includes a **pseudolikelihood-maximization DCA solver** capable of inferring sparse DCA models, where selected coupling coefficients `Jij` are constrained to zero.
17
+ StructureDCA combines a flexible, user-friendly Python interface with the high computational efficiency of its C++ backend.
18
+ This model was initially developed to improve classical DCA methods for predicting the effects of missense mutations in proteins. However, StructureDCA can be applied to any DCA-based analysis (except for contact predictions...) and provides the full functionality of both standard and sparse DCA models.
19
+
20
+ **Please cite**:
21
+ - [Matsvei Tsishyn, Hugo Talibart, Marianne Rooman, Fabrizio Pucci. Inferring Protein Mutational Landscape with Structure-Informed Direct Coupling Analysis. BioRxiv](https://www.biorxiv.org/).
22
+
23
+
24
+ ## Installation and Usage
25
+
26
+ ### Colab Notebook
27
+
28
+ You can instantly try StructureDCA in this [Colab Notebook](https://colab.research.google.com/github/3BioCompBio/StructureDCA/blob/main/colab_notebook_StructureDCA.ipynb). This notebook acts as a **user-friendly web server** / **graphical interface**, offering helpers to **automatically fetch or generate the MSA and 3D structure** for your target protein.
29
+ You can then **visualize** your mutational landscape predictions as a DMS heatmap or mapped to the 3D structure.
30
+
31
+ ### Installation
32
+ Installation with `pip`:
33
+ ```bash
34
+ pip install structuredca
35
+ ```
36
+
37
+ ### CLI usage
38
+ Use StructureDCA with a Command Line Interface (CLI).
39
+ For example, from the directory `./test_data/`, run:
40
+ ```bash
41
+ structuredca ./6acv_A_29-94.fasta ./6acv_A_29-94.pdb A -o ./6acv_A_29-94_structuredca.csv
42
+ ```
43
+
44
+ To show CLI usage and optional arguments, run:
45
+ ```bash
46
+ structuredca --help
47
+ ```
48
+
49
+ ### Python usage
50
+ Make sure the first sequence in your MSA file is the target sequence to mutate (otherwise have a look at tutorial 1).
51
+ From directory `./test_data/` execute the following Python code:
52
+ ```python
53
+ # Import
54
+ from structuredca import StructureDCA
55
+
56
+ # Log basic usage and arguments
57
+ StructureDCA.help()
58
+
59
+ # Initialize StructureDCA model
60
+ sdca = StructureDCA(
61
+ msa_path='./6acv_A_29-94.fasta',
62
+ pdb_path='./6acv_A_29-94.pdb', chains='A',
63
+ use_contacts_plddt_filter=False, # use only if 3D structure is an AlphaFold model (or similar) to remove low pLDDT regions from contacts
64
+ )
65
+
66
+ # Evaluate the evolutionary energy difference (ΔE) of mutations
67
+ # scores can be reweighted by Relative Solvent Accessibility-complement (RSAc) -> advised to predict stability changes (ΔΔG)
68
+ # * dE = 0 means neutral mutation
69
+ # * dE >> 0 means destabilizing / deleterious mutation
70
+ dE_mut1 = sdca.eval_mutation('K13H', reweight_by_rsa=True)
71
+ dE_mut2 = sdca.eval_mutation('K13H:K12G', reweight_by_rsa=True)
72
+
73
+ # Evaluate ΔE of all single mutations and save results to a file
74
+ dE_all = sdca.eval_mutations_table(
75
+ save_path='./6acv_A_29-94_structuredca.csv',
76
+ log_output_sample=True,
77
+ )
78
+
79
+ # Evaluate absolute evolutionary energy (E) of a sequence
80
+ seq_to_evaluate = 'A' * sdca.msa_length # arbitrary example: AAAAA...
81
+ E_only_alanine = sdca.eval_sequence(seq_to_evaluate, reweight_by_rsa=True)
82
+
83
+ # Evaluate relative probabilities for the 20 Amino Acids at this position given a background sequence
84
+ fasta_position = 10 # as in FASTA index system (starts at 1)
85
+ array_position = fasta_position - 1 # As in a Python array (starts at 0)
86
+ amino_acid_probabilities = sdca.position_probabilities(array_position) # P(a) = e^{-dE(wt→a)} / ∑_b e^{-dE(wt→b)}
87
+ ```
88
+
89
+ ### Tutorials and Advanced Usage
90
+ In the `./tutorials/` directory, we provide a series of Jupyter notebooks that illustrate different ways to using **StructureDCA**:
91
+
92
+ 1. **Basics and arguments** (`1_sdca-basics.ipynb`): basics, evaluating effects of mutations with StructureDCA, using optional arguments (like `distance_cutoff` or `lambda_h` / `lambda_J`), evaluate mutations with an alternative background sequence.
93
+
94
+ 2. **Access properties** (`2_sdca-properties.ipynb`): access StructureDCA coefficients and properties (like fields `h`, couplings `J`, Frobenius norms, residue-residue distance matrix, contact map, ...).
95
+
96
+ 3. **Standard DCA** (`3_sdca-standard-dca.ipynb`): solve standard (fully connected) DCA models and run without protein 3D structure.
97
+
98
+ 4. **Protein–Protein Interactions** (`4_sdca-ppis.ipynb`): working with protein–protein interactions (PPIs).
99
+ Compute RSA from the biologically relevant conformation, include inter-chain contacts arising from homomers, and build a StructureDCA model from a concatenated MSA of a heteromer PPI of highly coevoling proteins.
100
+
101
+ 5. **Custom contacts** (`5_sdca-custom-contacts.ipynb`): build a StructureDCA model with custom contact map (instead of the default distance criteria) and custom weights for StructureDCA[RSA] (instead of default RSA-based weights) to derive any possible sparse DCA model.
102
+
103
+ ## Build and Installation Notes
104
+
105
+ ### Requirements
106
+ - Python 3.9 or later
107
+ - Python packages `numpy` and `biopython` (version 1.75 or later)
108
+ - A C++ compiler that supports C++17 and OpenMP (such as GCC, LLVM or MSVC).
109
+
110
+ ### Troubleshooting macOS build errors
111
+ StructureDCA uses OpenMP for parallel computation in C++.
112
+ Since macOS `Clang` compiler does not include OpenMP support by default, macOS users may need to install and configure the `LLVM` compiler to build the package.
113
+
114
+ If you are using macOS and encounter compilation errors related to `-fopenmp`, please follow these steps:
115
+
116
+ - Install `Homebrew`
117
+ - Install `LLVM` and `OpenMP` runtime and configure Python to use this compiler. Then try to re-install StructureDCA. Run something like (paths may slightly differ):
118
+ ```bash
119
+ brew install llvm libomp
120
+ export CC=/usr/local/opt/llvm/bin/clang
121
+ export CXX=/usr/local/opt/llvm/bin/clang++
122
+ export CPPFLAGS="-I/usr/local/opt/libomp/include"
123
+ export LDFLAGS="-L/usr/local/opt/libomp/lib -lomp"
124
+ pip install structuredca
125
+ ```
126
+
127
+ Well, I am not exactly sure about this fix, but I know it is possible.
128
+
129
+ ## Credits
130
+ - For inferring the DCA coefficients, StructureDCA uses a gradient descent solver: [L-BFGS](https://github.com/chokkan/liblbfgs "libLBFGS") by Naoaki Okazaki (which is included in this repo).
131
+ - The part of the code that makes the bridge between Python and C++ is inspired from the [plmDCA implementation 'pycofitness'](https://github.com/KIT-MBS/pycofitness/) by Mehari B. Zerihun, Fabrizio Pucci.
@@ -0,0 +1,41 @@
1
+
2
+ # Pip Build Setup --------------------------------------------------------------
3
+ [build-system]
4
+ requires = ["setuptools>=61.0", "wheel"]
5
+ build-backend = "setuptools.build_meta"
6
+
7
+ # Pip Package Metadata ---------------------------------------------------------
8
+ [project]
9
+ name = "structuredca"
10
+ version = "1.0.0"
11
+ description = "Structure-Informed Direct Couplings Analysis."
12
+ readme = "README.md"
13
+ authors = [
14
+ { name = "Matsvei Tsishyn", email = "matsvei.tsishyn@protonmail.com" },
15
+ { name = "Hugo Talibart", email = "hugo.talibart@protonmail.com" }
16
+ ]
17
+ requires-python = ">=3.9"
18
+ license = "MIT"
19
+ license-files = ["LICENSE"]
20
+ dependencies = [
21
+ "numpy",
22
+ "biopython>=1.75",
23
+ "littlecsv",
24
+ ]
25
+ classifiers = [
26
+ "Programming Language :: Python :: 3",
27
+ "Programming Language :: C++",
28
+ "Programming Language :: C",
29
+ "Operating System :: OS Independent",
30
+ "Topic :: Scientific/Engineering :: Bio-Informatics"
31
+ ]
32
+ urls = { Homepage = "https://github.com/3BioCompBio/StructureDCA" }
33
+
34
+ # Entry Points for CLI ---------------------------------------------------------
35
+ [project.scripts]
36
+ structuredca = "structuredca.cli:main"
37
+
38
+ # Setup pip Package ------------------------------------------------------------
39
+ # this finds all the python packages recursively
40
+ [tool.setuptools]
41
+ packages = { find = {} }
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,45 @@
1
+
2
+ """
3
+ NOTE about the 'setup.py' deprecation.
4
+ Despite the fact that 'setup.py' is now deprecated and replaced by 'pyproject.toml', it is still required to include C++ modules in pip packages.
5
+ Here is a minimal 'setup.py' that includes the C++ code in the package to complement the 'pyproject.toml' (this is fucked up)
6
+ """
7
+
8
+ # Imports ----------------------------------------------------------------------
9
+ import os
10
+ from setuptools import setup, Extension
11
+
12
+ # Extensions -------------------------------------------------------------------
13
+
14
+ # Detect compiler platform to manage OpenMP compilation
15
+ compile_args = ['-std=c++17', '-O3']
16
+ link_args = ['-O3']
17
+ if os.name == 'nt':
18
+ # Windows (MSVC)
19
+ compile_args.append('/openmp')
20
+ else:
21
+ # Linux/macOS (GCC/Clang)
22
+ compile_args.append('-fopenmp')
23
+ link_args.append('-fopenmp')
24
+
25
+ # Define extension (C++ code that need to be compiled)
26
+ plmdca_ext = Extension(
27
+ name='structuredca.dca_model.dca_solvers.plmdca.lib_plmdcaBackend',
28
+ sources=[ # .cpp files
29
+ 'structuredca/dca_model/dca_solvers/plmdca/plmdcaBackend.cpp',
30
+ 'structuredca/dca_model/dca_solvers/plmdca/plmdca.cpp',
31
+ 'structuredca/dca_model/dca_solvers/plmdca/lbfgs/lib/lbfgs.cpp',
32
+ ],
33
+ include_dirs=[ # .h directories
34
+ 'structuredca/dca_model/dca_solvers/plmdca/include/',
35
+ 'structuredca/dca_model/dca_solvers/plmdca/lbfgs/include/',
36
+ ],
37
+ language='c++',
38
+ extra_compile_args=compile_args, # optimization and other flags
39
+ extra_link_args=link_args,
40
+ )
41
+
42
+ # Setup ------------------------------------------------------------------------
43
+ setup(
44
+ ext_modules = [plmdca_ext],
45
+ )
@@ -0,0 +1,20 @@
1
+ """
2
+ StructureDCA
3
+ ============
4
+
5
+ **Structure-Informed Direct Coupling Analysis** to predict
6
+ the **effects of missense mutations on proteins**.
7
+ It incorporates the residues contact map derived from protein
8
+ 3D structures to infer an **evolutionary sparse DCA model**.
9
+ This approach leverages the observation that functionally relevant,
10
+ co-evolving residues are most often in structural contact.
11
+
12
+ Example of usage in Python:
13
+
14
+ >>> from structuredca import StructureDCA
15
+ >>> sdca = StructureDCA('./msa1.fasta', './pdb1.pdb', 'A')
16
+ >>> mutation_score = sdca.eval_mutation('K24M:H39G', reweight_by_rsa=False)
17
+ >>> all_muts_scores_table = sdca.eval_mutations_table()
18
+ """
19
+
20
+ from structuredca.structuredca import StructureDCA
@@ -0,0 +1,2 @@
1
+ from structuredca.aligner.linear_extrapolation import LinearExtrapolation
2
+ from structuredca.aligner.structure_sequence_alignment import StructureSequenceAlignment
@@ -0,0 +1,87 @@
1
+
2
+ # Imports ----------------------------------------------------------------------
3
+ from typing import List
4
+ import numpy as np
5
+ from numpy.typing import NDArray
6
+
7
+ # Main -------------------------------------------------------------------------
8
+ class LinearExtrapolation:
9
+ """
10
+ Class to extrapolate distance matrices distance_increment between two consecutive nodes.
11
+
12
+ args:
13
+ distance_increment: float increment distance between two nodes: d[i, i+2] - d[i, i+1]
14
+ distance_adj: float distance_increment for two neighbour nodes: d[i, i+1]
15
+
16
+ usage:
17
+ lin_ext = LinearExtrapolation(distance_increment=3.2, distance_adj=0.135)
18
+
19
+ squared_dist_matrix = lin_ext.extrapolate_diagonal_matrix(10) # shape = (10, 10)
20
+
21
+ prolongation_dist_matrix = lin_ext.extrapolate_marginal_matrix([5.1, 5.3, 5.0], 10) # shape = (10, 3)
22
+ """
23
+
24
+ # Constructor --------------------------------------------------------------
25
+ def __init__(
26
+ self,
27
+ distance_increment: float,
28
+ distance_adj: float,
29
+ name: str="linear_extrapolator"
30
+ ):
31
+
32
+ # Guardians
33
+ assert distance_increment > 0.0, f"ERROR in LinearExtrapolation(): distance_increment={distance_increment} should be stricktly positive."
34
+ assert distance_adj > 0.0, f"ERROR in LinearExtrapolation(): distance_adj={distance_adj} should be stricktly positive."
35
+
36
+ # Init properties
37
+ self.name = name
38
+ self.distance_increment = np.float32(distance_increment)
39
+ self.distance_adj = np.float32(distance_adj)
40
+
41
+ # Base methods -------------------------------------------------------------
42
+ def __str__(self) -> str:
43
+ return f"LinearExtrapolation('{self.name}', d_inc={self.distance_increment:.2f}, d_nxt={self.distance_adj:.2f})"
44
+
45
+ # Extrapoation -------------------------------------------------------------
46
+ def extrapolate_diagonal_matrix(self, n: int) -> NDArray[np.float32]:
47
+ """
48
+ Return extrapolated diagonal distance matrix (shape=(n, n)).
49
+ """
50
+ assert n > 0, f"ERROR in {self}.extrapolate_diagonal_matrix(): n='{n}' should be stricktly positive."
51
+ extrapolation_array = [self.distance_adj + i*self.distance_increment for i in range(n)]
52
+ matrix = np.zeros((n, n), dtype=np.float32)
53
+ for i1 in range(n):
54
+ for i2 in range(i1):
55
+ delta_i = i1 - i2
56
+ d = extrapolation_array[delta_i - 1]
57
+ matrix[i1, i2] = d
58
+ matrix[i2, i1] = d
59
+ return matrix
60
+
61
+ def extrapolate_marginal_matrix(self, distance_array: List[float], n: int, reverse_lines: bool=False) -> NDArray[np.float32]:
62
+ """
63
+ Return extrapolated marginal distance matrix by extrapolating n steps from a distance_array (shape=(n, len(distance_array))).
64
+ """
65
+
66
+ # Guardians
67
+ assert n > 0, f"ERROR in {self}.extrapolate_marginal_matrix(): n='{n}' should be stricktly positive."
68
+
69
+ # Init
70
+ ZERO = np.float32(0.0)
71
+ distance_array = np.array(distance_array, dtype=np.float32)
72
+
73
+ # First increment (might use distance_adj or distance_increment)
74
+ first_increment_arr = np.array([self.distance_adj if d == ZERO else self.distance_increment for d in distance_array], dtype=np.float32)
75
+ matrix = [distance_array + first_increment_arr]
76
+
77
+ # Following increments (use distance_increment)
78
+ for i in range(n-1):
79
+ increment = np.float32((i+2)*self.distance_increment)
80
+ matrix.append(distance_array + increment)
81
+
82
+ # Reverse if required
83
+ if reverse_lines:
84
+ matrix = matrix[::-1]
85
+
86
+ # Format and return
87
+ return np.array(matrix, dtype=np.float32)