rcsb-embedding-model 0.0.44__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb_embedding_model-0.0.44/.dockerignore +36 -0
- rcsb_embedding_model-0.0.44/.github/workflows/_workflow-docker.yaml +17 -0
- rcsb_embedding_model-0.0.44/.github/workflows/publish.yaml +91 -0
- rcsb_embedding_model-0.0.44/.gitignore +6 -0
- rcsb_embedding_model-0.0.44/Dockerfile +8 -0
- rcsb_embedding_model-0.0.44/LICENSE.md +4 -0
- rcsb_embedding_model-0.0.44/PKG-INFO +136 -0
- rcsb_embedding_model-0.0.44/README.md +111 -0
- rcsb_embedding_model-0.0.44/assets/embedding-model-architecture.png +0 -0
- rcsb_embedding_model-0.0.44/examples/esm_embeddings.py +23 -0
- rcsb_embedding_model-0.0.44/pyproject.toml +39 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/__init__.py +9 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/cli/args_utils.py +9 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/cli/inference.py +449 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/dataset/esm_prot_from_chain.py +118 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/dataset/esm_prot_from_structure.py +64 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/dataset/resdiue_assembly_embedding_from_structure.py +67 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/dataset/residue_assembly_embedding_from_tensor_file.py +100 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/dataset/residue_embedding_from_structure.py +67 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/dataset/residue_embedding_from_tensor_file.py +44 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/dataset/untils/__init__.py +4 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/dataset/untils/utils.py +17 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/inference/assembly_inferece.py +60 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/inference/chain_inference.py +83 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/inference/esm_inference.py +76 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/inference/structure_inference.py +79 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/model/layers.py +28 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/model/residue_embedding_aggregator.py +53 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/modules/chain_module.py +19 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/modules/esm_module.py +24 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/modules/structure_module.py +32 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/rcsb_structure_embedding.py +127 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/types/api_types.py +60 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/utils/data.py +171 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/utils/esm/loaders.py +65 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/utils/model.py +28 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/utils/structure_parser.py +100 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/utils/structure_provider.py +27 -0
- rcsb_embedding_model-0.0.44/src/rcsb_embedding_model/writer/batch_writer.py +131 -0
- rcsb_embedding_model-0.0.44/tests/resources/embeddings/1acb.A.pt +0 -0
- rcsb_embedding_model-0.0.44/tests/resources/embeddings/1acb.B.pt +0 -0
- rcsb_embedding_model-0.0.44/tests/resources/embeddings/2uzi.A.pt +0 -0
- rcsb_embedding_model-0.0.44/tests/resources/embeddings/2uzi.B.pt +0 -0
- rcsb_embedding_model-0.0.44/tests/resources/embeddings/2uzi.C.pt +0 -0
- rcsb_embedding_model-0.0.44/tests/resources/pdb/1acb.cif +5068 -0
- rcsb_embedding_model-0.0.44/tests/resources/pdb/2uzi.cif +6685 -0
- rcsb_embedding_model-0.0.44/tests/resources/src_stream/assembly-complete-test.csv +7 -0
- rcsb_embedding_model-0.0.44/tests/resources/src_stream/instance-complete-test.csv +10 -0
- rcsb_embedding_model-0.0.44/tests/resources/src_stream/instance.csv +2 -0
- rcsb_embedding_model-0.0.44/tests/test_cli_inference.py +48 -0
- rcsb_embedding_model-0.0.44/tests/test_embedding_model.py +43 -0
- rcsb_embedding_model-0.0.44/tests/test_inference.py +172 -0
- rcsb_embedding_model-0.0.44/tests/test_remote_inference.py +103 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Only list files that could be under version control or could be created in CI/CD.
|
|
2
|
+
|
|
3
|
+
# Note: Patterns are absolute, like `.prettierignore` but unlike `.gitignore`.
|
|
4
|
+
|
|
5
|
+
# Gitignored files
|
|
6
|
+
**/.*
|
|
7
|
+
**/[~#$]*
|
|
8
|
+
**/*[~#$]
|
|
9
|
+
|
|
10
|
+
# Gitignored directories
|
|
11
|
+
node_modules/
|
|
12
|
+
|
|
13
|
+
# Directories
|
|
14
|
+
/dist/
|
|
15
|
+
/tests/
|
|
16
|
+
/assets/
|
|
17
|
+
/examples/
|
|
18
|
+
|
|
19
|
+
# Files in the root directory
|
|
20
|
+
/*.md
|
|
21
|
+
/*.txt
|
|
22
|
+
/CITATION.cff
|
|
23
|
+
/compose.yaml
|
|
24
|
+
/justfile
|
|
25
|
+
/mkdocs.yaml
|
|
26
|
+
|
|
27
|
+
# Keep README.md (needed for build)
|
|
28
|
+
!/README.md
|
|
29
|
+
|
|
30
|
+
# Keep .dockerignore and .gitignore
|
|
31
|
+
!/.dockerignore
|
|
32
|
+
!/.gitignore
|
|
33
|
+
|
|
34
|
+
# Keep legal files
|
|
35
|
+
!/LICENSE.*
|
|
36
|
+
!/NOTICE.txt
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Docker build and push workflow
|
|
2
|
+
|
|
3
|
+
name: Run CI/CD Docker Workflow
|
|
4
|
+
|
|
5
|
+
on:
|
|
6
|
+
workflow_call:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
run-workflow:
|
|
10
|
+
if: github.event_name == 'release'
|
|
11
|
+
name: "Run automated docker workflow"
|
|
12
|
+
uses: rcsb/devops-cicd-github-actions/.github/workflows/workflow-docker.yaml@master
|
|
13
|
+
with:
|
|
14
|
+
dockerfile_location: "Dockerfile" # The location of the Dockerfile relative to the root of the repository. Defaults to "Dockerfile".
|
|
15
|
+
repo_project: "rcsb" # REQUIRED. The name of the project or organization in the remote Docker image repository.
|
|
16
|
+
docker_image_name: "rcsb-embedding-model" # REQUIRED. The name of the Docker image to create.
|
|
17
|
+
docker_build_context: "." # The path location of the docker build context, relative to the project root. Defaults to the project root.
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
name: CI Pipeline
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [master]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [master]
|
|
8
|
+
release:
|
|
9
|
+
types: [published]
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
hatch-test:
|
|
13
|
+
name: Test on Python ${{ matrix.python-version }}
|
|
14
|
+
runs-on: ["self-hosted", "buildchain"]
|
|
15
|
+
timeout-minutes: 20
|
|
16
|
+
strategy:
|
|
17
|
+
matrix:
|
|
18
|
+
python-version: ["3.11"]
|
|
19
|
+
steps:
|
|
20
|
+
- name: Checkout code
|
|
21
|
+
uses: actions/checkout@v4
|
|
22
|
+
|
|
23
|
+
- name: Install build dependencies
|
|
24
|
+
run: |
|
|
25
|
+
sudo apt-get update
|
|
26
|
+
sudo apt-get install -y build-essential pkg-config libzstd-dev
|
|
27
|
+
|
|
28
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
29
|
+
uses: actions/setup-python@v4
|
|
30
|
+
with:
|
|
31
|
+
python-version: ${{ matrix.python-version }}
|
|
32
|
+
|
|
33
|
+
- name: Install Hatch and HaggingFace
|
|
34
|
+
run: pip install hatch huggingface_hub[cli]
|
|
35
|
+
|
|
36
|
+
- name: Run tests
|
|
37
|
+
run: hatch test
|
|
38
|
+
|
|
39
|
+
hatch-build:
|
|
40
|
+
name: Build to PyPI
|
|
41
|
+
needs: hatch-test
|
|
42
|
+
runs-on: ubuntu-latest
|
|
43
|
+
if: github.event_name == 'release'
|
|
44
|
+
steps:
|
|
45
|
+
- name: Checkout code
|
|
46
|
+
uses: actions/checkout@v4
|
|
47
|
+
|
|
48
|
+
- name: Set up Python 3.10
|
|
49
|
+
uses: actions/setup-python@v4
|
|
50
|
+
with:
|
|
51
|
+
python-version: "3.10"
|
|
52
|
+
|
|
53
|
+
- name: Install Hatch
|
|
54
|
+
run: pip install hatch
|
|
55
|
+
|
|
56
|
+
- name: Build distribution
|
|
57
|
+
run: hatch build
|
|
58
|
+
|
|
59
|
+
- name: Store the distribution packages
|
|
60
|
+
uses: actions/upload-artifact@v4
|
|
61
|
+
with:
|
|
62
|
+
name: python-package-distributions
|
|
63
|
+
path: dist/
|
|
64
|
+
|
|
65
|
+
publish-to-pypi:
|
|
66
|
+
name: >-
|
|
67
|
+
Publish Python 🐍 distribution 📦 to PyPI
|
|
68
|
+
if: github.event_name == 'release'
|
|
69
|
+
needs:
|
|
70
|
+
- hatch-build
|
|
71
|
+
runs-on: ubuntu-latest
|
|
72
|
+
environment:
|
|
73
|
+
name: pypi
|
|
74
|
+
url: https://pypi.org/p/rcsb-embedding-model
|
|
75
|
+
permissions:
|
|
76
|
+
id-token: write
|
|
77
|
+
|
|
78
|
+
steps:
|
|
79
|
+
- name: Download all the dists
|
|
80
|
+
uses: actions/download-artifact@v4
|
|
81
|
+
with:
|
|
82
|
+
name: python-package-distributions
|
|
83
|
+
path: dist/
|
|
84
|
+
- name: Publish distribution 📦 to PyPI
|
|
85
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
86
|
+
|
|
87
|
+
push-image:
|
|
88
|
+
needs:
|
|
89
|
+
- hatch-test
|
|
90
|
+
name: Push image to harbor
|
|
91
|
+
uses: ./.github/workflows/_workflow-docker.yaml
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rcsb-embedding-model
|
|
3
|
+
Version: 0.0.44
|
|
4
|
+
Summary: Protein Embedding Model for Structure Search
|
|
5
|
+
Project-URL: Homepage, https://github.com/rcsb/rcsb-embedding-model
|
|
6
|
+
Project-URL: Issues, https://github.com/rcsb/rcsb-embedding-model/issues
|
|
7
|
+
Author-email: Joan Segura <joan.segura@rcsb.org>
|
|
8
|
+
License: # Cambrian Non-Commercial License Agreement
|
|
9
|
+
|
|
10
|
+
This project is licensed under the EvolutionaryScale Cambrian Non-Commercial License Agreement.
|
|
11
|
+
See: https://www.evolutionaryscale.ai/policies/cambrian-non-commercial-license-agreement
|
|
12
|
+
License-File: LICENSE.md
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Requires-Python: >=3.11
|
|
16
|
+
Requires-Dist: biotite>=1.5.0
|
|
17
|
+
Requires-Dist: esm>=3.2.0
|
|
18
|
+
Requires-Dist: hf-xet>=1.1.10
|
|
19
|
+
Requires-Dist: httpx>=0.28.1
|
|
20
|
+
Requires-Dist: huggingface-hub>=0.30.2
|
|
21
|
+
Requires-Dist: importlib-metadata>=8.7.0
|
|
22
|
+
Requires-Dist: lightning>=2.5.0
|
|
23
|
+
Requires-Dist: typer>=0.15.0
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# RCSB Embedding Model
|
|
27
|
+
|
|
28
|
+
**Version** 0.0.44
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
## Overview
|
|
32
|
+
|
|
33
|
+
RCSB Embedding Model is a neural network architecture designed to encode macromolecular 3D structures into fixed-length vector embeddings for efficient large-scale structure similarity search.
|
|
34
|
+
|
|
35
|
+
Preprint: [Multi-scale structural similarity embedding search across entire proteomes](https://www.biorxiv.org/content/10.1101/2025.02.28.640875v1).
|
|
36
|
+
|
|
37
|
+
A web-based implementation using this model for structure similarity search is available at [rcsb-embedding-search](http://embedding-search.rcsb.org).
|
|
38
|
+
|
|
39
|
+
If you are interested in training the model with a new dataset, visit the [rcsb-embedding-search repository](https://github.com/bioinsilico/rcsb-embedding-search), which provides scripts and documentation for training.
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
## Features
|
|
43
|
+
|
|
44
|
+
- **Residue-level embeddings** computed using the ESM3 protein language model
|
|
45
|
+
- **Structure-level embeddings** aggregated via a transformer-based aggregator network
|
|
46
|
+
- **Command-line interface** implemented with Typer for high-throughput inference workflows
|
|
47
|
+
- **Python API** for interactive embedding computation and integration into analysis pipelines
|
|
48
|
+
- **High-performance inference** leveraging PyTorch Lightning, with multi-node and multi-GPU support
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
pip install rcsb-embedding-model
|
|
55
|
+
|
|
56
|
+
**Requirements:**
|
|
57
|
+
|
|
58
|
+
- Python ≥ 3.10
|
|
59
|
+
- ESM >= 3.2.0
|
|
60
|
+
- Lightning ≥ 2.5.0
|
|
61
|
+
- Typer ≥ 0.15.0
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Quick Start
|
|
66
|
+
|
|
67
|
+
### CLI
|
|
68
|
+
|
|
69
|
+
# 1. Compute residue embeddings: Calculate residue level embeddings of protein structures using ESM3. Predictions are stored as torch tensor files.
|
|
70
|
+
inference residue-embedding --src-file data/structures.csv --output-path results/residue_embeddings --structure-format mmcif --batch-size 8 --devices auto
|
|
71
|
+
|
|
72
|
+
# 2. Compute structure embeddings: Calculate single-chain protein embeddings from structural files. Predictions are stored in a single pandas DataFrame file.
|
|
73
|
+
inference structure-embedding --src-file data/structures.csv --output-path results/residue_embeddings --out-df-name df-res-embeddings --batch-size 4 --devices 0 --devives 1
|
|
74
|
+
|
|
75
|
+
# 3. Compute chain embeddings: Calculate single-chain protein embeddings from residue level embeddings stored as torch tensor files. Predictions a re stored as csv files.
|
|
76
|
+
inference chain-embedding --src-file data/structures.csv --output-path results/chain_embeddings --batch-size 4
|
|
77
|
+
|
|
78
|
+
# 4. Compute assembly embeddings: Calculate assembly embeddings from residue level embeddings stored as torch tensor files. Predictions are stored as csv files.
|
|
79
|
+
inference assembly-embedding --src-file data/structures.csv --res-embedding-location results/residue_embeddings --output-path results/assembly_embeddings
|
|
80
|
+
|
|
81
|
+
### Python API
|
|
82
|
+
|
|
83
|
+
from rcsb_embedding_model import RcsbStructureEmbedding
|
|
84
|
+
|
|
85
|
+
model = RcsbStructureEmbedding()
|
|
86
|
+
|
|
87
|
+
# Compute per-residue embeddings
|
|
88
|
+
res_emb = model.residue_embedding(
|
|
89
|
+
src_structure="examples/1abc.cif",
|
|
90
|
+
src_format="mmcif",
|
|
91
|
+
chain_id="A"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Aggregate to structure-level embedding
|
|
95
|
+
struct_emb = model.aggregator_embedding(res_emb)
|
|
96
|
+
|
|
97
|
+
See the examples and tests directories for more use cases.
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## Model Architecture
|
|
102
|
+
|
|
103
|
+
The embedding model is trained to predict structural similarity by approximating TM-scores using cosine distances between embeddings. It consists of two main components:
|
|
104
|
+
|
|
105
|
+
- **Protein Language Model (PLM)**: Computes residue-level embeddings from a given 3D structure.
|
|
106
|
+
- **Residue Embedding Aggregator**: A transformer-based neural network that aggregates these residue-level embeddings into a single vector.
|
|
107
|
+
|
|
108
|
+

|
|
109
|
+
|
|
110
|
+
### **Protein Language Model (PLM)**
|
|
111
|
+
Residue-wise embeddings of protein structures are computed using the [ESM3](https://www.evolutionaryscale.ai/) generative protein language model.
|
|
112
|
+
|
|
113
|
+
### **Residue Embedding Aggregator**
|
|
114
|
+
The aggregation component consists of six transformer encoder layers, each with a 3,072-neuron feedforward layer and ReLU activations. After processing through these layers, a summation pooling operation is applied, followed by 12 fully connected residual layers that refine the embeddings into a single 1,536-dimensional vector.
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Development
|
|
119
|
+
|
|
120
|
+
git clone https://github.com/rcsb/rcsb-embedding-model.git
|
|
121
|
+
cd rcsb-embedding-model
|
|
122
|
+
pip install -e .
|
|
123
|
+
pytest
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## Citation
|
|
128
|
+
|
|
129
|
+
Segura, J., Bittrich, S., et al. (2024). *Multi-scale structural similarity embedding search across entire proteomes*. bioRxiv. (Preprint: https://www.biorxiv.org/content/10.1101/2025.02.28.640875v1)
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## License
|
|
134
|
+
|
|
135
|
+
This project uses the EvolutionaryScale ESM-3 model and is distributed under the
|
|
136
|
+
[Cambrian Non-Commercial License Agreement](https://www.evolutionaryscale.ai/policies/cambrian-non-commercial-license-agreement).
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# RCSB Embedding Model
|
|
2
|
+
|
|
3
|
+
**Version** 0.0.44
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
## Overview
|
|
7
|
+
|
|
8
|
+
RCSB Embedding Model is a neural network architecture designed to encode macromolecular 3D structures into fixed-length vector embeddings for efficient large-scale structure similarity search.
|
|
9
|
+
|
|
10
|
+
Preprint: [Multi-scale structural similarity embedding search across entire proteomes](https://www.biorxiv.org/content/10.1101/2025.02.28.640875v1).
|
|
11
|
+
|
|
12
|
+
A web-based implementation using this model for structure similarity search is available at [rcsb-embedding-search](http://embedding-search.rcsb.org).
|
|
13
|
+
|
|
14
|
+
If you are interested in training the model with a new dataset, visit the [rcsb-embedding-search repository](https://github.com/bioinsilico/rcsb-embedding-search), which provides scripts and documentation for training.
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
## Features
|
|
18
|
+
|
|
19
|
+
- **Residue-level embeddings** computed using the ESM3 protein language model
|
|
20
|
+
- **Structure-level embeddings** aggregated via a transformer-based aggregator network
|
|
21
|
+
- **Command-line interface** implemented with Typer for high-throughput inference workflows
|
|
22
|
+
- **Python API** for interactive embedding computation and integration into analysis pipelines
|
|
23
|
+
- **High-performance inference** leveraging PyTorch Lightning, with multi-node and multi-GPU support
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
|
|
29
|
+
pip install rcsb-embedding-model
|
|
30
|
+
|
|
31
|
+
**Requirements:**
|
|
32
|
+
|
|
33
|
+
- Python ≥ 3.10
|
|
34
|
+
- ESM >= 3.2.0
|
|
35
|
+
- Lightning ≥ 2.5.0
|
|
36
|
+
- Typer ≥ 0.15.0
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Quick Start
|
|
41
|
+
|
|
42
|
+
### CLI
|
|
43
|
+
|
|
44
|
+
# 1. Compute residue embeddings: Calculate residue level embeddings of protein structures using ESM3. Predictions are stored as torch tensor files.
|
|
45
|
+
inference residue-embedding --src-file data/structures.csv --output-path results/residue_embeddings --structure-format mmcif --batch-size 8 --devices auto
|
|
46
|
+
|
|
47
|
+
# 2. Compute structure embeddings: Calculate single-chain protein embeddings from structural files. Predictions are stored in a single pandas DataFrame file.
|
|
48
|
+
inference structure-embedding --src-file data/structures.csv --output-path results/residue_embeddings --out-df-name df-res-embeddings --batch-size 4 --devices 0 --devives 1
|
|
49
|
+
|
|
50
|
+
# 3. Compute chain embeddings: Calculate single-chain protein embeddings from residue level embeddings stored as torch tensor files. Predictions a re stored as csv files.
|
|
51
|
+
inference chain-embedding --src-file data/structures.csv --output-path results/chain_embeddings --batch-size 4
|
|
52
|
+
|
|
53
|
+
# 4. Compute assembly embeddings: Calculate assembly embeddings from residue level embeddings stored as torch tensor files. Predictions are stored as csv files.
|
|
54
|
+
inference assembly-embedding --src-file data/structures.csv --res-embedding-location results/residue_embeddings --output-path results/assembly_embeddings
|
|
55
|
+
|
|
56
|
+
### Python API
|
|
57
|
+
|
|
58
|
+
from rcsb_embedding_model import RcsbStructureEmbedding
|
|
59
|
+
|
|
60
|
+
model = RcsbStructureEmbedding()
|
|
61
|
+
|
|
62
|
+
# Compute per-residue embeddings
|
|
63
|
+
res_emb = model.residue_embedding(
|
|
64
|
+
src_structure="examples/1abc.cif",
|
|
65
|
+
src_format="mmcif",
|
|
66
|
+
chain_id="A"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Aggregate to structure-level embedding
|
|
70
|
+
struct_emb = model.aggregator_embedding(res_emb)
|
|
71
|
+
|
|
72
|
+
See the examples and tests directories for more use cases.
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Model Architecture
|
|
77
|
+
|
|
78
|
+
The embedding model is trained to predict structural similarity by approximating TM-scores using cosine distances between embeddings. It consists of two main components:
|
|
79
|
+
|
|
80
|
+
- **Protein Language Model (PLM)**: Computes residue-level embeddings from a given 3D structure.
|
|
81
|
+
- **Residue Embedding Aggregator**: A transformer-based neural network that aggregates these residue-level embeddings into a single vector.
|
|
82
|
+
|
|
83
|
+

|
|
84
|
+
|
|
85
|
+
### **Protein Language Model (PLM)**
|
|
86
|
+
Residue-wise embeddings of protein structures are computed using the [ESM3](https://www.evolutionaryscale.ai/) generative protein language model.
|
|
87
|
+
|
|
88
|
+
### **Residue Embedding Aggregator**
|
|
89
|
+
The aggregation component consists of six transformer encoder layers, each with a 3,072-neuron feedforward layer and ReLU activations. After processing through these layers, a summation pooling operation is applied, followed by 12 fully connected residual layers that refine the embeddings into a single 1,536-dimensional vector.
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## Development
|
|
94
|
+
|
|
95
|
+
git clone https://github.com/rcsb/rcsb-embedding-model.git
|
|
96
|
+
cd rcsb-embedding-model
|
|
97
|
+
pip install -e .
|
|
98
|
+
pytest
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Citation
|
|
103
|
+
|
|
104
|
+
Segura, J., Bittrich, S., et al. (2024). *Multi-scale structural similarity embedding search across entire proteomes*. bioRxiv. (Preprint: https://www.biorxiv.org/content/10.1101/2025.02.28.640875v1)
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## License
|
|
109
|
+
|
|
110
|
+
This project uses the EvolutionaryScale ESM-3 model and is distributed under the
|
|
111
|
+
[Cambrian Non-Commercial License Agreement](https://www.evolutionaryscale.ai/policies/cambrian-non-commercial-license-agreement).
|
|
Binary file
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
from rcsb_embedding_model import RcsbStructureEmbedding
|
|
4
|
+
|
|
5
|
+
if __name__ == "__main__":
|
|
6
|
+
|
|
7
|
+
parser = argparse.ArgumentParser()
|
|
8
|
+
parser.add_argument('--file', type=str, required=True)
|
|
9
|
+
parser.add_argument('--file_format', type=str)
|
|
10
|
+
parser.add_argument('--chain', type=str)
|
|
11
|
+
args = parser.parse_args()
|
|
12
|
+
|
|
13
|
+
model = RcsbStructureEmbedding()
|
|
14
|
+
res_embedding = model.residue_embedding(
|
|
15
|
+
src_structure=args.file,
|
|
16
|
+
src_format=args.file_format,
|
|
17
|
+
chain_id=args.chain
|
|
18
|
+
)
|
|
19
|
+
structure_embedding = model.aggregator_embedding(
|
|
20
|
+
res_embedding
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
print(res_embedding.shape, structure_embedding.shape)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "rcsb-embedding-model"
|
|
3
|
+
version = "0.0.44"
|
|
4
|
+
authors = [
|
|
5
|
+
{ name="Joan Segura", email="joan.segura@rcsb.org" },
|
|
6
|
+
]
|
|
7
|
+
description = "Protein Embedding Model for Structure Search"
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Programming Language :: Python :: 3",
|
|
12
|
+
"Operating System :: OS Independent",
|
|
13
|
+
]
|
|
14
|
+
license = {file = "LICENSE.md"}
|
|
15
|
+
license-files = ["LICEN[CS]E*"]
|
|
16
|
+
dependencies=[
|
|
17
|
+
"importlib-metadata >= 8.7.0",
|
|
18
|
+
"esm >= 3.2.0",
|
|
19
|
+
"biotite >= 1.5.0",
|
|
20
|
+
"lightning >= 2.5.0",
|
|
21
|
+
"typer >= 0.15.0",
|
|
22
|
+
"hf-xet >= 1.1.10",
|
|
23
|
+
"huggingface-hub >= 0.30.2",
|
|
24
|
+
"httpx >= 0.28.1"
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
Homepage = "https://github.com/rcsb/rcsb-embedding-model"
|
|
29
|
+
Issues = "https://github.com/rcsb/rcsb-embedding-model/issues"
|
|
30
|
+
|
|
31
|
+
[build-system]
|
|
32
|
+
requires = [
|
|
33
|
+
"hatchling >= 1.14.1"
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
build-backend = "hatchling.build"
|
|
37
|
+
|
|
38
|
+
[project.scripts]
|
|
39
|
+
inference = "rcsb_embedding_model.cli.inference:app"
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from importlib_metadata import version, PackageNotFoundError
|
|
2
|
+
from rcsb_embedding_model.rcsb_structure_embedding import RcsbStructureEmbedding
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
__version__ = version("rcsb-embedding-model")
|
|
6
|
+
except PackageNotFoundError:
|
|
7
|
+
__version__ = "0.0.0"
|
|
8
|
+
|
|
9
|
+
__all__ = ["RcsbStructureEmbedding", "__version__"]
|