rcsb-embedding-model 0.0.2__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rcsb-embedding-model might be problematic. Click here for more details.
- {rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/PKG-INFO +3 -3
- {rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/README.md +1 -1
- {rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/examples/esm_embeddings.py +1 -1
- {rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/pyproject.toml +2 -2
- {rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/src/rcsb_embedding_model/rcsb_structure_embedding.py +27 -4
- {rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/tests/test_model.py +10 -0
- {rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/.gitignore +0 -0
- {rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/LICENSE.md +0 -0
- {rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/assets/embedding-model-architecture.png +0 -0
- {rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/src/rcsb_embedding_model/__init__.py +0 -0
- {rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/src/rcsb_embedding_model/model/layers.py +0 -0
- {rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/src/rcsb_embedding_model/model/residue_embedding_aggregator.py +0 -0
- {rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/tests/resources/1acb.cif +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rcsb-embedding-model
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: Protein Embedding Model for Structure Search
|
|
5
5
|
Project-URL: Homepage, https://github.com/rcsb/rcsb-embedding-model
|
|
6
6
|
Project-URL: Issues, https://github.com/rcsb/rcsb-embedding-model/issues
|
|
@@ -11,7 +11,7 @@ Classifier: Operating System :: OS Independent
|
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Requires-Python: >=3.10
|
|
13
13
|
Requires-Dist: esm>=3.2.0
|
|
14
|
-
Requires-Dist: torch>=2.
|
|
14
|
+
Requires-Dist: torch>=2.2.0
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
|
|
17
17
|
# RCSB Embedding Model: A Deep Learning Approach for 3D Structure Embeddings
|
|
@@ -70,7 +70,7 @@ res_embedding = model.residue_embedding(
|
|
|
70
70
|
)
|
|
71
71
|
```
|
|
72
72
|
|
|
73
|
-
### **Generating Protein Structure**
|
|
73
|
+
### **Generating Protein Structure Embeddings**
|
|
74
74
|
Protein 3D structure embedding can be calculated as:
|
|
75
75
|
|
|
76
76
|
```python
|
|
@@ -4,7 +4,7 @@ import torch
|
|
|
4
4
|
from biotite.structure import chain_iter, get_residues, filter_amino_acids
|
|
5
5
|
from biotite.structure.io.pdb import PDBFile
|
|
6
6
|
from biotite.structure.io.pdbx import CIFFile, get_structure, BinaryCIFFile
|
|
7
|
-
from esm.models.esm3 import ESM3, ESM3_OPEN_SMALL
|
|
7
|
+
from esm.models.esm3 import ESM3, ESM3_OPEN_SMALL
|
|
8
8
|
from esm.sdk.api import ESMProtein, SamplingConfig
|
|
9
9
|
from esm.utils.structure.protein_chain import ProteinChain
|
|
10
10
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "rcsb-embedding-model"
|
|
3
|
-
version = "0.0.
|
|
3
|
+
version = "0.0.4"
|
|
4
4
|
authors = [
|
|
5
5
|
{ name="Joan Segura", email="joan.segura@rcsb.org" },
|
|
6
6
|
]
|
|
@@ -15,7 +15,7 @@ license = "BSD-3-Clause"
|
|
|
15
15
|
license-files = ["LICEN[CS]E*"]
|
|
16
16
|
dependencies=[
|
|
17
17
|
"esm >= 3.2.0",
|
|
18
|
-
"torch >= 2.
|
|
18
|
+
"torch >= 2.2.0"
|
|
19
19
|
]
|
|
20
20
|
[project.urls]
|
|
21
21
|
Homepage = "https://github.com/rcsb/rcsb-embedding-model"
|
|
@@ -29,10 +29,7 @@ class RcsbStructureEmbedding:
|
|
|
29
29
|
def load_residue_embedding(self, device=None):
|
|
30
30
|
if not device:
|
|
31
31
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
32
|
-
self.__residue_embedding =
|
|
33
|
-
ESM3_OPEN_SMALL,
|
|
34
|
-
device
|
|
35
|
-
)
|
|
32
|
+
self.__residue_embedding = _load_res_model(device)
|
|
36
33
|
|
|
37
34
|
def load_aggregator_embedding(self, device=None):
|
|
38
35
|
if not device:
|
|
@@ -69,6 +66,25 @@ class RcsbStructureEmbedding:
|
|
|
69
66
|
dim=0
|
|
70
67
|
)
|
|
71
68
|
|
|
69
|
+
def sequence_embedding(self, sequence):
|
|
70
|
+
self.__check_residue_embedding()
|
|
71
|
+
|
|
72
|
+
if sequence.startswith(">"):
|
|
73
|
+
sequence = "".join(line.strip() for line in sequence.splitlines() if not line.startswith(">"))
|
|
74
|
+
|
|
75
|
+
if len(sequence) < RcsbStructureEmbedding.MIN_RES:
|
|
76
|
+
raise ValueError(f"Sequence too short for embedding (min {RcsbStructureEmbedding.MIN_RES} residues)")
|
|
77
|
+
|
|
78
|
+
protein = ESMProtein(sequence=sequence)
|
|
79
|
+
protein_tensor = self.__residue_embedding.encode(protein)
|
|
80
|
+
|
|
81
|
+
result = self.__residue_embedding.forward_and_sample(
|
|
82
|
+
protein_tensor,
|
|
83
|
+
SamplingConfig(return_per_residue_embeddings=True)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return result.per_residue_embedding
|
|
87
|
+
|
|
72
88
|
def aggregator_embedding(self, residue_embedding):
|
|
73
89
|
self.__check_aggregator_embedding()
|
|
74
90
|
return self.__aggregator_embedding(residue_embedding)
|
|
@@ -146,3 +162,10 @@ def _load_model(model_path, device=None):
|
|
|
146
162
|
aggregator_model.to(device)
|
|
147
163
|
aggregator_model.eval()
|
|
148
164
|
return aggregator_model
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _load_res_model(device=None):
|
|
168
|
+
return ESM3.from_pretrained(
|
|
169
|
+
ESM3_OPEN_SMALL,
|
|
170
|
+
device
|
|
171
|
+
)
|
|
@@ -18,6 +18,16 @@ class TestEmbeddingModel(unittest.TestCase):
|
|
|
18
18
|
)
|
|
19
19
|
self.assertEqual(list(res_embedding.shape), [243, 1536])
|
|
20
20
|
|
|
21
|
+
def test_sequence_embedding(self):
|
|
22
|
+
|
|
23
|
+
model = RcsbStructureEmbedding()
|
|
24
|
+
res_embedding = model.sequence_embedding(
|
|
25
|
+
sequence="CGVPAIQPVLSGLSRIVNGEEAVPGSWPWQVSLQDKTGFHFCGGSLINENWVVTAAHCGVTTSDVVVAGEFDQGSSSEKIQKLKIAKVFKNSK"
|
|
26
|
+
"YNSLTINNDITLLKLSTAASFSQTVSAVCLPSASDDFAAGTTCVTTGWGLTRYTNANTPDRLQQASLPLLSNTNCKKYWGTKIKDAMICAGAS"
|
|
27
|
+
"GVSSCMGDSGGPLVCKKNGAWTLVGIVSWGSSTCSTSTPGVYARVTALVNWVQQTLAAN"
|
|
28
|
+
)
|
|
29
|
+
self.assertEqual(list(res_embedding.shape), [247, 1536])
|
|
30
|
+
|
|
21
31
|
def test_aggregator_embedding(self):
|
|
22
32
|
|
|
23
33
|
model = RcsbStructureEmbedding()
|
|
File without changes
|
|
File without changes
|
{rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/assets/embedding-model-architecture.png
RENAMED
|
File without changes
|
{rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/src/rcsb_embedding_model/__init__.py
RENAMED
|
File without changes
|
{rcsb_embedding_model-0.0.2 → rcsb_embedding_model-0.0.4}/src/rcsb_embedding_model/model/layers.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|