rcsb-embedding-model 0.0.2__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rcsb-embedding-model might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rcsb-embedding-model
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: Protein Embedding Model for Structure Search
5
5
  Project-URL: Homepage, https://github.com/rcsb/rcsb-embedding-model
6
6
  Project-URL: Issues, https://github.com/rcsb/rcsb-embedding-model/issues
@@ -11,7 +11,7 @@ Classifier: Operating System :: OS Independent
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.10
13
13
  Requires-Dist: esm>=3.2.0
14
- Requires-Dist: torch>=2.6.0
14
+ Requires-Dist: torch>=2.2.0
15
15
  Description-Content-Type: text/markdown
16
16
 
17
17
  # RCSB Embedding Model: A Deep Learning Approach for 3D Structure Embeddings
@@ -70,7 +70,7 @@ res_embedding = model.residue_embedding(
70
70
  )
71
71
  ```
72
72
 
73
- ### **Generating Protein Structure**
73
+ ### **Generating Protein Structure Embeddings**
74
74
  Protein 3D structure embedding can be calculated as:
75
75
 
76
76
  ```python
@@ -54,7 +54,7 @@ res_embedding = model.residue_embedding(
54
54
  )
55
55
  ```
56
56
 
57
- ### **Generating Protein Structure**
57
+ ### **Generating Protein Structure Embeddings**
58
58
  Protein 3D structure embedding can be calculated as:
59
59
 
60
60
  ```python
@@ -4,7 +4,7 @@ import torch
4
4
  from biotite.structure import chain_iter, get_residues, filter_amino_acids
5
5
  from biotite.structure.io.pdb import PDBFile
6
6
  from biotite.structure.io.pdbx import CIFFile, get_structure, BinaryCIFFile
7
- from esm.models.esm3 import ESM3, ESM3_OPEN_SMALL, ESM3InferenceClient
7
+ from esm.models.esm3 import ESM3, ESM3_OPEN_SMALL
8
8
  from esm.sdk.api import ESMProtein, SamplingConfig
9
9
  from esm.utils.structure.protein_chain import ProteinChain
10
10
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "rcsb-embedding-model"
3
- version = "0.0.2"
3
+ version = "0.0.4"
4
4
  authors = [
5
5
  { name="Joan Segura", email="joan.segura@rcsb.org" },
6
6
  ]
@@ -15,7 +15,7 @@ license = "BSD-3-Clause"
15
15
  license-files = ["LICEN[CS]E*"]
16
16
  dependencies=[
17
17
  "esm >= 3.2.0",
18
- "torch >= 2.6.0"
18
+ "torch >= 2.2.0"
19
19
  ]
20
20
  [project.urls]
21
21
  Homepage = "https://github.com/rcsb/rcsb-embedding-model"
@@ -29,10 +29,7 @@ class RcsbStructureEmbedding:
29
29
  def load_residue_embedding(self, device=None):
30
30
  if not device:
31
31
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
32
- self.__residue_embedding = ESM3.from_pretrained(
33
- ESM3_OPEN_SMALL,
34
- device
35
- )
32
+ self.__residue_embedding = _load_res_model(device)
36
33
 
37
34
  def load_aggregator_embedding(self, device=None):
38
35
  if not device:
@@ -69,6 +66,25 @@ class RcsbStructureEmbedding:
69
66
  dim=0
70
67
  )
71
68
 
69
+ def sequence_embedding(self, sequence):
70
+ self.__check_residue_embedding()
71
+
72
+ if sequence.startswith(">"):
73
+ sequence = "".join(line.strip() for line in sequence.splitlines() if not line.startswith(">"))
74
+
75
+ if len(sequence) < RcsbStructureEmbedding.MIN_RES:
76
+ raise ValueError(f"Sequence too short for embedding (min {RcsbStructureEmbedding.MIN_RES} residues)")
77
+
78
+ protein = ESMProtein(sequence=sequence)
79
+ protein_tensor = self.__residue_embedding.encode(protein)
80
+
81
+ result = self.__residue_embedding.forward_and_sample(
82
+ protein_tensor,
83
+ SamplingConfig(return_per_residue_embeddings=True)
84
+ )
85
+
86
+ return result.per_residue_embedding
87
+
72
88
  def aggregator_embedding(self, residue_embedding):
73
89
  self.__check_aggregator_embedding()
74
90
  return self.__aggregator_embedding(residue_embedding)
@@ -146,3 +162,10 @@ def _load_model(model_path, device=None):
146
162
  aggregator_model.to(device)
147
163
  aggregator_model.eval()
148
164
  return aggregator_model
165
+
166
+
167
+ def _load_res_model(device=None):
168
+ return ESM3.from_pretrained(
169
+ ESM3_OPEN_SMALL,
170
+ device
171
+ )
@@ -18,6 +18,16 @@ class TestEmbeddingModel(unittest.TestCase):
18
18
  )
19
19
  self.assertEqual(list(res_embedding.shape), [243, 1536])
20
20
 
21
+ def test_sequence_embedding(self):
22
+
23
+ model = RcsbStructureEmbedding()
24
+ res_embedding = model.sequence_embedding(
25
+ sequence="CGVPAIQPVLSGLSRIVNGEEAVPGSWPWQVSLQDKTGFHFCGGSLINENWVVTAAHCGVTTSDVVVAGEFDQGSSSEKIQKLKIAKVFKNSK"
26
+ "YNSLTINNDITLLKLSTAASFSQTVSAVCLPSASDDFAAGTTCVTTGWGLTRYTNANTPDRLQQASLPLLSNTNCKKYWGTKIKDAMICAGAS"
27
+ "GVSSCMGDSGGPLVCKKNGAWTLVGIVSWGSSTCSTSTPGVYARVTALVNWVQQTLAAN"
28
+ )
29
+ self.assertEqual(list(res_embedding.shape), [247, 1536])
30
+
21
31
  def test_aggregator_embedding(self):
22
32
 
23
33
  model = RcsbStructureEmbedding()