rcsb-embedding-model 0.0.36__py3-none-any.whl → 0.0.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rcsb-embedding-model might be problematic. Click here for more details.

@@ -95,4 +95,4 @@ def adapt_csv_to_embedding_chain_stream(src_file, res_embedding_location):
95
95
  def __parse_row(row):
96
96
  r = row.split(",")
97
97
  return os.path.join(res_embedding_location, f"{r[0]}.{r[2]}.pt"), f"{r[0]}.{r[2]}"
98
- return tuple([__parse_row(r.strip()) for r in open(src_file)])
98
+ return tuple([__parse_row(r.strip()) for r in open(src_file) if len(r.split(",")) > 2])
@@ -0,0 +1,65 @@
1
+ from pathlib import Path
2
+ import torch
3
+
4
+ from esm.models.esm3 import ESM3
5
+ from esm.models.vqvae import StructureTokenEncoder
6
+ from esm.tokenization import TokenizerCollection, EsmSequenceTokenizer, StructureTokenizer, SecondaryStructureTokenizer, \
7
+ SASADiscretizingTokenizer, InterProQuantizedTokenizer, ResidueAnnotationsTokenizer
8
+
9
+ from huggingface_hub import snapshot_download
10
+
11
+ def data_root():
12
+ path = Path(snapshot_download(repo_id="rcsb/rcsb-esm"))
13
+ return path
14
+
15
+
16
+ def structure_encoder(device: torch.device | str = "cpu"):
17
+ with torch.device(device):
18
+ model = StructureTokenEncoder(
19
+ d_model=1024, n_heads=1, v_heads=128, n_layers=2, d_out=128, n_codes=4096
20
+ ).eval()
21
+ state_dict = torch.load(
22
+ data_root() / "data/weights/esm3_structure_encoder_v0.pth", map_location=device
23
+ )
24
+ model.load_state_dict(state_dict)
25
+ return model
26
+
27
+
28
+
29
+ def get_model_tokenizers():
30
+
31
+ class CustomAnnotationsTokenizer(ResidueAnnotationsTokenizer):
32
+ def __init__(self, csv_path: str | None = None, max_annotations: int = 16):
33
+ from esm.utils.constants import esm3 as C
34
+ super().__init__("none", max_annotations)
35
+ if csv_path is None:
36
+ csv_path = str(data_root() / C.RESID_CSV)
37
+ self.csv_path = csv_path
38
+
39
+ return TokenizerCollection(
40
+ sequence=EsmSequenceTokenizer(),
41
+ structure=StructureTokenizer(),
42
+ secondary_structure=SecondaryStructureTokenizer(kind="ss8"),
43
+ sasa=SASADiscretizingTokenizer(),
44
+ function=InterProQuantizedTokenizer(),
45
+ residue_annotations=CustomAnnotationsTokenizer(),
46
+ )
47
+
48
+
49
+ def esm_open(device: torch.device | str = "cpu"):
50
+ with torch.device(device):
51
+ model = ESM3(
52
+ d_model=1536,
53
+ n_heads=24,
54
+ v_heads=256,
55
+ n_layers=48,
56
+ structure_encoder_fn=structure_encoder,
57
+ structure_decoder_fn=lambda x: x,
58
+ function_decoder_fn=lambda x: x,
59
+ tokenizers=get_model_tokenizers(),
60
+ ).eval()
61
+ state_dict = torch.load(
62
+ data_root() / "data/weights/esm3_sm_open_v1.pth", map_location=device
63
+ )
64
+ model.load_state_dict(state_dict)
65
+ return model
@@ -1,9 +1,8 @@
1
1
  import torch
2
- from esm.models.esm3 import ESM3
3
- from esm.utils.constants.models import ESM3_OPEN_SMALL
4
- from huggingface_hub import hf_hub_download
5
2
 
3
+ from huggingface_hub import hf_hub_download
6
4
  from rcsb_embedding_model.model.residue_embedding_aggregator import ResidueEmbeddingAggregator
5
+ from rcsb_embedding_model.utils.esm.loaders import esm_open
7
6
 
8
7
  REPO_ID = "rcsb/rcsb-embedding-model"
9
8
  FILE_NAME = "rcsb-embedding-model.pt"
@@ -25,7 +24,5 @@ def get_aggregator_model(device=None):
25
24
 
26
25
 
27
26
  def get_residue_model(device=None):
28
- return ESM3.from_pretrained(
29
- ESM3_OPEN_SMALL,
30
- device
31
- )
27
+ return esm_open(device)
28
+
@@ -1,16 +1,23 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rcsb-embedding-model
3
- Version: 0.0.36
3
+ Version: 0.0.38
4
4
  Summary: Protein Embedding Model for Structure Search
5
5
  Project-URL: Homepage, https://github.com/rcsb/rcsb-embedding-model
6
6
  Project-URL: Issues, https://github.com/rcsb/rcsb-embedding-model/issues
7
7
  Author-email: Joan Segura <joan.segura@rcsb.org>
8
- License-Expression: BSD-3-Clause
8
+ License: # Cambrian Non-Commercial License Agreement
9
+
10
+ This project is licensed under the EvolutionaryScale Cambrian Non-Commercial License Agreement.
11
+ See: https://www.evolutionaryscale.ai/policies/cambrian-non-commercial-license-agreement
9
12
  License-File: LICENSE.md
10
13
  Classifier: Operating System :: OS Independent
11
14
  Classifier: Programming Language :: Python :: 3
12
- Requires-Python: >=3.10
15
+ Requires-Python: >=3.11
16
+ Requires-Dist: biotite>=1.5.0
13
17
  Requires-Dist: esm>=3.2.0
18
+ Requires-Dist: hf-xet>=1.1.10
19
+ Requires-Dist: httpx>=0.28.1
20
+ Requires-Dist: huggingface-hub>=0.30.2
14
21
  Requires-Dist: importlib-metadata>=8.7.0
15
22
  Requires-Dist: lightning>=2.5.0
16
23
  Requires-Dist: typer>=0.15.0
@@ -18,7 +25,7 @@ Description-Content-Type: text/markdown
18
25
 
19
26
  # RCSB Embedding Model
20
27
 
21
- **Version** 0.0.26
28
+ **Version** 0.0.38
22
29
 
23
30
 
24
31
  ## Overview
@@ -125,4 +132,5 @@ Segura, J., Bittrich, S., et al. (2024). *Multi-scale structural similarity embe
125
132
 
126
133
  ## License
127
134
 
128
- This project is licensed under the BSD 3-Clause License. See [LICENSE.md](LICENSE.md) for details.
135
+ This project uses the EvolutionaryScale ESM-3 model and is distributed under the
136
+ [Cambrian Non-Commercial License Agreement](https://www.evolutionaryscale.ai/policies/cambrian-non-commercial-license-agreement).
@@ -20,13 +20,14 @@ rcsb_embedding_model/modules/chain_module.py,sha256=KsZw2uagO4rpAKWv6ivqEMxIEzgt
20
20
  rcsb_embedding_model/modules/esm_module.py,sha256=otJRbCb319nCCob_4E1W_UClhkex9eDqcCyzWQO-vIs,740
21
21
  rcsb_embedding_model/modules/structure_module.py,sha256=4js02XzKvhc_G26ELsGhJ9SCi_wlvtVolObxfWt3BhE,1077
22
22
  rcsb_embedding_model/types/api_types.py,sha256=SCwALwvEb0KRKaoWKbuN7JyfOH-1whsI0Z4ki41dht8,1235
23
- rcsb_embedding_model/utils/data.py,sha256=ThrcYycIizsV_Ycn6PPxF12JRr1m2K-v8TsIaVqx10A,3816
24
- rcsb_embedding_model/utils/model.py,sha256=xr3p02ohOgJ5UInwdIupN68Oq4yvNFhxobZRacS1adg,953
23
+ rcsb_embedding_model/utils/data.py,sha256=p7sbskLPBFtpZ-XM18wFY5Kei02Xso4wTWYTqHxJvVw,3841
24
+ rcsb_embedding_model/utils/model.py,sha256=Xi6bSUsB2-IsQS9610gXnbAEvYlK2V7eJC-cDE-JBTA,875
25
25
  rcsb_embedding_model/utils/structure_parser.py,sha256=fSIbq_a_aEigCWY_1dUcW9d9Law0ZDOcZAxJlZL0Rt8,3377
26
26
  rcsb_embedding_model/utils/structure_provider.py,sha256=eWtxjkPpmRfmil_DKR1J6miaXR3lQ28DF5O0qrqSgGA,786
27
+ rcsb_embedding_model/utils/esm/loaders.py,sha256=V7CADr7RReoztYmBQb2tjA8RBQIwFEjxBcocKAB_ea4,2221
27
28
  rcsb_embedding_model/writer/batch_writer.py,sha256=rTFNasB0Xp4-XCNTXKeEWZxSrb7lvZytoRldJUWn9Jg,3312
28
- rcsb_embedding_model-0.0.36.dist-info/METADATA,sha256=spFNxlrrwMORe5Su0559-997by2cgkuk9-yEQlhew60,5351
29
- rcsb_embedding_model-0.0.36.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
30
- rcsb_embedding_model-0.0.36.dist-info/entry_points.txt,sha256=MK11jTIEmaV-x4CkPX5IymDaVs7Ky_f2xxU8BJVZ_9Q,69
31
- rcsb_embedding_model-0.0.36.dist-info/licenses/LICENSE.md,sha256=oUaHiKgfBkChth_Sm67WemEvatO1U0Go8LHjaskXY0w,1522
32
- rcsb_embedding_model-0.0.36.dist-info/RECORD,,
29
+ rcsb_embedding_model-0.0.38.dist-info/METADATA,sha256=uBAVCA8bTzGUza-i1Dr6_WpDPVCSZam4pvqg_ZT37ms,5820
30
+ rcsb_embedding_model-0.0.38.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
31
+ rcsb_embedding_model-0.0.38.dist-info/entry_points.txt,sha256=MK11jTIEmaV-x4CkPX5IymDaVs7Ky_f2xxU8BJVZ_9Q,69
32
+ rcsb_embedding_model-0.0.38.dist-info/licenses/LICENSE.md,sha256=XyzxQe9PLJQlOmOOrqwmBaAfo0PAenOQ5NsgnApuVH4,230
33
+ rcsb_embedding_model-0.0.38.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ # Cambrian Non-Commercial License Agreement
2
+
3
+ This project is licensed under the EvolutionaryScale Cambrian Non-Commercial License Agreement.
4
+ See: https://www.evolutionaryscale.ai/policies/cambrian-non-commercial-license-agreement
@@ -1,28 +0,0 @@
1
- BSD 3-Clause License
2
-
3
- Copyright (c) 2024, RCSB Protein Data Bank, UC San Diego
4
-
5
- Redistribution and use in source and binary forms, with or without
6
- modification, are permitted provided that the following conditions are met:
7
-
8
- 1. Redistributions of source code must retain the above copyright notice, this
9
- list of conditions and the following disclaimer.
10
-
11
- 2. Redistributions in binary form must reproduce the above copyright notice,
12
- this list of conditions and the following disclaimer in the documentation
13
- and/or other materials provided with the distribution.
14
-
15
- 3. Neither the name of the copyright holder nor the names of its
16
- contributors may be used to endorse or promote products derived from
17
- this software without specific prior written permission.
18
-
19
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.