PyPI - rcsb-embedding-model - Versions diffs - 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl - Mend

rcsb-embedding-model 0.0.35py3-none-any.whl → 0.0.37py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rcsb-embedding-model might be problematic. Click here for more details.

Files changed (22) hide show

rcsb_embedding_model/cli/inference.py CHANGED Viewed

@@ -6,7 +6,7 @@ import typer
 from rcsb_embedding_model import __version__
 from rcsb_embedding_model.cli.args_utils import arg_devices
 from rcsb_embedding_model.types.api_types import StructureFormat, Accelerator, SrcLocation, SrcProteinFrom, \
-    StructureLocation, SrcAssemblyFrom, SrcTensorFrom, OutFormat
+    SrcAssemblyFrom, SrcTensorFrom, OutFormat
 from rcsb_embedding_model.utils.data import adapt_csv_to_embedding_chain_stream
 import os
@@ -42,9 +42,6 @@ def residue_embedding(
         output_name: Annotated[str, typer.Option(
             help='File name for storing embeddings as a single JSON file. Used when output-format=grouped.'
         )] = 'inference',
-        structure_location: Annotated[StructureLocation, typer.Option(
-            help='Structure file location.'
-        )] = StructureLocation.local,
         structure_format: Annotated[StructureFormat, typer.Option(
             help='Structure file format.'
         )] = StructureFormat.mmcif,
@@ -72,7 +69,6 @@ def residue_embedding(
         src_stream=src_file,
         src_location=SrcLocation.file,
         src_from=SrcProteinFrom.chain,
-        structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
         batch_size=batch_size,
@@ -108,9 +104,6 @@ def structure_embedding(
         output_name: Annotated[str, typer.Option(
             help='File name for storing embeddings as a single JSON file.'
         )] = 'inference',
-        structure_location: Annotated[StructureLocation, typer.Option(
-            help='Structure file location.'
-        )] = StructureLocation.local,
         structure_format: Annotated[StructureFormat, typer.Option(
             help='Structure file format.'
         )] = StructureFormat.mmcif,
@@ -138,7 +131,6 @@ def structure_embedding(
         src_stream=src_file,
         src_location=SrcLocation.file,
         src_from=SrcProteinFrom.chain,
-        structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
         batch_size=batch_size,
@@ -183,9 +175,6 @@ def chain_embedding(
         output_name: Annotated[str, typer.Option(
             help='File name for storing embeddings as a single JSON file. Used when output-format=grouped.'
         )] = 'inference',
-        structure_location: Annotated[StructureLocation, typer.Option(
-            help='Structure file location.'
-        )] = StructureLocation.local,
         structure_format: Annotated[StructureFormat, typer.Option(
             help='Structure file format.'
         )] = StructureFormat.mmcif,
@@ -214,7 +203,6 @@ def chain_embedding(
         res_embedding_location=res_embedding_location,
         src_location=SrcLocation.stream,
         src_from=SrcTensorFrom.file,
-        structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
         batch_size=batch_size,
@@ -259,9 +247,6 @@ def assembly_embedding(
         output_name: Annotated[str, typer.Option(
             help='File name for storing embeddings as a single JSON file. Used when output-format=grouped.'
         )] = 'inference',
-        structure_location: Annotated[StructureLocation, typer.Option(
-            help='Structure file location.'
-        )] = StructureLocation.local,
         structure_format: Annotated[StructureFormat, typer.Option(
             help='Structure file format.'
         )] = StructureFormat.mmcif,
@@ -293,7 +278,6 @@ def assembly_embedding(
         res_embedding_location=res_embedding_location,
         src_location=SrcLocation.file,
         src_from=SrcAssemblyFrom.assembly,
-        structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
         max_res_n=max_res_n,
@@ -356,9 +340,6 @@ def complete_embedding(
         output_assembly_name: Annotated[str, typer.Option(
             help='File name for storing chain embeddings as a single JSON file. Used when output-format=grouped.'
         )] = 'chain-inference',
-        structure_location: Annotated[StructureLocation, typer.Option(
-            help='Structure file location.'
-        )] = StructureLocation.local,
         structure_format: Annotated[StructureFormat, typer.Option(
             help='Structure file format.'
         )] = StructureFormat.mmcif,
@@ -397,7 +378,6 @@ def complete_embedding(
         src_file=src_chain_file,
         output_path=output_res_path,
         output_format=OutFormat.separated,
-        structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
         batch_size=batch_size_res,
@@ -412,7 +392,6 @@ def complete_embedding(
         output_format=output_format,
         output_name=output_chain_name,
         res_embedding_location=output_res_path,
-        structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
         batch_size=batch_size_chain,
@@ -427,7 +406,6 @@ def complete_embedding(
         output_format=output_format,
         output_name=output_assembly_name,
         res_embedding_location=output_res_path,
-        structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
         batch_size=batch_size_assembly,

rcsb_embedding_model/dataset/esm_prot_from_chain.py CHANGED Viewed

@@ -9,6 +9,7 @@ from esm.utils.structure.protein_chain import ProteinChain
 from torch.utils.data import Dataset, DataLoader
 import pandas as pd
+from rcsb_embedding_model.dataset.untils import get_structure_location
 from rcsb_embedding_model.types.api_types import StructureFormat, StructureLocation, SrcLocation
 from rcsb_embedding_model.utils.data import stringio_from_url
 from rcsb_embedding_model.utils.structure_parser import rename_atom_attr,filter_residues
@@ -28,14 +29,12 @@ class EsmProtFromChain(Dataset):
         self,
         src_stream,
         src_location=SrcLocation.file,
-        structure_location=StructureLocation.local,
         structure_format=StructureFormat.mmcif,
         structure_provider=StructureProvider()
     ):
         super().__init__()
         self.__structure_provider = structure_provider
         self.src_location = src_location
-        self.structure_location = structure_location
         self.structure_format = structure_format
         self.data = pd.DataFrame()
         self.__load_stream(src_stream)
@@ -65,7 +64,7 @@ class EsmProtFromChain(Dataset):
         item_name = self.data.iloc[idx][EsmProtFromChain.ITEM_NAME_ATTR]
         structure = self.__structure_provider.get_structure(
             src_name=src_name,
-            src_structure=stringio_from_url(src_structure) if self.structure_location == StructureLocation.remote else src_structure,
+            src_structure=stringio_from_url(src_structure) if get_structure_location(src_structure) == StructureLocation.remote else src_structure,
             structure_format=self.structure_format,
             chain_id=chain_id
         )
@@ -96,7 +95,6 @@ if __name__ == '__main__':
     dataset = EsmProtFromChain(
         src_stream=args.file_list,
         src_location=SrcLocation.file,
-        structure_location=StructureLocation.remote,
         structure_format=StructureFormat.bciff,
     )

rcsb_embedding_model/dataset/esm_prot_from_structure.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import pandas as pd
 from rcsb_embedding_model.dataset.esm_prot_from_chain import EsmProtFromChain
+from rcsb_embedding_model.dataset.untils import get_structure_location
 from rcsb_embedding_model.types.api_types import StructureLocation, StructureFormat, SrcLocation
 from rcsb_embedding_model.utils.data import stringio_from_url
 from rcsb_embedding_model.utils.structure_parser import get_protein_chains
@@ -20,20 +21,17 @@ class EsmProtFromStructure(EsmProtFromChain):
             self,
             src_stream,
             src_location=SrcLocation.file,
-            structure_location=StructureLocation.local,
             structure_format=StructureFormat.mmcif,
             min_res_n=0,
             structure_provider=StructureProvider()
     ):
         self.min_res_n = min_res_n
         self.src_location = src_location
-        self.structure_location = structure_location
         self.structure_format = structure_format
         self.__structure_provider = structure_provider
         super().__init__(
             src_stream=self.__get_chains(src_stream),
             src_location=SrcLocation.stream,
-            structure_location=structure_location,
             structure_format=structure_format,
             structure_provider=structure_provider
         )
@@ -58,7 +56,7 @@ class EsmProtFromStructure(EsmProtFromChain):
             item_name = row[EsmProtFromStructure.ITEM_NAME_ATTR]
             structure = self.__structure_provider.get_structure(
                 src_name=src_name,
-                src_structure=stringio_from_url(src_structure) if self.structure_location == StructureLocation.remote else src_structure,
+                src_structure=stringio_from_url(src_structure) if get_structure_location(src_structure) == StructureLocation.remote else src_structure,
                 structure_format=self.structure_format
             )
             for ch in get_protein_chains(structure, self.min_res_n):

rcsb_embedding_model/dataset/resdiue_assembly_embedding_from_structure.py CHANGED Viewed

@@ -3,6 +3,7 @@ import sys
 import pandas as pd
 from rcsb_embedding_model.dataset.residue_assembly_embedding_from_tensor_file import ResidueAssemblyEmbeddingFromTensorFile
+from rcsb_embedding_model.dataset.untils import get_structure_location
 from rcsb_embedding_model.types.api_types import SrcLocation, StructureLocation, StructureFormat
 from rcsb_embedding_model.utils.data import stringio_from_url
 from rcsb_embedding_model.utils.structure_parser import get_assemblies
@@ -22,14 +23,12 @@ class ResidueAssemblyDatasetFromStructure(ResidueAssemblyEmbeddingFromTensorFile
             src_stream,
             res_embedding_location,
             src_location=SrcLocation.file,
-            structure_location=StructureLocation.local,
             structure_format=StructureFormat.mmcif,
             min_res_n=0,
             max_res_n=sys.maxsize,
             structure_provider=StructureProvider()
     ):
         self.src_location = src_location
-        self.structure_location = structure_location
         self.structure_format = structure_format
         self.min_res_n = min_res_n
         self.max_res_n = max_res_n
@@ -37,7 +36,6 @@ class ResidueAssemblyDatasetFromStructure(ResidueAssemblyEmbeddingFromTensorFile
             src_stream=self.__get_assemblies(src_stream),
             res_embedding_location=res_embedding_location,
             src_location=SrcLocation.stream,
-            structure_location=structure_location,
             structure_format=structure_format,
             min_res_n=min_res_n,
             max_res_n=max_res_n,
@@ -61,7 +59,7 @@ class ResidueAssemblyDatasetFromStructure(ResidueAssemblyEmbeddingFromTensorFile
         for idx, row in data.iterrows():
             src_name = row[ResidueAssemblyDatasetFromStructure.STREAM_NAME_ATTR]
             src_structure = row[ResidueAssemblyDatasetFromStructure.STREAM_ATTR]
-            structure = stringio_from_url(src_structure) if self.structure_location == StructureLocation.remote else src_structure
+            structure = stringio_from_url(src_structure) if get_structure_location(src_structure) == StructureLocation.remote else src_structure
             item_name = row[ResidueAssemblyDatasetFromStructure.ITEM_NAME_ATTR]
             for assembly_id in get_assemblies(structure=structure, structure_format=self.structure_format):
                 assemblies.append((src_name, src_structure, str(assembly_id), f"{item_name}-{assembly_id}"))

rcsb_embedding_model/dataset/residue_assembly_embedding_from_tensor_file.py CHANGED Viewed

@@ -4,6 +4,7 @@ import sys
 import pandas as pd
 from torch.utils.data import Dataset, DataLoader
+from rcsb_embedding_model.dataset.untils import get_structure_location
 from rcsb_embedding_model.types.api_types import StructureLocation, StructureFormat, SrcLocation
 from rcsb_embedding_model.utils.data import stringio_from_url, concatenate_tensors
 from rcsb_embedding_model.utils.structure_parser import get_protein_chains
@@ -24,7 +25,6 @@ class ResidueAssemblyEmbeddingFromTensorFile(Dataset):
             src_stream,
             res_embedding_location,
             src_location=SrcLocation.file,
-            structure_location=StructureLocation.local,
             structure_format=StructureFormat.mmcif,
             min_res_n=0,
             max_res_n=sys.maxsize,
@@ -33,7 +33,6 @@ class ResidueAssemblyEmbeddingFromTensorFile(Dataset):
         super().__init__()
         self.res_embedding_location = res_embedding_location
         self.src_location = src_location
-        self.structure_location = structure_location
         self.structure_format = structure_format
         self.min_res_n = min_res_n
         self.max_res_n = max_res_n
@@ -65,7 +64,7 @@ class ResidueAssemblyEmbeddingFromTensorFile(Dataset):
         item_name = self.data.iloc[idx][ResidueAssemblyEmbeddingFromTensorFile.ITEM_NAME_ATTR]
         structure = self.__structure_provider.get_structure(
             src_name=src_name,
-            src_structure=stringio_from_url(src_structure) if self.structure_location == StructureLocation.remote else src_structure,
+            src_structure=stringio_from_url(src_structure) if get_structure_location(src_structure) == StructureLocation.remote else src_structure,
             structure_format=self.structure_format,
             assembly_id=assembly_id
         )
@@ -86,7 +85,6 @@ if __name__ == "__main__":
         src_stream=args.file_list,
         res_embedding_location=args.res_embeddings_path,
         src_location=SrcLocation.file,
-        structure_location=StructureLocation.remote,
         structure_format=StructureFormat.bciff
     )

rcsb_embedding_model/dataset/residue_embedding_from_structure.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 import pandas as pd
 from rcsb_embedding_model.dataset.residue_embedding_from_tensor_file import ResidueEmbeddingFromTensorFile
+from rcsb_embedding_model.dataset.untils import get_structure_location
 from rcsb_embedding_model.types.api_types import SrcLocation, StructureLocation, StructureFormat
 from rcsb_embedding_model.utils.data import stringio_from_url
 from rcsb_embedding_model.utils.structure_parser import get_protein_chains
@@ -22,7 +23,6 @@ class ResidueEmbeddingFromStructure(ResidueEmbeddingFromTensorFile):
             src_stream,
             res_embedding_location,
             src_location=SrcLocation.file,
-            structure_location=StructureLocation.local,
             structure_format=StructureFormat.mmcif,
             min_res_n=0,
             structure_provider=StructureProvider()
@@ -31,7 +31,6 @@ class ResidueEmbeddingFromStructure(ResidueEmbeddingFromTensorFile):
             raise FileNotFoundError(f"Folder {res_embedding_location} does not exist")
         self.res_embedding_location = res_embedding_location
         self.src_location = src_location
-        self.structure_location = structure_location
         self.structure_format = structure_format
         self.min_res_n = min_res_n
         self.__structure_provider = structure_provider
@@ -60,7 +59,7 @@ class ResidueEmbeddingFromStructure(ResidueEmbeddingFromTensorFile):
             item_name = row[ResidueEmbeddingFromStructure.ITEM_NAME_ATTR]
             structure = self.__structure_provider.get_structure(
                 src_name=src_name,
-                src_structure=stringio_from_url(src_structure) if self.structure_location == StructureLocation.remote else src_structure,
+                src_structure=stringio_from_url(src_structure) if get_structure_location(src_structure) == StructureLocation.remote else src_structure,
                 structure_format=self.structure_format
             )
             for ch in get_protein_chains(structure, self.min_res_n):

rcsb_embedding_model/dataset/untils/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from rcsb_embedding_model.dataset.untils.utils import get_structure_location
+__all__ = ["get_structure_location"]

rcsb_embedding_model/dataset/untils/utils.py ADDED Viewed

@@ -0,0 +1,17 @@
+import os
+from urllib.parse import urlparse
+from rcsb_embedding_model.types.api_types import StructureLocation
+def get_structure_location(s: str) -> str:
+    # First, attempt to parse as URL
+    parsed = urlparse(s)
+    if parsed.scheme.lower() in {'http', 'https', 'ftp'} and parsed.netloc:
+        return StructureLocation.remote
+    # Next, test for an existing file or directory
+    if os.path.exists(s):
+        return StructureLocation.local
+    # Neither URL nor existing file
+    raise ValueError(f"Structure file source is neither a recognized URL nor file: {s!r}")

rcsb_embedding_model/inference/assembly_inferece.py CHANGED Viewed

@@ -3,7 +3,7 @@ import sys
 from rcsb_embedding_model.dataset.resdiue_assembly_embedding_from_structure import ResidueAssemblyDatasetFromStructure
 from rcsb_embedding_model.dataset.residue_assembly_embedding_from_tensor_file import ResidueAssemblyEmbeddingFromTensorFile
 from rcsb_embedding_model.types.api_types import FileOrStreamTuple, SrcLocation, Accelerator, Devices, OptionalPath, \
-    EmbeddingPath, StructureLocation, StructureFormat, SrcAssemblyFrom, OutFormat
+    EmbeddingPath, StructureFormat, SrcAssemblyFrom, OutFormat
 from rcsb_embedding_model.inference.chain_inference import predict as chain_predict
@@ -12,7 +12,6 @@ def predict(
         res_embedding_location: EmbeddingPath,
         src_location: SrcLocation = SrcLocation.file,
         src_from: SrcAssemblyFrom = SrcAssemblyFrom.assembly,
-        structure_location: StructureLocation = StructureLocation.local,
         structure_format: StructureFormat = StructureFormat.mmcif,
         min_res_n: int = 0,
         max_res_n: int = sys.maxsize,
@@ -29,7 +28,6 @@ def predict(
         src_stream=src_stream,
         res_embedding_location=res_embedding_location,
         src_location=src_location,
-        structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
         max_res_n=max_res_n
@@ -37,7 +35,6 @@ def predict(
         src_stream=src_stream,
         res_embedding_location=res_embedding_location,
         src_location=src_location,
-        structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
         max_res_n=max_res_n

rcsb_embedding_model/inference/chain_inference.py CHANGED Viewed

@@ -6,7 +6,7 @@ from rcsb_embedding_model.dataset.residue_embedding_from_structure import Residu
 from rcsb_embedding_model.dataset.residue_embedding_from_tensor_file import ResidueEmbeddingFromTensorFile
 from rcsb_embedding_model.modules.chain_module import ChainModule
 from rcsb_embedding_model.types.api_types import Accelerator, Devices, OptionalPath, FileOrStreamTuple, SrcLocation, \
-    SrcTensorFrom, StructureLocation, StructureFormat, OutFormat
+    SrcTensorFrom, StructureFormat, OutFormat
 from rcsb_embedding_model.utils.data import collate_seq_embeddings
 from rcsb_embedding_model.utils.model import get_aggregator_model
 from rcsb_embedding_model.writer.batch_writer import CsvBatchWriter, JsonStorage
@@ -17,7 +17,6 @@ def predict(
         res_embedding_location: OptionalPath = None,
         src_location: SrcLocation = SrcLocation.file,
         src_from: SrcTensorFrom = SrcTensorFrom.file,
-        structure_location: StructureLocation = StructureLocation.local,
         structure_format: StructureFormat = StructureFormat.mmcif,
         min_res_n: int = 0,
         batch_size: int = 1,
@@ -39,7 +38,6 @@ def predict(
             src_stream=src_stream,
             res_embedding_location=res_embedding_location,
             src_location=src_location,
-            structure_location=structure_location,
             structure_format=structure_format,
             min_res_n=min_res_n
         )

rcsb_embedding_model/inference/esm_inference.py CHANGED Viewed

@@ -5,7 +5,7 @@ from lightning import Trainer
 from rcsb_embedding_model.dataset.esm_prot_from_structure import EsmProtFromStructure
 from rcsb_embedding_model.dataset.esm_prot_from_chain import EsmProtFromChain
 from rcsb_embedding_model.modules.esm_module import EsmModule
-from rcsb_embedding_model.types.api_types import StructureFormat, Accelerator, Devices, OptionalPath, StructureLocation, \
+from rcsb_embedding_model.types.api_types import StructureFormat, Accelerator, Devices, OptionalPath, \
     SrcProteinFrom, FileOrStreamTuple, SrcLocation, OutFormat
 from rcsb_embedding_model.utils.model import get_residue_model
 from rcsb_embedding_model.writer.batch_writer import TensorBatchWriter, JsonStorage
@@ -15,7 +15,6 @@ def predict(
         src_stream: FileOrStreamTuple,
         src_location: SrcLocation = SrcLocation.file,
         src_from: SrcProteinFrom = SrcProteinFrom.chain,
-        structure_location: StructureLocation = StructureLocation.local,
         structure_format: StructureFormat = StructureFormat.mmcif,
         min_res_n: int = 0,
         batch_size: int = 1,
@@ -31,12 +30,10 @@ def predict(
     inference_set = EsmProtFromChain(
         src_stream=src_stream,
         src_location=src_location,
-        structure_location=structure_location,
         structure_format=structure_format
     ) if src_from == SrcProteinFrom.chain else EsmProtFromStructure(
         src_stream=src_stream,
         src_location=src_location,
-        structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n
     )

rcsb_embedding_model/inference/structure_inference.py CHANGED Viewed

@@ -5,7 +5,7 @@ from lightning import Trainer
 from rcsb_embedding_model.dataset.esm_prot_from_structure import EsmProtFromStructure
 from rcsb_embedding_model.dataset.esm_prot_from_chain import EsmProtFromChain
 from rcsb_embedding_model.modules.structure_module import StructureModule
-from rcsb_embedding_model.types.api_types import StructureFormat, Accelerator, Devices, OptionalPath, StructureLocation, \
+from rcsb_embedding_model.types.api_types import StructureFormat, Accelerator, Devices, OptionalPath, \
     SrcProteinFrom, FileOrStreamTuple, SrcLocation
 from rcsb_embedding_model.utils.model import get_residue_model, get_aggregator_model
 from rcsb_embedding_model.writer.batch_writer import JsonStorage
@@ -15,7 +15,6 @@ def predict(
         src_stream: FileOrStreamTuple,
         src_location: SrcLocation = SrcLocation.file,
         src_from: SrcProteinFrom = SrcProteinFrom.chain,
-        structure_location: StructureLocation = StructureLocation.local,
         structure_format: StructureFormat = StructureFormat.mmcif,
         min_res_n: int = 0,
         batch_size: int = 1,
@@ -30,12 +29,10 @@ def predict(
     inference_set = EsmProtFromChain(
         src_stream=src_stream,
         src_location=src_location,
-        structure_location=structure_location,
         structure_format=structure_format
     ) if src_from == SrcProteinFrom.chain else EsmProtFromStructure(
         src_stream=src_stream,
         src_location=src_location,
-        structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n
     )

rcsb_embedding_model/utils/data.py CHANGED Viewed

@@ -95,4 +95,4 @@ def adapt_csv_to_embedding_chain_stream(src_file, res_embedding_location):
     def __parse_row(row):
         r = row.split(",")
         return os.path.join(res_embedding_location, f"{r[0]}.{r[2]}.pt"), f"{r[0]}.{r[2]}"
-    return tuple([__parse_row(r.strip()) for r in open(src_file)])
+    return tuple([__parse_row(r.strip()) for r in open(src_file) if len(r.split(",")) > 2])

rcsb_embedding_model/utils/esm/loaders.py ADDED Viewed

@@ -0,0 +1,65 @@
+from pathlib import Path
+import torch
+from esm.models.esm3 import ESM3
+from esm.models.vqvae import StructureTokenEncoder
+from esm.tokenization import TokenizerCollection, EsmSequenceTokenizer, StructureTokenizer, SecondaryStructureTokenizer, \
+    SASADiscretizingTokenizer, InterProQuantizedTokenizer, ResidueAnnotationsTokenizer
+from huggingface_hub import  snapshot_download
+def data_root():
+    path = Path(snapshot_download(repo_id="rcsb/rcsb-esm"))
+    return path
+def structure_encoder(device: torch.device | str = "cpu"):
+    with torch.device(device):
+        model = StructureTokenEncoder(
+            d_model=1024, n_heads=1, v_heads=128, n_layers=2, d_out=128, n_codes=4096
+        ).eval()
+    state_dict = torch.load(
+        data_root() / "data/weights/esm3_structure_encoder_v0.pth", map_location=device
+    )
+    model.load_state_dict(state_dict)
+    return model
+def get_model_tokenizers():
+    class CustomAnnotationsTokenizer(ResidueAnnotationsTokenizer):
+        def __init__(self, csv_path: str | None = None, max_annotations: int = 16):
+            from esm.utils.constants import esm3 as C
+            super().__init__("none", max_annotations)
+            if csv_path is None:
+                csv_path = str(data_root() / C.RESID_CSV)
+            self.csv_path = csv_path
+    return TokenizerCollection(
+        sequence=EsmSequenceTokenizer(),
+        structure=StructureTokenizer(),
+        secondary_structure=SecondaryStructureTokenizer(kind="ss8"),
+        sasa=SASADiscretizingTokenizer(),
+        function=InterProQuantizedTokenizer(),
+        residue_annotations=CustomAnnotationsTokenizer(),
+    )
+def esm_open(device: torch.device | str = "cpu"):
+    with torch.device(device):
+        model = ESM3(
+            d_model=1536,
+            n_heads=24,
+            v_heads=256,
+            n_layers=48,
+            structure_encoder_fn=structure_encoder,
+            structure_decoder_fn=lambda x: x,
+            function_decoder_fn=lambda x: x,
+            tokenizers=get_model_tokenizers(),
+        ).eval()
+    state_dict = torch.load(
+        data_root() / "data/weights/esm3_sm_open_v1.pth", map_location=device
+    )
+    model.load_state_dict(state_dict)
+    return model

rcsb_embedding_model/utils/model.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import torch
-from esm.models.esm3 import ESM3
-from esm.utils.constants.models import ESM3_OPEN_SMALL
-from huggingface_hub import hf_hub_download
+from huggingface_hub import hf_hub_download
 from rcsb_embedding_model.model.residue_embedding_aggregator import ResidueEmbeddingAggregator
+from rcsb_embedding_model.utils.esm.loaders import esm_open
 REPO_ID = "rcsb/rcsb-embedding-model"
 FILE_NAME = "rcsb-embedding-model.pt"
@@ -25,7 +24,5 @@ def get_aggregator_model(device=None):
 def get_residue_model(device=None):
-    return ESM3.from_pretrained(
-        ESM3_OPEN_SMALL,
-        device
-    )
+    return esm_open(device)

{rcsb_embedding_model-0.0.35.dist-info → rcsb_embedding_model-0.0.37.dist-info}/METADATA RENAMED Viewed

@@ -1,16 +1,23 @@
 Metadata-Version: 2.4
 Name: rcsb-embedding-model
-Version: 0.0.35
+Version: 0.0.37
 Summary: Protein Embedding Model for Structure Search
 Project-URL: Homepage, https://github.com/rcsb/rcsb-embedding-model
 Project-URL: Issues, https://github.com/rcsb/rcsb-embedding-model/issues
 Author-email: Joan Segura <joan.segura@rcsb.org>
-License-Expression: BSD-3-Clause
+License: # Cambrian Non-Commercial License Agreement
+        This project is licensed under the EvolutionaryScale Cambrian Non-Commercial License Agreement.
+        See: https://www.evolutionaryscale.ai/policies/cambrian-non-commercial-license-agreement
 License-File: LICENSE.md
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
-Requires-Python: >=3.10
+Requires-Python: >=3.11
+Requires-Dist: biotite>=1.5.0
 Requires-Dist: esm>=3.2.0
+Requires-Dist: hf-xet>=1.1.10
+Requires-Dist: httpx>=0.28.1
+Requires-Dist: huggingface-hub>=0.30.2
 Requires-Dist: importlib-metadata>=8.7.0
 Requires-Dist: lightning>=2.5.0
 Requires-Dist: typer>=0.15.0
@@ -18,7 +25,7 @@ Description-Content-Type: text/markdown
 # RCSB Embedding Model
-**Version** 0.0.26
+**Version** 0.0.37
 ## Overview
@@ -125,4 +132,5 @@ Segura, J., Bittrich, S., et al. (2024). *Multi-scale structural similarity embe
 ## License
-This project is licensed under the BSD 3-Clause License. See [LICENSE.md](LICENSE.md) for details.
+This project uses the EvolutionaryScale ESM-3 model and is distributed under the
+[Cambrian Non-Commercial License Agreement](https://www.evolutionaryscale.ai/policies/cambrian-non-commercial-license-agreement).

rcsb_embedding_model-0.0.37.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,33 @@
+rcsb_embedding_model/__init__.py,sha256=7YfYO-V-u__19eAZfQ3t5Gf2qrhd_gwQB8rHO0J0puw,306
+rcsb_embedding_model/rcsb_structure_embedding.py,sha256=dKp9hXQO0JAnO4SEfjJ_mG_jHu3UxAPguv6jkOjp-BI,4487
+rcsb_embedding_model/cli/args_utils.py,sha256=7nP2q8pL5dWK_U7opxtWmoFcYVwasky6elHk-dASFaI,165
+rcsb_embedding_model/cli/inference.py,sha256=cXYaais4A3rVAkiucMdJxrYVxezKti8hL3DogBU0_2c,18788
+rcsb_embedding_model/dataset/esm_prot_from_chain.py,sha256=_DYWLDEc492nhUdFRAQjwh0romF9iMwydFNi43-r0TY,4345
+rcsb_embedding_model/dataset/esm_prot_from_structure.py,sha256=VU9BxNUApZ-pus_vmFGEU4eplcCH0fO7KBdic6X_NOM,2546
+rcsb_embedding_model/dataset/resdiue_assembly_embedding_from_structure.py,sha256=9iO7ZUcxl0TIBiwNieqjZFfnM7-7V3pl5abYiLzIY0I,2794
+rcsb_embedding_model/dataset/residue_assembly_embedding_from_tensor_file.py,sha256=6bMjb0hfNbrTOqstnUVHbegw0xeUo7s6INnRsvP7V3I,3663
+rcsb_embedding_model/dataset/residue_embedding_from_structure.py,sha256=tFHiXqGceZjAoYfVkeXG3sa2mz0gd5XBfm9EpJswcWI,2830
+rcsb_embedding_model/dataset/residue_embedding_from_tensor_file.py,sha256=4OPaw55yGKHjY2iPpCnemcfwfmTZ4j5VrGQ2oIMQw6A,1343
+rcsb_embedding_model/dataset/untils/__init__.py,sha256=O3WOukwvaKJvHUTALD3eYNHRacJo8o5BW7-ZulLZ65g,116
+rcsb_embedding_model/dataset/untils/utils.py,sha256=SPiQ9aO2WLictO4R2JiNlo2ChhlANNMeIhbN0kq11kQ,578
+rcsb_embedding_model/inference/assembly_inferece.py,sha256=b-mAfOJOO-s6gilOedZpaM90OTbhm_RQVqh2zKFG4dQ,2143
+rcsb_embedding_model/inference/chain_inference.py,sha256=0HkV4EnLwg4ttQhf-xwOuSksZwEYDEChnHU4_A0xUXM,2782
+rcsb_embedding_model/inference/esm_inference.py,sha256=nmHJYfSGjEqRPgb3l9s5fqtlyzdbAsiPz-OxHXBTgcI,2360
+rcsb_embedding_model/inference/structure_inference.py,sha256=b44mY7VcCbjbtB35Mi9EhZoM18yyMaF579MKmzwB564,2405
+rcsb_embedding_model/model/layers.py,sha256=lhKaWC4gTS_T5lHOP0mgnnP8nKTPEOm4MrjhESA4hE8,743
+rcsb_embedding_model/model/residue_embedding_aggregator.py,sha256=k3UW63Ax8DtjCMdD3O5xNxtyAu28l2n3-Ab6nS0atm0,1967
+rcsb_embedding_model/modules/chain_module.py,sha256=KsZw2uagO4rpAKWv6ivqEMxIEzgtfQFliHV_vX8kqtc,435
+rcsb_embedding_model/modules/esm_module.py,sha256=otJRbCb319nCCob_4E1W_UClhkex9eDqcCyzWQO-vIs,740
+rcsb_embedding_model/modules/structure_module.py,sha256=4js02XzKvhc_G26ELsGhJ9SCi_wlvtVolObxfWt3BhE,1077
+rcsb_embedding_model/types/api_types.py,sha256=SCwALwvEb0KRKaoWKbuN7JyfOH-1whsI0Z4ki41dht8,1235
+rcsb_embedding_model/utils/data.py,sha256=p7sbskLPBFtpZ-XM18wFY5Kei02Xso4wTWYTqHxJvVw,3841
+rcsb_embedding_model/utils/model.py,sha256=Xi6bSUsB2-IsQS9610gXnbAEvYlK2V7eJC-cDE-JBTA,875
+rcsb_embedding_model/utils/structure_parser.py,sha256=fSIbq_a_aEigCWY_1dUcW9d9Law0ZDOcZAxJlZL0Rt8,3377
+rcsb_embedding_model/utils/structure_provider.py,sha256=eWtxjkPpmRfmil_DKR1J6miaXR3lQ28DF5O0qrqSgGA,786
+rcsb_embedding_model/utils/esm/loaders.py,sha256=V7CADr7RReoztYmBQb2tjA8RBQIwFEjxBcocKAB_ea4,2221
+rcsb_embedding_model/writer/batch_writer.py,sha256=rTFNasB0Xp4-XCNTXKeEWZxSrb7lvZytoRldJUWn9Jg,3312
+rcsb_embedding_model-0.0.37.dist-info/METADATA,sha256=s_as4M_J_P6Pkcca8eWsRSx7FHBAV1Z-PMI_rnhFZ0A,5820
+rcsb_embedding_model-0.0.37.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+rcsb_embedding_model-0.0.37.dist-info/entry_points.txt,sha256=MK11jTIEmaV-x4CkPX5IymDaVs7Ky_f2xxU8BJVZ_9Q,69
+rcsb_embedding_model-0.0.37.dist-info/licenses/LICENSE.md,sha256=XyzxQe9PLJQlOmOOrqwmBaAfo0PAenOQ5NsgnApuVH4,230
+rcsb_embedding_model-0.0.37.dist-info/RECORD,,

rcsb_embedding_model-0.0.37.dist-info/licenses/LICENSE.md ADDED Viewed

@@ -0,0 +1,4 @@
+# Cambrian Non-Commercial License Agreement
+This project is licensed under the EvolutionaryScale Cambrian Non-Commercial License Agreement.
+See: https://www.evolutionaryscale.ai/policies/cambrian-non-commercial-license-agreement

rcsb_embedding_model-0.0.35.dist-info/RECORD DELETED Viewed

@@ -1,30 +0,0 @@
-rcsb_embedding_model/__init__.py,sha256=7YfYO-V-u__19eAZfQ3t5Gf2qrhd_gwQB8rHO0J0puw,306
-rcsb_embedding_model/rcsb_structure_embedding.py,sha256=dKp9hXQO0JAnO4SEfjJ_mG_jHu3UxAPguv6jkOjp-BI,4487
-rcsb_embedding_model/cli/args_utils.py,sha256=7nP2q8pL5dWK_U7opxtWmoFcYVwasky6elHk-dASFaI,165
-rcsb_embedding_model/cli/inference.py,sha256=67_Tr3LWeA3T4KS5mkjq6tw77Ypy0R8IwMxEG2FwVqQ,19901
-rcsb_embedding_model/dataset/esm_prot_from_chain.py,sha256=sLbBapgchxciq4RgwHkw9yoNokGlOv2Z5PSaiWV5G64,4418
-rcsb_embedding_model/dataset/esm_prot_from_structure.py,sha256=3HzXCCc-UqmZNbJaeXHyUsSIZZxMc2erbxAPGIxSmfE,2621
-rcsb_embedding_model/dataset/resdiue_assembly_embedding_from_structure.py,sha256=69h1VkrIXesHZi1cG3BOMMytSDeRzcBBP0_Z3Xz3dM8,2869
-rcsb_embedding_model/dataset/residue_assembly_embedding_from_tensor_file.py,sha256=Hd9oH-IVgY6d7Dxy5VfiwHvSaK-Wwhk6ccUBgOwl0TU,3740
-rcsb_embedding_model/dataset/residue_embedding_from_structure.py,sha256=1jmeEcCK41cAi2ZnqQkd667NWCAIGS3k6jGDF-WxtTk,2854
-rcsb_embedding_model/dataset/residue_embedding_from_tensor_file.py,sha256=4OPaw55yGKHjY2iPpCnemcfwfmTZ4j5VrGQ2oIMQw6A,1343
-rcsb_embedding_model/inference/assembly_inferece.py,sha256=8fPJjEXy1WsM5XB5U7KfdO5-Du6nEsawsaAjmWoXA9I,2329
-rcsb_embedding_model/inference/chain_inference.py,sha256=6f5wVzjtRtHU3BPMTe5k3nH_Nl440Am8BL8h1vmK1jI,2925
-rcsb_embedding_model/inference/esm_inference.py,sha256=rn6H43D8BYzMZbMu7UPsLYg2dgERmmpci5weNItrG5Q,2546
-rcsb_embedding_model/inference/structure_inference.py,sha256=0wqCW5wee_UQ8WJo9KG6SBHmosdNRzoJYEm7rMn4veA,2591
-rcsb_embedding_model/model/layers.py,sha256=lhKaWC4gTS_T5lHOP0mgnnP8nKTPEOm4MrjhESA4hE8,743
-rcsb_embedding_model/model/residue_embedding_aggregator.py,sha256=k3UW63Ax8DtjCMdD3O5xNxtyAu28l2n3-Ab6nS0atm0,1967
-rcsb_embedding_model/modules/chain_module.py,sha256=KsZw2uagO4rpAKWv6ivqEMxIEzgtfQFliHV_vX8kqtc,435
-rcsb_embedding_model/modules/esm_module.py,sha256=otJRbCb319nCCob_4E1W_UClhkex9eDqcCyzWQO-vIs,740
-rcsb_embedding_model/modules/structure_module.py,sha256=4js02XzKvhc_G26ELsGhJ9SCi_wlvtVolObxfWt3BhE,1077
-rcsb_embedding_model/types/api_types.py,sha256=SCwALwvEb0KRKaoWKbuN7JyfOH-1whsI0Z4ki41dht8,1235
-rcsb_embedding_model/utils/data.py,sha256=ThrcYycIizsV_Ycn6PPxF12JRr1m2K-v8TsIaVqx10A,3816
-rcsb_embedding_model/utils/model.py,sha256=xr3p02ohOgJ5UInwdIupN68Oq4yvNFhxobZRacS1adg,953
-rcsb_embedding_model/utils/structure_parser.py,sha256=fSIbq_a_aEigCWY_1dUcW9d9Law0ZDOcZAxJlZL0Rt8,3377
-rcsb_embedding_model/utils/structure_provider.py,sha256=eWtxjkPpmRfmil_DKR1J6miaXR3lQ28DF5O0qrqSgGA,786
-rcsb_embedding_model/writer/batch_writer.py,sha256=rTFNasB0Xp4-XCNTXKeEWZxSrb7lvZytoRldJUWn9Jg,3312
-rcsb_embedding_model-0.0.35.dist-info/METADATA,sha256=h5uREe5bIKpY4o-ZUzXF9tObRb_eewm9GJ44vgijdig,5351
-rcsb_embedding_model-0.0.35.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-rcsb_embedding_model-0.0.35.dist-info/entry_points.txt,sha256=MK11jTIEmaV-x4CkPX5IymDaVs7Ky_f2xxU8BJVZ_9Q,69
-rcsb_embedding_model-0.0.35.dist-info/licenses/LICENSE.md,sha256=oUaHiKgfBkChth_Sm67WemEvatO1U0Go8LHjaskXY0w,1522
-rcsb_embedding_model-0.0.35.dist-info/RECORD,,

rcsb_embedding_model-0.0.35.dist-info/licenses/LICENSE.md DELETED Viewed

@@ -1,28 +0,0 @@
-BSD 3-Clause License
-Copyright (c) 2024, RCSB Protein Data Bank, UC San Diego
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-3. Neither the name of the copyright holder nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

{rcsb_embedding_model-0.0.35.dist-info → rcsb_embedding_model-0.0.37.dist-info}/WHEEL RENAMED Viewed

File without changes

{rcsb_embedding_model-0.0.35.dist-info → rcsb_embedding_model-0.0.37.dist-info}/entry_points.txt RENAMED Viewed

File without changes

rcsb-embedding-model 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl

Potentially problematic release.

rcsb-embedding-model 0.0.35py3-none-any.whl → 0.0.37py3-none-any.whl