PyPI - rcsb-embedding-model - Versions diffs - 0.0.20__tar.gz → 0.0.22__tar.gz - Mend

rcsb-embedding-model 0.0.20tar.gz → 0.0.22tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rcsb-embedding-model might be problematic. Click here for more details.

Files changed (46) hide show

{rcsb_embedding_model-0.0.20 → rcsb_embedding_model-0.0.22}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rcsb-embedding-model
-Version: 0.0.20
+Version: 0.0.22
 Summary: Protein Embedding Model for Structure Search
 Project-URL: Homepage, https://github.com/rcsb/rcsb-embedding-model
 Project-URL: Issues, https://github.com/rcsb/rcsb-embedding-model/issues
@@ -17,7 +17,7 @@ Description-Content-Type: text/markdown
 # RCSB Embedding Model
-**Version** 0.0.20
+**Version** 0.0.22
 ## Overview
@@ -48,7 +48,7 @@ If you are interested in training the model with a new dataset, visit the [rcsb-
 **Requirements:**
 - Python ≥ 3.10
-- ESM == 3.1.1
+- ESM >= 3.2.0
 - Lightning ≥ 2.5.0
 - Typer ≥ 0.15.0

{rcsb_embedding_model-0.0.20 → rcsb_embedding_model-0.0.22}/README.md RENAMED Viewed

@@ -1,6 +1,6 @@
 # RCSB Embedding Model
-**Version** 0.0.20
+**Version** 0.0.22
 ## Overview
@@ -31,7 +31,7 @@ If you are interested in training the model with a new dataset, visit the [rcsb-
 **Requirements:**
 - Python ≥ 3.10
-- ESM == 3.1.1
+- ESM >= 3.2.0
 - Lightning ≥ 2.5.0
 - Typer ≥ 0.15.0

{rcsb_embedding_model-0.0.20 → rcsb_embedding_model-0.0.22}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "rcsb-embedding-model"
-version = "0.0.20"
+version = "0.0.22"
 authors = [
     { name="Joan Segura", email="joan.segura@rcsb.org" },
 ]

{rcsb_embedding_model-0.0.20 → rcsb_embedding_model-0.0.22}/src/rcsb_embedding_model/cli/inference.py RENAMED Viewed

@@ -8,6 +8,9 @@ from rcsb_embedding_model.types.api_types import StructureFormat, Accelerator, S
     StructureLocation, SrcAssemblyFrom, SrcTensorFrom, OutFormat
 from rcsb_embedding_model.utils.data import adapt_csv_to_embedding_chain_stream
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 app = typer.Typer(
     add_completion=False
 )
@@ -322,26 +325,36 @@ def complete_embedding(
             resolve_path=True,
             help='CSV file 4 columns: Structure Name | Structure File Path or URL (switch structure-location) | Assembly Id | Output embedding name.'
         )],
-        output_path: Annotated[typer.FileText, typer.Option(
+        output_res_path: Annotated[typer.FileText, typer.Option(
             exists=True,
             file_okay=False,
             dir_okay=True,
             resolve_path=True,
-            help='Output path to store predictions. Embeddings are stored as a single DataFrame file (see output_name).'
+            help='Output path to store residue embeddings. Residue embeddings are stored in separated files'
         )],
-        res_embedding_location: Annotated[typer.FileText, typer.Option(
+        output_chain_path: Annotated[typer.FileText, typer.Option(
+            exists=True,
+            file_okay=False,
+            dir_okay=True,
+            resolve_path=True,
+            help='Output path to store chain embeddings. Embeddings are stored as a single JSON file (see output_chain_name).'
+        )],
+        output_assembly_path: Annotated[typer.FileText, typer.Option(
             exists=True,
             file_okay=False,
             dir_okay=True,
             resolve_path=True,
-            help='Output path to store ESM predictions.'
+            help='Output path to store assembly embeddings. Embeddings are stored as a single JSON file (see output_assembly_name).'
         )],
         output_format: Annotated[OutFormat, typer.Option(
             help='Format of the output. Options: separated (predictions are stored in single files) or grouped (predictions are stored in a single JSON file).'
         )] = OutFormat.separated,
-        output_name: Annotated[str, typer.Option(
-            help='File name for storing embeddings as a single JSON file. Used when output-format=grouped.'
-        )] = 'inference',
+        output_chain_name: Annotated[str, typer.Option(
+            help='File name for storing chain embeddings as a single JSON file. Used when output-format=grouped.'
+        )] = 'chain-inference',
+        output_assembly_name: Annotated[str, typer.Option(
+            help='File name for storing chain embeddings as a single JSON file. Used when output-format=grouped.'
+        )] = 'chain-inference',
         structure_location: Annotated[StructureLocation, typer.Option(
             help='Structure file location.'
         )] = StructureLocation.local,
@@ -351,10 +364,22 @@ def complete_embedding(
         min_res_n: Annotated[int, typer.Option(
             help='When using all chains in a structure, consider only chains with more than <min_res_n> residues.'
         )] = 0,
-        batch_size: Annotated[int, typer.Option(
+        batch_size_res: Annotated[int, typer.Option(
             help='Number of samples processed together in one iteration.'
         )] = 1,
-        num_workers: Annotated[int, typer.Option(
+        num_workers_res: Annotated[int, typer.Option(
+            help='Number of subprocesses to use for data loading.'
+        )] = 0,
+        batch_size_chain: Annotated[int, typer.Option(
+            help='Number of samples processed together in one iteration.'
+        )] = 1,
+        num_workers_chain: Annotated[int, typer.Option(
+            help='Number of subprocesses to use for data loading.'
+        )] = 0,
+        batch_size_assembly: Annotated[int, typer.Option(
+            help='Number of samples processed together in one iteration.'
+        )] = 1,
+        num_workers_assembly: Annotated[int, typer.Option(
             help='Number of subprocesses to use for data loading.'
         )] = 0,
         num_nodes: Annotated[int, typer.Option(
@@ -369,43 +394,43 @@ def complete_embedding(
 ):
     residue_embedding(
         src_file=src_chain_file,
-        output_path=res_embedding_location,
+        output_path=output_res_path,
         output_format=OutFormat.separated,
         structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
-        batch_size=batch_size,
-        num_workers=num_workers,
+        batch_size=batch_size_res,
+        num_workers=num_workers_res,
         num_nodes=num_nodes,
         accelerator=accelerator,
         devices=devices,
     )
     chain_embedding(
         src_file=src_chain_file,
-        output_path=output_path,
+        output_path=output_chain_path,
         output_format=output_format,
-        output_name=f"{output_name}-chain",
-        res_embedding_location=res_embedding_location,
+        output_name=output_chain_name,
+        res_embedding_location=output_res_path,
         structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
-        batch_size=batch_size,
-        num_workers=num_workers,
+        batch_size=batch_size_chain,
+        num_workers=num_workers_chain,
         num_nodes=num_nodes,
         accelerator=accelerator,
         devices=devices
     )
     assembly_embedding(
         src_file=src_assembly_file,
-        output_path=output_path,
+        output_path=output_assembly_path,
         output_format=output_format,
-        output_name=f"{output_name}-assembly",
-        res_embedding_location=res_embedding_location,
+        output_name=output_assembly_name,
+        res_embedding_location=output_res_path,
         structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
-        batch_size=batch_size,
-        num_workers=num_workers,
+        batch_size=batch_size_assembly,
+        num_workers=num_workers_assembly,
         num_nodes=num_nodes,
         accelerator=accelerator,
         devices=devices

{rcsb_embedding_model-0.0.20 → rcsb_embedding_model-0.0.22}/src/rcsb_embedding_model/inference/chain_inference.py RENAMED Viewed

@@ -7,6 +7,7 @@ from rcsb_embedding_model.modules.chain_module import ChainModule
 from rcsb_embedding_model.types.api_types import Accelerator, Devices, OptionalPath, FileOrStreamTuple, SrcLocation, \
     SrcTensorFrom, StructureLocation, StructureFormat, OutFormat
 from rcsb_embedding_model.utils.data import collate_seq_embeddings
+from rcsb_embedding_model.utils.model import get_aggregator_model
 from rcsb_embedding_model.writer.batch_writer import CsvBatchWriter, JsonStorage
@@ -52,13 +53,17 @@ def predict(
         )
     )
-    module = ChainModule()
+    aggregator_model = get_aggregator_model()
+    module = ChainModule(
+        model=aggregator_model
+    )
     inference_writer = (JsonStorage(out_path, out_name) if out_format == OutFormat.grouped else CsvBatchWriter(out_path)) if out_path is not None else None
     trainer = Trainer(
         callbacks=[inference_writer] if inference_writer is not None else None,
         num_nodes=num_nodes,
         accelerator=accelerator,
         devices=devices,
+        strategy="ddp",
         logger=False
     )

{rcsb_embedding_model-0.0.20 → rcsb_embedding_model-0.0.22}/src/rcsb_embedding_model/inference/esm_inference.py RENAMED Viewed

@@ -6,6 +6,7 @@ from rcsb_embedding_model.dataset.esm_prot_from_chain import EsmProtFromChain
 from rcsb_embedding_model.modules.esm_module import EsmModule
 from rcsb_embedding_model.types.api_types import StructureFormat, Accelerator, Devices, OptionalPath, StructureLocation, \
     SrcProteinFrom, FileOrStreamTuple, SrcLocation, OutFormat
+from rcsb_embedding_model.utils.model import get_residue_model
 from rcsb_embedding_model.writer.batch_writer import TensorBatchWriter, JsonStorage
@@ -46,13 +47,17 @@ def predict(
         collate_fn=lambda _: _
     )
-    module = EsmModule()
+    esm_model = get_residue_model()
+    module = EsmModule(
+        model=esm_model
+    )
     inference_writer = (JsonStorage(out_path, out_name) if out_format == OutFormat.grouped else TensorBatchWriter(out_path)) if out_path is not None else None
     trainer = Trainer(
         callbacks=[inference_writer] if inference_writer is not None else None,
         num_nodes=num_nodes,
         accelerator=accelerator,
         devices=devices,
+        strategy="ddp",
         logger=False
     )

{rcsb_embedding_model-0.0.20 → rcsb_embedding_model-0.0.22}/src/rcsb_embedding_model/inference/structure_inference.py RENAMED Viewed

@@ -6,6 +6,7 @@ from rcsb_embedding_model.dataset.esm_prot_from_chain import EsmProtFromChain
 from rcsb_embedding_model.modules.structure_module import StructureModule
 from rcsb_embedding_model.types.api_types import StructureFormat, Accelerator, Devices, OptionalPath, StructureLocation, \
     SrcProteinFrom, FileOrStreamTuple, SrcLocation
+from rcsb_embedding_model.utils.model import get_residue_model, get_aggregator_model
 from rcsb_embedding_model.writer.batch_writer import JsonStorage
@@ -45,13 +46,19 @@ def predict(
         collate_fn=lambda _: _
     )
-    module = StructureModule()
+    res_model = get_residue_model()
+    aggregator_model = get_aggregator_model()
+    module = StructureModule(
+        res_model=res_model,
+        aggregator_model=aggregator_model
+    )
     inference_writer = JsonStorage(out_path, out_name) if out_path is not None and out_name is not None else None
     trainer = Trainer(
         callbacks=[inference_writer] if inference_writer is not None else None,
         num_nodes=num_nodes,
         accelerator=accelerator,
         devices=devices,
+        strategy="ddp",
         logger=False
     )

rcsb_embedding_model-0.0.22/src/rcsb_embedding_model/modules/chain_module.py ADDED Viewed

@@ -0,0 +1,19 @@
+import logging
+from lightning import LightningModule
+logger = logging.getLogger(__name__)
+class ChainModule(LightningModule):
+    def __init__(
+            self,
+            model
+    ):
+        super().__init__()
+        logger.info(f"Using device: {self.device}")
+        self.aggregator = model
+    def predict_step(self, batch, batch_idx):
+        (x, x_mask), dom_id = batch
+        return self.aggregator(x, x_mask), dom_id

{rcsb_embedding_model-0.0.20 → rcsb_embedding_model-0.0.22}/src/rcsb_embedding_model/modules/esm_module.py RENAMED Viewed

@@ -1,16 +1,19 @@
+import logging
 from esm.sdk.api import SamplingConfig
 from lightning import LightningModule
-from rcsb_embedding_model.utils.model import get_residue_model
+logger = logging.getLogger(__name__)
 class EsmModule(LightningModule):
     def __init__(
-            self
+            self,
+            model
     ):
         super().__init__()
-        self.esm3 = get_residue_model(self.device)
+        logger.info(f"Using device: {self.device}")
+        self.esm3 = model
     def predict_step(self, prot_batch, batch_idx):
         return tuple([self.__compute_embeddings(esm_prot) for esm_prot, name in prot_batch]), tuple([name for esm_prot, name in prot_batch])

{rcsb_embedding_model-0.0.20 → rcsb_embedding_model-0.0.22}/src/rcsb_embedding_model/modules/structure_module.py RENAMED Viewed

@@ -1,18 +1,23 @@
+import logging
 from esm.sdk.api import SamplingConfig
 from lightning import LightningModule
 from rcsb_embedding_model.utils.data import collate_seq_embeddings
-from rcsb_embedding_model.utils.model import get_residue_model, get_aggregator_model
+logger = logging.getLogger(__name__)
 class StructureModule(LightningModule):
     def __init__(
-            self
+            self,
+            res_model,
+            aggregator_model
     ):
         super().__init__()
-        self.esm3 = get_residue_model(self.device)
-        self.aggregator = get_aggregator_model(device=self.device)
+        logger.info(f"Using device: {self.device}")
+        self.esm3 = res_model
+        self.aggregator =  aggregator_model
     def predict_step(self, prot_batch, batch_idx):
         prot_embeddings = []

{rcsb_embedding_model-0.0.20 → rcsb_embedding_model-0.0.22}/src/rcsb_embedding_model/utils/model.py RENAMED Viewed

@@ -16,6 +16,8 @@ def get_aggregator_model(device=None):
         filename=FILE_NAME,
         revision=REVISION
     )
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     weights = torch.load(model_path, weights_only=True, map_location=device)
     aggregator_model = ResidueEmbeddingAggregator()
     aggregator_model.load_state_dict(weights)

rcsb_embedding_model-0.0.20/src/rcsb_embedding_model/modules/chain_module.py DELETED Viewed

@@ -1,16 +0,0 @@
-from lightning import LightningModule
-from rcsb_embedding_model.utils.model import get_aggregator_model
-class ChainModule(LightningModule):
-    def __init__(
-            self
-    ):
-        super().__init__()
-        self.model = get_aggregator_model(device=self.device)
-    def predict_step(self, batch, batch_idx):
-        (x, x_mask), dom_id = batch
-        return self.model(x, x_mask), dom_id