PyPI - rcsb-embedding-model - Versions diffs - 0.0.16__py3-none-any.whl → 0.0.18__py3-none-any.whl - Mend

rcsb-embedding-model 0.0.16py3-none-any.whl → 0.0.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rcsb-embedding-model might be problematic. Click here for more details.

Files changed (21) hide show

rcsb_embedding_model/cli/inference.py CHANGED Viewed

@@ -5,7 +5,8 @@ import typer
 from rcsb_embedding_model.cli.args_utils import arg_devices
 from rcsb_embedding_model.types.api_types import StructureFormat, Accelerator, SrcLocation, SrcProteinFrom, \
-    StructureLocation, SrcAssemblyFrom, SrcTensorFrom
+    StructureLocation, SrcAssemblyFrom, SrcTensorFrom, OutFormat
+from rcsb_embedding_model.utils.data import adapt_csv_to_embedding_chain_stream
 app = typer.Typer(
     add_completion=False
@@ -22,7 +23,7 @@ def residue_embedding(
             file_okay=True,
             dir_okay=False,
             resolve_path=True,
-            help='CSV file 4 (or 3) columns: Structure Name | Structure File Path or URL (switch structure-location) | Chain Id (asym_i for cif files. This field is required if src-from=chain) | Output Embedding Name.'
+            help='CSV file 4 columns: Structure Name | Structure File Path or URL (switch structure-location) | Chain Id (asym_i for cif files) | Output Embedding Name.'
         )],
         output_path: Annotated[typer.FileText, typer.Option(
             exists=True,
@@ -31,9 +32,12 @@ def residue_embedding(
             resolve_path=True,
             help='Output path to store predictions. Embeddings are stored as torch tensor files.'
         )],
-        src_from: Annotated[SrcProteinFrom, typer.Option(
-            help='Use specific chains or all chains in a structure.'
-        )] = SrcProteinFrom.chain,
+        output_format: Annotated[OutFormat, typer.Option(
+            help='Format of the output. Options: separated (predictions are stored in single files) or grouped (predictions are stored in a single JSON file).'
+        )] = OutFormat.separated,
+        output_name: Annotated[str, typer.Option(
+            help='File name for storing embeddings as a single JSON file. Used when output-format=grouped.'
+        )] = 'inference',
         structure_location: Annotated[StructureLocation, typer.Option(
             help='Structure file location.'
         )] = StructureLocation.local,
@@ -62,8 +66,8 @@ def residue_embedding(
     from rcsb_embedding_model.inference.esm_inference import predict
     predict(
         src_stream=src_file,
-        src_location=SrcLocation.local,
-        src_from=src_from,
+        src_location=SrcLocation.file,
+        src_from=SrcProteinFrom.chain,
         structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
@@ -72,6 +76,8 @@ def residue_embedding(
         num_nodes=num_nodes,
         accelerator=accelerator,
         devices=arg_devices(devices),
+        out_format=output_format,
+        out_name=output_name,
         out_path=output_path
     )
@@ -86,7 +92,7 @@ def structure_embedding(
             file_okay=True,
             dir_okay=False,
             resolve_path=True,
-            help='CSV file 4 (or 3) columns: Structure Name | Structure File Path or URL (switch structure-location) | Chain Id (asym_i for cif files. This field is required if src-from=chain) | Output Embedding Name.'
+            help='CSV file 4 columns: Structure Name | Structure File Path or URL (switch structure-location) | Chain Id (asym_i for cif files) | Output Embedding Name.'
         )],
         output_path: Annotated[typer.FileText, typer.Option(
             exists=True,
@@ -95,12 +101,9 @@ def structure_embedding(
             resolve_path=True,
             help='Output path to store predictions. Embeddings are stored as a single DataFrame file (see out-df-name).'
         )],
-        out_df_name: Annotated[str, typer.Option(
-            help='File name (without extension) for storing embeddings as a pandas DataFrame pickle (.pkl). The DataFrame contains 2 columns: Id | Embedding'
-        )],
-        src_from: Annotated[SrcProteinFrom, typer.Option(
-            help='Use specific chains or all chains in a structure.'
-        )] = SrcProteinFrom.chain,
+        output_name: Annotated[str, typer.Option(
+            help='File name for storing embeddings as a single JSON file.'
+        )] = 'inference',
         structure_location: Annotated[StructureLocation, typer.Option(
             help='Structure file location.'
         )] = StructureLocation.local,
@@ -129,8 +132,8 @@ def structure_embedding(
     from rcsb_embedding_model.inference.structure_inference import predict
     predict(
         src_stream=src_file,
-        src_location=SrcLocation.local,
-        src_from=src_from,
+        src_location=SrcLocation.file,
+        src_from=SrcProteinFrom.chain,
         structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
@@ -140,7 +143,7 @@ def structure_embedding(
         accelerator=accelerator,
         devices=arg_devices(devices),
         out_path=output_path,
-        out_df_name=out_df_name
+        out_name=output_name
     )
@@ -154,7 +157,7 @@ def chain_embedding(
             file_okay=True,
             dir_okay=False,
             resolve_path=True,
-            help='Option 1 (src-from=file) - CSV file 2 columns: Residue Embedding Torch Tensor File | Output Embedding Name. Option 2 (src-from=structure) - CSV file 3 columns: Structure Name | Structure File Path or URL (switch structure-location) | Output Embedding Name.'
+            help='CSV file 4 columns: Structure Name | Structure File Path or URL (switch structure-location) | Chain Id (asym_i for cif files) | Output Embedding Name.'
         )],
         output_path: Annotated[typer.FileText, typer.Option(
             exists=True,
@@ -168,11 +171,14 @@ def chain_embedding(
             file_okay=False,
             dir_okay=True,
             resolve_path=True,
-            help='Path where residue level embeddings are located. This argument is required if src-from=structure.'
-        )] = None,
-        src_from: Annotated[SrcTensorFrom, typer.Option(
-            help='Use file names or all chains in a structure.'
-        )] = SrcTensorFrom.file,
+            help='Path where residue level embeddings are located.'
+        )],
+        output_format: Annotated[OutFormat, typer.Option(
+            help='Format of the output. Options: separated (predictions are stored in single files) or grouped (predictions are stored in a single JSON file).'
+        )] = OutFormat.separated,
+        output_name: Annotated[str, typer.Option(
+            help='File name for storing embeddings as a single JSON file. Used when output-format=grouped.'
+        )] = 'inference',
         structure_location: Annotated[StructureLocation, typer.Option(
             help='Structure file location.'
         )] = StructureLocation.local,
@@ -200,10 +206,10 @@ def chain_embedding(
 ):
     from rcsb_embedding_model.inference.chain_inference import predict
     predict(
-        src_stream=src_file,
+        src_stream=adapt_csv_to_embedding_chain_stream(src_file, res_embedding_location),
         res_embedding_location=res_embedding_location,
-        src_location=SrcLocation.local,
-        src_from=src_from,
+        src_location=SrcLocation.stream,
+        src_from=SrcTensorFrom.file,
         structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
@@ -212,7 +218,9 @@ def chain_embedding(
         num_nodes=num_nodes,
         accelerator=accelerator,
         devices=arg_devices(devices),
-        out_path=output_path
+        out_path=output_path,
+        out_format=output_format,
+        out_name=output_name
     )
 @app.command(
@@ -241,9 +249,12 @@ def assembly_embedding(
             resolve_path=True,
             help='Output path to store predictions. Embeddings are stored as csv files.'
         )],
-        src_from: Annotated[SrcAssemblyFrom, typer.Option(
-            help='Use specific assembly or all assemblies in a structure.'
-        )] = SrcAssemblyFrom.assembly,
+        output_format: Annotated[OutFormat, typer.Option(
+            help='Format of the output. Options: separated (predictions are stored in single files) or grouped (predictions are stored in a single JSON file).'
+        )] = OutFormat.separated,
+        output_name: Annotated[str, typer.Option(
+            help='File name for storing embeddings as a single JSON file. Used when output-format=grouped.'
+        )] = 'inference',
         structure_location: Annotated[StructureLocation, typer.Option(
             help='Structure file location.'
         )] = StructureLocation.local,
@@ -276,8 +287,8 @@ def assembly_embedding(
     predict(
         src_stream=src_file,
         res_embedding_location=res_embedding_location,
-        src_location=SrcLocation.local,
-        src_from=src_from,
+        src_location=SrcLocation.file,
+        src_from=SrcAssemblyFrom.assembly,
         structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
@@ -287,7 +298,117 @@ def assembly_embedding(
         num_nodes=num_nodes,
         accelerator=accelerator,
         devices=arg_devices(devices),
-        out_path=output_path
+        out_path=output_path,
+        out_format=output_format,
+        out_name=output_name
+    )
+@app.command(
+    name="complete-embedding",
+    help="Calculate chain and assembly embeddings from structural files. Predictions are stored as csv files."
+)
+def complete_embedding(
+        src_chain_file: Annotated[typer.FileText, typer.Option(
+            exists=True,
+            file_okay=True,
+            dir_okay=False,
+            resolve_path=True,
+            help='CSV file 4 columns: Structure Name | Structure File Path or URL (switch structure-location) | Chain Id (asym_i for cif files) | Output Embedding Name.'
+        )],
+        src_assembly_file: Annotated[typer.FileText, typer.Option(
+            exists=True,
+            file_okay=True,
+            dir_okay=False,
+            resolve_path=True,
+            help='CSV file 4 columns: Structure Name | Structure File Path or URL (switch structure-location) | Assembly Id | Output embedding name.'
+        )],
+        output_path: Annotated[typer.FileText, typer.Option(
+            exists=True,
+            file_okay=False,
+            dir_okay=True,
+            resolve_path=True,
+            help='Output path to store predictions. Embeddings are stored as a single DataFrame file (see output_name).'
+        )],
+        res_embedding_location: Annotated[typer.FileText, typer.Option(
+            exists=True,
+            file_okay=False,
+            dir_okay=True,
+            resolve_path=True,
+            help='Output path to store ESM predictions.'
+        )],
+        output_format: Annotated[OutFormat, typer.Option(
+            help='Format of the output. Options: separated (predictions are stored in single files) or grouped (predictions are stored in a single JSON file).'
+        )] = OutFormat.separated,
+        output_name: Annotated[str, typer.Option(
+            help='File name for storing embeddings as a single JSON file. Used when output-format=grouped.'
+        )] = 'inference',
+        structure_location: Annotated[StructureLocation, typer.Option(
+            help='Structure file location.'
+        )] = StructureLocation.local,
+        structure_format: Annotated[StructureFormat, typer.Option(
+            help='Structure file format.'
+        )] = StructureFormat.mmcif,
+        min_res_n: Annotated[int, typer.Option(
+            help='When using all chains in a structure, consider only chains with more than <min_res_n> residues.'
+        )] = 0,
+        batch_size: Annotated[int, typer.Option(
+            help='Number of samples processed together in one iteration.'
+        )] = 1,
+        num_workers: Annotated[int, typer.Option(
+            help='Number of subprocesses to use for data loading.'
+        )] = 0,
+        num_nodes: Annotated[int, typer.Option(
+            help='Number of nodes to use for inference.'
+        )] = 1,
+        accelerator: Annotated[Accelerator, typer.Option(
+            help='Device used for inference.'
+        )] = Accelerator.auto,
+        devices: Annotated[List[str], typer.Option(
+            help='The devices to use. Can be set to a positive number or "auto". Repeat this argument to indicate multiple indices of devices. "auto" for automatic selection based on the chosen accelerator.'
+        )] = tuple(['auto'])
+):
+    residue_embedding(
+        src_file=src_chain_file,
+        output_path=res_embedding_location,
+        output_format=OutFormat.separated,
+        structure_location=structure_location,
+        structure_format=structure_format,
+        min_res_n=min_res_n,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        num_nodes=num_nodes,
+        accelerator=accelerator,
+        devices=devices,
+    )
+    chain_embedding(
+        src_file=src_chain_file,
+        output_path=output_path,
+        output_format=output_format,
+        output_name=f"{output_name}-chain",
+        res_embedding_location=res_embedding_location,
+        structure_location=structure_location,
+        structure_format=structure_format,
+        min_res_n=min_res_n,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        num_nodes=num_nodes,
+        accelerator=accelerator,
+        devices=devices
+    )
+    assembly_embedding(
+        src_file=src_assembly_file,
+        output_path=output_path,
+        output_format=output_format,
+        output_name=f"{output_name}-assembly",
+        res_embedding_location=res_embedding_location,
+        structure_location=structure_location,
+        structure_format=structure_format,
+        min_res_n=min_res_n,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        num_nodes=num_nodes,
+        accelerator=accelerator,
+        devices=devices
     )

rcsb_embedding_model/dataset/esm_prot_from_chain.py CHANGED Viewed

@@ -27,7 +27,7 @@ class EsmProtFromChain(Dataset):
     def __init__(
         self,
         src_stream,
-        src_location=SrcLocation.local,
+        src_location=SrcLocation.file,
         structure_location=StructureLocation.local,
         structure_format=StructureFormat.mmcif,
         structure_provider=StructureProvider()
@@ -70,6 +70,7 @@ class EsmProtFromChain(Dataset):
         for atom_ch in chain_iter(structure):
             protein_chain = ProteinChain.from_atomarray(rename_atom_ch(atom_ch))
             return ESMProtein.from_protein_chain(protein_chain), item_name
+        return None
 if __name__ == '__main__':

rcsb_embedding_model/dataset/esm_prot_from_structure.py CHANGED Viewed

@@ -19,7 +19,7 @@ class EsmProtFromStructure(EsmProtFromChain):
     def __init__(
             self,
             src_stream,
-            src_location=SrcLocation.local,
+            src_location=SrcLocation.file,
             structure_location=StructureLocation.local,
             structure_format=StructureFormat.mmcif,
             min_res_n=0,

rcsb_embedding_model/dataset/resdiue_assembly_embedding_from_structure.py CHANGED Viewed

@@ -21,7 +21,7 @@ class ResidueAssemblyDatasetFromStructure(ResidueAssemblyEmbeddingFromTensorFile
             self,
             src_stream,
             res_embedding_location,
-            src_location=SrcLocation.local,
+            src_location=SrcLocation.file,
             structure_location=StructureLocation.local,
             structure_format=StructureFormat.mmcif,
             min_res_n=0,
@@ -63,6 +63,6 @@ class ResidueAssemblyDatasetFromStructure(ResidueAssemblyEmbeddingFromTensorFile
             structure = stringio_from_url(src_structure) if self.structure_location == StructureLocation.remote else src_structure
             item_name = row[ResidueAssemblyDatasetFromStructure.ITEM_NAME_ATTR]
             for assembly_id in get_assemblies(structure=structure, structure_format=self.structure_format):
-                assemblies.append((src_name, src_structure, str(assembly_id), f"{item_name}.{assembly_id}"))
+                assemblies.append((src_name, src_structure, str(assembly_id), f"{item_name}-{assembly_id}"))
         return tuple(assemblies)

rcsb_embedding_model/dataset/residue_assembly_embedding_from_tensor_file.py CHANGED Viewed

@@ -22,7 +22,7 @@ class ResidueAssemblyEmbeddingFromTensorFile(Dataset):
             self,
             src_stream,
             res_embedding_location,
-            src_location=SrcLocation.local,
+            src_location=SrcLocation.file,
             structure_location=StructureLocation.local,
             structure_format=StructureFormat.mmcif,
             min_res_n=0,
@@ -79,7 +79,7 @@ if __name__ == "__main__":
     dataset = ResidueAssemblyEmbeddingFromTensorFile(
         src_stream="/Users/joan/tmp/assembly-test.csv",
         res_embedding_location="/Users/joan/tmp",
-        src_location=SrcLocation.local,
+        src_location=SrcLocation.file,
         structure_location=StructureLocation.local,
         structure_format=StructureFormat.mmcif
     )

rcsb_embedding_model/dataset/residue_embedding_from_structure.py CHANGED Viewed

@@ -21,7 +21,7 @@ class ResidueEmbeddingFromStructure(ResidueEmbeddingFromTensorFile):
             self,
             src_stream,
             res_embedding_location,
-            src_location=SrcLocation.local,
+            src_location=SrcLocation.file,
             structure_location=StructureLocation.local,
             structure_format=StructureFormat.mmcif,
             min_res_n=0,

rcsb_embedding_model/dataset/residue_embedding_from_tensor_file.py CHANGED Viewed

@@ -15,7 +15,7 @@ class ResidueEmbeddingFromTensorFile(Dataset):
     def __init__(
             self,
             src_stream,
-            src_location=SrcLocation.local
+            src_location=SrcLocation.file
     ):
         super().__init__()
         self.src_location = src_location

rcsb_embedding_model/inference/assembly_inferece.py CHANGED Viewed

@@ -2,14 +2,15 @@ import sys
 from rcsb_embedding_model.dataset.resdiue_assembly_embedding_from_structure import ResidueAssemblyDatasetFromStructure
 from rcsb_embedding_model.dataset.residue_assembly_embedding_from_tensor_file import ResidueAssemblyEmbeddingFromTensorFile
-from rcsb_embedding_model.types.api_types import FileOrStreamTuple, SrcLocation, Accelerator, Devices, OptionalPath, EmbeddingPath, StructureLocation, StructureFormat, SrcAssemblyFrom
+from rcsb_embedding_model.types.api_types import FileOrStreamTuple, SrcLocation, Accelerator, Devices, OptionalPath, \
+    EmbeddingPath, StructureLocation, StructureFormat, SrcAssemblyFrom, OutFormat
 from rcsb_embedding_model.inference.chain_inference import predict as chain_predict
 def predict(
         src_stream: FileOrStreamTuple,
         res_embedding_location: EmbeddingPath,
-        src_location: SrcLocation = SrcLocation.local,
+        src_location: SrcLocation = SrcLocation.file,
         src_from: SrcAssemblyFrom = SrcAssemblyFrom.assembly,
         structure_location: StructureLocation = StructureLocation.local,
         structure_format: StructureFormat = StructureFormat.mmcif,
@@ -20,6 +21,8 @@ def predict(
         num_nodes: int = 1,
         accelerator: Accelerator = Accelerator.auto,
         devices: Devices = 'auto',
+        out_format: OutFormat = OutFormat.separated,
+        out_name: str = 'inference',
         out_path: OptionalPath = None
 ):
     inference_set = ResidueAssemblyEmbeddingFromTensorFile(
@@ -48,6 +51,8 @@ def predict(
         num_nodes=num_nodes,
         accelerator=accelerator,
         devices=devices,
+        out_format=out_format,
+        out_name=out_name,
         out_path=out_path,
         inference_set=inference_set
     )

rcsb_embedding_model/inference/chain_inference.py CHANGED Viewed

@@ -5,15 +5,15 @@ from rcsb_embedding_model.dataset.residue_embedding_from_structure import Residu
 from rcsb_embedding_model.dataset.residue_embedding_from_tensor_file import ResidueEmbeddingFromTensorFile
 from rcsb_embedding_model.modules.chain_module import ChainModule
 from rcsb_embedding_model.types.api_types import Accelerator, Devices, OptionalPath, FileOrStreamTuple, SrcLocation, \
-    SrcTensorFrom, StructureLocation, StructureFormat
+    SrcTensorFrom, StructureLocation, StructureFormat, OutFormat
 from rcsb_embedding_model.utils.data import collate_seq_embeddings
-from rcsb_embedding_model.writer.batch_writer import CsvBatchWriter
+from rcsb_embedding_model.writer.batch_writer import CsvBatchWriter, JsonStorage
 def predict(
         src_stream: FileOrStreamTuple,
         res_embedding_location: OptionalPath = None,
-        src_location: SrcLocation = SrcLocation.local,
+        src_location: SrcLocation = SrcLocation.file,
         src_from: SrcTensorFrom = SrcTensorFrom.file,
         structure_location: StructureLocation = StructureLocation.local,
         structure_format: StructureFormat = StructureFormat.mmcif,
@@ -23,6 +23,8 @@ def predict(
         num_nodes: int = 1,
         accelerator: Accelerator = Accelerator.auto,
         devices: Devices = 'auto',
+        out_format: OutFormat = OutFormat.separated,
+        out_name: str = 'inference',
         out_path: OptionalPath = None,
         inference_set=None
 ):
@@ -51,13 +53,13 @@ def predict(
     )
     module = ChainModule()
-    inference_writer = CsvBatchWriter(out_path) if out_path is not None else None
+    inference_writer = (JsonStorage(out_path, out_name) if out_format == OutFormat.grouped else CsvBatchWriter(out_path)) if out_path is not None else None
     trainer = Trainer(
         callbacks=[inference_writer] if inference_writer is not None else None,
         num_nodes=num_nodes,
         accelerator=accelerator,
-        devices=devices
+        devices=devices,
+        logger=False
     )
     prediction = trainer.predict(

rcsb_embedding_model/inference/esm_inference.py CHANGED Viewed

@@ -4,13 +4,14 @@ from lightning import Trainer
 from rcsb_embedding_model.dataset.esm_prot_from_structure import EsmProtFromStructure
 from rcsb_embedding_model.dataset.esm_prot_from_chain import EsmProtFromChain
 from rcsb_embedding_model.modules.esm_module import EsmModule
-from rcsb_embedding_model.types.api_types import StructureFormat, Accelerator, Devices, OptionalPath, StructureLocation, SrcProteinFrom, FileOrStreamTuple, SrcLocation
-from rcsb_embedding_model.writer.batch_writer import TensorBatchWriter
+from rcsb_embedding_model.types.api_types import StructureFormat, Accelerator, Devices, OptionalPath, StructureLocation, \
+    SrcProteinFrom, FileOrStreamTuple, SrcLocation, OutFormat
+from rcsb_embedding_model.writer.batch_writer import TensorBatchWriter, JsonStorage
 def predict(
         src_stream: FileOrStreamTuple,
-        src_location: SrcLocation = SrcLocation.local,
+        src_location: SrcLocation = SrcLocation.file,
         src_from: SrcProteinFrom = SrcProteinFrom.chain,
         structure_location: StructureLocation = StructureLocation.local,
         structure_format: StructureFormat = StructureFormat.mmcif,
@@ -20,6 +21,8 @@ def predict(
         num_nodes: int = 1,
         accelerator: Accelerator = Accelerator.auto,
         devices: Devices = 'auto',
+        out_format: OutFormat = OutFormat.separated,
+        out_name: str = 'inference',
         out_path: OptionalPath = None
 ):
@@ -44,12 +47,13 @@ def predict(
     )
     module = EsmModule()
-    inference_writer = TensorBatchWriter(out_path) if out_path is not None else None
+    inference_writer = (JsonStorage(out_path, out_name) if out_format == OutFormat.grouped else TensorBatchWriter(out_path)) if out_path is not None else None
     trainer = Trainer(
         callbacks=[inference_writer] if inference_writer is not None else None,
         num_nodes=num_nodes,
         accelerator=accelerator,
-        devices=devices
+        devices=devices,
+        logger=False
     )
     prediction = trainer.predict(

rcsb_embedding_model/inference/structure_inference.py CHANGED Viewed

@@ -4,13 +4,14 @@ from lightning import Trainer
 from rcsb_embedding_model.dataset.esm_prot_from_structure import EsmProtFromStructure
 from rcsb_embedding_model.dataset.esm_prot_from_chain import EsmProtFromChain
 from rcsb_embedding_model.modules.structure_module import StructureModule
-from rcsb_embedding_model.types.api_types import StructureFormat, Accelerator, Devices, OptionalPath, StructureLocation, SrcProteinFrom, FileOrStreamTuple, SrcLocation
-from rcsb_embedding_model.writer.batch_writer import DataFrameStorage
+from rcsb_embedding_model.types.api_types import StructureFormat, Accelerator, Devices, OptionalPath, StructureLocation, \
+    SrcProteinFrom, FileOrStreamTuple, SrcLocation
+from rcsb_embedding_model.writer.batch_writer import JsonStorage
 def predict(
         src_stream: FileOrStreamTuple,
-        src_location: SrcLocation = SrcLocation.local,
+        src_location: SrcLocation = SrcLocation.file,
         src_from: SrcProteinFrom = SrcProteinFrom.chain,
         structure_location: StructureLocation = StructureLocation.local,
         structure_format: StructureFormat = StructureFormat.mmcif,
@@ -20,8 +21,8 @@ def predict(
         num_nodes: int = 1,
         accelerator: Accelerator = Accelerator.auto,
         devices: Devices = 'auto',
-        out_path: OptionalPath = None,
-        out_df_name: str = None
+        out_name: str = 'inference',
+        out_path: OptionalPath = None
 ):
     inference_set = EsmProtFromChain(
@@ -45,12 +46,13 @@ def predict(
     )
     module = StructureModule()
-    inference_writer = DataFrameStorage(out_path, out_df_name) if out_path is not None and out_df_name is not None else None
+    inference_writer = JsonStorage(out_path, out_name) if out_path is not None and out_name is not None else None
     trainer = Trainer(
         callbacks=[inference_writer] if inference_writer is not None else None,
         num_nodes=num_nodes,
         accelerator=accelerator,
-        devices=devices
+        devices=devices,
+        logger=False
     )
     prediction = trainer.predict(

rcsb_embedding_model/modules/esm_module.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from esm.sdk.api import SamplingConfig
+from esm.sdk import batch_executor
 from lightning import LightningModule
 from rcsb_embedding_model.utils.model import get_residue_model
@@ -14,11 +15,13 @@ class EsmModule(LightningModule):
     def predict_step(self, prot_batch, batch_idx):
         prot_embeddings = []
-        prot_names = []
-        for esm_prot, name in prot_batch:
-            embeddings = self.esm3.forward_and_sample(
+        def __batch_embedding(esm_prot):
+            return self.esm3.forward_and_sample(
                 self.esm3.encode(esm_prot), SamplingConfig(return_per_residue_embeddings=True)
             ).per_residue_embedding
-            prot_embeddings.append(embeddings)
-            prot_names.append(name)
-        return tuple(prot_embeddings), tuple(prot_names)
+        with batch_executor() as executor:
+            prot_embeddings = executor.execute_batch(
+                user_func=__batch_embedding,
+                esm_prot=[esm_prot for esm_prot, name in prot_batch]
+            )
+        return tuple(prot_embeddings), tuple([name for esm_prot, name in prot_batch])

rcsb_embedding_model/types/api_types.py CHANGED Viewed

@@ -32,7 +32,7 @@ class Accelerator(str, Enum):
 class SrcLocation(str, Enum):
-    local = "local"
+    file = "file"
     stream = "stream"
@@ -54,3 +54,7 @@ class SrcAssemblyFrom(str, Enum):
 class SrcTensorFrom(str, Enum):
     file = "file"
     structure = "structure"
+class OutFormat(str, Enum):
+    separated = "separated"
+    grouped = "grouped"

rcsb_embedding_model/utils/data.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import os
 from io import StringIO
 import requests
@@ -76,3 +77,9 @@ def concatenate_tensors(file_list, max_residues, dim=0):
         return tensor_cat
     else:
         raise ValueError("No valid tensors were loaded to concatenate.")
+def adapt_csv_to_embedding_chain_stream(src_file, res_embedding_location):
+    def __parse_row(row):
+        r = row.split(",")
+        return os.path.join(res_embedding_location, f"{r[0]}.{r[2]}.pt"), f"{r[0]}.{r[2]}"
+    return tuple([__parse_row(r) for r in open(src_file)])

rcsb_embedding_model/writer/batch_writer.py CHANGED Viewed

@@ -111,3 +111,21 @@ class DataFrameStorage(CoreBatchWriter, ABC):
             f"{self.out_path}/{self.df_id}.pkl.gz",
             compression='gzip'
         )
+class JsonStorage(DataFrameStorage, ABC):
+    def __init__(
+            self,
+            output_path,
+            df_id,
+            postfix="pkl",
+            write_interval="batch"
+    ):
+        super().__init__(output_path, df_id, postfix, write_interval)
+    def on_predict_end(self, trainer, pl_module):
+        self.embedding.to_json(
+            f"{self.out_path}/{self.df_id}.json.gz",
+            orient='records',
+            compression='gzip'
+        )

{rcsb_embedding_model-0.0.16.dist-info → rcsb_embedding_model-0.0.18.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rcsb-embedding-model
-Version: 0.0.16
+Version: 0.0.18
 Summary: Protein Embedding Model for Structure Search
 Project-URL: Homepage, https://github.com/rcsb/rcsb-embedding-model
 Project-URL: Issues, https://github.com/rcsb/rcsb-embedding-model/issues
@@ -12,13 +12,12 @@ Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.10
 Requires-Dist: esm>=3.2.0
 Requires-Dist: lightning>=2.5.0
-Requires-Dist: torch>=2.2.0
 Requires-Dist: typer>=0.15.0
 Description-Content-Type: text/markdown
 # RCSB Embedding Model
-**Version** 0.0.16
+**Version** 0.0.18
 ## Overview
@@ -48,11 +47,10 @@ If you are interested in training the model with a new dataset, visit the [rcsb-
 **Requirements:**
-- Python ≥ 3.10
-- ESM ≥ 3.2.0
-- PyTorch ≥ 2.2.0
-- Lightning ≥ 2.5.0
-- Typer ≥ 0.15.0
+- Python ≥ 3.10
+- ESM == 3.1.1
+- Lightning ≥ 2.5.0
+- Typer ≥ 0.15.0
 ---

rcsb_embedding_model-0.0.18.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,30 @@
+rcsb_embedding_model/__init__.py,sha256=r3gLdeBIXkQEQA_K6QcRPO-TtYuAQSutk6pXRUE_nas,120
+rcsb_embedding_model/rcsb_structure_embedding.py,sha256=dKp9hXQO0JAnO4SEfjJ_mG_jHu3UxAPguv6jkOjp-BI,4487
+rcsb_embedding_model/cli/args_utils.py,sha256=7nP2q8pL5dWK_U7opxtWmoFcYVwasky6elHk-dASFaI,165
+rcsb_embedding_model/cli/inference.py,sha256=PE36a1d6nfhNsuqCCJbos2JpZE0oCJmIf2mNw7Nz8GI,18231
+rcsb_embedding_model/dataset/esm_prot_from_chain.py,sha256=3hWo2nWunFZNTfYCTiPvVoJlkWQbRmvlehFw-6B4z6A,3506
+rcsb_embedding_model/dataset/esm_prot_from_structure.py,sha256=TeITPdi1uc3qLQ-Pgn807oH6eM0LYv-67RE50ZT4dLI,2551
+rcsb_embedding_model/dataset/resdiue_assembly_embedding_from_structure.py,sha256=worRiNqOJRjyr693TaillsS65bdTdGOoHfwyT9yE1O4,2866
+rcsb_embedding_model/dataset/residue_assembly_embedding_from_tensor_file.py,sha256=JG4rrhziIUtdTmbuTbMbEYHrvlda4m5VWvdJXe_Sv3c,3449
+rcsb_embedding_model/dataset/residue_embedding_from_structure.py,sha256=dxfUNcVmdl8LrtQf1UJQ4E79e7R9LRsL0fjsq2GJQRk,2796
+rcsb_embedding_model/dataset/residue_embedding_from_tensor_file.py,sha256=ehHQuLI2TrE5l4_4n6p3e30i17O1pXW92KOCn7bGtcg,1274
+rcsb_embedding_model/inference/assembly_inferece.py,sha256=8fPJjEXy1WsM5XB5U7KfdO5-Du6nEsawsaAjmWoXA9I,2329
+rcsb_embedding_model/inference/chain_inference.py,sha256=zTV_glkoErSYjVy0xfDRNtT8bVS0NGBnaNSUqp-CnoY,2700
+rcsb_embedding_model/inference/esm_inference.py,sha256=3ny9vvHDSI7jpybDfMVXour52qiZ_av-2SL6h2yygEI,2341
+rcsb_embedding_model/inference/structure_inference.py,sha256=lqbDBPSea8IoNyQXl83OcfXgLq4hmbD1DNvAwjetiPc,2231
+rcsb_embedding_model/model/layers.py,sha256=lhKaWC4gTS_T5lHOP0mgnnP8nKTPEOm4MrjhESA4hE8,743
+rcsb_embedding_model/model/residue_embedding_aggregator.py,sha256=k3UW63Ax8DtjCMdD3O5xNxtyAu28l2n3-Ab6nS0atm0,1967
+rcsb_embedding_model/modules/chain_module.py,sha256=sDSPXJmWuU2C3lt1NorlbUVWZvRSLzumPdFQk01h3VI,403
+rcsb_embedding_model/modules/esm_module.py,sha256=4IQgrNQlGThxl0PhobVzyp7N3FcyAbvek_KxJozGImQ,945
+rcsb_embedding_model/modules/structure_module.py,sha256=dEtDNdWo1j2sSDa0JiOHQfEfQzIWqSLEKpvOX0GrXZ4,1048
+rcsb_embedding_model/types/api_types.py,sha256=SCwALwvEb0KRKaoWKbuN7JyfOH-1whsI0Z4ki41dht8,1235
+rcsb_embedding_model/utils/data.py,sha256=FVb6tzoX4SrJf3Fr6UFbxZJQsUr9xp5RbkK6nqXhcuQ,3222
+rcsb_embedding_model/utils/model.py,sha256=rpZa-gfm3cEtbBd7UXMHrZv3x6f0AC8TJT3gtrSxr5I,852
+rcsb_embedding_model/utils/structure_parser.py,sha256=IWMQ8brlEMe6_ND-DBESOli8vlqHxladTssjbM9RSKw,2751
+rcsb_embedding_model/utils/structure_provider.py,sha256=eWtxjkPpmRfmil_DKR1J6miaXR3lQ28DF5O0qrqSgGA,786
+rcsb_embedding_model/writer/batch_writer.py,sha256=rTFNasB0Xp4-XCNTXKeEWZxSrb7lvZytoRldJUWn9Jg,3312
+rcsb_embedding_model-0.0.18.dist-info/METADATA,sha256=PzSnwGeAeUbYxhpRBgEiZZdj2bGdLrT8QAy0uB_BxNQ,5310
+rcsb_embedding_model-0.0.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+rcsb_embedding_model-0.0.18.dist-info/entry_points.txt,sha256=MK11jTIEmaV-x4CkPX5IymDaVs7Ky_f2xxU8BJVZ_9Q,69
+rcsb_embedding_model-0.0.18.dist-info/licenses/LICENSE.md,sha256=oUaHiKgfBkChth_Sm67WemEvatO1U0Go8LHjaskXY0w,1522
+rcsb_embedding_model-0.0.18.dist-info/RECORD,,

rcsb_embedding_model-0.0.16.dist-info/RECORD DELETED Viewed

@@ -1,30 +0,0 @@
-rcsb_embedding_model/__init__.py,sha256=r3gLdeBIXkQEQA_K6QcRPO-TtYuAQSutk6pXRUE_nas,120
-rcsb_embedding_model/rcsb_structure_embedding.py,sha256=dKp9hXQO0JAnO4SEfjJ_mG_jHu3UxAPguv6jkOjp-BI,4487
-rcsb_embedding_model/cli/args_utils.py,sha256=7nP2q8pL5dWK_U7opxtWmoFcYVwasky6elHk-dASFaI,165
-rcsb_embedding_model/cli/inference.py,sha256=0DZHw4QeAi2f6xdfoEPzYb_gQhCWc_IPA1QgnckcUIg,12916
-rcsb_embedding_model/dataset/esm_prot_from_chain.py,sha256=dBD2N0Y-GoN6p3z2yLnOvv6JGn-skAxwgbOYhXKDngc,3487
-rcsb_embedding_model/dataset/esm_prot_from_structure.py,sha256=9IvurGr7PGjfAABoGoMlG08zn6mC6iVAjgExGSrDVdQ,2552
-rcsb_embedding_model/dataset/resdiue_assembly_embedding_from_structure.py,sha256=10NUHnjTE5xSXPFVTfeuL8MpOhqk-f3ZIG7EbWR49B4,2867
-rcsb_embedding_model/dataset/residue_assembly_embedding_from_tensor_file.py,sha256=KXiohnPjjfZEFbPZQ46HGE8eEYWrVX8bfbTz4zPlo7o,3451
-rcsb_embedding_model/dataset/residue_embedding_from_structure.py,sha256=9MfgKvFAxYr9RU8kwvHnEZBH35gukx8hRPeoBXfyNXo,2797
-rcsb_embedding_model/dataset/residue_embedding_from_tensor_file.py,sha256=mDCqJrpnu2GXmp75zOPTH8ogL3GWDqc3iEH62JuyHVs,1275
-rcsb_embedding_model/inference/assembly_inferece.py,sha256=MPssN5bsOqOU-LGwa6AKX99cv5LD43Mnbaqhuuww1Tw,2165
-rcsb_embedding_model/inference/chain_inference.py,sha256=N92Wfu-UNkhmSlQ0153BA1idECj1NgEcl35Zis9Q2js,2492
-rcsb_embedding_model/inference/esm_inference.py,sha256=oVN4r9_6V8TS0pYoNn7GR92Xo0Zn7eBsnt_OfDSaH6g,2126
-rcsb_embedding_model/inference/structure_inference.py,sha256=QIUEo8eEc-kTSYKGdlX2rxT74huw4ZAw6U8Px9kYajE,2216
-rcsb_embedding_model/model/layers.py,sha256=lhKaWC4gTS_T5lHOP0mgnnP8nKTPEOm4MrjhESA4hE8,743
-rcsb_embedding_model/model/residue_embedding_aggregator.py,sha256=k3UW63Ax8DtjCMdD3O5xNxtyAu28l2n3-Ab6nS0atm0,1967
-rcsb_embedding_model/modules/chain_module.py,sha256=sDSPXJmWuU2C3lt1NorlbUVWZvRSLzumPdFQk01h3VI,403
-rcsb_embedding_model/modules/esm_module.py,sha256=CTHGOATXiarqZsBsZ8oxGJBj20A73186Slpr0EzMJsE,770
-rcsb_embedding_model/modules/structure_module.py,sha256=dEtDNdWo1j2sSDa0JiOHQfEfQzIWqSLEKpvOX0GrXZ4,1048
-rcsb_embedding_model/types/api_types.py,sha256=JSHd5Rq7dm6uWNzy1UZnLkWKxfjsKB7gRRTCSqS4r7c,1156
-rcsb_embedding_model/utils/data.py,sha256=ODz6GG6IAhgAlLh3tcIP6-JVHX8Bb_-E745Lvc_oR84,2934
-rcsb_embedding_model/utils/model.py,sha256=rpZa-gfm3cEtbBd7UXMHrZv3x6f0AC8TJT3gtrSxr5I,852
-rcsb_embedding_model/utils/structure_parser.py,sha256=IWMQ8brlEMe6_ND-DBESOli8vlqHxladTssjbM9RSKw,2751
-rcsb_embedding_model/utils/structure_provider.py,sha256=eWtxjkPpmRfmil_DKR1J6miaXR3lQ28DF5O0qrqSgGA,786
-rcsb_embedding_model/writer/batch_writer.py,sha256=ekgzFZyoKpcnZ3IDP9hfOWBpuHxUQ31P35ViDAi-Edw,2843
-rcsb_embedding_model-0.0.16.dist-info/METADATA,sha256=xxH5oEOiNEWHBf2AuKoaK9FTLf4gh-3EXHRUtPGGgUc,5368
-rcsb_embedding_model-0.0.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-rcsb_embedding_model-0.0.16.dist-info/entry_points.txt,sha256=MK11jTIEmaV-x4CkPX5IymDaVs7Ky_f2xxU8BJVZ_9Q,69
-rcsb_embedding_model-0.0.16.dist-info/licenses/LICENSE.md,sha256=oUaHiKgfBkChth_Sm67WemEvatO1U0Go8LHjaskXY0w,1522
-rcsb_embedding_model-0.0.16.dist-info/RECORD,,

{rcsb_embedding_model-0.0.16.dist-info → rcsb_embedding_model-0.0.18.dist-info}/WHEEL RENAMED Viewed

File without changes

{rcsb_embedding_model-0.0.16.dist-info → rcsb_embedding_model-0.0.18.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{rcsb_embedding_model-0.0.16.dist-info → rcsb_embedding_model-0.0.18.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

rcsb-embedding-model 0.0.16__py3-none-any.whl → 0.0.18__py3-none-any.whl

Potentially problematic release.

rcsb-embedding-model 0.0.16py3-none-any.whl → 0.0.18py3-none-any.whl