PyPI - rcsb-embedding-model - Versions diffs - 0.0.17__py3-none-any.whl → 0.0.18__py3-none-any.whl - Mend

rcsb-embedding-model 0.0.17py3-none-any.whl → 0.0.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rcsb-embedding-model might be problematic. Click here for more details.

Files changed (9) hide show

rcsb_embedding_model/cli/inference.py CHANGED Viewed

@@ -6,6 +6,7 @@ import typer
 from rcsb_embedding_model.cli.args_utils import arg_devices
 from rcsb_embedding_model.types.api_types import StructureFormat, Accelerator, SrcLocation, SrcProteinFrom, \
     StructureLocation, SrcAssemblyFrom, SrcTensorFrom, OutFormat
+from rcsb_embedding_model.utils.data import adapt_csv_to_embedding_chain_stream
 app = typer.Typer(
     add_completion=False
@@ -22,7 +23,7 @@ def residue_embedding(
             file_okay=True,
             dir_okay=False,
             resolve_path=True,
-            help='CSV file 4 (or 3) columns: Structure Name | Structure File Path or URL (switch structure-location) | Chain Id (asym_i for cif files. This field is required if src-from=chain) | Output Embedding Name.'
+            help='CSV file 4 columns: Structure Name | Structure File Path or URL (switch structure-location) | Chain Id (asym_i for cif files) | Output Embedding Name.'
         )],
         output_path: Annotated[typer.FileText, typer.Option(
             exists=True,
@@ -37,9 +38,6 @@ def residue_embedding(
         output_name: Annotated[str, typer.Option(
             help='File name for storing embeddings as a single JSON file. Used when output-format=grouped.'
         )] = 'inference',
-        src_from: Annotated[SrcProteinFrom, typer.Option(
-            help='Use specific chains or all chains in a structure.'
-        )] = SrcProteinFrom.chain,
         structure_location: Annotated[StructureLocation, typer.Option(
             help='Structure file location.'
         )] = StructureLocation.local,
@@ -69,7 +67,7 @@ def residue_embedding(
     predict(
         src_stream=src_file,
         src_location=SrcLocation.file,
-        src_from=src_from,
+        src_from=SrcProteinFrom.chain,
         structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
@@ -94,7 +92,7 @@ def structure_embedding(
             file_okay=True,
             dir_okay=False,
             resolve_path=True,
-            help='CSV file 4 (or 3) columns: Structure Name | Structure File Path or URL (switch structure-location) | Chain Id (asym_i for cif files. This field is required if src-from=chain) | Output Embedding Name.'
+            help='CSV file 4 columns: Structure Name | Structure File Path or URL (switch structure-location) | Chain Id (asym_i for cif files) | Output Embedding Name.'
         )],
         output_path: Annotated[typer.FileText, typer.Option(
             exists=True,
@@ -106,9 +104,6 @@ def structure_embedding(
         output_name: Annotated[str, typer.Option(
             help='File name for storing embeddings as a single JSON file.'
         )] = 'inference',
-        src_from: Annotated[SrcProteinFrom, typer.Option(
-            help='Use specific chains or all chains in a structure.'
-        )] = SrcProteinFrom.chain,
         structure_location: Annotated[StructureLocation, typer.Option(
             help='Structure file location.'
         )] = StructureLocation.local,
@@ -138,7 +133,7 @@ def structure_embedding(
     predict(
         src_stream=src_file,
         src_location=SrcLocation.file,
-        src_from=src_from,
+        src_from=SrcProteinFrom.chain,
         structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
@@ -162,7 +157,7 @@ def chain_embedding(
             file_okay=True,
             dir_okay=False,
             resolve_path=True,
-            help='Option 1 (src-from=file) - CSV file 2 columns: Residue Embedding Torch Tensor File | Output Embedding Name. Option 2 (src-from=structure) - CSV file 3 columns: Structure Name | Structure File Path or URL (switch structure-location) | Output Embedding Name.'
+            help='CSV file 4 columns: Structure Name | Structure File Path or URL (switch structure-location) | Chain Id (asym_i for cif files) | Output Embedding Name.'
         )],
         output_path: Annotated[typer.FileText, typer.Option(
             exists=True,
@@ -171,22 +166,19 @@ def chain_embedding(
             resolve_path=True,
             help='Output path to store predictions. Embeddings are stored as csv files.'
         )],
+        res_embedding_location: Annotated[typer.FileText, typer.Option(
+            exists=True,
+            file_okay=False,
+            dir_okay=True,
+            resolve_path=True,
+            help='Path where residue level embeddings are located.'
+        )],
         output_format: Annotated[OutFormat, typer.Option(
             help='Format of the output. Options: separated (predictions are stored in single files) or grouped (predictions are stored in a single JSON file).'
         )] = OutFormat.separated,
         output_name: Annotated[str, typer.Option(
             help='File name for storing embeddings as a single JSON file. Used when output-format=grouped.'
         )] = 'inference',
-        res_embedding_location: Annotated[typer.FileText, typer.Option(
-            exists=True,
-            file_okay=False,
-            dir_okay=True,
-            resolve_path=True,
-            help='Path where residue level embeddings are located. This argument is required if src-from=structure.'
-        )] = None,
-        src_from: Annotated[SrcTensorFrom, typer.Option(
-            help='Use file names or all chains in a structure.'
-        )] = SrcTensorFrom.file,
         structure_location: Annotated[StructureLocation, typer.Option(
             help='Structure file location.'
         )] = StructureLocation.local,
@@ -214,10 +206,10 @@ def chain_embedding(
 ):
     from rcsb_embedding_model.inference.chain_inference import predict
     predict(
-        src_stream=src_file,
+        src_stream=adapt_csv_to_embedding_chain_stream(src_file, res_embedding_location),
         res_embedding_location=res_embedding_location,
-        src_location=SrcLocation.file,
-        src_from=src_from,
+        src_location=SrcLocation.stream,
+        src_from=SrcTensorFrom.file,
         structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
@@ -263,9 +255,6 @@ def assembly_embedding(
         output_name: Annotated[str, typer.Option(
             help='File name for storing embeddings as a single JSON file. Used when output-format=grouped.'
         )] = 'inference',
-        src_from: Annotated[SrcAssemblyFrom, typer.Option(
-            help='Use specific assembly or all assemblies in a structure.'
-        )] = SrcAssemblyFrom.assembly,
         structure_location: Annotated[StructureLocation, typer.Option(
             help='Structure file location.'
         )] = StructureLocation.local,
@@ -299,7 +288,7 @@ def assembly_embedding(
         src_stream=src_file,
         res_embedding_location=res_embedding_location,
         src_location=SrcLocation.file,
-        src_from=src_from,
+        src_from=SrcAssemblyFrom.assembly,
         structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
@@ -319,12 +308,19 @@ def assembly_embedding(
     help="Calculate chain and assembly embeddings from structural files. Predictions are stored as csv files."
 )
 def complete_embedding(
-        src_file: Annotated[typer.FileText, typer.Option(
+        src_chain_file: Annotated[typer.FileText, typer.Option(
             exists=True,
             file_okay=True,
             dir_okay=False,
             resolve_path=True,
-            help='CSV file 3 columns: Structure Name | Structure File Path or URL | Chain Id (asym_i for cif files. This field is required if src-from=chain) | Output Embedding Name.'
+            help='CSV file 4 columns: Structure Name | Structure File Path or URL (switch structure-location) | Chain Id (asym_i for cif files) | Output Embedding Name.'
+        )],
+        src_assembly_file: Annotated[typer.FileText, typer.Option(
+            exists=True,
+            file_okay=True,
+            dir_okay=False,
+            resolve_path=True,
+            help='CSV file 4 columns: Structure Name | Structure File Path or URL (switch structure-location) | Assembly Id | Output embedding name.'
         )],
         output_path: Annotated[typer.FileText, typer.Option(
             exists=True,
@@ -333,7 +329,7 @@ def complete_embedding(
             resolve_path=True,
             help='Output path to store predictions. Embeddings are stored as a single DataFrame file (see output_name).'
         )],
-        esm_output_path: Annotated[typer.FileText, typer.Option(
+        res_embedding_location: Annotated[typer.FileText, typer.Option(
             exists=True,
             file_okay=False,
             dir_okay=True,
@@ -372,9 +368,8 @@ def complete_embedding(
         )] = tuple(['auto'])
 ):
     residue_embedding(
-        src_file=src_file,
-        src_from=SrcProteinFrom.structure,
-        output_path=esm_output_path,
+        src_file=src_chain_file,
+        output_path=res_embedding_location,
         output_format=OutFormat.separated,
         structure_location=structure_location,
         structure_format=structure_format,
@@ -386,12 +381,11 @@ def complete_embedding(
         devices=devices,
     )
     chain_embedding(
-        src_file=src_file,
-        src_from=SrcTensorFrom.structure,
+        src_file=src_chain_file,
         output_path=output_path,
         output_format=output_format,
         output_name=f"{output_name}-chain",
-        res_embedding_location=esm_output_path,
+        res_embedding_location=res_embedding_location,
         structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,
@@ -402,12 +396,11 @@ def complete_embedding(
         devices=devices
     )
     assembly_embedding(
-        src_file=src_file,
-        src_from=SrcAssemblyFrom.structure,
+        src_file=src_assembly_file,
         output_path=output_path,
         output_format=output_format,
         output_name=f"{output_name}-assembly",
-        res_embedding_location=esm_output_path,
+        res_embedding_location=res_embedding_location,
         structure_location=structure_location,
         structure_format=structure_format,
         min_res_n=min_res_n,

rcsb_embedding_model/dataset/esm_prot_from_chain.py CHANGED Viewed

@@ -70,6 +70,7 @@ class EsmProtFromChain(Dataset):
         for atom_ch in chain_iter(structure):
             protein_chain = ProteinChain.from_atomarray(rename_atom_ch(atom_ch))
             return ESMProtein.from_protein_chain(protein_chain), item_name
+        return None
 if __name__ == '__main__':

rcsb_embedding_model/modules/esm_module.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from esm.sdk.api import SamplingConfig
+from esm.sdk import batch_executor
 from lightning import LightningModule
 from rcsb_embedding_model.utils.model import get_residue_model
@@ -14,11 +15,13 @@ class EsmModule(LightningModule):
     def predict_step(self, prot_batch, batch_idx):
         prot_embeddings = []
-        prot_names = []
-        for esm_prot, name in prot_batch:
-            embeddings = self.esm3.forward_and_sample(
+        def __batch_embedding(esm_prot):
+            return self.esm3.forward_and_sample(
                 self.esm3.encode(esm_prot), SamplingConfig(return_per_residue_embeddings=True)
             ).per_residue_embedding
-            prot_embeddings.append(embeddings)
-            prot_names.append(name)
-        return tuple(prot_embeddings), tuple(prot_names)
+        with batch_executor() as executor:
+            prot_embeddings = executor.execute_batch(
+                user_func=__batch_embedding,
+                esm_prot=[esm_prot for esm_prot, name in prot_batch]
+            )
+        return tuple(prot_embeddings), tuple([name for esm_prot, name in prot_batch])

rcsb_embedding_model/utils/data.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import os
 from io import StringIO
 import requests
@@ -76,3 +77,9 @@ def concatenate_tensors(file_list, max_residues, dim=0):
         return tensor_cat
     else:
         raise ValueError("No valid tensors were loaded to concatenate.")
+def adapt_csv_to_embedding_chain_stream(src_file, res_embedding_location):
+    def __parse_row(row):
+        r = row.split(",")
+        return os.path.join(res_embedding_location, f"{r[0]}.{r[2]}.pt"), f"{r[0]}.{r[2]}"
+    return tuple([__parse_row(r) for r in open(src_file)])

{rcsb_embedding_model-0.0.17.dist-info → rcsb_embedding_model-0.0.18.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rcsb-embedding-model
-Version: 0.0.17
+Version: 0.0.18
 Summary: Protein Embedding Model for Structure Search
 Project-URL: Homepage, https://github.com/rcsb/rcsb-embedding-model
 Project-URL: Issues, https://github.com/rcsb/rcsb-embedding-model/issues
@@ -12,13 +12,12 @@ Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.10
 Requires-Dist: esm>=3.2.0
 Requires-Dist: lightning>=2.5.0
-Requires-Dist: torch>=2.2.0
 Requires-Dist: typer>=0.15.0
 Description-Content-Type: text/markdown
 # RCSB Embedding Model
-**Version** 0.0.17
+**Version** 0.0.18
 ## Overview
@@ -48,11 +47,10 @@ If you are interested in training the model with a new dataset, visit the [rcsb-
 **Requirements:**
-- Python ≥ 3.10
-- ESM ≥ 3.2.0
-- PyTorch ≥ 2.2.0
-- Lightning ≥ 2.5.0
-- Typer ≥ 0.15.0
+- Python ≥ 3.10
+- ESM == 3.1.1
+- Lightning ≥ 2.5.0
+- Typer ≥ 0.15.0
 ---

{rcsb_embedding_model-0.0.17.dist-info → rcsb_embedding_model-0.0.18.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 rcsb_embedding_model/__init__.py,sha256=r3gLdeBIXkQEQA_K6QcRPO-TtYuAQSutk6pXRUE_nas,120
 rcsb_embedding_model/rcsb_structure_embedding.py,sha256=dKp9hXQO0JAnO4SEfjJ_mG_jHu3UxAPguv6jkOjp-BI,4487
 rcsb_embedding_model/cli/args_utils.py,sha256=7nP2q8pL5dWK_U7opxtWmoFcYVwasky6elHk-dASFaI,165
-rcsb_embedding_model/cli/inference.py,sha256=XmnRwygWYQkPqeJi4I1H2jjo24IxXzt_EihdYZ7LLqA,18696
-rcsb_embedding_model/dataset/esm_prot_from_chain.py,sha256=u6vu_2CN6vaYAk6kpvHAOgHuEHjXJl3fukMk-tDr_6E,3486
+rcsb_embedding_model/cli/inference.py,sha256=PE36a1d6nfhNsuqCCJbos2JpZE0oCJmIf2mNw7Nz8GI,18231
+rcsb_embedding_model/dataset/esm_prot_from_chain.py,sha256=3hWo2nWunFZNTfYCTiPvVoJlkWQbRmvlehFw-6B4z6A,3506
 rcsb_embedding_model/dataset/esm_prot_from_structure.py,sha256=TeITPdi1uc3qLQ-Pgn807oH6eM0LYv-67RE50ZT4dLI,2551
 rcsb_embedding_model/dataset/resdiue_assembly_embedding_from_structure.py,sha256=worRiNqOJRjyr693TaillsS65bdTdGOoHfwyT9yE1O4,2866
 rcsb_embedding_model/dataset/residue_assembly_embedding_from_tensor_file.py,sha256=JG4rrhziIUtdTmbuTbMbEYHrvlda4m5VWvdJXe_Sv3c,3449
@@ -15,16 +15,16 @@ rcsb_embedding_model/inference/structure_inference.py,sha256=lqbDBPSea8IoNyQXl83
 rcsb_embedding_model/model/layers.py,sha256=lhKaWC4gTS_T5lHOP0mgnnP8nKTPEOm4MrjhESA4hE8,743
 rcsb_embedding_model/model/residue_embedding_aggregator.py,sha256=k3UW63Ax8DtjCMdD3O5xNxtyAu28l2n3-Ab6nS0atm0,1967
 rcsb_embedding_model/modules/chain_module.py,sha256=sDSPXJmWuU2C3lt1NorlbUVWZvRSLzumPdFQk01h3VI,403
-rcsb_embedding_model/modules/esm_module.py,sha256=CTHGOATXiarqZsBsZ8oxGJBj20A73186Slpr0EzMJsE,770
+rcsb_embedding_model/modules/esm_module.py,sha256=4IQgrNQlGThxl0PhobVzyp7N3FcyAbvek_KxJozGImQ,945
 rcsb_embedding_model/modules/structure_module.py,sha256=dEtDNdWo1j2sSDa0JiOHQfEfQzIWqSLEKpvOX0GrXZ4,1048
 rcsb_embedding_model/types/api_types.py,sha256=SCwALwvEb0KRKaoWKbuN7JyfOH-1whsI0Z4ki41dht8,1235
-rcsb_embedding_model/utils/data.py,sha256=ODz6GG6IAhgAlLh3tcIP6-JVHX8Bb_-E745Lvc_oR84,2934
+rcsb_embedding_model/utils/data.py,sha256=FVb6tzoX4SrJf3Fr6UFbxZJQsUr9xp5RbkK6nqXhcuQ,3222
 rcsb_embedding_model/utils/model.py,sha256=rpZa-gfm3cEtbBd7UXMHrZv3x6f0AC8TJT3gtrSxr5I,852
 rcsb_embedding_model/utils/structure_parser.py,sha256=IWMQ8brlEMe6_ND-DBESOli8vlqHxladTssjbM9RSKw,2751
 rcsb_embedding_model/utils/structure_provider.py,sha256=eWtxjkPpmRfmil_DKR1J6miaXR3lQ28DF5O0qrqSgGA,786
 rcsb_embedding_model/writer/batch_writer.py,sha256=rTFNasB0Xp4-XCNTXKeEWZxSrb7lvZytoRldJUWn9Jg,3312
-rcsb_embedding_model-0.0.17.dist-info/METADATA,sha256=ZIV-WqJGsSmZd79Ks455CTxSgN2J2JSD7vqr3XPx3nE,5368
-rcsb_embedding_model-0.0.17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-rcsb_embedding_model-0.0.17.dist-info/entry_points.txt,sha256=MK11jTIEmaV-x4CkPX5IymDaVs7Ky_f2xxU8BJVZ_9Q,69
-rcsb_embedding_model-0.0.17.dist-info/licenses/LICENSE.md,sha256=oUaHiKgfBkChth_Sm67WemEvatO1U0Go8LHjaskXY0w,1522
-rcsb_embedding_model-0.0.17.dist-info/RECORD,,
+rcsb_embedding_model-0.0.18.dist-info/METADATA,sha256=PzSnwGeAeUbYxhpRBgEiZZdj2bGdLrT8QAy0uB_BxNQ,5310
+rcsb_embedding_model-0.0.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+rcsb_embedding_model-0.0.18.dist-info/entry_points.txt,sha256=MK11jTIEmaV-x4CkPX5IymDaVs7Ky_f2xxU8BJVZ_9Q,69
+rcsb_embedding_model-0.0.18.dist-info/licenses/LICENSE.md,sha256=oUaHiKgfBkChth_Sm67WemEvatO1U0Go8LHjaskXY0w,1522
+rcsb_embedding_model-0.0.18.dist-info/RECORD,,

{rcsb_embedding_model-0.0.17.dist-info → rcsb_embedding_model-0.0.18.dist-info}/WHEEL RENAMED Viewed

File without changes

{rcsb_embedding_model-0.0.17.dist-info → rcsb_embedding_model-0.0.18.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{rcsb_embedding_model-0.0.17.dist-info → rcsb_embedding_model-0.0.18.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

rcsb-embedding-model 0.0.17__py3-none-any.whl → 0.0.18__py3-none-any.whl

Potentially problematic release.

rcsb-embedding-model 0.0.17py3-none-any.whl → 0.0.18py3-none-any.whl