rcsb-embedding-model 0.0.28__tar.gz → 0.0.30__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rcsb-embedding-model might be problematic. Click here for more details.

Files changed (50) hide show
  1. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/.gitignore +2 -1
  2. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/PKG-INFO +1 -1
  3. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/pyproject.toml +1 -1
  4. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/dataset/esm_prot_from_chain.py +5 -5
  5. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/dataset/esm_prot_from_structure.py +4 -2
  6. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/dataset/resdiue_assembly_embedding_from_structure.py +7 -6
  7. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/dataset/residue_assembly_embedding_from_tensor_file.py +14 -9
  8. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/dataset/residue_embedding_from_structure.py +7 -5
  9. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/dataset/residue_embedding_from_tensor_file.py +2 -2
  10. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/utils/structure_parser.py +1 -1
  11. rcsb_embedding_model-0.0.30/tests/resources/src_stream/assembly-complete-test.csv +7 -0
  12. rcsb_embedding_model-0.0.30/tests/resources/src_stream/instance-complete-test.csv +10 -0
  13. rcsb_embedding_model-0.0.30/tests/resources/src_stream/instance.csv +2 -0
  14. rcsb_embedding_model-0.0.30/tests/test_cli_inference.py +53 -0
  15. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/tests/test_remote_inference.py +18 -0
  16. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/.dockerignore +0 -0
  17. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/.github/workflows/_workflow-docker.yaml +0 -0
  18. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/.github/workflows/publish.yaml +0 -0
  19. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/Dockerfile +0 -0
  20. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/LICENSE.md +0 -0
  21. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/README.md +0 -0
  22. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/assets/embedding-model-architecture.png +0 -0
  23. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/examples/esm_embeddings.py +0 -0
  24. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/__init__.py +0 -0
  25. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/cli/args_utils.py +0 -0
  26. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/cli/inference.py +0 -0
  27. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/inference/assembly_inferece.py +0 -0
  28. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/inference/chain_inference.py +0 -0
  29. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/inference/esm_inference.py +0 -0
  30. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/inference/structure_inference.py +0 -0
  31. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/model/layers.py +0 -0
  32. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/model/residue_embedding_aggregator.py +0 -0
  33. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/modules/chain_module.py +0 -0
  34. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/modules/esm_module.py +0 -0
  35. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/modules/structure_module.py +0 -0
  36. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/rcsb_structure_embedding.py +0 -0
  37. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/types/api_types.py +0 -0
  38. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/utils/data.py +0 -0
  39. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/utils/model.py +0 -0
  40. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/utils/structure_provider.py +0 -0
  41. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/src/rcsb_embedding_model/writer/batch_writer.py +0 -0
  42. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/tests/resources/embeddings/1acb.A.pt +0 -0
  43. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/tests/resources/embeddings/1acb.B.pt +0 -0
  44. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/tests/resources/embeddings/2uzi.A.pt +0 -0
  45. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/tests/resources/embeddings/2uzi.B.pt +0 -0
  46. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/tests/resources/embeddings/2uzi.C.pt +0 -0
  47. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/tests/resources/pdb/1acb.cif +0 -0
  48. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/tests/resources/pdb/2uzi.cif +0 -0
  49. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/tests/test_embedding_model.py +0 -0
  50. {rcsb_embedding_model-0.0.28 → rcsb_embedding_model-0.0.30}/tests/test_inference.py +0 -0
@@ -2,4 +2,5 @@
2
2
  /rcsb-embedding-model.iml
3
3
  /dist/
4
4
  /.pypi.rc
5
- __pycache__
5
+ __pycache__
6
+ /tests/resources/tmp
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rcsb-embedding-model
3
- Version: 0.0.28
3
+ Version: 0.0.30
4
4
  Summary: Protein Embedding Model for Structure Search
5
5
  Project-URL: Homepage, https://github.com/rcsb/rcsb-embedding-model
6
6
  Project-URL: Issues, https://github.com/rcsb/rcsb-embedding-model/issues
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "rcsb-embedding-model"
3
- version = "0.0.28"
3
+ version = "0.0.30"
4
4
  authors = [
5
5
  { name="Joan Segura", email="joan.segura@rcsb.org" },
6
6
  ]
@@ -59,10 +59,10 @@ class EsmProtFromChain(Dataset):
59
59
  return len(self.data)
60
60
 
61
61
  def __getitem__(self, idx):
62
- src_name = self.data.loc[idx, EsmProtFromChain.STREAM_NAME_ATTR]
63
- src_structure = self.data.loc[idx, EsmProtFromChain.STREAM_ATTR]
64
- chain_id = self.data.loc[idx, EsmProtFromChain.CH_ATTR]
65
- item_name = self.data.loc[idx, EsmProtFromChain.ITEM_NAME_ATTR]
62
+ src_name = self.data.iloc[idx][EsmProtFromChain.STREAM_NAME_ATTR]
63
+ src_structure = self.data.iloc[idx][EsmProtFromChain.STREAM_ATTR]
64
+ chain_id = self.data.iloc[idx][EsmProtFromChain.CH_ATTR]
65
+ item_name = self.data.iloc[idx][EsmProtFromChain.ITEM_NAME_ATTR]
66
66
  structure = self.__structure_provider.get_structure(
67
67
  src_name=src_name,
68
68
  src_structure=stringio_from_url(src_structure) if self.structure_location == StructureLocation.remote else src_structure,
@@ -87,7 +87,7 @@ if __name__ == '__main__':
87
87
  src_stream=args.file_list,
88
88
  src_location=SrcLocation.file,
89
89
  structure_location=StructureLocation.remote,
90
- structure_format=StructureFormat.mmcif,
90
+ structure_format=StructureFormat.bciff,
91
91
  )
92
92
 
93
93
  esm3 = ESM3.from_pretrained(
@@ -40,7 +40,7 @@ class EsmProtFromStructure(EsmProtFromChain):
40
40
 
41
41
  def __get_chains(self, src_stream):
42
42
  chains = []
43
- for idx, row in (pd.DataFrame(
43
+ data = pd.DataFrame(
44
44
  src_stream,
45
45
  dtype=str,
46
46
  columns=EsmProtFromStructure.COLUMNS
@@ -50,7 +50,9 @@ class EsmProtFromStructure(EsmProtFromChain):
50
50
  index_col=None,
51
51
  dtype=str,
52
52
  names=EsmProtFromStructure.COLUMNS
53
- )).iterrows():
53
+ )
54
+ data = data.sort_values(by=data.columns[0])
55
+ for idx, row in data.iterrows():
54
56
  src_name = row[EsmProtFromStructure.STREAM_NAME_ATTR]
55
57
  src_structure = row[EsmProtFromStructure.STREAM_ATTR]
56
58
  item_name = row[EsmProtFromStructure.ITEM_NAME_ATTR]
@@ -33,7 +33,6 @@ class ResidueAssemblyDatasetFromStructure(ResidueAssemblyEmbeddingFromTensorFile
33
33
  self.structure_format = structure_format
34
34
  self.min_res_n = min_res_n
35
35
  self.max_res_n = max_res_n
36
- self.__structure_provider = structure_provider
37
36
  super().__init__(
38
37
  src_stream=self.__get_assemblies(src_stream),
39
38
  res_embedding_location=res_embedding_location,
@@ -47,17 +46,19 @@ class ResidueAssemblyDatasetFromStructure(ResidueAssemblyEmbeddingFromTensorFile
47
46
 
48
47
  def __get_assemblies(self, src_stream):
49
48
  assemblies = []
50
- for idx, row in (pd.DataFrame(
51
- src_stream,
52
- dtype=str,
53
- columns=ResidueAssemblyDatasetFromStructure.COLUMNS
49
+ data = pd.DataFrame(
50
+ src_stream,
51
+ dtype=str,
52
+ columns=ResidueAssemblyDatasetFromStructure.COLUMNS
54
53
  ) if self.src_location == SrcLocation.stream else pd.read_csv(
55
54
  src_stream,
56
55
  header=None,
57
56
  index_col=None,
58
57
  dtype=str,
59
58
  names=ResidueAssemblyDatasetFromStructure.COLUMNS
60
- )).iterrows():
59
+ )
60
+ data = data.sort_values(by=data.columns[0])
61
+ for idx, row in data.iterrows():
61
62
  src_name = row[ResidueAssemblyDatasetFromStructure.STREAM_NAME_ATTR]
62
63
  src_structure = row[ResidueAssemblyDatasetFromStructure.STREAM_ATTR]
63
64
  structure = stringio_from_url(src_structure) if self.structure_location == StructureLocation.remote else src_structure
@@ -1,3 +1,4 @@
1
+ import argparse
1
2
  import sys
2
3
 
3
4
  import pandas as pd
@@ -58,11 +59,10 @@ class ResidueAssemblyEmbeddingFromTensorFile(Dataset):
58
59
  return len(self.data)
59
60
 
60
61
  def __getitem__(self, idx):
61
- src_name = self.data.loc[idx, ResidueAssemblyEmbeddingFromTensorFile.STREAM_NAME_ATTR]
62
- src_structure = self.data.loc[idx, ResidueAssemblyEmbeddingFromTensorFile.STREAM_ATTR]
63
- assembly_id = self.data.loc[idx, ResidueAssemblyEmbeddingFromTensorFile.ASSEMBLY_ATTR]
64
- item_name = self.data.loc[idx, ResidueAssemblyEmbeddingFromTensorFile.ITEM_NAME_ATTR]
65
-
62
+ src_name = self.data.iloc[idx][ResidueAssemblyEmbeddingFromTensorFile.STREAM_NAME_ATTR]
63
+ src_structure = self.data.iloc[idx][ResidueAssemblyEmbeddingFromTensorFile.STREAM_ATTR]
64
+ assembly_id = self.data.iloc[idx][ResidueAssemblyEmbeddingFromTensorFile.ASSEMBLY_ATTR]
65
+ item_name = self.data.iloc[idx][ResidueAssemblyEmbeddingFromTensorFile.ITEM_NAME_ATTR]
66
66
  structure = self.__structure_provider.get_structure(
67
67
  src_name=src_name,
68
68
  src_structure=stringio_from_url(src_structure) if self.structure_location == StructureLocation.remote else src_structure,
@@ -77,12 +77,17 @@ class ResidueAssemblyEmbeddingFromTensorFile(Dataset):
77
77
 
78
78
  if __name__ == "__main__":
79
79
 
80
+ parser = argparse.ArgumentParser()
81
+ parser.add_argument('--file_list', type=argparse.FileType('r'), required=True)
82
+ parser.add_argument('--res_embeddings_path', required=True)
83
+ args = parser.parse_args()
84
+
80
85
  dataset = ResidueAssemblyEmbeddingFromTensorFile(
81
- src_stream="/Users/joan/tmp/assembly-test.csv",
82
- res_embedding_location="/Users/joan/tmp",
86
+ src_stream=args.file_list,
87
+ res_embedding_location=args.res_embeddings_path,
83
88
  src_location=SrcLocation.file,
84
- structure_location=StructureLocation.local,
85
- structure_format=StructureFormat.mmcif
89
+ structure_location=StructureLocation.remote,
90
+ structure_format=StructureFormat.bciff
86
91
  )
87
92
 
88
93
  dataloader = DataLoader(
@@ -42,17 +42,19 @@ class ResidueEmbeddingFromStructure(ResidueEmbeddingFromTensorFile):
42
42
 
43
43
  def __get_chains(self, src_stream):
44
44
  chains = []
45
- for idx, row in (pd.DataFrame(
46
- src_stream,
47
- dtype=str,
48
- columns=ResidueEmbeddingFromStructure.COLUMNS
45
+ data = pd.DataFrame(
46
+ src_stream,
47
+ dtype=str,
48
+ columns=ResidueEmbeddingFromStructure.COLUMNS
49
49
  ) if self.src_location == SrcLocation.stream else pd.read_csv(
50
50
  src_stream,
51
51
  header=None,
52
52
  index_col=None,
53
53
  dtype=str,
54
54
  names=ResidueEmbeddingFromStructure.COLUMNS
55
- )).iterrows():
55
+ )
56
+ data = data.sort_values(by=data.columns[0])
57
+ for idx, row in data.iterrows():
56
58
  src_name = row[ResidueEmbeddingFromStructure.STREAM_NAME_ATTR]
57
59
  src_structure = row[ResidueEmbeddingFromStructure.STREAM_ATTR]
58
60
  item_name = row[ResidueEmbeddingFromStructure.ITEM_NAME_ATTR]
@@ -39,6 +39,6 @@ class ResidueEmbeddingFromTensorFile(Dataset):
39
39
  return len(self.data)
40
40
 
41
41
  def __getitem__(self, idx):
42
- embedding_src = self.data.loc[idx, ResidueEmbeddingFromTensorFile.FILE_ATTR]
43
- item_name = self.data.loc[idx, ResidueEmbeddingFromTensorFile.ITEM_NAME_ATTR]
42
+ embedding_src = self.data.iloc[idx][ResidueEmbeddingFromTensorFile.FILE_ATTR]
43
+ item_name = self.data.iloc[idx][ResidueEmbeddingFromTensorFile.ITEM_NAME_ATTR]
44
44
  return torch.load(embedding_src, map_location=torch.device('cpu')), item_name
@@ -32,7 +32,7 @@ def get_protein_chains(structure, min_res_n=0):
32
32
  for atom_ch in chain_iter(structure):
33
33
  atom_res = atom_ch[filter_polymer(atom_ch)]
34
34
  atom_res = atom_res[filter_amino_acids(atom_res)]
35
- if len(atom_res) > 0 and len(get_residues(atom_res)) > min_res_n:
35
+ if len(atom_res) > 0 and len(get_residues(atom_res)[0]) > min_res_n:
36
36
  chain_ids.append(str(get_chains(atom_res)[0]))
37
37
  return tuple(chain_ids)
38
38
 
@@ -0,0 +1,7 @@
1
+ 1A21,https://models.rcsb.org/1A21.bcif.gz,1,1A21-1
2
+ 1A21,https://models.rcsb.org/1A21.bcif.gz,2,1A21-2
3
+ 1A3J,https://models.rcsb.org/1A3J.bcif.gz,1,1A3J-1
4
+ 1A3X,https://models.rcsb.org/1A3X.bcif.gz,1,1A3X-1
5
+ 1A3X,https://models.rcsb.org/1A3X.bcif.gz,2,1A3X-2
6
+ 1AIV,https://models.rcsb.org/1AIV.bcif.gz,1,1AIV-1
7
+ 1AU1,https://models.rcsb.org/1AU1.bcif.gz,1,1AU1-1
@@ -0,0 +1,10 @@
1
+ 1A21,https://models.rcsb.org/1A21.bcif.gz,A,1A21.A
2
+ 1A21,https://models.rcsb.org/1A21.bcif.gz,B,1A21.B
3
+ 1A3J,https://models.rcsb.org/1A3J.bcif.gz,A,1A3J.A
4
+ 1A3J,https://models.rcsb.org/1A3J.bcif.gz,B,1A3J.B
5
+ 1A3J,https://models.rcsb.org/1A3J.bcif.gz,C,1A3J.C
6
+ 1A3X,https://models.rcsb.org/1A3X.bcif.gz,A,1A3X.A
7
+ 1A3X,https://models.rcsb.org/1A3X.bcif.gz,B,1A3X.B
8
+ 1AIV,https://models.rcsb.org/1AIV.bcif.gz,A,1AIV.A
9
+ 1AU1,https://models.rcsb.org/1AU1.bcif.gz,A,1AU1.A
10
+ 1AU1,https://models.rcsb.org/1AU1.bcif.gz,B,1AU1.B
@@ -0,0 +1,2 @@
1
+ 1acb,https://models.rcsb.org/1acb.bcif.gz,A,1acb.A
2
+ 2uzi,https://models.rcsb.org/2uzi.bcif.gz,A,2uzi.A
@@ -0,0 +1,53 @@
1
+
2
+ import os
3
+ import shutil
4
+ import unittest
5
+
6
+ from rcsb_embedding_model.types.api_types import OutFormat, StructureLocation, StructureFormat, Accelerator
7
+
8
+
9
+
10
+
11
+
12
+ class TestCliInference(unittest.TestCase):
13
+ __test_path = os.path.dirname(__file__)
14
+
15
+ def test_complete_inference(self):
16
+ _remove_files_in_directory(f"{self.__test_path}/resources/tmp")
17
+ from rcsb_embedding_model.cli.inference import complete_embedding
18
+ complete_embedding(
19
+ src_chain_file=f"{self.__test_path}/resources/src_stream/instance-complete-test.csv",
20
+ src_assembly_file=f"{self.__test_path}/resources/src_stream/assembly-complete-test.csv",
21
+ output_res_path=f"{self.__test_path}/resources/tmp",
22
+ output_chain_path=f"{self.__test_path}/resources/tmp",
23
+ output_assembly_path=f"{self.__test_path}/resources/tmp",
24
+ output_format=OutFormat.grouped,
25
+ output_chain_name="instance-inference",
26
+ output_assembly_name="assembly-inference",
27
+ structure_location=StructureLocation.remote,
28
+ structure_format=StructureFormat.bciff,
29
+ min_res_n=0,
30
+ batch_size_res=1,
31
+ num_workers_res=0,
32
+ batch_size_chain=1,
33
+ num_workers_chain=0,
34
+ batch_size_assembly=1,
35
+ num_workers_assembly=0,
36
+ num_nodes=1,
37
+ accelerator=Accelerator.cpu
38
+ )
39
+ self.assertTrue(os.path.exists(f"{self.__test_path}/resources/tmp/instance-inference.json.gz"))
40
+ self.assertTrue(os.path.exists(f"{self.__test_path}/resources/tmp/assembly-inference.json.gz"))
41
+
42
+
43
+ def _remove_files_in_directory(directory_path):
44
+ os.makedirs(directory_path, exist_ok=True)
45
+ for filename in os.listdir(directory_path):
46
+ file_path = os.path.join(directory_path, filename)
47
+ try:
48
+ if os.path.isfile(file_path):
49
+ os.unlink(file_path)
50
+ elif os.path.isdir(file_path):
51
+ shutil.rmtree(file_path)
52
+ except Exception as e:
53
+ print(f"Failed to delete {file_path}. Reason: {e}")
@@ -50,6 +50,24 @@ class TestRemoteInference(unittest.TestCase):
50
50
  self.assertEqual(tuple(esm_embeddings[idx][0][0].shape), shape)
51
51
 
52
52
 
53
+ def test_esm_inference_from_csv_bcif_gz(self):
54
+ from rcsb_embedding_model.inference.esm_inference import predict
55
+
56
+ esm_embeddings = predict(
57
+ src_stream=f"{self.__test_path}/resources/src_stream/instance.csv",
58
+ src_location=SrcLocation.file,
59
+ src_from=SrcProteinFrom.chain,
60
+ structure_location=StructureLocation.remote,
61
+ structure_format=StructureFormat.bciff,
62
+ accelerator=Accelerator.cpu
63
+ )
64
+
65
+ self.assertEqual(len(esm_embeddings), 2)
66
+ shapes = ((243, 1536), (116, 1536))
67
+ for idx, shape in enumerate(shapes):
68
+ self.assertEqual(tuple(esm_embeddings[idx][0][0].shape), shape)
69
+
70
+
53
71
  def test_esm_inference_from_cif_gz(self):
54
72
  from rcsb_embedding_model.inference.esm_inference import predict
55
73