rcsb-embedding-model 0.0.28__py3-none-any.whl → 0.0.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rcsb-embedding-model might be problematic. Click here for more details.
- rcsb_embedding_model/dataset/esm_prot_from_chain.py +5 -5
- rcsb_embedding_model/dataset/esm_prot_from_structure.py +4 -2
- rcsb_embedding_model/dataset/resdiue_assembly_embedding_from_structure.py +7 -6
- rcsb_embedding_model/dataset/residue_assembly_embedding_from_tensor_file.py +14 -9
- rcsb_embedding_model/dataset/residue_embedding_from_structure.py +7 -5
- rcsb_embedding_model/dataset/residue_embedding_from_tensor_file.py +2 -2
- rcsb_embedding_model/utils/structure_parser.py +1 -1
- {rcsb_embedding_model-0.0.28.dist-info → rcsb_embedding_model-0.0.30.dist-info}/METADATA +1 -1
- {rcsb_embedding_model-0.0.28.dist-info → rcsb_embedding_model-0.0.30.dist-info}/RECORD +12 -12
- {rcsb_embedding_model-0.0.28.dist-info → rcsb_embedding_model-0.0.30.dist-info}/WHEEL +0 -0
- {rcsb_embedding_model-0.0.28.dist-info → rcsb_embedding_model-0.0.30.dist-info}/entry_points.txt +0 -0
- {rcsb_embedding_model-0.0.28.dist-info → rcsb_embedding_model-0.0.30.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -59,10 +59,10 @@ class EsmProtFromChain(Dataset):
|
|
|
59
59
|
return len(self.data)
|
|
60
60
|
|
|
61
61
|
def __getitem__(self, idx):
|
|
62
|
-
src_name = self.data.
|
|
63
|
-
src_structure = self.data.
|
|
64
|
-
chain_id = self.data.
|
|
65
|
-
item_name = self.data.
|
|
62
|
+
src_name = self.data.iloc[idx][EsmProtFromChain.STREAM_NAME_ATTR]
|
|
63
|
+
src_structure = self.data.iloc[idx][EsmProtFromChain.STREAM_ATTR]
|
|
64
|
+
chain_id = self.data.iloc[idx][EsmProtFromChain.CH_ATTR]
|
|
65
|
+
item_name = self.data.iloc[idx][EsmProtFromChain.ITEM_NAME_ATTR]
|
|
66
66
|
structure = self.__structure_provider.get_structure(
|
|
67
67
|
src_name=src_name,
|
|
68
68
|
src_structure=stringio_from_url(src_structure) if self.structure_location == StructureLocation.remote else src_structure,
|
|
@@ -87,7 +87,7 @@ if __name__ == '__main__':
|
|
|
87
87
|
src_stream=args.file_list,
|
|
88
88
|
src_location=SrcLocation.file,
|
|
89
89
|
structure_location=StructureLocation.remote,
|
|
90
|
-
structure_format=StructureFormat.
|
|
90
|
+
structure_format=StructureFormat.bciff,
|
|
91
91
|
)
|
|
92
92
|
|
|
93
93
|
esm3 = ESM3.from_pretrained(
|
|
@@ -40,7 +40,7 @@ class EsmProtFromStructure(EsmProtFromChain):
|
|
|
40
40
|
|
|
41
41
|
def __get_chains(self, src_stream):
|
|
42
42
|
chains = []
|
|
43
|
-
|
|
43
|
+
data = pd.DataFrame(
|
|
44
44
|
src_stream,
|
|
45
45
|
dtype=str,
|
|
46
46
|
columns=EsmProtFromStructure.COLUMNS
|
|
@@ -50,7 +50,9 @@ class EsmProtFromStructure(EsmProtFromChain):
|
|
|
50
50
|
index_col=None,
|
|
51
51
|
dtype=str,
|
|
52
52
|
names=EsmProtFromStructure.COLUMNS
|
|
53
|
-
)
|
|
53
|
+
)
|
|
54
|
+
data = data.sort_values(by=data.columns[0])
|
|
55
|
+
for idx, row in data.iterrows():
|
|
54
56
|
src_name = row[EsmProtFromStructure.STREAM_NAME_ATTR]
|
|
55
57
|
src_structure = row[EsmProtFromStructure.STREAM_ATTR]
|
|
56
58
|
item_name = row[EsmProtFromStructure.ITEM_NAME_ATTR]
|
|
@@ -33,7 +33,6 @@ class ResidueAssemblyDatasetFromStructure(ResidueAssemblyEmbeddingFromTensorFile
|
|
|
33
33
|
self.structure_format = structure_format
|
|
34
34
|
self.min_res_n = min_res_n
|
|
35
35
|
self.max_res_n = max_res_n
|
|
36
|
-
self.__structure_provider = structure_provider
|
|
37
36
|
super().__init__(
|
|
38
37
|
src_stream=self.__get_assemblies(src_stream),
|
|
39
38
|
res_embedding_location=res_embedding_location,
|
|
@@ -47,17 +46,19 @@ class ResidueAssemblyDatasetFromStructure(ResidueAssemblyEmbeddingFromTensorFile
|
|
|
47
46
|
|
|
48
47
|
def __get_assemblies(self, src_stream):
|
|
49
48
|
assemblies = []
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
49
|
+
data = pd.DataFrame(
|
|
50
|
+
src_stream,
|
|
51
|
+
dtype=str,
|
|
52
|
+
columns=ResidueAssemblyDatasetFromStructure.COLUMNS
|
|
54
53
|
) if self.src_location == SrcLocation.stream else pd.read_csv(
|
|
55
54
|
src_stream,
|
|
56
55
|
header=None,
|
|
57
56
|
index_col=None,
|
|
58
57
|
dtype=str,
|
|
59
58
|
names=ResidueAssemblyDatasetFromStructure.COLUMNS
|
|
60
|
-
)
|
|
59
|
+
)
|
|
60
|
+
data = data.sort_values(by=data.columns[0])
|
|
61
|
+
for idx, row in data.iterrows():
|
|
61
62
|
src_name = row[ResidueAssemblyDatasetFromStructure.STREAM_NAME_ATTR]
|
|
62
63
|
src_structure = row[ResidueAssemblyDatasetFromStructure.STREAM_ATTR]
|
|
63
64
|
structure = stringio_from_url(src_structure) if self.structure_location == StructureLocation.remote else src_structure
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import argparse
|
|
1
2
|
import sys
|
|
2
3
|
|
|
3
4
|
import pandas as pd
|
|
@@ -58,11 +59,10 @@ class ResidueAssemblyEmbeddingFromTensorFile(Dataset):
|
|
|
58
59
|
return len(self.data)
|
|
59
60
|
|
|
60
61
|
def __getitem__(self, idx):
|
|
61
|
-
src_name = self.data.
|
|
62
|
-
src_structure = self.data.
|
|
63
|
-
assembly_id = self.data.
|
|
64
|
-
item_name = self.data.
|
|
65
|
-
|
|
62
|
+
src_name = self.data.iloc[idx][ResidueAssemblyEmbeddingFromTensorFile.STREAM_NAME_ATTR]
|
|
63
|
+
src_structure = self.data.iloc[idx][ResidueAssemblyEmbeddingFromTensorFile.STREAM_ATTR]
|
|
64
|
+
assembly_id = self.data.iloc[idx][ResidueAssemblyEmbeddingFromTensorFile.ASSEMBLY_ATTR]
|
|
65
|
+
item_name = self.data.iloc[idx][ResidueAssemblyEmbeddingFromTensorFile.ITEM_NAME_ATTR]
|
|
66
66
|
structure = self.__structure_provider.get_structure(
|
|
67
67
|
src_name=src_name,
|
|
68
68
|
src_structure=stringio_from_url(src_structure) if self.structure_location == StructureLocation.remote else src_structure,
|
|
@@ -77,12 +77,17 @@ class ResidueAssemblyEmbeddingFromTensorFile(Dataset):
|
|
|
77
77
|
|
|
78
78
|
if __name__ == "__main__":
|
|
79
79
|
|
|
80
|
+
parser = argparse.ArgumentParser()
|
|
81
|
+
parser.add_argument('--file_list', type=argparse.FileType('r'), required=True)
|
|
82
|
+
parser.add_argument('--res_embeddings_path', required=True)
|
|
83
|
+
args = parser.parse_args()
|
|
84
|
+
|
|
80
85
|
dataset = ResidueAssemblyEmbeddingFromTensorFile(
|
|
81
|
-
src_stream=
|
|
82
|
-
res_embedding_location=
|
|
86
|
+
src_stream=args.file_list,
|
|
87
|
+
res_embedding_location=args.res_embeddings_path,
|
|
83
88
|
src_location=SrcLocation.file,
|
|
84
|
-
structure_location=StructureLocation.
|
|
85
|
-
structure_format=StructureFormat.
|
|
89
|
+
structure_location=StructureLocation.remote,
|
|
90
|
+
structure_format=StructureFormat.bciff
|
|
86
91
|
)
|
|
87
92
|
|
|
88
93
|
dataloader = DataLoader(
|
|
@@ -42,17 +42,19 @@ class ResidueEmbeddingFromStructure(ResidueEmbeddingFromTensorFile):
|
|
|
42
42
|
|
|
43
43
|
def __get_chains(self, src_stream):
|
|
44
44
|
chains = []
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
45
|
+
data = pd.DataFrame(
|
|
46
|
+
src_stream,
|
|
47
|
+
dtype=str,
|
|
48
|
+
columns=ResidueEmbeddingFromStructure.COLUMNS
|
|
49
49
|
) if self.src_location == SrcLocation.stream else pd.read_csv(
|
|
50
50
|
src_stream,
|
|
51
51
|
header=None,
|
|
52
52
|
index_col=None,
|
|
53
53
|
dtype=str,
|
|
54
54
|
names=ResidueEmbeddingFromStructure.COLUMNS
|
|
55
|
-
)
|
|
55
|
+
)
|
|
56
|
+
data = data.sort_values(by=data.columns[0])
|
|
57
|
+
for idx, row in data.iterrows():
|
|
56
58
|
src_name = row[ResidueEmbeddingFromStructure.STREAM_NAME_ATTR]
|
|
57
59
|
src_structure = row[ResidueEmbeddingFromStructure.STREAM_ATTR]
|
|
58
60
|
item_name = row[ResidueEmbeddingFromStructure.ITEM_NAME_ATTR]
|
|
@@ -39,6 +39,6 @@ class ResidueEmbeddingFromTensorFile(Dataset):
|
|
|
39
39
|
return len(self.data)
|
|
40
40
|
|
|
41
41
|
def __getitem__(self, idx):
|
|
42
|
-
embedding_src = self.data.
|
|
43
|
-
item_name = self.data.
|
|
42
|
+
embedding_src = self.data.iloc[idx][ResidueEmbeddingFromTensorFile.FILE_ATTR]
|
|
43
|
+
item_name = self.data.iloc[idx][ResidueEmbeddingFromTensorFile.ITEM_NAME_ATTR]
|
|
44
44
|
return torch.load(embedding_src, map_location=torch.device('cpu')), item_name
|
|
@@ -32,7 +32,7 @@ def get_protein_chains(structure, min_res_n=0):
|
|
|
32
32
|
for atom_ch in chain_iter(structure):
|
|
33
33
|
atom_res = atom_ch[filter_polymer(atom_ch)]
|
|
34
34
|
atom_res = atom_res[filter_amino_acids(atom_res)]
|
|
35
|
-
if len(atom_res) > 0 and len(get_residues(atom_res)) > min_res_n:
|
|
35
|
+
if len(atom_res) > 0 and len(get_residues(atom_res)[0]) > min_res_n:
|
|
36
36
|
chain_ids.append(str(get_chains(atom_res)[0]))
|
|
37
37
|
return tuple(chain_ids)
|
|
38
38
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rcsb-embedding-model
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.30
|
|
4
4
|
Summary: Protein Embedding Model for Structure Search
|
|
5
5
|
Project-URL: Homepage, https://github.com/rcsb/rcsb-embedding-model
|
|
6
6
|
Project-URL: Issues, https://github.com/rcsb/rcsb-embedding-model/issues
|
|
@@ -2,12 +2,12 @@ rcsb_embedding_model/__init__.py,sha256=r3gLdeBIXkQEQA_K6QcRPO-TtYuAQSutk6pXRUE_
|
|
|
2
2
|
rcsb_embedding_model/rcsb_structure_embedding.py,sha256=dKp9hXQO0JAnO4SEfjJ_mG_jHu3UxAPguv6jkOjp-BI,4487
|
|
3
3
|
rcsb_embedding_model/cli/args_utils.py,sha256=7nP2q8pL5dWK_U7opxtWmoFcYVwasky6elHk-dASFaI,165
|
|
4
4
|
rcsb_embedding_model/cli/inference.py,sha256=tfMvHAhkUIzJ2RbTtQjq7eWmOUrSyVfH5bjTkCCSIS8,19500
|
|
5
|
-
rcsb_embedding_model/dataset/esm_prot_from_chain.py,sha256=
|
|
6
|
-
rcsb_embedding_model/dataset/esm_prot_from_structure.py,sha256=
|
|
7
|
-
rcsb_embedding_model/dataset/resdiue_assembly_embedding_from_structure.py,sha256=
|
|
8
|
-
rcsb_embedding_model/dataset/residue_assembly_embedding_from_tensor_file.py,sha256=
|
|
9
|
-
rcsb_embedding_model/dataset/residue_embedding_from_structure.py,sha256=
|
|
10
|
-
rcsb_embedding_model/dataset/residue_embedding_from_tensor_file.py,sha256=
|
|
5
|
+
rcsb_embedding_model/dataset/esm_prot_from_chain.py,sha256=l8pRgRpz-8gEzR6QsNcmdnX_KkgvgF1vdqNAsP8Lrc8,3960
|
|
6
|
+
rcsb_embedding_model/dataset/esm_prot_from_structure.py,sha256=3HzXCCc-UqmZNbJaeXHyUsSIZZxMc2erbxAPGIxSmfE,2621
|
|
7
|
+
rcsb_embedding_model/dataset/resdiue_assembly_embedding_from_structure.py,sha256=69h1VkrIXesHZi1cG3BOMMytSDeRzcBBP0_Z3Xz3dM8,2869
|
|
8
|
+
rcsb_embedding_model/dataset/residue_assembly_embedding_from_tensor_file.py,sha256=Hd9oH-IVgY6d7Dxy5VfiwHvSaK-Wwhk6ccUBgOwl0TU,3740
|
|
9
|
+
rcsb_embedding_model/dataset/residue_embedding_from_structure.py,sha256=1jmeEcCK41cAi2ZnqQkd667NWCAIGS3k6jGDF-WxtTk,2854
|
|
10
|
+
rcsb_embedding_model/dataset/residue_embedding_from_tensor_file.py,sha256=4OPaw55yGKHjY2iPpCnemcfwfmTZ4j5VrGQ2oIMQw6A,1343
|
|
11
11
|
rcsb_embedding_model/inference/assembly_inferece.py,sha256=8fPJjEXy1WsM5XB5U7KfdO5-Du6nEsawsaAjmWoXA9I,2329
|
|
12
12
|
rcsb_embedding_model/inference/chain_inference.py,sha256=6f5wVzjtRtHU3BPMTe5k3nH_Nl440Am8BL8h1vmK1jI,2925
|
|
13
13
|
rcsb_embedding_model/inference/esm_inference.py,sha256=rn6H43D8BYzMZbMu7UPsLYg2dgERmmpci5weNItrG5Q,2546
|
|
@@ -20,11 +20,11 @@ rcsb_embedding_model/modules/structure_module.py,sha256=4js02XzKvhc_G26ELsGhJ9SC
|
|
|
20
20
|
rcsb_embedding_model/types/api_types.py,sha256=SCwALwvEb0KRKaoWKbuN7JyfOH-1whsI0Z4ki41dht8,1235
|
|
21
21
|
rcsb_embedding_model/utils/data.py,sha256=BOjYdIRHrFqk8qFuKGrgCtVyfDupzgOVmH_0C-ecMvg,3813
|
|
22
22
|
rcsb_embedding_model/utils/model.py,sha256=xr3p02ohOgJ5UInwdIupN68Oq4yvNFhxobZRacS1adg,953
|
|
23
|
-
rcsb_embedding_model/utils/structure_parser.py,sha256=
|
|
23
|
+
rcsb_embedding_model/utils/structure_parser.py,sha256=eq1Jpmeo5oqqo_kzYbyIrqDysBAZR3II0mZK3ygaYhs,2754
|
|
24
24
|
rcsb_embedding_model/utils/structure_provider.py,sha256=eWtxjkPpmRfmil_DKR1J6miaXR3lQ28DF5O0qrqSgGA,786
|
|
25
25
|
rcsb_embedding_model/writer/batch_writer.py,sha256=rTFNasB0Xp4-XCNTXKeEWZxSrb7lvZytoRldJUWn9Jg,3312
|
|
26
|
-
rcsb_embedding_model-0.0.
|
|
27
|
-
rcsb_embedding_model-0.0.
|
|
28
|
-
rcsb_embedding_model-0.0.
|
|
29
|
-
rcsb_embedding_model-0.0.
|
|
30
|
-
rcsb_embedding_model-0.0.
|
|
26
|
+
rcsb_embedding_model-0.0.30.dist-info/METADATA,sha256=IjxTnTFbYAIGu99NNAH2MII46PvFMZXEkim5STOCNTU,5310
|
|
27
|
+
rcsb_embedding_model-0.0.30.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
28
|
+
rcsb_embedding_model-0.0.30.dist-info/entry_points.txt,sha256=MK11jTIEmaV-x4CkPX5IymDaVs7Ky_f2xxU8BJVZ_9Q,69
|
|
29
|
+
rcsb_embedding_model-0.0.30.dist-info/licenses/LICENSE.md,sha256=oUaHiKgfBkChth_Sm67WemEvatO1U0Go8LHjaskXY0w,1522
|
|
30
|
+
rcsb_embedding_model-0.0.30.dist-info/RECORD,,
|
|
File without changes
|
{rcsb_embedding_model-0.0.28.dist-info → rcsb_embedding_model-0.0.30.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{rcsb_embedding_model-0.0.28.dist-info → rcsb_embedding_model-0.0.30.dist-info}/licenses/LICENSE.md
RENAMED
|
File without changes
|