rcsb-embedding-model 0.0.32__py3-none-any.whl → 0.0.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rcsb-embedding-model might be problematic. Click here for more details.
- rcsb_embedding_model/dataset/esm_prot_from_chain.py +10 -6
- rcsb_embedding_model/utils/data.py +4 -6
- rcsb_embedding_model/utils/structure_parser.py +42 -34
- {rcsb_embedding_model-0.0.32.dist-info → rcsb_embedding_model-0.0.34.dist-info}/METADATA +1 -1
- {rcsb_embedding_model-0.0.32.dist-info → rcsb_embedding_model-0.0.34.dist-info}/RECORD +8 -8
- {rcsb_embedding_model-0.0.32.dist-info → rcsb_embedding_model-0.0.34.dist-info}/WHEEL +0 -0
- {rcsb_embedding_model-0.0.32.dist-info → rcsb_embedding_model-0.0.34.dist-info}/entry_points.txt +0 -0
- {rcsb_embedding_model-0.0.32.dist-info → rcsb_embedding_model-0.0.34.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -11,7 +11,7 @@ import pandas as pd
|
|
|
11
11
|
|
|
12
12
|
from rcsb_embedding_model.types.api_types import StructureFormat, StructureLocation, SrcLocation
|
|
13
13
|
from rcsb_embedding_model.utils.data import stringio_from_url
|
|
14
|
-
from rcsb_embedding_model.utils.structure_parser import
|
|
14
|
+
from rcsb_embedding_model.utils.structure_parser import rename_atom_attr,filter_residues
|
|
15
15
|
from rcsb_embedding_model.utils.structure_provider import StructureProvider
|
|
16
16
|
|
|
17
17
|
|
|
@@ -72,13 +72,17 @@ class EsmProtFromChain(Dataset):
|
|
|
72
72
|
for atom_ch in chain_iter(structure):
|
|
73
73
|
if len(atom_ch) == 0:
|
|
74
74
|
raise IOError(f"No atoms were found in structure chain {src_name}.{chain_id}")
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
75
|
+
atom_ch = filter_residues(atom_ch)
|
|
76
|
+
atom_ch = rename_atom_attr(atom_ch)
|
|
77
|
+
try:
|
|
78
|
+
protein_chain = ProteinChain.from_atomarray(atom_ch)
|
|
79
|
+
protein_chain = ESMProtein.from_protein_chain(protein_chain)
|
|
80
|
+
except Exception as e:
|
|
81
|
+
raise IOError(f"Error while creating ESMProtein from structure chain {src_name}.{chain_id}: {e}")
|
|
82
|
+
|
|
80
83
|
if len(protein_chain) == 0:
|
|
81
84
|
raise IOError(f"No atoms were found in structure chain {src_name}.{chain_id}")
|
|
85
|
+
|
|
82
86
|
return protein_chain, item_name
|
|
83
87
|
raise IOError(f"No atoms were found in structure chain {src_name}.{chain_id}")
|
|
84
88
|
|
|
@@ -4,6 +4,7 @@ import gzip
|
|
|
4
4
|
from io import StringIO, BytesIO
|
|
5
5
|
|
|
6
6
|
import torch
|
|
7
|
+
from requests import RequestException
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
def collate_seq_embeddings(batch_list):
|
|
@@ -52,13 +53,10 @@ def stringio_from_url(url):
|
|
|
52
53
|
return StringIO(f.read())
|
|
53
54
|
else:
|
|
54
55
|
return StringIO(response.text)
|
|
55
|
-
except
|
|
56
|
-
|
|
57
|
-
return None
|
|
56
|
+
except RequestException as e:
|
|
57
|
+
raise RuntimeError(f"Error fetching URL: {e}")
|
|
58
58
|
except (OSError, gzip.BadGzipFile) as e:
|
|
59
|
-
|
|
60
|
-
return None
|
|
61
|
-
|
|
59
|
+
raise RuntimeError(f"Error decompressing gzip file: {e}")
|
|
62
60
|
|
|
63
61
|
|
|
64
62
|
def concatenate_tensors(file_list, max_residues, dim=0):
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
import numpy as np
|
|
2
|
+
from biotite.structure import filter_amino_acids, filter_polymer, chain_iter, get_chains, get_residues, \
|
|
3
|
+
filter_peptide_backbone, residue_iter, array
|
|
2
4
|
from biotite.structure.io.pdb import PDBFile, get_structure as get_pdb_structure, get_assembly as get_pdb_assembly, list_assemblies as list_pdb_assemblies
|
|
3
5
|
from biotite.structure.io.pdbx import CIFFile, get_structure, get_assembly, BinaryCIFFile, list_assemblies
|
|
4
6
|
|
|
@@ -9,17 +11,20 @@ def get_structure_from_src(
|
|
|
9
11
|
chain_id=None,
|
|
10
12
|
assembly_id=None
|
|
11
13
|
):
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
14
|
+
try:
|
|
15
|
+
if structure_format == "pdb":
|
|
16
|
+
pdb_file = PDBFile.read(src_structure)
|
|
17
|
+
structure = __get_pdb_structure(pdb_file, assembly_id)
|
|
18
|
+
elif structure_format == "mmcif":
|
|
19
|
+
cif_file = CIFFile.read(src_structure)
|
|
20
|
+
structure = __get_structure(cif_file, assembly_id)
|
|
21
|
+
elif structure_format == "binarycif":
|
|
22
|
+
cif_file = BinaryCIFFile.read(src_structure)
|
|
23
|
+
structure = __get_structure(cif_file, assembly_id)
|
|
24
|
+
else:
|
|
25
|
+
raise RuntimeError(f"Unknown file format {structure_format}")
|
|
26
|
+
except Exception as e:
|
|
27
|
+
raise RuntimeError(f"Error reading structure from {src_structure}: {e}")
|
|
23
28
|
|
|
24
29
|
if chain_id is not None:
|
|
25
30
|
return structure[structure.chain_id == chain_id]
|
|
@@ -38,34 +43,29 @@ def get_protein_chains(structure, min_res_n=0):
|
|
|
38
43
|
|
|
39
44
|
|
|
40
45
|
def get_assemblies(structure, structure_format="mmcif"):
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
try:
|
|
47
|
+
if structure_format == "pdb":
|
|
48
|
+
return tuple(list_pdb_assemblies(PDBFile.read(structure)))
|
|
49
|
+
elif structure_format == "mmcif":
|
|
50
|
+
return tuple(list_assemblies(CIFFile.read(structure)).keys())
|
|
51
|
+
elif structure_format == "binarycif":
|
|
52
|
+
return tuple(list_assemblies(BinaryCIFFile.read(structure)))
|
|
53
|
+
else:
|
|
54
|
+
raise RuntimeError(f"Unknown file format {structure_format}")
|
|
55
|
+
except Exception as e:
|
|
56
|
+
raise RuntimeError(f"Error reading assemblies from {structure}: {e}")
|
|
49
57
|
|
|
50
58
|
|
|
51
|
-
def
|
|
52
|
-
|
|
53
|
-
for idx, atom in enumerate(atom_ch):
|
|
54
|
-
atom.chain_id = ch
|
|
55
|
-
renamed_atom_ch[idx] = atom
|
|
56
|
-
return renamed_atom_ch
|
|
59
|
+
def rename_atom_attr(atom_ch):
|
|
60
|
+
return array([__rename_atom(a) for a in atom_ch])
|
|
57
61
|
|
|
58
62
|
|
|
59
|
-
def
|
|
60
|
-
|
|
61
|
-
for idx, atom in enumerate(atom_ch):
|
|
62
|
-
atom.hetero = False
|
|
63
|
-
renamed_atom_ch[idx] = atom
|
|
64
|
-
return renamed_atom_ch
|
|
63
|
+
def filter_residues(atom_ch):
|
|
64
|
+
return atom_ch[filter_amino_acids(atom_ch)]
|
|
65
65
|
|
|
66
66
|
|
|
67
|
-
def
|
|
68
|
-
return
|
|
67
|
+
def get_backbone_atoms(atom_ch):
|
|
68
|
+
return np.array([(lambda x: [a.coord for a in x])(r) for r in residue_iter(atom_ch[filter_peptide_backbone(atom_ch)])])
|
|
69
69
|
|
|
70
70
|
|
|
71
71
|
def __get_pdb_structure(pdb_file, assembly_id=None):
|
|
@@ -79,6 +79,14 @@ def __get_pdb_structure(pdb_file, assembly_id=None):
|
|
|
79
79
|
)
|
|
80
80
|
|
|
81
81
|
|
|
82
|
+
def __rename_atom(atom):
|
|
83
|
+
atom.chain_id = "A"
|
|
84
|
+
atom.hetero = False
|
|
85
|
+
if len(atom.res_name) > 3:
|
|
86
|
+
atom.res_name = 'UNK'
|
|
87
|
+
return atom
|
|
88
|
+
|
|
89
|
+
|
|
82
90
|
def __get_structure(cif_file, assembly_id=None):
|
|
83
91
|
return get_structure(
|
|
84
92
|
cif_file,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rcsb-embedding-model
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.34
|
|
4
4
|
Summary: Protein Embedding Model for Structure Search
|
|
5
5
|
Project-URL: Homepage, https://github.com/rcsb/rcsb-embedding-model
|
|
6
6
|
Project-URL: Issues, https://github.com/rcsb/rcsb-embedding-model/issues
|
|
@@ -2,7 +2,7 @@ rcsb_embedding_model/__init__.py,sha256=7YfYO-V-u__19eAZfQ3t5Gf2qrhd_gwQB8rHO0J0
|
|
|
2
2
|
rcsb_embedding_model/rcsb_structure_embedding.py,sha256=dKp9hXQO0JAnO4SEfjJ_mG_jHu3UxAPguv6jkOjp-BI,4487
|
|
3
3
|
rcsb_embedding_model/cli/args_utils.py,sha256=7nP2q8pL5dWK_U7opxtWmoFcYVwasky6elHk-dASFaI,165
|
|
4
4
|
rcsb_embedding_model/cli/inference.py,sha256=67_Tr3LWeA3T4KS5mkjq6tw77Ypy0R8IwMxEG2FwVqQ,19901
|
|
5
|
-
rcsb_embedding_model/dataset/esm_prot_from_chain.py,sha256=
|
|
5
|
+
rcsb_embedding_model/dataset/esm_prot_from_chain.py,sha256=p0IqLjGj1v-0sKGbrpWfkyYZXSurF3OOVkVk0g3zxyE,4410
|
|
6
6
|
rcsb_embedding_model/dataset/esm_prot_from_structure.py,sha256=3HzXCCc-UqmZNbJaeXHyUsSIZZxMc2erbxAPGIxSmfE,2621
|
|
7
7
|
rcsb_embedding_model/dataset/resdiue_assembly_embedding_from_structure.py,sha256=69h1VkrIXesHZi1cG3BOMMytSDeRzcBBP0_Z3Xz3dM8,2869
|
|
8
8
|
rcsb_embedding_model/dataset/residue_assembly_embedding_from_tensor_file.py,sha256=Hd9oH-IVgY6d7Dxy5VfiwHvSaK-Wwhk6ccUBgOwl0TU,3740
|
|
@@ -18,13 +18,13 @@ rcsb_embedding_model/modules/chain_module.py,sha256=KsZw2uagO4rpAKWv6ivqEMxIEzgt
|
|
|
18
18
|
rcsb_embedding_model/modules/esm_module.py,sha256=otJRbCb319nCCob_4E1W_UClhkex9eDqcCyzWQO-vIs,740
|
|
19
19
|
rcsb_embedding_model/modules/structure_module.py,sha256=4js02XzKvhc_G26ELsGhJ9SCi_wlvtVolObxfWt3BhE,1077
|
|
20
20
|
rcsb_embedding_model/types/api_types.py,sha256=SCwALwvEb0KRKaoWKbuN7JyfOH-1whsI0Z4ki41dht8,1235
|
|
21
|
-
rcsb_embedding_model/utils/data.py,sha256=
|
|
21
|
+
rcsb_embedding_model/utils/data.py,sha256=ThrcYycIizsV_Ycn6PPxF12JRr1m2K-v8TsIaVqx10A,3816
|
|
22
22
|
rcsb_embedding_model/utils/model.py,sha256=xr3p02ohOgJ5UInwdIupN68Oq4yvNFhxobZRacS1adg,953
|
|
23
|
-
rcsb_embedding_model/utils/structure_parser.py,sha256=
|
|
23
|
+
rcsb_embedding_model/utils/structure_parser.py,sha256=fSIbq_a_aEigCWY_1dUcW9d9Law0ZDOcZAxJlZL0Rt8,3377
|
|
24
24
|
rcsb_embedding_model/utils/structure_provider.py,sha256=eWtxjkPpmRfmil_DKR1J6miaXR3lQ28DF5O0qrqSgGA,786
|
|
25
25
|
rcsb_embedding_model/writer/batch_writer.py,sha256=rTFNasB0Xp4-XCNTXKeEWZxSrb7lvZytoRldJUWn9Jg,3312
|
|
26
|
-
rcsb_embedding_model-0.0.
|
|
27
|
-
rcsb_embedding_model-0.0.
|
|
28
|
-
rcsb_embedding_model-0.0.
|
|
29
|
-
rcsb_embedding_model-0.0.
|
|
30
|
-
rcsb_embedding_model-0.0.
|
|
26
|
+
rcsb_embedding_model-0.0.34.dist-info/METADATA,sha256=zL59HhwjvhQAOrh_hJAMQ_997sXCnVzboStFC-j5qog,5351
|
|
27
|
+
rcsb_embedding_model-0.0.34.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
28
|
+
rcsb_embedding_model-0.0.34.dist-info/entry_points.txt,sha256=MK11jTIEmaV-x4CkPX5IymDaVs7Ky_f2xxU8BJVZ_9Q,69
|
|
29
|
+
rcsb_embedding_model-0.0.34.dist-info/licenses/LICENSE.md,sha256=oUaHiKgfBkChth_Sm67WemEvatO1U0Go8LHjaskXY0w,1522
|
|
30
|
+
rcsb_embedding_model-0.0.34.dist-info/RECORD,,
|
|
File without changes
|
{rcsb_embedding_model-0.0.32.dist-info → rcsb_embedding_model-0.0.34.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{rcsb_embedding_model-0.0.32.dist-info → rcsb_embedding_model-0.0.34.dist-info}/licenses/LICENSE.md
RENAMED
|
File without changes
|