gemmi-protools 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gemmi-protools might be problematic. Click here for more details.
- gemmi_protools/io/reader.py +52 -79
- gemmi_protools/tools/pdb_annot.py +23 -0
- {gemmi_protools-1.0.2.dist-info → gemmi_protools-1.0.4.dist-info}/METADATA +2 -2
- {gemmi_protools-1.0.2.dist-info → gemmi_protools-1.0.4.dist-info}/RECORD +7 -7
- {gemmi_protools-1.0.2.dist-info → gemmi_protools-1.0.4.dist-info}/WHEEL +0 -0
- {gemmi_protools-1.0.2.dist-info → gemmi_protools-1.0.4.dist-info}/licenses/LICENSE +0 -0
- {gemmi_protools-1.0.2.dist-info → gemmi_protools-1.0.4.dist-info}/top_level.txt +0 -0
gemmi_protools/io/reader.py
CHANGED
|
@@ -1,19 +1,28 @@
|
|
|
1
1
|
import gzip
|
|
2
2
|
import io
|
|
3
|
-
import itertools
|
|
4
3
|
import pathlib
|
|
5
4
|
import random
|
|
6
5
|
import string
|
|
7
6
|
from collections import defaultdict
|
|
8
|
-
from copy import deepcopy
|
|
9
7
|
from typing import Dict, Optional, List
|
|
10
8
|
|
|
11
9
|
import gemmi
|
|
12
10
|
import numpy as np
|
|
13
11
|
import pandas as pd
|
|
14
|
-
from joblib import Parallel, delayed
|
|
15
12
|
from scipy.spatial import cKDTree
|
|
16
13
|
|
|
14
|
+
ATOM = [("chain_name", "U5"),
|
|
15
|
+
("residue_num", "i4"),
|
|
16
|
+
("residue_icode", "U3"),
|
|
17
|
+
("residue_name", "U5"),
|
|
18
|
+
("atom_name", "U5"),
|
|
19
|
+
("element", "U3"),
|
|
20
|
+
("charge", "i1"),
|
|
21
|
+
("b_factor", "f4"),
|
|
22
|
+
("occupancy", "f4"),
|
|
23
|
+
("coordinate", ("f4", (3,)))
|
|
24
|
+
]
|
|
25
|
+
|
|
17
26
|
|
|
18
27
|
def is_pdb(path: str) -> bool:
|
|
19
28
|
"""
|
|
@@ -111,6 +120,23 @@ def parse_cif(path: str) -> dict:
|
|
|
111
120
|
|
|
112
121
|
info_map = dict(st.info)
|
|
113
122
|
pdb_code = info_map.get("_entry.id", "").lower()
|
|
123
|
+
|
|
124
|
+
v1 = block.find_value("_refine.ls_d_res_high")
|
|
125
|
+
v2 = block.find_value("_em_3d_reconstruction.resolution")
|
|
126
|
+
|
|
127
|
+
resolution = 0.0
|
|
128
|
+
if v1 not in [".", "?", None]:
|
|
129
|
+
resolution = v1
|
|
130
|
+
elif v2 not in [".", "?", None]:
|
|
131
|
+
resolution = v2
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
resolution = float(resolution)
|
|
135
|
+
except:
|
|
136
|
+
resolution = 0.0
|
|
137
|
+
|
|
138
|
+
st.resolution = resolution
|
|
139
|
+
|
|
114
140
|
info = dict(description={k: v for k, v in entityid2description.items() if v and v != "?"},
|
|
115
141
|
source=entityid2src,
|
|
116
142
|
resolution=st.resolution,
|
|
@@ -301,8 +327,6 @@ class StructureParser(object):
|
|
|
301
327
|
del self.STRUCT[idx]
|
|
302
328
|
|
|
303
329
|
self.MODEL = self.STRUCT[0]
|
|
304
|
-
self.STRUCT.remove_alternative_conformations()
|
|
305
|
-
self.STRUCT.remove_hydrogens()
|
|
306
330
|
self.STRUCT.remove_empty_chains()
|
|
307
331
|
self._update_full_sequences()
|
|
308
332
|
|
|
@@ -757,17 +781,23 @@ class StructureParser(object):
|
|
|
757
781
|
new_struct.INFO["source"] = src
|
|
758
782
|
return new_struct
|
|
759
783
|
|
|
760
|
-
def clean_structure(self, remove_ligand=True):
|
|
784
|
+
def clean_structure(self, remove_ligand=False, remove_hydrogen=True):
|
|
761
785
|
"""
|
|
762
786
|
Remove water by default
|
|
763
787
|
|
|
764
|
-
:param remove_ligand:
|
|
788
|
+
:param remove_ligand: bool, default False
|
|
789
|
+
:param remove_hydrogen: bool, default True
|
|
765
790
|
:return:
|
|
766
791
|
"""
|
|
792
|
+
self.STRUCT.remove_alternative_conformations()
|
|
793
|
+
|
|
794
|
+
if remove_hydrogen:
|
|
795
|
+
self.STRUCT.remove_hydrogens()
|
|
796
|
+
|
|
767
797
|
if remove_ligand:
|
|
768
|
-
self.STRUCT.remove_waters()
|
|
769
|
-
else:
|
|
770
798
|
self.STRUCT.remove_ligands_and_waters()
|
|
799
|
+
else:
|
|
800
|
+
self.STRUCT.remove_waters()
|
|
771
801
|
|
|
772
802
|
self.STRUCT.remove_empty_chains()
|
|
773
803
|
self.update_entity()
|
|
@@ -782,11 +812,12 @@ class StructureParser(object):
|
|
|
782
812
|
atom.name = 'SE'
|
|
783
813
|
atom.element = gemmi.Element('Se')
|
|
784
814
|
|
|
785
|
-
def get_atoms(self, arg: str = "*"):
|
|
815
|
+
def get_atoms(self, arg: str = "*", exclude_hydrogen=False):
|
|
786
816
|
"""
|
|
787
817
|
|
|
788
818
|
:param arg: str, "*", "/1/*//N,CA,C,O", "/1/*"
|
|
789
819
|
see gemmi.Selection
|
|
820
|
+
:param exclude_hydrogen: bool, default False
|
|
790
821
|
:return:
|
|
791
822
|
np.ndarray
|
|
792
823
|
"""
|
|
@@ -797,6 +828,9 @@ class StructureParser(object):
|
|
|
797
828
|
for chain in sel.chains(model):
|
|
798
829
|
for residue in sel.residues(chain):
|
|
799
830
|
for atom in sel.atoms(residue):
|
|
831
|
+
if exclude_hydrogen and atom.is_hydrogen():
|
|
832
|
+
continue
|
|
833
|
+
|
|
800
834
|
val = (chain.name,
|
|
801
835
|
residue.seqid.num,
|
|
802
836
|
residue.seqid.icode,
|
|
@@ -810,23 +844,12 @@ class StructureParser(object):
|
|
|
810
844
|
)
|
|
811
845
|
res.append(val)
|
|
812
846
|
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
("charge", "i1"),
|
|
820
|
-
("b_factor", "f4"),
|
|
821
|
-
("occupancy", "f4"),
|
|
822
|
-
("coordinate", ("f4", (3,)))
|
|
823
|
-
]
|
|
824
|
-
return np.array(res, dtype=dtype)
|
|
825
|
-
|
|
826
|
-
def polymer_interface_residues(self,
|
|
827
|
-
chains_x: List[str],
|
|
828
|
-
chains_y: List[str],
|
|
829
|
-
threshold: float = 4.5):
|
|
847
|
+
return np.array(res, dtype=ATOM)
|
|
848
|
+
|
|
849
|
+
def compute_interface(self,
|
|
850
|
+
chains_x: List[str],
|
|
851
|
+
chains_y: List[str],
|
|
852
|
+
threshold: float = 5.0):
|
|
830
853
|
"""
|
|
831
854
|
Identify PPI among protein, DNA, RNA using heavy atom distances.
|
|
832
855
|
:param chains_x:
|
|
@@ -843,23 +866,8 @@ class StructureParser(object):
|
|
|
843
866
|
% (ch, " ".join(list(self.polymer_types.keys())))
|
|
844
867
|
)
|
|
845
868
|
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
protein_atoms = ['N', 'ND1', 'ND2', 'NE', 'NE1', 'NE2', 'NH1', 'NH2', 'NZ',
|
|
849
|
-
'O', 'OD1', 'OD2', 'OE1', 'OE2', 'OG', 'OG1', 'OH',
|
|
850
|
-
'SD', 'SG']
|
|
851
|
-
xna_atoms = ['N1', 'N2', 'N3', 'N4', 'N6', 'N7', 'N9',
|
|
852
|
-
'O2', "O2'", "O3'", 'O4', "O4'", "O5'", 'O6',
|
|
853
|
-
'OP1', 'OP2', 'OP3', 'P']
|
|
854
|
-
tag = "/1/%s//%s" % (",".join(chains), ",".join(protein_atoms + xna_atoms))
|
|
855
|
-
z = struct.get_atoms(tag)
|
|
856
|
-
return z
|
|
857
|
-
|
|
858
|
-
query_struct = deepcopy(self)
|
|
859
|
-
query_struct.clean_structure(remove_ligand=True)
|
|
860
|
-
|
|
861
|
-
atom_x = ppi_atoms(query_struct, chains_x)
|
|
862
|
-
atom_y = ppi_atoms(query_struct, chains_y)
|
|
869
|
+
atom_x = self.get_atoms("/1/%s" % ",".join(chains_x), exclude_hydrogen=True)
|
|
870
|
+
atom_y = self.get_atoms("/1/%s" % ",".join(chains_y), exclude_hydrogen=True)
|
|
863
871
|
|
|
864
872
|
kd_tree_x = cKDTree(atom_x["coordinate"])
|
|
865
873
|
kd_tree_y = cKDTree(atom_y["coordinate"])
|
|
@@ -869,38 +877,3 @@ class StructureParser(object):
|
|
|
869
877
|
y_res = np.unique(atom_y[pairs.col][["chain_name", "residue_num", "residue_icode", "residue_name"]])
|
|
870
878
|
|
|
871
879
|
return x_res, y_res
|
|
872
|
-
|
|
873
|
-
def polymer_interface_residues_all(self, ppi_threshold: float = 4.5, n_cpus: int = 4):
|
|
874
|
-
"""
|
|
875
|
-
Identify PPI among protein, DNA, RNA using heavy atom distances between all chain pairs.
|
|
876
|
-
|
|
877
|
-
:param ppi_threshold:
|
|
878
|
-
:param n_cpus:
|
|
879
|
-
:return:
|
|
880
|
-
"""
|
|
881
|
-
chains = list(self.polymer_types.keys())
|
|
882
|
-
ch_pairs = list(itertools.combinations(chains, r=2))
|
|
883
|
-
ch_pairs.sort()
|
|
884
|
-
|
|
885
|
-
def _run(ch_1, ch_2):
|
|
886
|
-
key = "%s/%s" % (ch_1, ch_2)
|
|
887
|
-
res_x, res_y = self.polymer_interface_residues(chains_x=[ch_1], chains_y=[ch_2], threshold=ppi_threshold)
|
|
888
|
-
|
|
889
|
-
if len(res_x) > 0:
|
|
890
|
-
vx = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in res_x.tolist()]
|
|
891
|
-
vy = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in res_y.tolist()]
|
|
892
|
-
return {key: [vx, vy]}
|
|
893
|
-
else:
|
|
894
|
-
return dict()
|
|
895
|
-
|
|
896
|
-
cpu2use = max(min(n_cpus, len(ch_pairs)), 1)
|
|
897
|
-
|
|
898
|
-
outputs = dict()
|
|
899
|
-
if cpu2use == 1 or len(ch_pairs) < 50:
|
|
900
|
-
for ch_1, ch_2 in ch_pairs:
|
|
901
|
-
outputs.update(_run(ch_1, ch_2))
|
|
902
|
-
else:
|
|
903
|
-
results = Parallel(n_jobs=cpu2use)(delayed(_run)(c1, c2) for c1, c2 in ch_pairs)
|
|
904
|
-
for item in results:
|
|
905
|
-
outputs.update(item)
|
|
906
|
-
return outputs
|
|
@@ -10,6 +10,7 @@ import uuid
|
|
|
10
10
|
from collections import defaultdict
|
|
11
11
|
from importlib.resources import files
|
|
12
12
|
|
|
13
|
+
import numpy as np
|
|
13
14
|
from anarci import run_anarci
|
|
14
15
|
from anarci.germlines import all_germlines
|
|
15
16
|
|
|
@@ -37,10 +38,21 @@ def get_fv_region(in_sequence: str):
|
|
|
37
38
|
)
|
|
38
39
|
|
|
39
40
|
mapper = dict()
|
|
41
|
+
num_mapper = dict()
|
|
40
42
|
for k, v in imgt_scheme.items():
|
|
41
43
|
for i in range(v[0], v[1] + 1):
|
|
42
44
|
mapper[i] = k
|
|
43
45
|
|
|
46
|
+
if k == "cdr1":
|
|
47
|
+
ki = 1
|
|
48
|
+
elif k == "cdr2":
|
|
49
|
+
ki = 2
|
|
50
|
+
elif k == "cdr3":
|
|
51
|
+
ki = 3
|
|
52
|
+
else:
|
|
53
|
+
ki = 0
|
|
54
|
+
num_mapper[i] = ki
|
|
55
|
+
|
|
44
56
|
inputs = [("input", in_sequence)]
|
|
45
57
|
_, numbered, alignment_details, _ = run_anarci(inputs, scheme="imgt", assign_germline=True)
|
|
46
58
|
if numbered[0] is None:
|
|
@@ -49,6 +61,16 @@ def get_fv_region(in_sequence: str):
|
|
|
49
61
|
outputs = []
|
|
50
62
|
for cur_numbered, cur_details in zip(numbered[0], alignment_details[0]):
|
|
51
63
|
aligned_sites, start, end = cur_numbered
|
|
64
|
+
# add mask
|
|
65
|
+
# 9 for not Fv region
|
|
66
|
+
# 0 for non-CDR region, 1, 2, 3 for CDR region for the current Fv
|
|
67
|
+
mask = np.full(len(in_sequence), fill_value=9, dtype=np.int8)
|
|
68
|
+
mask[start: end + 1] = 0
|
|
69
|
+
i = 0
|
|
70
|
+
for (site_num, _), site_aa in aligned_sites:
|
|
71
|
+
if site_aa != "-":
|
|
72
|
+
mask[i + start] = num_mapper[site_num]
|
|
73
|
+
i += 1
|
|
52
74
|
|
|
53
75
|
# region_seq
|
|
54
76
|
regions = defaultdict(list)
|
|
@@ -119,6 +141,7 @@ def get_fv_region(in_sequence: str):
|
|
|
119
141
|
cdr1_aa=cdr1_seq,
|
|
120
142
|
cdr2_aa=cdr2_seq,
|
|
121
143
|
cdr3_aa=cdr3_seq,
|
|
144
|
+
mask="".join([str(i) for i in mask.tolist()])
|
|
122
145
|
)
|
|
123
146
|
)
|
|
124
147
|
return outputs
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gemmi_protools
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.4
|
|
4
4
|
Summary: An Enhanced tool to process PDB structures based on Gemmi
|
|
5
5
|
Author: Luo Jiejian
|
|
6
6
|
Author-email: Luo Jiejian <luojiejian12@mails.ucas.ac.cn>
|
|
@@ -27,7 +27,7 @@ Dynamic: license-file
|
|
|
27
27
|
# Install
|
|
28
28
|
```commandline
|
|
29
29
|
|
|
30
|
-
conda install python=3.12.9 anarci hmmer dockq trimesh rtree -c bioconda -c conda-forge
|
|
30
|
+
conda install python=3.12.9 anarci hmmer dockq trimesh rtree pdbfixer -c bioconda -c conda-forge
|
|
31
31
|
pip install gemmi_protools
|
|
32
32
|
```
|
|
33
33
|
|
|
@@ -6,14 +6,14 @@ gemmi_protools/data/MHC/MHC_combined.hmm.h3m,sha256=CvNMCsobQiX-wL7iB4CreNcbpnEl
|
|
|
6
6
|
gemmi_protools/data/MHC/MHC_combined.hmm.h3p,sha256=-mK278pRedG3-KL-DtuVAQy7La9DgXg5FcP89D6X3Ck,78325
|
|
7
7
|
gemmi_protools/io/__init__.py,sha256=F6e1xNT_7lZAWQgNIneH06o2qtWYrHNr_xPUPTwwx5E,29
|
|
8
8
|
gemmi_protools/io/convert.py,sha256=A1i1vPgxG1LqMSUvWtegLl9LipgUQbfmKeGJ_f00UYo,3781
|
|
9
|
-
gemmi_protools/io/reader.py,sha256=
|
|
9
|
+
gemmi_protools/io/reader.py,sha256=drJ8WO_N8VR3KAmvEJBsHHAgZj-hzfnM3fyIV1uc0gg,32047
|
|
10
10
|
gemmi_protools/tools/__init__.py,sha256=F6e1xNT_7lZAWQgNIneH06o2qtWYrHNr_xPUPTwwx5E,29
|
|
11
11
|
gemmi_protools/tools/align.py,sha256=oKHvpeDa62zEjLkPmuyBM6avYDl3HFeJVHeRX62I2f4,7085
|
|
12
12
|
gemmi_protools/tools/dockq.py,sha256=baCuO5-GZCwrlS59T5UIXogpM44OIFIfXqksqRBAb0A,4428
|
|
13
13
|
gemmi_protools/tools/mesh.py,sha256=73MuJYwS_ACJI15OsrooAAhB1Ti4fM8CJSBqFOBR7LU,6537
|
|
14
|
-
gemmi_protools/tools/pdb_annot.py,sha256=
|
|
15
|
-
gemmi_protools-1.0.
|
|
16
|
-
gemmi_protools-1.0.
|
|
17
|
-
gemmi_protools-1.0.
|
|
18
|
-
gemmi_protools-1.0.
|
|
19
|
-
gemmi_protools-1.0.
|
|
14
|
+
gemmi_protools/tools/pdb_annot.py,sha256=fjEB7xmY-SmctW_lICtf2GZ63Odu9nADlyJuOen4W8o,8906
|
|
15
|
+
gemmi_protools-1.0.4.dist-info/licenses/LICENSE,sha256=JuQvKcgj6n11y5y6nXr9rABv3gJSswc4eTCd5WZBtSY,1062
|
|
16
|
+
gemmi_protools-1.0.4.dist-info/METADATA,sha256=HU5R2A-uwFbMcqDTcVeNPr0U3L0XrmeYVA8DihfWfiI,1043
|
|
17
|
+
gemmi_protools-1.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
+
gemmi_protools-1.0.4.dist-info/top_level.txt,sha256=P12mYJi5O5EKIn5u-RFaWxuix431CgLacSRD7rBid_U,15
|
|
19
|
+
gemmi_protools-1.0.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|