gemmi-protools 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gemmi-protools might be problematic. Click here for more details.

@@ -1,19 +1,28 @@
1
1
  import gzip
2
2
  import io
3
- import itertools
4
3
  import pathlib
5
4
  import random
6
5
  import string
7
6
  from collections import defaultdict
8
- from copy import deepcopy
9
7
  from typing import Dict, Optional, List
10
8
 
11
9
  import gemmi
12
10
  import numpy as np
13
11
  import pandas as pd
14
- from joblib import Parallel, delayed
15
12
  from scipy.spatial import cKDTree
16
13
 
14
+ ATOM = [("chain_name", "U5"),
15
+ ("residue_num", "i4"),
16
+ ("residue_icode", "U3"),
17
+ ("residue_name", "U5"),
18
+ ("atom_name", "U5"),
19
+ ("element", "U3"),
20
+ ("charge", "i1"),
21
+ ("b_factor", "f4"),
22
+ ("occupancy", "f4"),
23
+ ("coordinate", ("f4", (3,)))
24
+ ]
25
+
17
26
 
18
27
  def is_pdb(path: str) -> bool:
19
28
  """
@@ -111,6 +120,23 @@ def parse_cif(path: str) -> dict:
111
120
 
112
121
  info_map = dict(st.info)
113
122
  pdb_code = info_map.get("_entry.id", "").lower()
123
+
124
+ v1 = block.find_value("_refine.ls_d_res_high")
125
+ v2 = block.find_value("_em_3d_reconstruction.resolution")
126
+
127
+ resolution = 0.0
128
+ if v1 not in [".", "?", None]:
129
+ resolution = v1
130
+ elif v2 not in [".", "?", None]:
131
+ resolution = v2
132
+
133
+ try:
134
+ resolution = float(resolution)
135
+ except:
136
+ resolution = 0.0
137
+
138
+ st.resolution = resolution
139
+
114
140
  info = dict(description={k: v for k, v in entityid2description.items() if v and v != "?"},
115
141
  source=entityid2src,
116
142
  resolution=st.resolution,
@@ -301,8 +327,6 @@ class StructureParser(object):
301
327
  del self.STRUCT[idx]
302
328
 
303
329
  self.MODEL = self.STRUCT[0]
304
- self.STRUCT.remove_alternative_conformations()
305
- self.STRUCT.remove_hydrogens()
306
330
  self.STRUCT.remove_empty_chains()
307
331
  self._update_full_sequences()
308
332
 
@@ -757,17 +781,23 @@ class StructureParser(object):
757
781
  new_struct.INFO["source"] = src
758
782
  return new_struct
759
783
 
760
- def clean_structure(self, remove_ligand=True):
784
+ def clean_structure(self, remove_ligand=False, remove_hydrogen=True):
761
785
  """
762
786
  Remove water by default
763
787
 
764
- :param remove_ligand:
788
+ :param remove_ligand: bool, default False
789
+ :param remove_hydrogen: bool, default True
765
790
  :return:
766
791
  """
792
+ self.STRUCT.remove_alternative_conformations()
793
+
794
+ if remove_hydrogen:
795
+ self.STRUCT.remove_hydrogens()
796
+
767
797
  if remove_ligand:
768
- self.STRUCT.remove_waters()
769
- else:
770
798
  self.STRUCT.remove_ligands_and_waters()
799
+ else:
800
+ self.STRUCT.remove_waters()
771
801
 
772
802
  self.STRUCT.remove_empty_chains()
773
803
  self.update_entity()
@@ -782,11 +812,12 @@ class StructureParser(object):
782
812
  atom.name = 'SE'
783
813
  atom.element = gemmi.Element('Se')
784
814
 
785
- def get_atoms(self, arg: str = "*"):
815
+ def get_atoms(self, arg: str = "*", exclude_hydrogen=False):
786
816
  """
787
817
 
788
818
  :param arg: str, "*", "/1/*//N,CA,C,O", "/1/*"
789
819
  see gemmi.Selection
820
+ :param exclude_hydrogen: bool, default False
790
821
  :return:
791
822
  np.ndarray
792
823
  """
@@ -797,6 +828,9 @@ class StructureParser(object):
797
828
  for chain in sel.chains(model):
798
829
  for residue in sel.residues(chain):
799
830
  for atom in sel.atoms(residue):
831
+ if exclude_hydrogen and atom.is_hydrogen():
832
+ continue
833
+
800
834
  val = (chain.name,
801
835
  residue.seqid.num,
802
836
  residue.seqid.icode,
@@ -810,23 +844,12 @@ class StructureParser(object):
810
844
  )
811
845
  res.append(val)
812
846
 
813
- dtype = [("chain_name", "U5"),
814
- ("residue_num", "i4"),
815
- ("residue_icode", "U3"),
816
- ("residue_name", "U5"),
817
- ("atom_name", "U5"),
818
- ("element", "U3"),
819
- ("charge", "i1"),
820
- ("b_factor", "f4"),
821
- ("occupancy", "f4"),
822
- ("coordinate", ("f4", (3,)))
823
- ]
824
- return np.array(res, dtype=dtype)
825
-
826
- def polymer_interface_residues(self,
827
- chains_x: List[str],
828
- chains_y: List[str],
829
- threshold: float = 4.5):
847
+ return np.array(res, dtype=ATOM)
848
+
849
+ def compute_interface(self,
850
+ chains_x: List[str],
851
+ chains_y: List[str],
852
+ threshold: float = 5.0):
830
853
  """
831
854
  Identify PPI among protein, DNA, RNA using heavy atom distances.
832
855
  :param chains_x:
@@ -843,23 +866,8 @@ class StructureParser(object):
843
866
  % (ch, " ".join(list(self.polymer_types.keys())))
844
867
  )
845
868
 
846
- def ppi_atoms(struct, chains):
847
- # atoms for N and O of backbone and N, O, P, S of side chains, only for PPI searching
848
- protein_atoms = ['N', 'ND1', 'ND2', 'NE', 'NE1', 'NE2', 'NH1', 'NH2', 'NZ',
849
- 'O', 'OD1', 'OD2', 'OE1', 'OE2', 'OG', 'OG1', 'OH',
850
- 'SD', 'SG']
851
- xna_atoms = ['N1', 'N2', 'N3', 'N4', 'N6', 'N7', 'N9',
852
- 'O2', "O2'", "O3'", 'O4', "O4'", "O5'", 'O6',
853
- 'OP1', 'OP2', 'OP3', 'P']
854
- tag = "/1/%s//%s" % (",".join(chains), ",".join(protein_atoms + xna_atoms))
855
- z = struct.get_atoms(tag)
856
- return z
857
-
858
- query_struct = deepcopy(self)
859
- query_struct.clean_structure(remove_ligand=True)
860
-
861
- atom_x = ppi_atoms(query_struct, chains_x)
862
- atom_y = ppi_atoms(query_struct, chains_y)
869
+ atom_x = self.get_atoms("/1/%s" % ",".join(chains_x), exclude_hydrogen=True)
870
+ atom_y = self.get_atoms("/1/%s" % ",".join(chains_y), exclude_hydrogen=True)
863
871
 
864
872
  kd_tree_x = cKDTree(atom_x["coordinate"])
865
873
  kd_tree_y = cKDTree(atom_y["coordinate"])
@@ -869,38 +877,3 @@ class StructureParser(object):
869
877
  y_res = np.unique(atom_y[pairs.col][["chain_name", "residue_num", "residue_icode", "residue_name"]])
870
878
 
871
879
  return x_res, y_res
872
-
873
- def polymer_interface_residues_all(self, ppi_threshold: float = 4.5, n_cpus: int = 4):
874
- """
875
- Identify PPI among protein, DNA, RNA using heavy atom distances between all chain pairs.
876
-
877
- :param ppi_threshold:
878
- :param n_cpus:
879
- :return:
880
- """
881
- chains = list(self.polymer_types.keys())
882
- ch_pairs = list(itertools.combinations(chains, r=2))
883
- ch_pairs.sort()
884
-
885
- def _run(ch_1, ch_2):
886
- key = "%s/%s" % (ch_1, ch_2)
887
- res_x, res_y = self.polymer_interface_residues(chains_x=[ch_1], chains_y=[ch_2], threshold=ppi_threshold)
888
-
889
- if len(res_x) > 0:
890
- vx = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in res_x.tolist()]
891
- vy = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in res_y.tolist()]
892
- return {key: [vx, vy]}
893
- else:
894
- return dict()
895
-
896
- cpu2use = max(min(n_cpus, len(ch_pairs)), 1)
897
-
898
- outputs = dict()
899
- if cpu2use == 1 or len(ch_pairs) < 50:
900
- for ch_1, ch_2 in ch_pairs:
901
- outputs.update(_run(ch_1, ch_2))
902
- else:
903
- results = Parallel(n_jobs=cpu2use)(delayed(_run)(c1, c2) for c1, c2 in ch_pairs)
904
- for item in results:
905
- outputs.update(item)
906
- return outputs
@@ -10,6 +10,7 @@ import uuid
10
10
  from collections import defaultdict
11
11
  from importlib.resources import files
12
12
 
13
+ import numpy as np
13
14
  from anarci import run_anarci
14
15
  from anarci.germlines import all_germlines
15
16
 
@@ -37,10 +38,21 @@ def get_fv_region(in_sequence: str):
37
38
  )
38
39
 
39
40
  mapper = dict()
41
+ num_mapper = dict()
40
42
  for k, v in imgt_scheme.items():
41
43
  for i in range(v[0], v[1] + 1):
42
44
  mapper[i] = k
43
45
 
46
+ if k == "cdr1":
47
+ ki = 1
48
+ elif k == "cdr2":
49
+ ki = 2
50
+ elif k == "cdr3":
51
+ ki = 3
52
+ else:
53
+ ki = 0
54
+ num_mapper[i] = ki
55
+
44
56
  inputs = [("input", in_sequence)]
45
57
  _, numbered, alignment_details, _ = run_anarci(inputs, scheme="imgt", assign_germline=True)
46
58
  if numbered[0] is None:
@@ -49,6 +61,16 @@ def get_fv_region(in_sequence: str):
49
61
  outputs = []
50
62
  for cur_numbered, cur_details in zip(numbered[0], alignment_details[0]):
51
63
  aligned_sites, start, end = cur_numbered
64
+ # add mask
65
+ # 9 for not Fv region
66
+ # 0 for non-CDR region, 1, 2, 3 for CDR region for the current Fv
67
+ mask = np.full(len(in_sequence), fill_value=9, dtype=np.int8)
68
+ mask[start: end + 1] = 0
69
+ i = 0
70
+ for (site_num, _), site_aa in aligned_sites:
71
+ if site_aa != "-":
72
+ mask[i + start] = num_mapper[site_num]
73
+ i += 1
52
74
 
53
75
  # region_seq
54
76
  regions = defaultdict(list)
@@ -119,6 +141,7 @@ def get_fv_region(in_sequence: str):
119
141
  cdr1_aa=cdr1_seq,
120
142
  cdr2_aa=cdr2_seq,
121
143
  cdr3_aa=cdr3_seq,
144
+ mask="".join([str(i) for i in mask.tolist()])
122
145
  )
123
146
  )
124
147
  return outputs
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gemmi_protools
3
- Version: 1.0.2
3
+ Version: 1.0.4
4
4
  Summary: An Enhanced tool to process PDB structures based on Gemmi
5
5
  Author: Luo Jiejian
6
6
  Author-email: Luo Jiejian <luojiejian12@mails.ucas.ac.cn>
@@ -27,7 +27,7 @@ Dynamic: license-file
27
27
  # Install
28
28
  ```commandline
29
29
 
30
- conda install python=3.12.9 anarci hmmer dockq trimesh rtree -c bioconda -c conda-forge
30
+ conda install python=3.12.9 anarci hmmer dockq trimesh rtree pdbfixer -c bioconda -c conda-forge
31
31
  pip install gemmi_protools
32
32
  ```
33
33
 
@@ -6,14 +6,14 @@ gemmi_protools/data/MHC/MHC_combined.hmm.h3m,sha256=CvNMCsobQiX-wL7iB4CreNcbpnEl
6
6
  gemmi_protools/data/MHC/MHC_combined.hmm.h3p,sha256=-mK278pRedG3-KL-DtuVAQy7La9DgXg5FcP89D6X3Ck,78325
7
7
  gemmi_protools/io/__init__.py,sha256=F6e1xNT_7lZAWQgNIneH06o2qtWYrHNr_xPUPTwwx5E,29
8
8
  gemmi_protools/io/convert.py,sha256=A1i1vPgxG1LqMSUvWtegLl9LipgUQbfmKeGJ_f00UYo,3781
9
- gemmi_protools/io/reader.py,sha256=X4onV0IVl0Q7JVH0yg2Zy-8iPIZvRPM-aaxDapawiro,33617
9
+ gemmi_protools/io/reader.py,sha256=drJ8WO_N8VR3KAmvEJBsHHAgZj-hzfnM3fyIV1uc0gg,32047
10
10
  gemmi_protools/tools/__init__.py,sha256=F6e1xNT_7lZAWQgNIneH06o2qtWYrHNr_xPUPTwwx5E,29
11
11
  gemmi_protools/tools/align.py,sha256=oKHvpeDa62zEjLkPmuyBM6avYDl3HFeJVHeRX62I2f4,7085
12
12
  gemmi_protools/tools/dockq.py,sha256=baCuO5-GZCwrlS59T5UIXogpM44OIFIfXqksqRBAb0A,4428
13
13
  gemmi_protools/tools/mesh.py,sha256=73MuJYwS_ACJI15OsrooAAhB1Ti4fM8CJSBqFOBR7LU,6537
14
- gemmi_protools/tools/pdb_annot.py,sha256=enATyAHq0dE8TMsKQhsSbYj-baGrI33iviJdW2R7Hv8,8157
15
- gemmi_protools-1.0.2.dist-info/licenses/LICENSE,sha256=JuQvKcgj6n11y5y6nXr9rABv3gJSswc4eTCd5WZBtSY,1062
16
- gemmi_protools-1.0.2.dist-info/METADATA,sha256=29ea2GIobnQjR6N0VQaI6MRvcH6UCG7cTpzETUUIrCE,1034
17
- gemmi_protools-1.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
- gemmi_protools-1.0.2.dist-info/top_level.txt,sha256=P12mYJi5O5EKIn5u-RFaWxuix431CgLacSRD7rBid_U,15
19
- gemmi_protools-1.0.2.dist-info/RECORD,,
14
+ gemmi_protools/tools/pdb_annot.py,sha256=fjEB7xmY-SmctW_lICtf2GZ63Odu9nADlyJuOen4W8o,8906
15
+ gemmi_protools-1.0.4.dist-info/licenses/LICENSE,sha256=JuQvKcgj6n11y5y6nXr9rABv3gJSswc4eTCd5WZBtSY,1062
16
+ gemmi_protools-1.0.4.dist-info/METADATA,sha256=HU5R2A-uwFbMcqDTcVeNPr0U3L0XrmeYVA8DihfWfiI,1043
17
+ gemmi_protools-1.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
+ gemmi_protools-1.0.4.dist-info/top_level.txt,sha256=P12mYJi5O5EKIn5u-RFaWxuix431CgLacSRD7rBid_U,15
19
+ gemmi_protools-1.0.4.dist-info/RECORD,,