gemmi-protools 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gemmi-protools might be problematic. Click here for more details.

@@ -1,19 +1,28 @@
1
1
  import gzip
2
2
  import io
3
- import itertools
4
3
  import pathlib
5
4
  import random
6
5
  import string
7
6
  from collections import defaultdict
8
- from copy import deepcopy
9
7
  from typing import Dict, Optional, List
10
8
 
11
9
  import gemmi
12
10
  import numpy as np
13
11
  import pandas as pd
14
- from joblib import Parallel, delayed
15
12
  from scipy.spatial import cKDTree
16
13
 
14
+ ATOM = [("chain_name", "U5"),
15
+ ("residue_num", "i4"),
16
+ ("residue_icode", "U3"),
17
+ ("residue_name", "U5"),
18
+ ("atom_name", "U5"),
19
+ ("element", "U3"),
20
+ ("charge", "i1"),
21
+ ("b_factor", "f4"),
22
+ ("occupancy", "f4"),
23
+ ("coordinate", ("f4", (3,)))
24
+ ]
25
+
17
26
 
18
27
  def is_pdb(path: str) -> bool:
19
28
  """
@@ -318,8 +327,6 @@ class StructureParser(object):
318
327
  del self.STRUCT[idx]
319
328
 
320
329
  self.MODEL = self.STRUCT[0]
321
- self.STRUCT.remove_alternative_conformations()
322
- self.STRUCT.remove_hydrogens()
323
330
  self.STRUCT.remove_empty_chains()
324
331
  self._update_full_sequences()
325
332
 
@@ -774,17 +781,23 @@ class StructureParser(object):
774
781
  new_struct.INFO["source"] = src
775
782
  return new_struct
776
783
 
777
- def clean_structure(self, remove_ligand=True):
784
+ def clean_structure(self, remove_ligand=False, remove_hydrogen=True):
778
785
  """
779
786
  Remove water by default
780
787
 
781
- :param remove_ligand:
788
+ :param remove_ligand: bool, default False
789
+ :param remove_hydrogen: bool, default True
782
790
  :return:
783
791
  """
792
+ self.STRUCT.remove_alternative_conformations()
793
+
794
+ if remove_hydrogen:
795
+ self.STRUCT.remove_hydrogens()
796
+
784
797
  if remove_ligand:
785
- self.STRUCT.remove_waters()
786
- else:
787
798
  self.STRUCT.remove_ligands_and_waters()
799
+ else:
800
+ self.STRUCT.remove_waters()
788
801
 
789
802
  self.STRUCT.remove_empty_chains()
790
803
  self.update_entity()
@@ -799,11 +812,12 @@ class StructureParser(object):
799
812
  atom.name = 'SE'
800
813
  atom.element = gemmi.Element('Se')
801
814
 
802
- def get_atoms(self, arg: str = "*"):
815
+ def get_atoms(self, arg: str = "*", exclude_hydrogen=False):
803
816
  """
804
817
 
805
818
  :param arg: str, "*", "/1/*//N,CA,C,O", "/1/*"
806
819
  see gemmi.Selection
820
+ :param exclude_hydrogen: bool, default False
807
821
  :return:
808
822
  np.ndarray
809
823
  """
@@ -814,6 +828,9 @@ class StructureParser(object):
814
828
  for chain in sel.chains(model):
815
829
  for residue in sel.residues(chain):
816
830
  for atom in sel.atoms(residue):
831
+ if exclude_hydrogen and atom.is_hydrogen():
832
+ continue
833
+
817
834
  val = (chain.name,
818
835
  residue.seqid.num,
819
836
  residue.seqid.icode,
@@ -827,23 +844,12 @@ class StructureParser(object):
827
844
  )
828
845
  res.append(val)
829
846
 
830
- dtype = [("chain_name", "U5"),
831
- ("residue_num", "i4"),
832
- ("residue_icode", "U3"),
833
- ("residue_name", "U5"),
834
- ("atom_name", "U5"),
835
- ("element", "U3"),
836
- ("charge", "i1"),
837
- ("b_factor", "f4"),
838
- ("occupancy", "f4"),
839
- ("coordinate", ("f4", (3,)))
840
- ]
841
- return np.array(res, dtype=dtype)
842
-
843
- def polymer_interface_residues(self,
844
- chains_x: List[str],
845
- chains_y: List[str],
846
- threshold: float = 4.5):
847
+ return np.array(res, dtype=ATOM)
848
+
849
+ def compute_interface(self,
850
+ chains_x: List[str],
851
+ chains_y: List[str],
852
+ threshold: float = 5.0):
847
853
  """
848
854
  Identify PPI among protein, DNA, RNA using heavy atom distances.
849
855
  :param chains_x:
@@ -860,23 +866,8 @@ class StructureParser(object):
860
866
  % (ch, " ".join(list(self.polymer_types.keys())))
861
867
  )
862
868
 
863
- def ppi_atoms(struct, chains):
864
- # atoms for N and O of backbone and N, O, P, S of side chains, only for PPI searching
865
- protein_atoms = ['N', 'ND1', 'ND2', 'NE', 'NE1', 'NE2', 'NH1', 'NH2', 'NZ',
866
- 'O', 'OD1', 'OD2', 'OE1', 'OE2', 'OG', 'OG1', 'OH',
867
- 'SD', 'SG']
868
- xna_atoms = ['N1', 'N2', 'N3', 'N4', 'N6', 'N7', 'N9',
869
- 'O2', "O2'", "O3'", 'O4', "O4'", "O5'", 'O6',
870
- 'OP1', 'OP2', 'OP3', 'P']
871
- tag = "/1/%s//%s" % (",".join(chains), ",".join(protein_atoms + xna_atoms))
872
- z = struct.get_atoms(tag)
873
- return z
874
-
875
- query_struct = deepcopy(self)
876
- query_struct.clean_structure(remove_ligand=True)
877
-
878
- atom_x = ppi_atoms(query_struct, chains_x)
879
- atom_y = ppi_atoms(query_struct, chains_y)
869
+ atom_x = self.get_atoms("/1/%s" % ",".join(chains_x), exclude_hydrogen=True)
870
+ atom_y = self.get_atoms("/1/%s" % ",".join(chains_y), exclude_hydrogen=True)
880
871
 
881
872
  kd_tree_x = cKDTree(atom_x["coordinate"])
882
873
  kd_tree_y = cKDTree(atom_y["coordinate"])
@@ -886,38 +877,3 @@ class StructureParser(object):
886
877
  y_res = np.unique(atom_y[pairs.col][["chain_name", "residue_num", "residue_icode", "residue_name"]])
887
878
 
888
879
  return x_res, y_res
889
-
890
- def polymer_interface_residues_all(self, ppi_threshold: float = 4.5, n_cpus: int = 4):
891
- """
892
- Identify PPI among protein, DNA, RNA using heavy atom distances between all chain pairs.
893
-
894
- :param ppi_threshold:
895
- :param n_cpus:
896
- :return:
897
- """
898
- chains = list(self.polymer_types.keys())
899
- ch_pairs = list(itertools.combinations(chains, r=2))
900
- ch_pairs.sort()
901
-
902
- def _run(ch_1, ch_2):
903
- key = "%s/%s" % (ch_1, ch_2)
904
- res_x, res_y = self.polymer_interface_residues(chains_x=[ch_1], chains_y=[ch_2], threshold=ppi_threshold)
905
-
906
- if len(res_x) > 0:
907
- vx = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in res_x.tolist()]
908
- vy = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in res_y.tolist()]
909
- return {key: [vx, vy]}
910
- else:
911
- return dict()
912
-
913
- cpu2use = max(min(n_cpus, len(ch_pairs)), 1)
914
-
915
- outputs = dict()
916
- if cpu2use == 1 or len(ch_pairs) < 50:
917
- for ch_1, ch_2 in ch_pairs:
918
- outputs.update(_run(ch_1, ch_2))
919
- else:
920
- results = Parallel(n_jobs=cpu2use)(delayed(_run)(c1, c2) for c1, c2 in ch_pairs)
921
- for item in results:
922
- outputs.update(item)
923
- return outputs
@@ -10,6 +10,7 @@ import uuid
10
10
  from collections import defaultdict
11
11
  from importlib.resources import files
12
12
 
13
+ import numpy as np
13
14
  from anarci import run_anarci
14
15
  from anarci.germlines import all_germlines
15
16
 
@@ -37,10 +38,21 @@ def get_fv_region(in_sequence: str):
37
38
  )
38
39
 
39
40
  mapper = dict()
41
+ num_mapper = dict()
40
42
  for k, v in imgt_scheme.items():
41
43
  for i in range(v[0], v[1] + 1):
42
44
  mapper[i] = k
43
45
 
46
+ if k == "cdr1":
47
+ ki = 1
48
+ elif k == "cdr2":
49
+ ki = 2
50
+ elif k == "cdr3":
51
+ ki = 3
52
+ else:
53
+ ki = 0
54
+ num_mapper[i] = ki
55
+
44
56
  inputs = [("input", in_sequence)]
45
57
  _, numbered, alignment_details, _ = run_anarci(inputs, scheme="imgt", assign_germline=True)
46
58
  if numbered[0] is None:
@@ -49,6 +61,16 @@ def get_fv_region(in_sequence: str):
49
61
  outputs = []
50
62
  for cur_numbered, cur_details in zip(numbered[0], alignment_details[0]):
51
63
  aligned_sites, start, end = cur_numbered
64
+ # add mask
65
+ # 9 for not Fv region
66
+ # 0 for non-CDR region, 1, 2, 3 for CDR region for the current Fv
67
+ mask = np.full(len(in_sequence), fill_value=9, dtype=np.int8)
68
+ mask[start: end + 1] = 0
69
+ i = 0
70
+ for (site_num, _), site_aa in aligned_sites:
71
+ if site_aa != "-":
72
+ mask[i + start] = num_mapper[site_num]
73
+ i += 1
52
74
 
53
75
  # region_seq
54
76
  regions = defaultdict(list)
@@ -119,6 +141,7 @@ def get_fv_region(in_sequence: str):
119
141
  cdr1_aa=cdr1_seq,
120
142
  cdr2_aa=cdr2_seq,
121
143
  cdr3_aa=cdr3_seq,
144
+ mask="".join([str(i) for i in mask.tolist()])
122
145
  )
123
146
  )
124
147
  return outputs
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gemmi_protools
3
- Version: 1.0.3
3
+ Version: 1.0.4
4
4
  Summary: An Enhanced tool to process PDB structures based on Gemmi
5
5
  Author: Luo Jiejian
6
6
  Author-email: Luo Jiejian <luojiejian12@mails.ucas.ac.cn>
@@ -27,7 +27,7 @@ Dynamic: license-file
27
27
  # Install
28
28
  ```commandline
29
29
 
30
- conda install python=3.12.9 anarci hmmer dockq trimesh rtree -c bioconda -c conda-forge
30
+ conda install python=3.12.9 anarci hmmer dockq trimesh rtree pdbfixer -c bioconda -c conda-forge
31
31
  pip install gemmi_protools
32
32
  ```
33
33
 
@@ -6,14 +6,14 @@ gemmi_protools/data/MHC/MHC_combined.hmm.h3m,sha256=CvNMCsobQiX-wL7iB4CreNcbpnEl
6
6
  gemmi_protools/data/MHC/MHC_combined.hmm.h3p,sha256=-mK278pRedG3-KL-DtuVAQy7La9DgXg5FcP89D6X3Ck,78325
7
7
  gemmi_protools/io/__init__.py,sha256=F6e1xNT_7lZAWQgNIneH06o2qtWYrHNr_xPUPTwwx5E,29
8
8
  gemmi_protools/io/convert.py,sha256=A1i1vPgxG1LqMSUvWtegLl9LipgUQbfmKeGJ_f00UYo,3781
9
- gemmi_protools/io/reader.py,sha256=-O9h5CIGPDnE1rDGMZqfApXk3_LRacNt_DaQ0zACkto,33992
9
+ gemmi_protools/io/reader.py,sha256=drJ8WO_N8VR3KAmvEJBsHHAgZj-hzfnM3fyIV1uc0gg,32047
10
10
  gemmi_protools/tools/__init__.py,sha256=F6e1xNT_7lZAWQgNIneH06o2qtWYrHNr_xPUPTwwx5E,29
11
11
  gemmi_protools/tools/align.py,sha256=oKHvpeDa62zEjLkPmuyBM6avYDl3HFeJVHeRX62I2f4,7085
12
12
  gemmi_protools/tools/dockq.py,sha256=baCuO5-GZCwrlS59T5UIXogpM44OIFIfXqksqRBAb0A,4428
13
13
  gemmi_protools/tools/mesh.py,sha256=73MuJYwS_ACJI15OsrooAAhB1Ti4fM8CJSBqFOBR7LU,6537
14
- gemmi_protools/tools/pdb_annot.py,sha256=enATyAHq0dE8TMsKQhsSbYj-baGrI33iviJdW2R7Hv8,8157
15
- gemmi_protools-1.0.3.dist-info/licenses/LICENSE,sha256=JuQvKcgj6n11y5y6nXr9rABv3gJSswc4eTCd5WZBtSY,1062
16
- gemmi_protools-1.0.3.dist-info/METADATA,sha256=mfEiC_hb27lnZzrW5f3EMpuEi_ri81w0xNsl1PXqVLE,1034
17
- gemmi_protools-1.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
- gemmi_protools-1.0.3.dist-info/top_level.txt,sha256=P12mYJi5O5EKIn5u-RFaWxuix431CgLacSRD7rBid_U,15
19
- gemmi_protools-1.0.3.dist-info/RECORD,,
14
+ gemmi_protools/tools/pdb_annot.py,sha256=fjEB7xmY-SmctW_lICtf2GZ63Odu9nADlyJuOen4W8o,8906
15
+ gemmi_protools-1.0.4.dist-info/licenses/LICENSE,sha256=JuQvKcgj6n11y5y6nXr9rABv3gJSswc4eTCd5WZBtSY,1062
16
+ gemmi_protools-1.0.4.dist-info/METADATA,sha256=HU5R2A-uwFbMcqDTcVeNPr0U3L0XrmeYVA8DihfWfiI,1043
17
+ gemmi_protools-1.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
+ gemmi_protools-1.0.4.dist-info/top_level.txt,sha256=P12mYJi5O5EKIn5u-RFaWxuix431CgLacSRD7rBid_U,15
19
+ gemmi_protools-1.0.4.dist-info/RECORD,,