gemmi-protools 0.1.17__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gemmi-protools might be problematic. Click here for more details.

@@ -2,33 +2,26 @@
2
2
  @Author: Luo Jiejian
3
3
  """
4
4
  import os
5
- import pathlib
6
5
  import re
7
6
  import shutil
8
7
  import subprocess
9
8
  import tempfile
10
- import uuid
11
- from copy import deepcopy
12
- from typing import Union, Dict, Any, List, Optional
9
+ from typing import Dict, Any, List, Optional
13
10
 
14
11
  import numpy as np
15
12
  from Bio.PDB import Superimposer
16
- from typeguard import typechecked
17
13
 
18
14
  from gemmi_protools.io.convert import gemmi2bio, bio2gemmi
19
15
  from gemmi_protools.io.reader import StructureParser
20
16
 
21
17
 
22
18
  class StructureAligner(object):
23
- @typechecked
24
- def __init__(self, query_path: Union[str, pathlib.Path], ref_path: Union[str, pathlib.Path]):
19
+ def __init__(self, query_path: str, ref_path: str):
25
20
  self._query_st = StructureParser()
26
21
  self._query_st.load_from_file(query_path)
27
- self._query_st.set_default_model()
28
22
 
29
23
  self._ref_st = StructureParser()
30
24
  self._ref_st.load_from_file(ref_path)
31
- self._ref_st.set_default_model()
32
25
 
33
26
  self.values = dict()
34
27
  self.rot_mat = None
@@ -49,8 +42,7 @@ class StructureAligner(object):
49
42
  return _path
50
43
 
51
44
  @staticmethod
52
- @typechecked
53
- def __parser_rotation_matrix(matrix_file: Union[str, pathlib.Path]):
45
+ def __parser_rotation_matrix(matrix_file: str):
54
46
  rotation_matrix = []
55
47
  translation_vector = []
56
48
 
@@ -66,7 +58,6 @@ class StructureAligner(object):
66
58
  T=np.array(translation_vector).astype(np.float32))
67
59
 
68
60
  @staticmethod
69
- @typechecked
70
61
  def __parse_terminal_outputs(output_string: str) -> Dict[str, Any]:
71
62
  lines = re.split(pattern=r"\n", string=output_string)
72
63
  # chain mapping
@@ -108,7 +99,6 @@ class StructureAligner(object):
108
99
  del patterns[key]
109
100
  return values
110
101
 
111
- @typechecked
112
102
  def make_alignment(self, query_chains: Optional[List[str]] = None,
113
103
  ref_chains: Optional[List[str]] = None, timeout=300.0):
114
104
  """
@@ -122,56 +112,50 @@ class StructureAligner(object):
122
112
  program_path = self.__mmalign_path
123
113
 
124
114
  # clone
125
- q_st = deepcopy(self._query_st)
126
- r_st = deepcopy(self._ref_st)
127
-
128
- tmp_dir = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
129
- os.makedirs(tmp_dir)
130
-
131
115
  if isinstance(query_chains, list):
132
- q_st.pick_chains(query_chains)
116
+ q_st = self._query_st.pick_chains(query_chains)
117
+ else:
118
+ q_st = self._query_st
133
119
 
134
120
  if isinstance(ref_chains, list):
135
- r_st.pick_chains(ref_chains)
121
+ r_st = self._ref_st.pick_chains(query_chains)
122
+ else:
123
+ r_st = self._ref_st
136
124
 
137
- q_ch_mapper = q_st.make_chain_names_to_one_letter()
138
- r_ch_mapper = r_st.make_chain_names_to_one_letter()
125
+ q_ch_mapper = q_st.make_one_letter_chain()
126
+ r_ch_mapper = r_st.make_one_letter_chain()
139
127
 
140
128
  q_ch_mapper_r = {v: k for k, v in q_ch_mapper.items()}
141
129
  r_ch_mapper_r = {v: k for k, v in r_ch_mapper.items()}
142
130
 
143
- _tmp_a = os.path.join(tmp_dir, "a.pdb")
144
- q_st.to_pdb(_tmp_a)
131
+ with tempfile.TemporaryDirectory() as tmp_dir:
132
+ _tmp_a = os.path.join(tmp_dir, "a.pdb")
133
+ q_st.to_pdb(_tmp_a)
134
+
135
+ _tmp_b = os.path.join(tmp_dir, "b.pdb")
136
+ r_st.to_pdb(_tmp_b)
137
+
138
+ matrix_file = os.path.join(tmp_dir, "m.txt")
139
+ _command = "%s %s %s -m %s" % (program_path, _tmp_a, _tmp_b, matrix_file)
140
+
141
+ try:
142
+ result = subprocess.run(_command, shell=True, check=True,
143
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE,
144
+ timeout=timeout)
145
+ except Exception as e:
146
+ print("%s: between files %s and %s; between chains: %s and %s" % (
147
+ str(e), self.query_path, self.ref_path,
148
+ str(q_st.chain_ids), str(r_st.chain_ids))
149
+ )
150
+ else:
151
+ self.values = self.__parse_terminal_outputs(result.stdout.decode())
152
+ self.rot_mat = self.__parser_rotation_matrix(matrix_file)
153
+ self.is_aligned = True
154
+ self.by_query = q_st.chain_ids if query_chains is None else query_chains
155
+ self.by_ref = r_st.chain_ids if ref_chains is None else ref_chains
156
+ self.values["query_chain_ids"] = [q_ch_mapper_r.get(ch, ch) for ch in self.values["query_chain_ids"]]
157
+ self.values["ref_chain_ids"] = [r_ch_mapper_r.get(ch, ch) for ch in self.values["ref_chain_ids"]]
145
158
 
146
- _tmp_b = os.path.join(tmp_dir, "b.pdb")
147
- r_st.to_pdb(_tmp_b)
148
-
149
- matrix_file = os.path.join(tmp_dir, "m.txt")
150
- _command = "%s %s %s -m %s" % (program_path, _tmp_a, _tmp_b, matrix_file)
151
-
152
- try:
153
- result = subprocess.run(_command, shell=True, check=True,
154
- stdout=subprocess.PIPE, stderr=subprocess.PIPE,
155
- timeout=timeout)
156
- except Exception as e:
157
- print("%s: between files %s and %s; between chains: %s and %s" % (
158
- str(e), self.query_path, self.ref_path,
159
- str(q_st.chain_ids), str(r_st.chain_ids))
160
- )
161
- else:
162
- self.values = self.__parse_terminal_outputs(result.stdout.decode())
163
- self.rot_mat = self.__parser_rotation_matrix(matrix_file)
164
- self.is_aligned = True
165
- self.by_query = q_st.chain_ids if query_chains is None else query_chains
166
- self.by_ref = r_st.chain_ids if ref_chains is None else ref_chains
167
- self.values["query_chain_ids"] = [q_ch_mapper_r.get(ch, ch) for ch in self.values["query_chain_ids"]]
168
- self.values["ref_chain_ids"] = [r_ch_mapper_r.get(ch, ch) for ch in self.values["ref_chain_ids"]]
169
-
170
- finally:
171
- if os.path.isdir(tmp_dir):
172
- shutil.rmtree(tmp_dir)
173
-
174
- @typechecked
175
159
  def save_aligned_query(self, out_file: str):
176
160
  """
177
161
 
@@ -0,0 +1,128 @@
1
+ """
2
+ @Author: Luo Jiejian
3
+ """
4
+ import json
5
+ import os
6
+ import shutil
7
+ import subprocess
8
+ import tempfile
9
+ from copy import deepcopy
10
+ from typing import List, Tuple
11
+
12
+ import gemmi
13
+ import pandas as pd
14
+
15
+ from gemmi_protools.io.reader import StructureParser
16
+
17
+
18
+ def dockq_score_interface(query_model: str,
19
+ native_model: str,
20
+ partner_1_mapping: List[Tuple[str, str]],
21
+ partner_2_mapping: List[Tuple[str, str]],
22
+ ):
23
+ """
24
+ Calculate Dockq Score for an interface (partner 1 vs partner 2)
25
+
26
+ :param query_model: str
27
+ path of query model, support .pdb, .pdb.gz, .cif, .cif.gz
28
+ :param native_model:
29
+ :param partner_1_mapping: a list of chain ID mapping between query and native for partner1 of the interface
30
+ e.g. [(q chain1, n chain1), (q chain2, n chain2)]
31
+ :param partner_2_mapping:
32
+ :return:
33
+ """
34
+ dockq_program = shutil.which("DockQ")
35
+ if dockq_program is None:
36
+ raise RuntimeError("DockQ is need")
37
+
38
+ assert len(partner_1_mapping) > 0, "partner_1_mapping must be a list of chain ID tuples, can't be empty"
39
+ assert len(partner_2_mapping) > 0, "partner_2_mapping must be a list of chain ID tuples, can't be empty"
40
+
41
+ def load_struct(path: str, partner_1: List[str], partner_2: List[str]):
42
+ st = StructureParser()
43
+ st.load_from_file(path)
44
+ st.clean_structure()
45
+
46
+ for ch in partner_1 + partner_2:
47
+ if ch not in st.chain_ids:
48
+ raise ValueError("Chain %s not found for %s (only [%s])" % (ch, path, " ".join(st.chain_ids)))
49
+
50
+ # merge chains in each each partner into on chain
51
+ # partner_1 with chain ID A
52
+ # partner_2 with chain ID B
53
+
54
+ chain_a = gemmi.Chain("A")
55
+ idx_a = 1
56
+ for ch in partner_1:
57
+ for res in st.get_chain(ch):
58
+ nr = deepcopy(res)
59
+ nr.seqid.icode = " "
60
+ nr.seqid.num = idx_a
61
+ chain_a.add_residue(nr)
62
+ idx_a += 1
63
+
64
+ chain_b = gemmi.Chain("B")
65
+ idx_b = 1
66
+ for ch in partner_2:
67
+ for res in st.get_chain(ch):
68
+ nr = deepcopy(res)
69
+ nr.seqid.icode = " "
70
+ nr.seqid.num = idx_b
71
+ chain_b.add_residue(nr)
72
+ idx_b += 1
73
+
74
+ model = gemmi.Model(1)
75
+ model.add_chain(chain_a)
76
+ model.add_chain(chain_b)
77
+
78
+ struct = gemmi.Structure()
79
+ struct.add_model(model)
80
+
81
+ output = StructureParser(struct)
82
+ return output
83
+
84
+ partner_1_query, partner_1_native = list(zip(*partner_1_mapping))
85
+ partner_2_query, partner_2_native = list(zip(*partner_2_mapping))
86
+
87
+ q_st = load_struct(query_model, list(partner_1_query), list(partner_2_query))
88
+ n_st = load_struct(native_model, list(partner_1_native), list(partner_2_native))
89
+
90
+ with tempfile.TemporaryDirectory() as tmp_dir:
91
+ result_file = os.path.join(tmp_dir, "result.json")
92
+ q_file = os.path.join(tmp_dir, "q.pdb")
93
+ n_file = os.path.join(tmp_dir, "n.pdb")
94
+ q_st.to_pdb(q_file, write_minimal_pdb=True)
95
+ n_st.to_pdb(n_file, write_minimal_pdb=True)
96
+
97
+ mapping = "AB:AB"
98
+
99
+ _command = "%s --mapping %s --json %s %s %s" % (dockq_program, mapping, result_file, q_file, n_file)
100
+ metrics = ['DockQ', 'F1', 'chain1', 'chain2']
101
+
102
+ try:
103
+ _ = subprocess.run(_command, shell=True, check=True,
104
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE,
105
+ timeout=300.0)
106
+ except subprocess.CalledProcessError as e:
107
+ # Handle errors in the called executable
108
+ msg = e.stderr.decode()
109
+ outputs = pd.DataFrame(columns=metrics)
110
+ except Exception as e:
111
+ # Handle other exceptions such as file not found or permissions issues
112
+ msg = str(e)
113
+ outputs = pd.DataFrame(columns=metrics)
114
+ else:
115
+ with open(result_file, "r") as fin:
116
+ vals = json.load(fin)
117
+ msg = "Finished"
118
+ result = []
119
+ for v in vals["best_result"].values():
120
+ result.append(v)
121
+ outputs = pd.DataFrame(result)[metrics]
122
+
123
+ if len(outputs) > 0:
124
+ score = "%.4f" % outputs.iloc[0]["DockQ"]
125
+ else:
126
+ score = ""
127
+
128
+ return dict(score=score, status=msg)
@@ -0,0 +1,197 @@
1
+ """
2
+ @Author: Luo Jiejian
3
+ """
4
+ import os
5
+ import subprocess
6
+ import tempfile
7
+ from collections import defaultdict
8
+ from typing import List, Optional, Union
9
+
10
+ import freesasa
11
+ import numpy as np
12
+ import trimesh
13
+ from Bio.PDB import Selection
14
+ from Bio.PDB.ResidueDepth import _get_atom_radius, _read_vertex_array
15
+
16
+ from gemmi_protools import StructureParser
17
+ from gemmi_protools import gemmi2bio
18
+
19
+
20
+ def _read_face_array(filename: str):
21
+ with open(filename) as fp:
22
+ face_list = []
23
+ for line in fp:
24
+ sl = line.split()
25
+ if len(sl) != 5:
26
+ # skip header
27
+ continue
28
+ vl = [int(x) for x in sl[0:3]]
29
+ face_list.append(vl)
30
+ return np.array(face_list)
31
+
32
+
33
+ def get_mesh(struct_file: str, chains: Optional[List[str]] = None, MSMS: str = "msms"):
34
+ """
35
+
36
+ :param struct_file: str
37
+ .pdb, .cif, .pdb.gz, .cif.gz
38
+ :param chains: a list of chain names
39
+ default None to include all chains
40
+ :param MSMS: str
41
+ path of msms executable
42
+ :return:
43
+ https://ccsb.scripps.edu/msms/downloads/
44
+ """
45
+ xyz_tmp = tempfile.NamedTemporaryFile(delete=False).name
46
+ surface_tmp = tempfile.NamedTemporaryFile(delete=False).name
47
+ msms_tmp = tempfile.NamedTemporaryFile(delete=False).name
48
+ face_file = surface_tmp + ".face"
49
+ surface_file = surface_tmp + ".vert"
50
+
51
+ try:
52
+ st = StructureParser()
53
+ st.load_from_file(struct_file)
54
+ st.clean_structure(remove_ligand=True)
55
+
56
+ if chains is None:
57
+ st_p = st
58
+ else:
59
+ for ch in chains:
60
+ if ch not in st.chain_ids:
61
+ raise ValueError("Chain %s not found (only [%s])" % (ch, " ".join(st.chain_ids)))
62
+ st_p = st.pick_chains(chains)
63
+
64
+ bio_st = gemmi2bio(st_p.STRUCT)
65
+ model = bio_st[0]
66
+
67
+ # Replace pdb_to_xyzr
68
+ # Make x,y,z,radius file
69
+ atom_list = Selection.unfold_entities(model, "A")
70
+
71
+ with open(xyz_tmp, "w") as pdb_to_xyzr:
72
+ for atom in atom_list:
73
+ x, y, z = atom.coord
74
+ radius = _get_atom_radius(atom, rtype="united")
75
+ pdb_to_xyzr.write(f"{x:6.3f}\t{y:6.3f}\t{z:6.3f}\t{radius:1.2f}\n")
76
+
77
+ # Make surface
78
+ MSMS = MSMS + " -no_header -probe_radius 1.5 -if %s -of %s > " + msms_tmp
79
+ make_surface = MSMS % (xyz_tmp, surface_tmp)
80
+ subprocess.call(make_surface, shell=True)
81
+ if not os.path.isfile(surface_file):
82
+ raise RuntimeError(
83
+ f"Failed to generate surface file using command:\n{make_surface}"
84
+ )
85
+
86
+ except Exception as e:
87
+ print(str(e))
88
+ mesh = None
89
+ else:
90
+ # Read surface vertices from vertex file
91
+ vertices = _read_vertex_array(surface_file)
92
+ faces = _read_face_array(face_file)
93
+ mesh = trimesh.Trimesh(vertices=vertices, faces=faces - 1)
94
+ mesh.merge_vertices()
95
+ mesh.update_faces(mesh.unique_faces())
96
+ mesh.update_faces(mesh.nondegenerate_faces())
97
+ mesh.remove_unreferenced_vertices()
98
+
99
+ # Remove temporary files
100
+ for fn in [xyz_tmp, surface_tmp, msms_tmp, face_file, surface_file]:
101
+ try:
102
+ os.remove(fn)
103
+ except OSError:
104
+ pass
105
+
106
+ return mesh
107
+
108
+
109
+ def get_surface_residues(struct_file: str,
110
+ chains: Optional[List[str]] = None,
111
+ relative_sasa_cutoff: Union[int, float] = 0.15):
112
+ ####################
113
+ # check and pick
114
+ ####################
115
+ st = StructureParser()
116
+ st.load_from_file(struct_file)
117
+ st.clean_structure()
118
+
119
+ if chains is None:
120
+ chains = st.chain_ids
121
+
122
+ if isinstance(chains, list):
123
+ if len(chains) == 0:
124
+ raise ValueError("chains is not set")
125
+ else:
126
+ # check if chains valid
127
+ for ch in chains:
128
+ if ch not in st.chain_ids:
129
+ raise ValueError("Chain %s not found" % ch)
130
+
131
+ st_p = st.pick_chains(chains)
132
+ # sequences = {k: s.replace("-", "").upper() for k, s in st_p.polymer_sequences().items()}
133
+
134
+ # start from 1
135
+ seq_num_mapper = dict()
136
+ for chain in st_p.MODEL:
137
+ for i, res in enumerate(chain):
138
+ key = (chain.name, str(res.seqid.num) + res.seqid.icode.strip(), res.name)
139
+ seq_num_mapper[key] = i + 1
140
+
141
+ # make one upper letter chain ID
142
+ mapper = st_p.make_one_letter_chain(only_uppercase=True)
143
+ mapper_r = {v: k for k, v in mapper.items()}
144
+
145
+ ####################
146
+ # save to pdb
147
+ ####################
148
+ with tempfile.NamedTemporaryFile(delete=True, suffix=".pdb", mode='w') as tmp_file:
149
+ st_p.to_pdb(tmp_file.name)
150
+ structure = freesasa.Structure(tmp_file.name)
151
+
152
+ result = freesasa.calc(structure)
153
+
154
+ residue_areas = result.residueAreas()
155
+
156
+ surface_residues_relative_sasa = dict()
157
+ surface_atoms = defaultdict(list)
158
+ for atom_index in range(structure.nAtoms()):
159
+ ch = structure.chainLabel(atom_index)
160
+ ch = mapper_r.get(ch, ch)
161
+
162
+ res_num = structure.residueNumber(atom_index).strip()
163
+ res_name = structure.residueName(atom_index)
164
+ atom_sasa = result.atomArea(atom_index)
165
+
166
+ res_id = (ch, res_num, res_name)
167
+ res_relative_total = residue_areas[ch][res_num].relativeTotal
168
+ if res_relative_total > relative_sasa_cutoff:
169
+ if res_id not in surface_residues_relative_sasa:
170
+ surface_residues_relative_sasa[res_id] = res_relative_total
171
+ if atom_sasa > 0:
172
+ atom_name = structure.atomName(atom_index).strip()
173
+ pos = structure.coord(atom_index)
174
+ surface_atoms[res_id].append((atom_sasa, atom_name, pos))
175
+
176
+ results = []
177
+ for res_id, query_atoms in surface_atoms.items():
178
+ seq_loc = seq_num_mapper[res_id]
179
+
180
+ query_atoms.sort(reverse=True)
181
+ centroid = tuple(np.array([a[2] for a in query_atoms[0:3]]).mean(axis=0).tolist())
182
+ results.append((res_id[0],
183
+ res_id[1],
184
+ res_id[2],
185
+ seq_loc,
186
+ centroid,
187
+ surface_residues_relative_sasa[res_id]
188
+ )
189
+ )
190
+ dtype = [("chain_name", "U5"),
191
+ ("residue_numi", "U8"),
192
+ ("residue_name", "U5"),
193
+ ("sequential_residue_num", "i4"),
194
+ ("centroid", ("f4", (3,))),
195
+ ("relative_sasa", "f4"),
196
+ ]
197
+ return np.array(results, dtype=dtype)
@@ -2,25 +2,18 @@
2
2
  @Author: Luo Jiejian
3
3
  """
4
4
  import hashlib
5
- import itertools
6
5
  import os
7
6
  import re
8
7
  import shutil
9
8
  import subprocess
10
9
  import uuid
11
10
  from collections import defaultdict
12
- from dataclasses import asdict
13
11
  from importlib.resources import files
14
- from typing import List
15
12
 
16
- import numpy as np
17
13
  from anarci import run_anarci
18
14
  from anarci.germlines import all_germlines
19
- from joblib import Parallel, delayed
20
- from scipy.spatial import cKDTree
21
15
 
22
16
  from gemmi_protools import StructureParser
23
- from gemmi_protools.utils.ppi import _ppi_atoms
24
17
 
25
18
 
26
19
  def hash_sequence(seq: str) -> str:
@@ -207,125 +200,48 @@ def annotate_mhc(seq_dict: dict):
207
200
  return out
208
201
 
209
202
 
210
- def _interface_residues(struct: StructureParser,
211
- chains_x: List[str],
212
- chains_y: List[str],
213
- threshold: float = 4.5):
214
- """
215
- identify PPI among protein, DNA, RNA
216
- :param struct: StructureParser
217
- :param chains_x:
218
- :param chains_y:
219
- :param threshold:
220
- :return:
221
- PPI residues of chains_x, PPI residues of chains_y
222
- """
223
-
224
- x_coord, x_id = _ppi_atoms(struct, chains_x)
225
- y_coord, y_id = _ppi_atoms(struct, chains_y)
226
-
227
- kd_tree_x = cKDTree(x_coord)
228
- kd_tree_y = cKDTree(y_coord)
229
-
230
- pairs = kd_tree_x.sparse_distance_matrix(kd_tree_y, threshold, output_type='coo_matrix')
231
-
232
- x_res = np.unique(x_id[pairs.row][["ch_name", 'res_num', 'res_icode', 'res_name']])
233
- y_res = np.unique(y_id[pairs.col][["ch_name", 'res_num', 'res_icode', 'res_name']])
234
-
235
- x_out = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in x_res.tolist()]
236
- y_out = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in y_res.tolist()]
237
- return x_out, y_out
238
-
239
-
240
- def polymer_interface_residues(struct: StructureParser,
241
- ppi_threshold: float = 4.5,
242
- n_cpus: int = 1,
243
- ):
244
- """
245
-
246
- Args:
247
- struct:
248
- ppi_threshold:
249
-
250
- Returns:
251
-
252
- """
253
- chains = [ch for ch, ct in struct.chain_types.items() if ct in ["protein", "dna", "rna"]]
254
- ch_pairs = list(itertools.combinations(chains, r=2))
255
- ch_pairs.sort()
256
-
257
- def _run(ch_1, ch_2):
258
- key = "%s/%s" % (ch_1, ch_2)
259
- res_x, res_y = _interface_residues(struct, chains_x=[ch_1], chains_y=[ch_2], threshold=ppi_threshold)
260
- if len(res_x) > 0:
261
- return {key: [res_x, res_y]}
262
- else:
263
- return dict()
264
-
265
- cpu2use = max(min(n_cpus, len(ch_pairs)), 1)
266
-
267
- outputs = dict()
268
- if cpu2use == 1 or len(ch_pairs) < 100:
269
- for ch_1, ch_2 in ch_pairs:
270
- outputs.update(_run(ch_1, ch_2))
271
- else:
272
- results = Parallel(n_jobs=cpu2use)(delayed(_run)(c1, c2) for c1, c2 in ch_pairs)
273
- for item in results:
274
- outputs.update(item)
275
- return outputs
276
-
277
-
278
- def annotate_pdb(struct_file: str, ppi_threshold: float = 4.5,
279
- n_cpus: int = 1, max_seqs: int = 100):
203
+ def annotate_pdb(struct_file: str):
280
204
  st = StructureParser()
281
205
  st.load_from_file(struct_file)
282
- st.set_default_model()
283
- st.STRUCT.remove_alternative_conformations()
284
- st.STRUCT.remove_ligands_and_waters()
285
- st.STRUCT.remove_hydrogens()
286
- st.STRUCT.remove_empty_chains()
287
- st.update_entity()
206
+ st.clean_structure()
288
207
 
289
- if len(st.chain_ids) > max_seqs:
290
- raise RuntimeError("Too many chains: %d > %d" % (len(st.chain_ids), max_seqs))
208
+ subchain_id2entity_id = dict()
209
+ for ent in st.STRUCT.entities:
210
+ for ch in ent.subchains:
211
+ subchain_id2entity_id[ch] = ent.name
291
212
 
292
213
  # Merge sequences
293
214
  polymers = dict()
294
- for ch, seq in st.polymer_sequences.items():
295
- hash_id = hash_sequence(seq)
215
+ for ch, seq in st.polymer_sequences().items():
216
+ subchain_id = st.get_chain(ch).get_polymer().subchain_id()
217
+ entity_id = subchain_id2entity_id[subchain_id]
218
+
219
+ hash_id = hash_sequence(seq.upper())
296
220
  if hash_id not in polymers:
297
221
  val = dict(chain_ids=[ch],
298
- sequence=seq,
299
- type=st.chain_types[ch],
300
- description=st.ENTITY.eid2desc.get(st.ENTITY.polymer2eid[ch], "Unknown"),
301
- specie=st.ENTITY.eid2specie.get(st.ENTITY.polymer2eid[ch], "Unknown"),
302
- taxid=st.ENTITY.eid2taxid.get(st.ENTITY.polymer2eid[ch], "Unknown"),
222
+ sequence=seq.upper(),
223
+ type=st.polymer_types[ch].name,
224
+ description=st.INFO["description"].get(entity_id, "Unknown"),
303
225
  )
304
226
  polymers[hash_id] = val
305
227
  else:
306
228
  polymers[hash_id]["chain_ids"].append(ch)
307
229
 
308
- sdict = {k: v["sequence"] for k, v in polymers.items()}
309
-
230
+ proteins = dict()
310
231
  results = dict()
311
- for hasd_id, val in polymers.items():
232
+ for hash_id, val in polymers.items():
312
233
  val["chain_ids"].sort()
313
- if val["type"] == "protein":
234
+ if val["type"] == "PeptideL":
235
+ proteins[hash_id] = val["sequence"]
314
236
  anarci_info = get_fv_region(val["sequence"])
315
237
  fvt = fv_region_type(anarci_info)
316
238
  if fvt != "not-Fv":
317
- results[hasd_id] = dict(fv_type=fvt, annotations=anarci_info)
318
-
319
- struct_info = asdict(st.INFO)
320
- struct_info.update(resolution=st.STRUCT.resolution)
321
- struct_info["pdb_id"] = struct_info["pdb_id"].lower()
322
- struct_info["exp_method"] = struct_info["exp_method"].lower()
239
+ results[hash_id] = dict(fv_type=fvt, annotations=anarci_info)
323
240
 
241
+ struct_info = {k: st.INFO[k] for k in ["resolution", "pdb_id", "deposition_date", "method", "title"]}
324
242
  return dict(path=os.path.abspath(os.path.expanduser(struct_file)),
325
243
  info=struct_info,
326
244
  polymers=polymers,
327
245
  anarci=results,
328
- mhc=annotate_mhc(sdict) if len(sdict) > 0 else dict(),
329
- interfaces=polymer_interface_residues(st, ppi_threshold,
330
- n_cpus=n_cpus)
246
+ mhc=annotate_mhc(proteins) if len(proteins) > 0 else dict(),
331
247
  )