gemmi-protools 0.1.16__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gemmi-protools might be problematic. Click here for more details.

@@ -1,217 +1,594 @@
1
- """
2
- @Author: Luo Jiejian
3
- """
1
+ import itertools
4
2
  import pathlib
3
+ import random
5
4
  import string
6
- import warnings
5
+ from collections import defaultdict
7
6
  from copy import deepcopy
8
- from typing import Union, Optional, List
7
+ from typing import Dict, Optional, List
9
8
 
10
9
  import gemmi
11
10
  import numpy as np
12
- from typeguard import typechecked
11
+ import pandas as pd
12
+ from joblib import Parallel, delayed
13
+ from scipy.spatial import cKDTree
13
14
 
14
- from gemmi_protools.io.cif_opts import _cif_block_for_output, _is_cif
15
- from gemmi_protools.io.parser import (_assign_digital_entity_names, _ent_from_structure,
16
- pdb_parser, cif_parser, _chain_type, _chain_names2one_letter,
17
- _assert_unique_chain_names_in_models, get_assembly)
18
- from gemmi_protools.io.pdb_opts import _compound_source_string, _is_pdb
19
- from gemmi_protools.io.peptide import nucleic_3to1_mapper, protein_3to1_mapper
20
- from gemmi_protools.io.struct_info import Info
15
+
16
+ def is_pdb(path: str) -> bool:
17
+ """
18
+ Check if input file is .pdb or .pdb.gz format
19
+ :param path:
20
+ :return:
21
+ bool
22
+ """
23
+ path = pathlib.Path(path)
24
+
25
+ if path.suffixes:
26
+ if path.suffixes[-1] == ".pdb":
27
+ return True
28
+ elif "".join(path.suffixes[-2:]) == ".pdb.gz":
29
+ return True
30
+ else:
31
+ return False
32
+ else:
33
+ return False
34
+
35
+
36
+ def is_cif(path: str) -> bool:
37
+ """
38
+ Check if input file is .cif or .cif.gz
39
+ :param path:
40
+ :return:
41
+ bool
42
+ """
43
+
44
+ path = pathlib.Path(path)
45
+ if path.suffixes:
46
+ if path.suffixes[-1] == ".cif":
47
+ return True
48
+ elif "".join(path.suffixes[-2:]) == ".cif.gz":
49
+ return True
50
+ else:
51
+ return False
52
+ else:
53
+ return False
54
+
55
+
56
+ def parse_cif(path: str) -> dict:
57
+ """
58
+ Parse CIF structure and info
59
+ :param path: str
60
+ :return:
61
+ dict
62
+ """
63
+ if not is_cif(path):
64
+ raise TypeError("Input file is not a cif file [.cif or .cif.gz]: %s" % path)
65
+
66
+ doc = gemmi.cif.Document()
67
+ st = gemmi.read_structure(path, save_doc=doc)
68
+ st.setup_entities()
69
+ st.assign_serial_numbers()
70
+ block = doc.sole_block()
71
+
72
+ def _read_src(query_block, category, name_col, taxid_col):
73
+ dk = pd.DataFrame(query_block.get_mmcif_category(name=category, raw=False))
74
+ dk[dk.isna()] = ""
75
+
76
+ if dk.shape[0] > 0 and np.all(np.isin(["entity_id", name_col, taxid_col], dk.columns)):
77
+ return {eid: [name, taxid]
78
+ for eid, name, taxid in dk[["entity_id", name_col, taxid_col]].to_numpy()
79
+ }
80
+ else:
81
+ return dict()
82
+
83
+ desc = pd.DataFrame(block.get_mmcif_category(name="_entity", raw=False))
84
+ desc[desc.isna()] = ""
85
+
86
+ entityid2description = dict()
87
+ if desc.shape[0] > 0 and np.all(np.isin(["id", "pdbx_description"], desc.columns)):
88
+ entityid2description = dict(zip(desc["id"], desc["pdbx_description"]))
89
+
90
+ entityid2src = dict()
91
+ src_1 = _read_src(block, "_entity_src_gen.",
92
+ "pdbx_gene_src_scientific_name",
93
+ "pdbx_gene_src_ncbi_taxonomy_id")
94
+ src_2 = _read_src(block, "_pdbx_entity_src_syn.",
95
+ "organism_scientific",
96
+ "ncbi_taxonomy_id")
97
+ src_3 = _read_src(block, "_entity_src_nat.",
98
+ "pdbx_organism_scientific",
99
+ "pdbx_ncbi_taxonomy_id")
100
+ entityid2src.update(src_1)
101
+
102
+ for k, v in src_2.items():
103
+ if k not in entityid2src:
104
+ entityid2src[k] = v
105
+
106
+ for k, v in src_3.items():
107
+ if k not in entityid2src:
108
+ entityid2src[k] = v
109
+
110
+ info_map = dict(st.info)
111
+ pdb_code = info_map.get("_entry.id", "").lower()
112
+ info = dict(description={k: v for k, v in entityid2description.items() if v and v != "?"},
113
+ source=entityid2src,
114
+ resolution=st.resolution,
115
+ pdb_id=pdb_code if gemmi.is_pdb_code(pdb_code) else "",
116
+ method=info_map.get("_exptl.method", "").lower(),
117
+ deposition_date=info_map.get("_pdbx_database_status.recvd_initial_deposition_date", ""),
118
+ title=info_map.get("_struct.title", "")
119
+ )
120
+ return dict(structure=st, info=info)
121
+
122
+
123
+ def molecule_description(path: str):
124
+ """
125
+ Molecule description from PDB (.pdb or .pdb.gz)
126
+ :param path:
127
+ :return:
128
+ """
129
+ if is_pdb(path):
130
+ cur_path = pathlib.Path(path)
131
+ if cur_path.suffixes[-1] == ".pdb":
132
+ with open(path, "r") as text_io:
133
+ lines = text_io.readlines()
134
+ else:
135
+ with gzip.open(path, "rb") as gz_handle:
136
+ with io.TextIOWrapper(gz_handle, encoding="utf-8") as text_io:
137
+ lines = text_io.readlines()
138
+ else:
139
+ raise ValueError("Only support .pdb or .pdb.gz file, but got %s" % path)
140
+
141
+ values = {"COMPND": defaultdict(dict),
142
+ "SOURCE": defaultdict(dict),
143
+ }
144
+
145
+ comp_molid = ""
146
+ last_comp_key = ""
147
+
148
+ for hh in lines:
149
+ h = hh.strip()
150
+ key = h[:6].strip()
151
+ tt = h[10:].strip().strip(";")
152
+
153
+ if key in ["COMPND", "SOURCE"]:
154
+ tok = tt.split(":")
155
+ if len(tok) >= 2:
156
+ ckey = tok[0].lower().strip()
157
+ cval = tok[1].strip()
158
+ if ckey == "mol_id":
159
+ comp_molid = cval
160
+ values[key][comp_molid] = dict()
161
+ else:
162
+ values[key][comp_molid][ckey] = cval
163
+ last_comp_key = ckey
164
+ else:
165
+ if last_comp_key != "":
166
+ values[key][comp_molid][last_comp_key] += " " + tok[0].strip()
167
+
168
+ outputs = dict(description=dict(),
169
+ source=dict())
170
+
171
+ ch_id2mol_id = dict()
172
+ for mol_id, val in values["COMPND"].items():
173
+ chain_str = val.get("chain", "").strip()
174
+ if chain_str != "":
175
+ chains = chain_str.split(",")
176
+ for ch in chains:
177
+ ch_id2mol_id[ch.strip()] = mol_id
178
+
179
+ for mol_id, val in values["COMPND"].items():
180
+ m = val.get("molecule", "").strip()
181
+ if m != "":
182
+ outputs["description"][mol_id] = m
183
+
184
+ for mol_id, val in values["SOURCE"].items():
185
+ name = val.get("organism_scientific", "").strip()
186
+ taxid = val.get("organism_taxid", "").strip()
187
+ if name not in ["", "?", "."] or taxid not in ["", "?", "."]:
188
+ outputs["source"][mol_id] = [name, taxid]
189
+ outputs["ch_id2mol_id"] = ch_id2mol_id
190
+ return outputs
191
+
192
+
193
+ def parse_pdb(path: str) -> dict:
194
+ if not is_pdb(path):
195
+ raise TypeError("Input file is not a pdb file [.pdb or .pdb.gz]: %s" % path)
196
+
197
+ st = gemmi.read_structure(path)
198
+ st.setup_entities()
199
+ st.assign_serial_numbers()
200
+
201
+ values = molecule_description(path)
202
+
203
+ mol_id2entity_name = dict()
204
+ for ent in st.entities:
205
+ if ent.name in values["ch_id2mol_id"]:
206
+ mol_id = values["ch_id2mol_id"][ent.name]
207
+ mol_id2entity_name[mol_id] = ent.name
208
+
209
+ # replace mod_id to entity.name
210
+ description = {mol_id2entity_name[mol_id]: v for mol_id, v in values["description"].items()
211
+ if mol_id in mol_id2entity_name}
212
+ # add ligand and water entity description
213
+ # gemmi use ligand name or water as entity name, take this as description
214
+ for ent in st.entities:
215
+ if (ent.name not in description
216
+ and ent.polymer_type.name == "Unknown"
217
+ and ent.name != ""
218
+ and len(ent.name) > 1):
219
+ description[ent.name] = ent.name
220
+
221
+ source = {mol_id2entity_name[mol_id]: v for mol_id, v in values["source"].items()
222
+ if mol_id in mol_id2entity_name}
223
+
224
+ # assign digital entity names
225
+ mapper = assign_digital_entity_names(st)
226
+
227
+ info_map = dict(st.info)
228
+ pdb_code = info_map.get("_entry.id", "").lower()
229
+ info = dict(description={mapper.get(k, k): v for k, v in description.items()},
230
+ source={mapper.get(k, k): v for k, v in source.items()},
231
+ resolution=st.resolution,
232
+ pdb_id=pdb_code if gemmi.is_pdb_code(pdb_code) else "",
233
+ method=info_map.get("_exptl.method", "").lower(),
234
+ deposition_date=info_map.get("_pdbx_database_status.recvd_initial_deposition_date", ""),
235
+ title=info_map.get("_struct.title", ""),
236
+ )
237
+ return dict(structure=st, info=info)
238
+
239
+
240
+ def assign_digital_entity_names(structure: gemmi.Structure) -> Optional[Dict[str, str]]:
241
+ """
242
+ :param structure:
243
+ :return:
244
+ dict, original entity name to new digital entity name
245
+ """
246
+ all_digit_name = np.all([ent.name.isdigit() for ent in structure.entities])
247
+
248
+ mapper = dict()
249
+ if not all_digit_name:
250
+ for ix, ent in enumerate(structure.entities):
251
+ new_name = str(ix + 1)
252
+ mapper[ent.name] = new_name
253
+ ent.name = new_name
254
+ return mapper
21
255
 
22
256
 
23
257
  class StructureParser(object):
24
258
  """
25
- Enhance Structure reader for .cif, .cif.gz, .pdb or .pdb.gz
259
+ Structure reader for .cif, .cif.gz, .pdb or .pdb.gz
260
+
261
+ Read the first model
26
262
  """
27
263
 
28
- def __init__(self, structure: gemmi.Structure = None):
264
+ def __init__(self, structure: Optional[gemmi.Structure] = None):
29
265
  if not isinstance(structure, (type(None), gemmi.Structure)):
30
266
  raise ValueError("structure must be gemmi.Structure or None")
31
267
  if structure is None:
268
+ # init with an empty model
32
269
  self.STRUCT = gemmi.Structure()
270
+ self.MODEL = gemmi.Model(1)
271
+ self.STRUCT.add_model(self.MODEL)
33
272
  elif isinstance(structure, gemmi.Structure):
34
- _assert_unique_chain_names_in_models(structure)
35
273
  self.STRUCT = structure.clone()
36
274
  else:
37
275
  raise ValueError("structure must be gemmi.Structure or None")
276
+
38
277
  self.STRUCT.setup_entities()
39
- _assign_digital_entity_names(self.STRUCT)
278
+ self.STRUCT.assign_serial_numbers()
40
279
 
41
- self.INFO = Info()
42
- self.INFO.from_gemmi_structure_infomap(self.STRUCT.info)
43
- self.ENTITY = _ent_from_structure(self.STRUCT)
44
- self.update_entity()
45
- self.update_full_sequences()
46
-
47
- def update_full_sequences(self):
48
- for ent_idx, ent in enumerate(self.STRUCT.entities):
49
- # get full sequence
50
- full_seq = ent.full_sequence
51
-
52
- # when missing, construct from Residues
53
- if not full_seq:
54
- sel_ch_id = None
55
- sel_ch_len = 0
56
- for ch_id, ent_id in self.ENTITY.polymer2eid.items():
57
- if ent_id == ent.name:
58
- cur_len = len(self.polymer_sequences[ch_id])
59
- if cur_len > sel_ch_len:
60
- sel_ch_id = ch_id
61
- sel_ch_len = cur_len
62
-
63
- if sel_ch_id is not None and sel_ch_len > 0:
64
- full_seq = [r.name for r in self.STRUCT[0][sel_ch_id].get_polymer() if not r.is_water()]
65
- self.STRUCT.entities[ent_idx].full_sequence = full_seq
66
-
67
- @typechecked
68
- def load_from_file(self, path: Union[str, pathlib.PosixPath]):
69
- if _is_pdb(path):
70
- struct, entity = pdb_parser(path)
71
- elif _is_cif(path):
72
- struct, entity = cif_parser(path)
73
- else:
74
- raise ValueError("Only support .cif, .cif.gz, .pdb or .pdb.gz file, but got %s" % path)
280
+ self.STRUCT.renumber_models()
281
+ if len(self.STRUCT) > 1:
282
+ for idx in range(1, len(self.STRUCT)):
283
+ del self.STRUCT[idx]
75
284
 
76
- _assert_unique_chain_names_in_models(struct)
77
- self.STRUCT, self.ENTITY = struct, entity
78
- self.INFO.from_gemmi_structure_infomap(self.STRUCT.info)
285
+ self.MODEL = self.STRUCT[0]
286
+ self.STRUCT.remove_alternative_conformations()
287
+ self.STRUCT.remove_hydrogens()
288
+ self.STRUCT.remove_empty_chains()
289
+ self._update_full_sequences()
290
+
291
+ info_map = dict(self.STRUCT.info)
292
+ pdb_code = info_map.get("_entry.id", "").lower()
293
+ self.INFO = dict(description=dict(),
294
+ source=dict(),
295
+ resolution=self.STRUCT.resolution,
296
+ pdb_id=pdb_code if gemmi.is_pdb_code(pdb_code) else "",
297
+ method=info_map.get("_exptl.method", "").lower(),
298
+ deposition_date=info_map.get("_pdbx_database_status.recvd_initial_deposition_date", ""),
299
+ title=info_map.get("_struct.title", ""),
300
+ )
79
301
  self.update_entity()
80
- self.update_full_sequences()
81
-
82
- @typechecked
83
- def to_pdb(self, outfile: str, write_minimal_pdb=False):
84
- compound_source = _compound_source_string(self.ENTITY)
85
- struct = self.STRUCT.clone()
86
-
87
- rs = "REMARK 2 RESOLUTION. %.2f ANGSTROMS." % struct.resolution
88
- resolution_remarks = ["%-80s" % "REMARK 2",
89
- "%-80s" % rs]
90
302
 
91
- struct.raw_remarks = compound_source + resolution_remarks
92
- if write_minimal_pdb:
93
- struct.write_minimal_pdb(outfile)
94
- else:
95
- struct.write_pdb(outfile)
96
-
97
- @typechecked
98
- def to_cif(self, outfile: str):
99
- out_block = _cif_block_for_output(self.STRUCT, self.ENTITY)
100
- out_block.write_file(outfile)
101
-
102
- @property
103
- def chain_ids(self):
104
- vals = []
105
- for m in self.STRUCT:
106
- for c in m:
107
- vals.append(c.name)
108
- vals.sort()
109
- return vals
110
-
111
- @property
112
- def model_numbers(self):
113
- return [m.num for m in self.STRUCT]
114
-
115
- @typechecked
116
- def set_default_model(self, num: Optional[int] = None):
303
+ def load_from_file(self, path: str):
117
304
  """
118
- Set the first model as default
119
- :param num:
305
+ Load model from file, default use the first model.
306
+ :param path:
120
307
  :return:
121
308
  """
122
- if len(self.STRUCT) == 0:
123
- raise RuntimeError("There is no model in structure")
124
-
125
- keep_model = None
126
- if num is None:
127
- # default first model
128
- keep_model = self.STRUCT[0]
309
+ if is_pdb(path):
310
+ val = parse_pdb(path)
311
+ self.STRUCT, self.INFO = val["structure"], val["info"]
312
+ elif is_cif(path):
313
+ val = parse_cif(path)
314
+ self.STRUCT, self.INFO = val["structure"], val["info"]
129
315
  else:
130
- for model in self.STRUCT:
131
- if model.num == num:
132
- keep_model = model
133
- break
316
+ raise ValueError("path must be files with suffixes [ .cif, .cif.gz, .pdb or .pdb.gz]")
317
+
318
+ # force to use first model when mulitple models exist
319
+ self.STRUCT.renumber_models()
320
+ if len(self.STRUCT) > 1:
321
+ for idx in range(1, len(self.STRUCT)):
322
+ del self.STRUCT[idx]
323
+
324
+ self.MODEL = self.STRUCT[0]
325
+ self.STRUCT.remove_alternative_conformations()
326
+ self.STRUCT.remove_hydrogens()
327
+ self.STRUCT.remove_empty_chains()
328
+ self._update_full_sequences()
329
+ self.update_entity()
134
330
 
135
- if keep_model is None:
136
- raise RuntimeError("Model %d not found in structure" % num)
331
+ def _update_full_sequences(self):
332
+ for idx, ent in enumerate(self.STRUCT.entities):
333
+ if ent.entity_type.name == "Polymer":
334
+ self.STRUCT.entities[idx].full_sequence = [gemmi.Entity.first_mon(item) for item in ent.full_sequence]
137
335
 
138
- # del, reversed order indexes
139
- indexes_to_del = [i for i, model in enumerate(self.STRUCT) if model.num != keep_model.num]
140
- indexes_to_del.sort(reverse=True)
336
+ if len(ent.full_sequence) == 0:
337
+ sc = self.get_subchain(ent.subchains[0])
338
+ self.STRUCT.entities[idx].full_sequence = sc.extract_sequence()
141
339
 
142
- for cur_index in indexes_to_del:
143
- del self.STRUCT[cur_index]
340
+ @property
341
+ def chain_ids(self):
342
+ return [ch.name for ch in self.MODEL]
144
343
 
145
344
  @property
146
- def chain_types(self):
147
- return {c: _chain_type(self.STRUCT, c) for c in self.chain_ids}
345
+ def subchain_ids(self):
346
+ return [ch.subchain_id() for ch in self.MODEL.subchains()]
148
347
 
149
348
  @property
150
349
  def assembly_names(self):
151
350
  return [assem.name for assem in self.STRUCT.assemblies]
152
351
 
153
352
  @property
154
- def polymer_sequences(self):
155
- cts = self.chain_types
353
+ def polymer_types(self):
354
+ subchain_id2polymer = dict()
355
+ for ent in self.STRUCT.entities:
356
+ if ent.entity_type.name == "Polymer":
357
+ for ch in ent.subchains:
358
+ subchain_id2polymer[ch] = ent.polymer_type
359
+
156
360
  out = dict()
157
- for model in self.STRUCT:
158
- for chain in model:
159
- ct = cts.get(chain.name, "other")
160
- if ct != "other":
161
- out[chain.name] = self.make_one_letter_sequence(chain.name)
361
+ for chain in self.MODEL:
362
+ polymer_ch = chain.get_polymer()
363
+ seq = polymer_ch.extract_sequence()
364
+ if seq:
365
+ subchain_id = polymer_ch.subchain_id()
366
+ if subchain_id in subchain_id2polymer:
367
+ out[chain.name] = subchain_id2polymer[subchain_id]
162
368
  return out
163
369
 
164
- @property
165
- def polymer_residue_numbers(self):
166
- cts = self.chain_types
370
+ def polymer_sequences(self, pdbx: bool = False):
167
371
  out = dict()
168
- id_type = np.dtype([
169
- ("ch_name", "U5"),
170
- ("res_num", "i4"),
171
- ("res_icode", "U3"),
172
- ("res_name", "U5"),
173
- ])
174
- for model in self.STRUCT:
175
- for chain in model:
176
- ct = cts.get(chain.name, "other")
177
- if ct != "other":
178
- out[chain.name] = np.array([(chain.name, r.seqid.num, r.seqid.icode, r.name)
179
- for r in chain.get_polymer()], dtype=id_type)
372
+ for ch, polymer_type in self.polymer_types.items():
373
+ polymer = self.get_chain(ch).get_polymer()
374
+ if pdbx:
375
+ s = gemmi.pdbx_one_letter_code(polymer.extract_sequence(), gemmi.sequence_kind(polymer_type))
376
+ else:
377
+ s = polymer.make_one_letter_sequence().replace("-", "")
378
+ out[ch] = s
180
379
  return out
181
380
 
182
- def chain_residues(self, polymer_only=True, with_water=False):
183
- """
184
- :param polymer_only, bool
185
- :param with_water:
186
- :return: dict of Three-letter codes of chain residues
187
- """
381
+ def get_subchain(self, subchain_id: str):
382
+ out = None
383
+ for ch in self.MODEL.subchains():
384
+ if ch.subchain_id() == subchain_id:
385
+ out = ch
386
+ break
387
+
388
+ if out is None:
389
+ raise ValueError("Sub-Chain %s not found (only [%s])" % (subchain_id, " ".join(self.subchain_ids)))
188
390
 
189
- out = dict()
190
- for model in self.STRUCT:
191
- for chain in model:
192
- res_codes = []
193
- for r in chain:
194
- if r.is_water():
195
- if with_water:
196
- res_codes.append(r.name)
197
- else:
198
- if polymer_only:
199
- if r.entity_type.name == "Polymer":
200
- res_codes.append(r.name)
201
- else:
202
- res_codes.append(r.name)
203
- out[chain.name] = res_codes
204
391
  return out
205
392
 
393
+ def get_chain(self, chain_id: str):
394
+ return self.MODEL[chain_id]
395
+
396
+ def pick_chains(self, chain_names: List[str]):
397
+ struct = gemmi.Structure()
398
+ struct.name = self.STRUCT.name
399
+ model = gemmi.Model(1)
400
+ for ch_id in chain_names:
401
+ model.add_chain(self.get_chain(ch_id))
402
+
403
+ struct.add_model(model)
404
+
405
+ # add basic information
406
+ struct.resolution = self.STRUCT.resolution
407
+
408
+ vals = {"_exptl.method": self.INFO["method"],
409
+ "_struct.title": "(Chains %s): " % " ".join(chain_names) + self.INFO["title"],
410
+ "_pdbx_database_status.recvd_initial_deposition_date": self.INFO["deposition_date"],
411
+ }
412
+ if self.INFO["pdb_id"] != "":
413
+ vals["_entry.id"] = self.INFO["pdb_id"]
414
+
415
+ struct.info = gemmi.InfoMap(vals)
416
+ new_struct = StructureParser(struct)
417
+
418
+ new_struct.INFO["description"] = {ent.name: self.INFO["description"][ent.name]
419
+ for ent in new_struct.STRUCT.entities
420
+ if ent.name in self.INFO["description"]
421
+ }
422
+ new_struct.INFO["source"] = {ent.name: self.INFO["source"][ent.name]
423
+ for ent in new_struct.STRUCT.entities
424
+ if ent.name in self.INFO["source"]
425
+ }
426
+ return new_struct
427
+
428
+ def _raw_marks(self):
429
+ subchain2chain = dict()
430
+ for chain in self.MODEL:
431
+ for sub_chain in chain.subchains():
432
+ subchain_id = sub_chain.subchain_id()
433
+ subchain2chain[subchain_id] = chain.name
434
+
435
+ entity2chains = dict()
436
+ for ent in self.STRUCT.entities:
437
+ val = [subchain2chain[sub_ch] for sub_ch in ent.subchains if sub_ch in subchain2chain]
438
+ if len(val) > 0:
439
+ entity2chains[ent.name] = val
440
+
441
+ mol_id = 1
442
+ n_line = 1
443
+ compound_mol = "COMPND {n_line:>3} MOL_ID: {mol_id};"
444
+ compound_molecule = "COMPND {n_line:>3} MOLECULE: {molecule};"
445
+ compound_chain = "COMPND {n_line:>3} CHAIN: {chain};"
446
+
447
+ outputs = []
448
+
449
+ for ent in self.STRUCT.entities:
450
+ if ent.entity_type.name == "Polymer":
451
+ chain = ", ".join(entity2chains[ent.name])
452
+
453
+ molecule = self.INFO["description"].get(ent.name, "")
454
+ if n_line == 1:
455
+ outputs.append("COMPND MOL_ID: {mol_id};".format(mol_id=mol_id))
456
+ else:
457
+ outputs.append(compound_mol.format(n_line=n_line, mol_id=mol_id))
458
+ n_line += 1
459
+
460
+ outputs.append(compound_molecule.format(n_line=n_line, molecule=molecule))
461
+ n_line += 1
462
+
463
+ outputs.append(compound_chain.format(n_line=n_line, chain=chain))
464
+ n_line += 1
465
+
466
+ mol_id += 1
467
+
468
+ mol_id = 1
469
+ n_line = 1
470
+ source_mol = "SOURCE {n_line:>3} MOL_ID: {mol_id};"
471
+ source_scientific = "SOURCE {n_line:>3} ORGANISM_SCIENTIFIC: {organism_scientific};"
472
+ source_taxid = "SOURCE {n_line:>3} ORGANISM_TAXID: {organism_taxid};"
473
+
474
+ for ent in self.STRUCT.entities:
475
+ if ent.entity_type.name == "Polymer":
476
+ src = self.INFO["source"].get(ent.name)
477
+ if src is None:
478
+ organism_scientific, organism_taxid = "", ""
479
+ else:
480
+ organism_scientific, organism_taxid = src
481
+
482
+ if n_line == 1:
483
+ outputs.append("SOURCE MOL_ID: {mol_id};".format(mol_id=mol_id))
484
+ else:
485
+ outputs.append(source_mol.format(n_line=n_line, mol_id=mol_id))
486
+ n_line += 1
487
+
488
+ outputs.append(source_scientific.format(n_line=n_line, organism_scientific=organism_scientific))
489
+ n_line += 1
490
+
491
+ outputs.append(source_taxid.format(n_line=n_line, organism_taxid=organism_taxid))
492
+ n_line += 1
493
+
494
+ mol_id += 1
495
+
496
+ resolution_remarks = ["REMARK 2",
497
+ "REMARK 2 RESOLUTION. %.2f ANGSTROMS." % self.STRUCT.resolution
498
+ ]
499
+ outputs.extend(resolution_remarks)
500
+ return outputs
501
+
502
+ def to_pdb(self, outfile: str, write_minimal_pdb=False):
503
+ struct = self.STRUCT.clone()
504
+ if write_minimal_pdb:
505
+ struct.write_minimal_pdb(outfile)
506
+ else:
507
+ struct.raw_remarks = self._raw_marks()
508
+ struct.write_pdb(outfile)
509
+
510
+ @staticmethod
511
+ def _item_index(block: gemmi.cif.Block, tag: str):
512
+ mapper = dict()
513
+ for idx, item in enumerate(block):
514
+ if item.loop is not None:
515
+ keys = item.loop.tags
516
+ for k in keys:
517
+ mapper[k] = idx
518
+ elif item.pair is not None:
519
+ key = item.pair[0]
520
+ mapper[key] = idx
521
+ return mapper.get(tag)
522
+
523
+ def to_cif(self, outfile: str):
524
+ block = self.STRUCT.make_mmcif_block()
525
+ #### add resolution
526
+ # block.set_pair(tag="_refine.entry_id", value=gemmi.cif.quote(self.INFO["pdb_id"].upper()))
527
+ # block.set_pair(tag="_refine.pdbx_refine_id", value=gemmi.cif.quote(self.INFO["method"].upper()))
528
+ block.set_pair(tag="_refine.ls_d_res_high", value=gemmi.cif.quote(str(self.INFO["resolution"])))
529
+
530
+ # tag_names = ["_exptl.entry_id",
531
+ # "_refine.entry_id", "_refine.pdbx_refine_id",
532
+ # "_refine.ls_d_res_high"]
533
+ # for i in range(1, len(tag_names)):
534
+ # idx_1a = self._item_index(block, tag=tag_names[i])
535
+ # idx_2a = self._item_index(block, tag=tag_names[i - 1])
536
+ # block.move_item(idx_1a, idx_2a + 1)
537
+
538
+ #### add entity description
539
+ ta = block.find_mmcif_category(category="_entity.")
540
+ da = pd.DataFrame(list(ta), columns=list(ta.tags))
541
+ da["_entity.pdbx_description"] = da["_entity.id"].apply(
542
+ lambda i: gemmi.cif.quote(self.INFO["description"].get(i, "?")))
543
+
544
+ rows_1 = da.to_numpy().tolist()
545
+ tags_1 = [s.replace("_entity.", "") for s in da.columns.tolist()]
546
+
547
+ # erase
548
+ qitem = block.find_loop_item("_entity.id")
549
+ if isinstance(qitem, gemmi.cif.Item):
550
+ qitem.erase()
551
+
552
+ # add
553
+ loop_1 = block.init_loop(prefix="_entity.", tags=tags_1)
554
+ for r in rows_1:
555
+ loop_1.add_row(r)
556
+
557
+ idx_1b = self._item_index(block, tag="_entity.id")
558
+ idx_2b = self._item_index(block, tag="_entity_poly.entity_id")
559
+
560
+ # place _entity. before _entity_poly.
561
+ if isinstance(idx_1b, int) and isinstance(idx_2b, int):
562
+ block.move_item(idx_1b, idx_2b - 1)
563
+
564
+ #### add source name and taxid
565
+ loop_2 = block.init_loop(prefix="_entity_src_gen.", tags=["entity_id",
566
+ "pdbx_gene_src_scientific_name",
567
+ "pdbx_gene_src_ncbi_taxonomy_id"])
568
+
569
+ for k, (name, taxid) in self.INFO["source"].items():
570
+ name = name if name != "" else "?"
571
+ taxid = taxid if taxid != "" else "?"
572
+
573
+ loop_2.add_row([gemmi.cif.quote(k),
574
+ gemmi.cif.quote(name),
575
+ gemmi.cif.quote(taxid)]
576
+ )
577
+
578
+ idx_1c = self._item_index(block, tag="_entity_src_gen.entity_id")
579
+ idx_2c = self._item_index(block, tag="_entity_poly_seq.entity_id")
580
+ # place _entity_src_gen. after _entity_poly_seq.
581
+ if isinstance(idx_1c, int) and isinstance(idx_2c, int):
582
+ block.move_item(idx_1c, idx_2c + 1)
583
+
584
+ block.write_file(outfile)
585
+
206
586
  def update_entity(self):
207
587
  """
208
588
  Update ENTITY, .entities .assemblies according to subchains
209
589
  :return:
210
590
  """
211
- subchains = []
212
- for model in self.STRUCT:
213
- for chain in model:
214
- subchains.extend([sc.subchain_id() for sc in chain.subchains()])
591
+ subchains = self.subchain_ids
215
592
 
216
593
  # update .entities
217
594
  new_entities = gemmi.EntityList()
@@ -224,15 +601,9 @@ class StructureParser(object):
224
601
  ent_names.append(ent.name)
225
602
  self.STRUCT.entities = new_entities
226
603
 
227
- # update .ENTITY
228
- for super_key in ["eid2desc", "eid2specie", "eid2taxid"]:
229
- for eid in list(self.ENTITY[super_key].keys()):
230
- if eid not in ent_names:
231
- del self.ENTITY[super_key][eid]
232
-
233
- for cid, eid in list(self.ENTITY["polymer2eid"].items()):
234
- if eid not in ent_names or cid not in self.chain_ids:
235
- del self.ENTITY["polymer2eid"][cid]
604
+ # update INFO
605
+ self.INFO["description"] = {k: v for k, v in self.INFO["description"].items() if k in ent_names}
606
+ self.INFO["source"] = {k: v for k, v in self.INFO["source"].items() if k in ent_names}
236
607
 
237
608
  # update .assemblies
238
609
  all_cid = self.chain_ids
@@ -262,189 +633,257 @@ class StructureParser(object):
262
633
  for dai in del_assembly_indexes:
263
634
  del self.STRUCT.assemblies[dai]
264
635
 
265
- @typechecked
266
636
  def rename_chain(self, origin_name: str, target_name: str):
267
637
  if origin_name not in self.chain_ids:
268
- raise ValueError("chain %s not found" % origin_name)
638
+ raise ValueError("Chain %s not found" % origin_name)
639
+
269
640
  other_chain_names = set(self.chain_ids) - {origin_name}
270
641
 
271
642
  if target_name in other_chain_names:
272
- raise ValueError("target chain name %s has existed, change to a different one." % target_name)
643
+ raise ValueError("Chain %s has existed, please set a different target_name." % target_name)
273
644
 
274
645
  self.STRUCT.rename_chain(origin_name, target_name)
275
646
 
276
- # update .polymer2eid if exist
277
- if origin_name in self.ENTITY.polymer2eid:
278
- val = self.ENTITY.polymer2eid[origin_name]
279
- del self.ENTITY.polymer2eid[origin_name]
280
- self.ENTITY.polymer2eid[target_name] = val
281
-
282
- # update .assemblies.generator.chain if exists, for .pdb loading structure
283
647
  for assembly in self.STRUCT.assemblies:
284
648
  for gen in assembly.generators:
285
649
  tmp = [target_name if c == origin_name else c for c in gen.chains]
286
650
  gen.chains = tmp
287
651
 
288
- @typechecked
289
- def switch_chain_names(self, chain_name_1: str, chain_name_2: str):
652
+ def swap_chain_names(self, chain_name_1: str, chain_name_2: str):
290
653
  if chain_name_1 not in self.chain_ids:
291
- raise ValueError("chain_name_2 %s not in structure" % chain_name_1)
654
+ raise ValueError("Chain %s not found" % chain_name_1)
292
655
  if chain_name_2 not in self.chain_ids:
293
- raise ValueError("chain_name_2 %s not in structure" % chain_name_2)
656
+ raise ValueError("Chain %s not in found" % chain_name_2)
294
657
 
295
- l3 = [i + j + k for i in string.ascii_uppercase for j in string.ascii_uppercase for k in string.ascii_uppercase]
296
- l3.sort(reverse=True)
658
+ flag = True
659
+ while flag:
660
+ characters = string.ascii_letters + string.digits
661
+ sw_name = ''.join(random.choices(characters, k=4))
662
+ if sw_name not in self.chain_ids:
663
+ flag = False
297
664
 
298
- current_names = set(self.chain_ids)
299
- l3_l = [n for n in l3 if n not in current_names]
300
- sw_name = l3_l.pop()
301
665
  self.rename_chain(chain_name_1, sw_name)
302
666
  self.rename_chain(chain_name_2, chain_name_1)
303
667
  self.rename_chain(sw_name, chain_name_2)
304
668
 
305
- @typechecked
306
- def pick_chains(self, chain_names: List[str]):
307
- self.set_default_model()
669
+ def make_one_letter_chain(self, only_uppercase: bool = True):
670
+ uppercase_letters = list(string.ascii_uppercase)
671
+ uppercase_letters.sort(reverse=True)
308
672
 
309
- if chain_names:
310
- missing = [c for c in chain_names if c not in self.chain_ids]
311
- if missing:
312
- raise ValueError("Chains %s not found" % ",".join(missing))
313
- else:
314
- del_chain_names = set(self.chain_ids) - set(chain_names)
315
- del_chain_indexes = [i for i, ch in enumerate(self.STRUCT[0]) if ch.name in del_chain_names]
316
- del_chain_indexes.sort(reverse=True)
317
- for di in del_chain_indexes:
318
- del self.STRUCT[0][di]
319
- self.update_entity()
673
+ lowercase_letters = list(string.ascii_lowercase)
674
+ lowercase_letters.sort(reverse=True)
675
+
676
+ digit_letters = list(string.digits)
677
+ digit_letters.sort(reverse=True)
678
+
679
+ if only_uppercase:
680
+ letters = uppercase_letters
320
681
  else:
321
- raise ValueError("No chain is given")
682
+ letters = digit_letters + lowercase_letters + uppercase_letters
322
683
 
323
- @typechecked
324
- def make_chain_names_to_one_letter(self, only_uppercase: bool = True):
325
- _mapper = _chain_names2one_letter(self.STRUCT, only_uppercase)
326
- for origin_name, target_name in _mapper.items():
684
+ if only_uppercase:
685
+ msg = "The number of chains exceed the number of uppercase letters: %d > %d"
686
+ else:
687
+ msg = "The number of chains exceed the number of one-letter characters: %d > %d"
688
+
689
+ if len(self.chain_ids) > len(letters):
690
+ raise RuntimeError(msg % (len(self.chain_ids), len(letters)))
691
+
692
+ # not use yet
693
+ letters_valid = [l for l in letters if l not in self.chain_ids]
694
+ chains2rename = [ch for ch in self.chain_ids if ch not in letters]
695
+ mapper = {ch: letters_valid.pop() for ch in self.chain_ids if ch not in letters}
696
+
697
+ for origin_name, target_name in mapper.items():
327
698
  self.rename_chain(origin_name, target_name)
328
- return _mapper
699
+ return mapper
329
700
 
330
- @typechecked
331
- def get_assembly(self, assembly_name: str):
701
+ def get_assembly(self, assembly_name: str,
702
+ how: gemmi.HowToNameCopiedChain = gemmi.HowToNameCopiedChain.AddNumber):
332
703
  if assembly_name not in self.assembly_names:
333
- raise ValueError("assembly %s is not found" % assembly_name)
704
+ raise ValueError("Assembly %s not found (only [%s])" % (assembly_name, ", ".join(self.assembly_names)))
334
705
 
335
- struct, polymer2eid = get_assembly(self.STRUCT, assembly_name, gemmi.HowToNameCopiedChain.Short)
336
- out = StructureParser(struct)
337
- out.ENTITY = deepcopy(self.ENTITY)
338
- out.ENTITY.polymer2eid = polymer2eid
706
+ struct = self.STRUCT.clone()
707
+ struct.transform_to_assembly(assembly_name, how)
708
+ struct.info["_struct.title"] = "(Assembly %s): " % assembly_name + struct.info["_struct.title"]
709
+
710
+ new_struct = StructureParser(struct)
711
+
712
+ # find perfect match entities
713
+ entity_mapper = dict()
714
+ for new_ent in new_struct.STRUCT.entities:
715
+ for ent in self.STRUCT.entities:
716
+ if new_ent.entity_type == ent.entity_type:
717
+ if ent.entity_type.name == "Polymer":
718
+ if new_ent.full_sequence == ent.full_sequence:
719
+ entity_mapper[new_ent.name] = ent.name
720
+ break
721
+ else:
722
+ new_s = new_struct.get_subchain(new_ent.subchains[0]).extract_sequence()
723
+ s = self.get_subchain(ent.subchains[0]).extract_sequence()
724
+ if new_s == s:
725
+ entity_mapper[new_ent.name] = ent.name
726
+ break
339
727
 
340
- # update info
341
- prefix = "[Assembly %s] " % assembly_name
342
- out.INFO.title = prefix + out.INFO.title
343
- out.STRUCT.info = out.INFO.to_gemmi_structure_infomap()
344
- return out
728
+ # update Info
729
+ desc = dict()
730
+ src = dict()
345
731
 
346
- @typechecked
347
- def merge_chains(self, chains: List[str]):
348
- """
349
- Merge a list of chains, target chain id is chains[0]
732
+ for ent in new_struct.STRUCT.entities:
733
+ if ent.name in entity_mapper and entity_mapper[ent.name] in self.INFO["description"]:
734
+ desc[ent.name] = self.INFO["description"][entity_mapper[ent.name]]
735
+
736
+ if ent.name in entity_mapper and entity_mapper[ent.name] in self.INFO["source"]:
737
+ src[ent.name] = self.INFO["source"][entity_mapper[ent.name]]
350
738
 
351
- Renumber the new chain from 1
739
+ new_struct.INFO["description"] = desc
740
+ new_struct.INFO["source"] = src
741
+ return new_struct
352
742
 
353
- [No fix the Entity and some other information of structure]
354
- :param chains:
743
+ def clean_structure(self, remove_ligand=True):
744
+ """
745
+ Remove water by default
746
+
747
+ :param remove_ligand:
355
748
  :return:
356
- GemmiLoader
357
749
  """
358
- for c in chains:
359
- if c not in self.chain_ids:
360
- raise RuntimeError("Chain %s is not in the structure" % c)
361
- if len(self.STRUCT) > 1:
362
- print("Multiple models in structure, do nothing")
363
- elif len(chains) < 2:
364
- print("Query chains less than 2, do nothing")
365
- else:
366
- new_chain = gemmi.Chain(chains[0])
367
- residue_index = 1
368
-
369
- model = self.STRUCT[0]
370
-
371
- for ch in model:
372
- if ch.name in chains:
373
- for res in ch:
374
- nr = deepcopy(res)
375
- nr.seqid.icode = " "
376
- nr.seqid.num = residue_index
377
- new_chain.add_residue(nr)
378
- residue_index += 1
379
-
380
- for c in chains:
381
- self.STRUCT[0].remove_chain(c)
382
-
383
- self.STRUCT[0].add_chain(new_chain, unique_name=True)
384
-
385
- def get_atom_coords(self, chains: List[str], atoms: Optional[List[str]] = None):
386
- for c in chains:
387
- if c not in self.chain_ids:
388
- warnings.warn("Chain %s is not in the structure" % c)
389
-
390
- coord = []
391
- atom_id = []
392
- id_type = np.dtype([
393
- ("ch_name", "U5"),
394
- ("res_num", "i4"),
395
- ("res_icode", "U3"),
396
- ("res_name", "U5"),
397
- ("atom_name", "U5")
398
- ])
399
-
400
- model = self.STRUCT[0]
401
- for ch in model:
402
- if ch.name in chains:
403
- for res in ch:
404
- for atom in res:
405
- if atoms is None or atom.name in atoms:
406
- cur_id = (ch.name, res.seqid.num, res.seqid.icode, res.name, atom.name)
407
- cur_pos = atom.pos.tolist()
408
- coord.append(cur_pos)
409
- atom_id.append(cur_id)
410
-
411
- if coord:
412
- return np.array(coord, dtype=np.float32), np.array(atom_id, dtype=id_type)
750
+ if remove_ligand:
751
+ self.STRUCT.remove_waters()
413
752
  else:
414
- return np.empty(shape=(0, 3), dtype=np.float32), np.array(atom_id, dtype=id_type)
415
-
416
- def make_one_letter_sequence(self, chain_id):
417
- c_type = self.chain_types[chain_id]
418
- residues = self.chain_residues(polymer_only=True, with_water=False)[chain_id]
753
+ self.STRUCT.remove_ligands_and_waters()
419
754
 
420
- if c_type == "protein":
421
- one_letter_code = "".join([protein_3to1_mapper.get(r, "X") for r in residues])
422
- elif c_type in ["dna", "rna"]:
423
- one_letter_code = "".join([nucleic_3to1_mapper.get(r, "N") for r in residues])
424
- else:
425
- one_letter_code = ""
426
- return one_letter_code
755
+ self.STRUCT.remove_empty_chains()
756
+ self.update_entity()
427
757
 
428
- def clean_structure(self, keep_ligand=True):
758
+ def met_to_mse(self):
759
+ for chain in self.MODEL:
760
+ for residue in chain:
761
+ if residue.name == 'MET':
762
+ residue.name = 'MSE'
763
+ for atom in residue:
764
+ if atom.name == 'SD':
765
+ atom.name = 'SE'
766
+ atom.element = gemmi.Element('Se')
767
+
768
+ def get_atoms(self, arg: str = "*"):
429
769
  """
430
- (1) remove_alternative_conformations
431
- (2) remove_hydrogens
432
- (3) remove_water
433
- (4) remove_empty_chains
434
770
 
771
+ :param arg: str, "*", "/1/*//N,CA,C,O", "/1/*"
772
+ see gemmi.Selection
435
773
  :return:
774
+ np.ndarray
436
775
  """
437
- self.set_default_model()
438
- self.STRUCT.remove_alternative_conformations()
439
- self.STRUCT.remove_hydrogens()
776
+ sel = gemmi.Selection(arg)
777
+ res = []
778
+
779
+ for model in sel.models(self.STRUCT):
780
+ for chain in sel.chains(model):
781
+ for residue in sel.residues(chain):
782
+ for atom in sel.atoms(residue):
783
+ val = (chain.name,
784
+ residue.seqid.num,
785
+ residue.seqid.icode,
786
+ residue.name,
787
+ atom.name,
788
+ atom.element.name,
789
+ atom.charge,
790
+ atom.b_iso,
791
+ atom.occ,
792
+ tuple(atom.pos.tolist()),
793
+ )
794
+ res.append(val)
795
+
796
+ dtype = [("chain_name", "U5"),
797
+ ("residue_num", "i4"),
798
+ ("residue_icode", "U3"),
799
+ ("residue_name", "U5"),
800
+ ("atom_name", "U5"),
801
+ ("element", "U3"),
802
+ ("charge", "i1"),
803
+ ("b_factor", "f4"),
804
+ ("occupancy", "f4"),
805
+ ("coordinate", ("f4", (3,)))
806
+ ]
807
+ return np.array(res, dtype=dtype)
808
+
809
+ def polymer_interface_residues(self,
810
+ chains_x: List[str],
811
+ chains_y: List[str],
812
+ threshold: float = 4.5):
813
+ """
814
+ Identify PPI among protein, DNA, RNA using heavy atom distances.
815
+ :param chains_x:
816
+ :param chains_y:
817
+ :param threshold:
818
+ :return:
819
+ PPI residues of chains_x, PPI residues of chains_y
820
+ """
821
+ for ch in chains_x + chains_y:
822
+ if ch not in self.chain_ids:
823
+ raise ValueError("Chain %s not found (only [%s])" % (ch, " ".join(self.chain_ids)))
824
+ elif ch not in self.polymer_types:
825
+ raise ValueError("Chain %s is not a polymer (only [%s])"
826
+ % (ch, " ".join(list(self.polymer_types.keys())))
827
+ )
828
+
829
+ def ppi_atoms(struct, chains):
830
+ # atoms for N and O of backbone and N, O, P, S of side chains, only for PPI searching
831
+ protein_atoms = ['N', 'ND1', 'ND2', 'NE', 'NE1', 'NE2', 'NH1', 'NH2', 'NZ',
832
+ 'O', 'OD1', 'OD2', 'OE1', 'OE2', 'OG', 'OG1', 'OH',
833
+ 'SD', 'SG']
834
+ xna_atoms = ['N1', 'N2', 'N3', 'N4', 'N6', 'N7', 'N9',
835
+ 'O2', "O2'", "O3'", 'O4', "O4'", "O5'", 'O6',
836
+ 'OP1', 'OP2', 'OP3', 'P']
837
+ tag = "/1/%s//%s" % (",".join(chains), ",".join(protein_atoms + xna_atoms))
838
+ z = struct.get_atoms(tag)
839
+ return z
840
+
841
+ query_struct = deepcopy(self)
842
+ query_struct.clean_structure(remove_ligand=True)
843
+
844
+ atom_x = ppi_atoms(query_struct, chains_x)
845
+ atom_y = ppi_atoms(query_struct, chains_y)
846
+
847
+ kd_tree_x = cKDTree(atom_x["coordinate"])
848
+ kd_tree_y = cKDTree(atom_y["coordinate"])
849
+
850
+ pairs = kd_tree_x.sparse_distance_matrix(kd_tree_y, threshold, output_type='coo_matrix')
851
+ x_res = np.unique(atom_x[pairs.row][["chain_name", "residue_num", "residue_icode", "residue_name"]])
852
+ y_res = np.unique(atom_y[pairs.col][["chain_name", "residue_num", "residue_icode", "residue_name"]])
853
+
854
+ return x_res, y_res
855
+
856
+ def polymer_interface_residues_all(self, ppi_threshold: float = 4.5, n_cpus: int = 4):
857
+ """
858
+ Identify PPI among protein, DNA, RNA using heavy atom distances between all chain pairs.
440
859
 
441
- if keep_ligand:
442
- self.STRUCT.remove_waters()
443
- else:
444
- self.STRUCT.remove_ligands_and_waters()
860
+ :param ppi_threshold:
861
+ :param n_cpus:
862
+ :return:
863
+ """
864
+ chains = list(self.polymer_types.keys())
865
+ ch_pairs = list(itertools.combinations(chains, r=2))
866
+ ch_pairs.sort()
867
+
868
+ def _run(ch_1, ch_2):
869
+ key = "%s/%s" % (ch_1, ch_2)
870
+ res_x, res_y = self.polymer_interface_residues(chains_x=[ch_1], chains_y=[ch_2], threshold=ppi_threshold)
871
+
872
+ if len(res_x) > 0:
873
+ vx = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in res_x.tolist()]
874
+ vy = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in res_y.tolist()]
875
+ return {key: [vx, vy]}
876
+ else:
877
+ return dict()
445
878
 
446
- self.STRUCT.remove_empty_chains()
879
+ cpu2use = max(min(n_cpus, len(ch_pairs)), 1)
447
880
 
448
- # update information
449
- self.update_entity()
450
- self.update_full_sequences()
881
+ outputs = dict()
882
+ if cpu2use == 1 or len(ch_pairs) < 50:
883
+ for ch_1, ch_2 in ch_pairs:
884
+ outputs.update(_run(ch_1, ch_2))
885
+ else:
886
+ results = Parallel(n_jobs=cpu2use)(delayed(_run)(c1, c2) for c1, c2 in ch_pairs)
887
+ for item in results:
888
+ outputs.update(item)
889
+ return outputs