gemmi-protools 0.1.17__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gemmi-protools might be problematic. Click here for more details.

@@ -1,217 +1,598 @@
1
- """
2
- @Author: Luo Jiejian
3
- """
1
+ import gzip
2
+ import io
3
+ import itertools
4
4
  import pathlib
5
+ import random
5
6
  import string
6
- import warnings
7
+ from collections import defaultdict
7
8
  from copy import deepcopy
8
- from typing import Union, Optional, List
9
+ from typing import Dict, Optional, List
9
10
 
10
11
  import gemmi
11
12
  import numpy as np
12
- from typeguard import typechecked
13
+ import pandas as pd
14
+ from joblib import Parallel, delayed
15
+ from scipy.spatial import cKDTree
13
16
 
14
- from gemmi_protools.io.cif_opts import _cif_block_for_output, _is_cif
15
- from gemmi_protools.io.parser import (_assign_digital_entity_names, _ent_from_structure,
16
- pdb_parser, cif_parser, _chain_type, _chain_names2one_letter,
17
- _assert_unique_chain_names_in_models, get_assembly)
18
- from gemmi_protools.io.pdb_opts import _compound_source_string, _is_pdb
19
- from gemmi_protools.io.peptide import nucleic_3to1_mapper, protein_3to1_mapper
20
- from gemmi_protools.io.struct_info import Info
17
+
18
+ def is_pdb(path: str) -> bool:
19
+ """
20
+ Check if input file is .pdb or .pdb.gz format
21
+ :param path:
22
+ :return:
23
+ bool
24
+ """
25
+ path = pathlib.Path(path)
26
+
27
+ if path.suffixes:
28
+ if path.suffixes[-1] == ".pdb":
29
+ return True
30
+ elif "".join(path.suffixes[-2:]) == ".pdb.gz":
31
+ return True
32
+ else:
33
+ return False
34
+ else:
35
+ return False
36
+
37
+
38
+ def is_cif(path: str) -> bool:
39
+ """
40
+ Check if input file is .cif or .cif.gz
41
+ :param path:
42
+ :return:
43
+ bool
44
+ """
45
+
46
+ path = pathlib.Path(path)
47
+ if path.suffixes:
48
+ if path.suffixes[-1] == ".cif":
49
+ return True
50
+ elif "".join(path.suffixes[-2:]) == ".cif.gz":
51
+ return True
52
+ else:
53
+ return False
54
+ else:
55
+ return False
56
+
57
+
58
+ def parse_cif(path: str) -> dict:
59
+ """
60
+ Parse CIF structure and info
61
+ :param path: str
62
+ :return:
63
+ dict
64
+ """
65
+ if not is_cif(path):
66
+ raise TypeError("Input file is not a cif file [.cif or .cif.gz]: %s" % path)
67
+
68
+ doc = gemmi.cif.Document()
69
+ st = gemmi.read_structure(path, save_doc=doc)
70
+ st.setup_entities()
71
+ st.assign_serial_numbers()
72
+ block = doc.sole_block()
73
+
74
+ def _read_src(query_block, category, name_col, taxid_col):
75
+ dk = pd.DataFrame(query_block.get_mmcif_category(name=category, raw=False))
76
+ dk[dk.isna()] = ""
77
+
78
+ if dk.shape[0] > 0 and np.all(np.isin(["entity_id", name_col, taxid_col], dk.columns)):
79
+ return {eid: [name, taxid]
80
+ for eid, name, taxid in dk[["entity_id", name_col, taxid_col]].to_numpy()
81
+ }
82
+ else:
83
+ return dict()
84
+
85
+ desc = pd.DataFrame(block.get_mmcif_category(name="_entity", raw=False))
86
+ desc[desc.isna()] = ""
87
+
88
+ entityid2description = dict()
89
+ if desc.shape[0] > 0 and np.all(np.isin(["id", "pdbx_description"], desc.columns)):
90
+ entityid2description = dict(zip(desc["id"], desc["pdbx_description"]))
91
+
92
+ entityid2src = dict()
93
+ src_1 = _read_src(block, "_entity_src_gen.",
94
+ "pdbx_gene_src_scientific_name",
95
+ "pdbx_gene_src_ncbi_taxonomy_id")
96
+ src_2 = _read_src(block, "_pdbx_entity_src_syn.",
97
+ "organism_scientific",
98
+ "ncbi_taxonomy_id")
99
+ src_3 = _read_src(block, "_entity_src_nat.",
100
+ "pdbx_organism_scientific",
101
+ "pdbx_ncbi_taxonomy_id")
102
+ entityid2src.update(src_1)
103
+
104
+ for k, v in src_2.items():
105
+ if k not in entityid2src:
106
+ entityid2src[k] = v
107
+
108
+ for k, v in src_3.items():
109
+ if k not in entityid2src:
110
+ entityid2src[k] = v
111
+
112
+ info_map = dict(st.info)
113
+ pdb_code = info_map.get("_entry.id", "").lower()
114
+ info = dict(description={k: v for k, v in entityid2description.items() if v and v != "?"},
115
+ source=entityid2src,
116
+ resolution=st.resolution,
117
+ pdb_id=pdb_code if gemmi.is_pdb_code(pdb_code) else "",
118
+ method=info_map.get("_exptl.method", "").lower(),
119
+ deposition_date=info_map.get("_pdbx_database_status.recvd_initial_deposition_date", ""),
120
+ title=info_map.get("_struct.title", "")
121
+ )
122
+ return dict(structure=st, info=info)
123
+
124
+
125
+ def molecule_description(path: str):
126
+ """
127
+ Molecule description from PDB (.pdb or .pdb.gz)
128
+ :param path:
129
+ :return:
130
+ """
131
+ if is_pdb(path):
132
+ cur_path = pathlib.Path(path)
133
+ if cur_path.suffixes[-1] == ".pdb":
134
+ with open(path, "r") as text_io:
135
+ lines = text_io.readlines()
136
+ else:
137
+ with gzip.open(path, "rb") as gz_handle:
138
+ with io.TextIOWrapper(gz_handle, encoding="utf-8") as text_io:
139
+ lines = text_io.readlines()
140
+ else:
141
+ raise ValueError("Only support .pdb or .pdb.gz file, but got %s" % path)
142
+
143
+ values = {"COMPND": defaultdict(dict),
144
+ "SOURCE": defaultdict(dict),
145
+ }
146
+
147
+ comp_molid = ""
148
+ last_comp_key = ""
149
+
150
+ for hh in lines:
151
+ h = hh.strip()
152
+ key = h[:6].strip()
153
+ tt = h[10:].strip().strip(";")
154
+
155
+ if key in ["COMPND", "SOURCE"]:
156
+ tok = tt.split(":")
157
+ if len(tok) >= 2:
158
+ ckey = tok[0].lower().strip()
159
+ cval = tok[1].strip()
160
+ if ckey == "mol_id":
161
+ comp_molid = cval
162
+ values[key][comp_molid] = dict()
163
+ else:
164
+ values[key][comp_molid][ckey] = cval
165
+ last_comp_key = ckey
166
+ else:
167
+ if last_comp_key != "":
168
+ values[key][comp_molid][last_comp_key] += " " + tok[0].strip()
169
+
170
+ outputs = dict(description=dict(),
171
+ source=dict())
172
+
173
+ ch_id2mol_id = dict()
174
+ for mol_id, val in values["COMPND"].items():
175
+ chain_str = val.get("chain", "").strip()
176
+ if chain_str != "":
177
+ chains = chain_str.split(",")
178
+ for ch in chains:
179
+ ch_id2mol_id[ch.strip()] = mol_id
180
+
181
+ for mol_id, val in values["COMPND"].items():
182
+ m = val.get("molecule", "").strip()
183
+ if m != "":
184
+ outputs["description"][mol_id] = m
185
+
186
+ for mol_id, val in values["SOURCE"].items():
187
+ name = val.get("organism_scientific", "").strip()
188
+ taxid = val.get("organism_taxid", "").strip()
189
+ if name not in ["", "?", "."] or taxid not in ["", "?", "."]:
190
+ outputs["source"][mol_id] = [name, taxid]
191
+ outputs["ch_id2mol_id"] = ch_id2mol_id
192
+ return outputs
193
+
194
+
195
+ def parse_pdb(path: str) -> dict:
196
+ if not is_pdb(path):
197
+ raise TypeError("Input file is not a pdb file [.pdb or .pdb.gz]: %s" % path)
198
+
199
+ st = gemmi.read_structure(path)
200
+ st.setup_entities()
201
+ st.assign_serial_numbers()
202
+
203
+ values = molecule_description(path)
204
+
205
+ mol_id2entity_name = dict()
206
+ for ent in st.entities:
207
+ if ent.name in values["ch_id2mol_id"]:
208
+ mol_id = values["ch_id2mol_id"][ent.name]
209
+ mol_id2entity_name[mol_id] = ent.name
210
+
211
+ # replace mod_id to entity.name
212
+ description = {mol_id2entity_name[mol_id]: v for mol_id, v in values["description"].items()
213
+ if mol_id in mol_id2entity_name}
214
+ # add ligand and water entity description
215
+ # gemmi use ligand name or water as entity name, take this as description
216
+ for ent in st.entities:
217
+ if (ent.name not in description
218
+ and ent.polymer_type.name == "Unknown"
219
+ and ent.name != ""
220
+ and len(ent.name) > 1):
221
+ description[ent.name] = ent.name
222
+
223
+ source = {mol_id2entity_name[mol_id]: v for mol_id, v in values["source"].items()
224
+ if mol_id in mol_id2entity_name}
225
+
226
+ # assign digital entity names
227
+ mapper = assign_digital_entity_names(st)
228
+
229
+ info_map = dict(st.info)
230
+ pdb_code = info_map.get("_entry.id", "").lower()
231
+ info = dict(description={mapper.get(k, k): v for k, v in description.items()},
232
+ source={mapper.get(k, k): v for k, v in source.items()},
233
+ resolution=st.resolution,
234
+ pdb_id=pdb_code if gemmi.is_pdb_code(pdb_code) else "",
235
+ method=info_map.get("_exptl.method", "").lower(),
236
+ deposition_date=info_map.get("_pdbx_database_status.recvd_initial_deposition_date", ""),
237
+ title=info_map.get("_struct.title", ""),
238
+ )
239
+ return dict(structure=st, info=info)
240
+
241
+
242
+ def assign_digital_entity_names(structure: gemmi.Structure) -> Optional[Dict[str, str]]:
243
+ """
244
+ :param structure:
245
+ :return:
246
+ dict, original entity name to new digital entity name
247
+ """
248
+ all_digit_name = np.all([ent.name.isdigit() for ent in structure.entities])
249
+
250
+ mapper = dict()
251
+ if not all_digit_name:
252
+ for ix, ent in enumerate(structure.entities):
253
+ new_name = str(ix + 1)
254
+ mapper[ent.name] = new_name
255
+ ent.name = new_name
256
+ return mapper
21
257
 
22
258
 
23
259
  class StructureParser(object):
24
260
  """
25
- Enhance Structure reader for .cif, .cif.gz, .pdb or .pdb.gz
261
+ Structure reader for .cif, .cif.gz, .pdb or .pdb.gz
262
+
263
+ Read the first model
26
264
  """
27
265
 
28
- def __init__(self, structure: gemmi.Structure = None):
266
+ def __init__(self, structure: Optional[gemmi.Structure] = None):
29
267
  if not isinstance(structure, (type(None), gemmi.Structure)):
30
268
  raise ValueError("structure must be gemmi.Structure or None")
31
269
  if structure is None:
270
+ # init with an empty model
32
271
  self.STRUCT = gemmi.Structure()
272
+ self.MODEL = gemmi.Model(1)
273
+ self.STRUCT.add_model(self.MODEL)
33
274
  elif isinstance(structure, gemmi.Structure):
34
- _assert_unique_chain_names_in_models(structure)
35
275
  self.STRUCT = structure.clone()
36
276
  else:
37
277
  raise ValueError("structure must be gemmi.Structure or None")
38
- self.STRUCT.setup_entities()
39
- _assign_digital_entity_names(self.STRUCT)
40
278
 
41
- self.INFO = Info()
42
- self.INFO.from_gemmi_structure_infomap(self.STRUCT.info)
43
- self.ENTITY = _ent_from_structure(self.STRUCT)
279
+ self._init_struct()
280
+
281
+ info_map = dict(self.STRUCT.info)
282
+ pdb_code = info_map.get("_entry.id", "").lower()
283
+ self.INFO = dict(description=dict(),
284
+ source=dict(),
285
+ resolution=self.STRUCT.resolution,
286
+ pdb_id=pdb_code if gemmi.is_pdb_code(pdb_code) else "",
287
+ method=info_map.get("_exptl.method", "").lower(),
288
+ deposition_date=info_map.get("_pdbx_database_status.recvd_initial_deposition_date", ""),
289
+ title=info_map.get("_struct.title", ""),
290
+ )
44
291
  self.update_entity()
45
- self.update_full_sequences()
46
-
47
- def update_full_sequences(self):
48
- for ent_idx, ent in enumerate(self.STRUCT.entities):
49
- # get full sequence
50
- full_seq = ent.full_sequence
51
-
52
- # when missing, construct from Residues
53
- if not full_seq:
54
- sel_ch_id = None
55
- sel_ch_len = 0
56
- for ch_id, ent_id in self.ENTITY.polymer2eid.items():
57
- if ent_id == ent.name:
58
- cur_len = len(self.polymer_sequences[ch_id])
59
- if cur_len > sel_ch_len:
60
- sel_ch_id = ch_id
61
- sel_ch_len = cur_len
62
-
63
- if sel_ch_id is not None and sel_ch_len > 0:
64
- full_seq = [r.name for r in self.STRUCT[0][sel_ch_id].get_polymer() if not r.is_water()]
65
- self.STRUCT.entities[ent_idx].full_sequence = full_seq
66
-
67
- @typechecked
68
- def load_from_file(self, path: Union[str, pathlib.PosixPath]):
69
- if _is_pdb(path):
70
- struct, entity = pdb_parser(path)
71
- elif _is_cif(path):
72
- struct, entity = cif_parser(path)
73
- else:
74
- raise ValueError("Only support .cif, .cif.gz, .pdb or .pdb.gz file, but got %s" % path)
75
-
76
- _assert_unique_chain_names_in_models(struct)
77
- self.STRUCT, self.ENTITY = struct, entity
78
- self.INFO.from_gemmi_structure_infomap(self.STRUCT.info)
79
- self.update_entity()
80
- self.update_full_sequences()
81
-
82
- @typechecked
83
- def to_pdb(self, outfile: str, write_minimal_pdb=False):
84
- compound_source = _compound_source_string(self.ENTITY)
85
- struct = self.STRUCT.clone()
86
-
87
- rs = "REMARK 2 RESOLUTION. %.2f ANGSTROMS." % struct.resolution
88
- resolution_remarks = ["%-80s" % "REMARK 2",
89
- "%-80s" % rs]
90
-
91
- struct.raw_remarks = compound_source + resolution_remarks
92
- if write_minimal_pdb:
93
- struct.write_minimal_pdb(outfile)
94
- else:
95
- struct.write_pdb(outfile)
96
292
 
97
- @typechecked
98
- def to_cif(self, outfile: str):
99
- out_block = _cif_block_for_output(self.STRUCT, self.ENTITY)
100
- out_block.write_file(outfile)
293
+ def _init_struct(self):
294
+ self.STRUCT.setup_entities()
295
+ self.STRUCT.assign_serial_numbers()
296
+ self.STRUCT.renumber_models()
101
297
 
102
- @property
103
- def chain_ids(self):
104
- vals = []
105
- for m in self.STRUCT:
106
- for c in m:
107
- vals.append(c.name)
108
- vals.sort()
109
- return vals
298
+ # keep the first model
299
+ if len(self.STRUCT) > 1:
300
+ for idx in reversed(list(range(1, len(self.STRUCT)))):
301
+ del self.STRUCT[idx]
110
302
 
111
- @property
112
- def model_numbers(self):
113
- return [m.num for m in self.STRUCT]
303
+ self.MODEL = self.STRUCT[0]
304
+ self.STRUCT.remove_alternative_conformations()
305
+ self.STRUCT.remove_hydrogens()
306
+ self.STRUCT.remove_empty_chains()
307
+ self._update_full_sequences()
114
308
 
115
- @typechecked
116
- def set_default_model(self, num: Optional[int] = None):
309
+ def load_from_file(self, path: str):
117
310
  """
118
- Set the first model as default
119
- :param num:
311
+ Load model from file, default use the first model.
312
+ :param path:
120
313
  :return:
121
314
  """
122
- if len(self.STRUCT) == 0:
123
- raise RuntimeError("There is no model in structure")
124
-
125
- keep_model = None
126
- if num is None:
127
- # default first model
128
- keep_model = self.STRUCT[0]
315
+ if is_pdb(path):
316
+ val = parse_pdb(path)
317
+ self.STRUCT, self.INFO = val["structure"], val["info"]
318
+ elif is_cif(path):
319
+ val = parse_cif(path)
320
+ self.STRUCT, self.INFO = val["structure"], val["info"]
129
321
  else:
130
- for model in self.STRUCT:
131
- if model.num == num:
132
- keep_model = model
133
- break
322
+ raise ValueError("path must be files with suffixes [ .cif, .cif.gz, .pdb or .pdb.gz]")
134
323
 
135
- if keep_model is None:
136
- raise RuntimeError("Model %d not found in structure" % num)
324
+ self._init_struct()
325
+ self.update_entity()
326
+
327
+ def _update_full_sequences(self):
328
+ for idx, ent in enumerate(self.STRUCT.entities):
329
+ if ent.entity_type.name == "Polymer":
330
+ self.STRUCT.entities[idx].full_sequence = [gemmi.Entity.first_mon(item) for item in ent.full_sequence]
137
331
 
138
- # del, reversed order indexes
139
- indexes_to_del = [i for i, model in enumerate(self.STRUCT) if model.num != keep_model.num]
140
- indexes_to_del.sort(reverse=True)
332
+ if len(ent.full_sequence) == 0:
333
+ sc = self.get_subchain(ent.subchains[0])
334
+ self.STRUCT.entities[idx].full_sequence = sc.extract_sequence()
141
335
 
142
- for cur_index in indexes_to_del:
143
- del self.STRUCT[cur_index]
336
+ @property
337
+ def chain_ids(self):
338
+ return [ch.name for ch in self.MODEL]
144
339
 
145
340
  @property
146
- def chain_types(self):
147
- return {c: _chain_type(self.STRUCT, c) for c in self.chain_ids}
341
+ def subchain_ids(self):
342
+ return [ch.subchain_id() for ch in self.MODEL.subchains()]
148
343
 
149
344
  @property
150
345
  def assembly_names(self):
151
346
  return [assem.name for assem in self.STRUCT.assemblies]
152
347
 
153
348
  @property
154
- def polymer_sequences(self):
155
- cts = self.chain_types
349
+ def polymer_types(self):
350
+ subchain_id2polymer = dict()
351
+ for ent in self.STRUCT.entities:
352
+ if ent.entity_type.name == "Polymer":
353
+ for ch in ent.subchains:
354
+ subchain_id2polymer[ch] = ent.polymer_type
355
+
156
356
  out = dict()
157
- for model in self.STRUCT:
158
- for chain in model:
159
- ct = cts.get(chain.name, "other")
160
- if ct != "other":
161
- out[chain.name] = self.make_one_letter_sequence(chain.name)
357
+ for chain in self.MODEL:
358
+ polymer_ch = chain.get_polymer()
359
+ seq = polymer_ch.extract_sequence()
360
+ if seq:
361
+ subchain_id = polymer_ch.subchain_id()
362
+ if subchain_id in subchain_id2polymer:
363
+ out[chain.name] = subchain_id2polymer[subchain_id]
162
364
  return out
163
365
 
164
- @property
165
- def polymer_residue_numbers(self):
166
- cts = self.chain_types
366
+ def polymer_sequences(self, pdbx: bool = False):
167
367
  out = dict()
168
- id_type = np.dtype([
169
- ("ch_name", "U5"),
170
- ("res_num", "i4"),
171
- ("res_icode", "U3"),
172
- ("res_name", "U5"),
173
- ])
174
- for model in self.STRUCT:
175
- for chain in model:
176
- ct = cts.get(chain.name, "other")
177
- if ct != "other":
178
- out[chain.name] = np.array([(chain.name, r.seqid.num, r.seqid.icode, r.name)
179
- for r in chain.get_polymer()], dtype=id_type)
368
+ for ch, polymer_type in self.polymer_types.items():
369
+ polymer = self.get_chain(ch).get_polymer()
370
+ if pdbx:
371
+ s = gemmi.pdbx_one_letter_code(polymer.extract_sequence(), gemmi.sequence_kind(polymer_type))
372
+ else:
373
+ s = polymer.make_one_letter_sequence().replace("-", "")
374
+ out[ch] = s
180
375
  return out
181
376
 
182
- def chain_residues(self, polymer_only=True, with_water=False):
183
- """
184
- :param polymer_only, bool
185
- :param with_water:
186
- :return: dict of Three-letter codes of chain residues
187
- """
377
+ def get_subchain(self, subchain_id: str):
378
+ out = None
379
+ for ch in self.MODEL.subchains():
380
+ if ch.subchain_id() == subchain_id:
381
+ out = ch
382
+ break
383
+
384
+ if out is None:
385
+ raise ValueError("Sub-Chain %s not found (only [%s])" % (subchain_id, " ".join(self.subchain_ids)))
188
386
 
189
- out = dict()
190
- for model in self.STRUCT:
191
- for chain in model:
192
- res_codes = []
193
- for r in chain:
194
- if r.is_water():
195
- if with_water:
196
- res_codes.append(r.name)
197
- else:
198
- if polymer_only:
199
- if r.entity_type.name == "Polymer":
200
- res_codes.append(r.name)
201
- else:
202
- res_codes.append(r.name)
203
- out[chain.name] = res_codes
204
387
  return out
205
388
 
389
+ @property
390
+ def subchain_id_to_entity_id(self):
391
+ return {ch: ent.name for ent in self.STRUCT.entities for ch in ent.subchains}
392
+
393
+ @property
394
+ def subchain_id_to_chain_id(self):
395
+ return {sch.subchain_id(): chain.name for chain in self.MODEL for sch in chain.subchains()}
396
+
397
+ def get_chain(self, chain_id: str):
398
+ return self.MODEL[chain_id]
399
+
400
+ def pick_chains(self, chain_names: List[str]):
401
+ struct = gemmi.Structure()
402
+ struct.name = self.STRUCT.name
403
+ model = gemmi.Model(1)
404
+ for ch_id in chain_names:
405
+ model.add_chain(self.get_chain(ch_id))
406
+
407
+ struct.add_model(model)
408
+
409
+ # add basic information
410
+ struct.resolution = self.STRUCT.resolution
411
+
412
+ vals = {"_exptl.method": self.INFO["method"],
413
+ "_struct.title": "(Chains %s): " % " ".join(chain_names) + self.INFO["title"],
414
+ "_pdbx_database_status.recvd_initial_deposition_date": self.INFO["deposition_date"],
415
+ }
416
+ if self.INFO["pdb_id"] != "":
417
+ vals["_entry.id"] = self.INFO["pdb_id"]
418
+
419
+ struct.info = gemmi.InfoMap(vals)
420
+ new_struct = StructureParser(struct)
421
+
422
+ new_struct.INFO["description"] = {ent.name: self.INFO["description"][ent.name]
423
+ for ent in new_struct.STRUCT.entities
424
+ if ent.name in self.INFO["description"]
425
+ }
426
+ new_struct.INFO["source"] = {ent.name: self.INFO["source"][ent.name]
427
+ for ent in new_struct.STRUCT.entities
428
+ if ent.name in self.INFO["source"]
429
+ }
430
+ return new_struct
431
+
432
+ def _raw_marks(self):
433
+ subchain2chain = dict()
434
+ for chain in self.MODEL:
435
+ for sub_chain in chain.subchains():
436
+ subchain_id = sub_chain.subchain_id()
437
+ subchain2chain[subchain_id] = chain.name
438
+
439
+ entity2chains = dict()
440
+ for ent in self.STRUCT.entities:
441
+ val = [subchain2chain[sub_ch] for sub_ch in ent.subchains if sub_ch in subchain2chain]
442
+ if len(val) > 0:
443
+ entity2chains[ent.name] = val
444
+
445
+ mol_id = 1
446
+ n_line = 1
447
+ compound_mol = "COMPND {n_line:>3} MOL_ID: {mol_id};"
448
+ compound_molecule = "COMPND {n_line:>3} MOLECULE: {molecule};"
449
+ compound_chain = "COMPND {n_line:>3} CHAIN: {chain};"
450
+
451
+ outputs = []
452
+
453
+ for ent in self.STRUCT.entities:
454
+ if ent.entity_type.name == "Polymer":
455
+ chain = ", ".join(entity2chains[ent.name])
456
+
457
+ molecule = self.INFO["description"].get(ent.name, "")
458
+ if n_line == 1:
459
+ outputs.append("COMPND MOL_ID: {mol_id};".format(mol_id=mol_id))
460
+ else:
461
+ outputs.append(compound_mol.format(n_line=n_line, mol_id=mol_id))
462
+ n_line += 1
463
+
464
+ outputs.append(compound_molecule.format(n_line=n_line, molecule=molecule))
465
+ n_line += 1
466
+
467
+ outputs.append(compound_chain.format(n_line=n_line, chain=chain))
468
+ n_line += 1
469
+
470
+ mol_id += 1
471
+
472
+ mol_id = 1
473
+ n_line = 1
474
+ source_mol = "SOURCE {n_line:>3} MOL_ID: {mol_id};"
475
+ source_scientific = "SOURCE {n_line:>3} ORGANISM_SCIENTIFIC: {organism_scientific};"
476
+ source_taxid = "SOURCE {n_line:>3} ORGANISM_TAXID: {organism_taxid};"
477
+
478
+ for ent in self.STRUCT.entities:
479
+ if ent.entity_type.name == "Polymer":
480
+ src = self.INFO["source"].get(ent.name)
481
+ if src is None:
482
+ organism_scientific, organism_taxid = "", ""
483
+ else:
484
+ organism_scientific, organism_taxid = src
485
+
486
+ if n_line == 1:
487
+ outputs.append("SOURCE MOL_ID: {mol_id};".format(mol_id=mol_id))
488
+ else:
489
+ outputs.append(source_mol.format(n_line=n_line, mol_id=mol_id))
490
+ n_line += 1
491
+
492
+ outputs.append(source_scientific.format(n_line=n_line, organism_scientific=organism_scientific))
493
+ n_line += 1
494
+
495
+ outputs.append(source_taxid.format(n_line=n_line, organism_taxid=organism_taxid))
496
+ n_line += 1
497
+
498
+ mol_id += 1
499
+
500
+ resolution_remarks = ["REMARK 2",
501
+ "REMARK 2 RESOLUTION. %.2f ANGSTROMS." % self.STRUCT.resolution
502
+ ]
503
+ outputs.extend(resolution_remarks)
504
+ return outputs
505
+
506
+ def to_pdb(self, outfile: str, write_minimal_pdb=False):
507
+ struct = self.STRUCT.clone()
508
+ if write_minimal_pdb:
509
+ struct.write_minimal_pdb(outfile)
510
+ else:
511
+ struct.raw_remarks = self._raw_marks()
512
+ struct.write_pdb(outfile)
513
+
514
+ @staticmethod
515
+ def _item_index(block: gemmi.cif.Block, tag: str):
516
+ mapper = dict()
517
+ for idx, item in enumerate(block):
518
+ if item.loop is not None:
519
+ keys = item.loop.tags
520
+ for k in keys:
521
+ mapper[k] = idx
522
+ elif item.pair is not None:
523
+ key = item.pair[0]
524
+ mapper[key] = idx
525
+ return mapper.get(tag)
526
+
527
+ def to_cif(self, outfile: str):
528
+ block = self.STRUCT.make_mmcif_block()
529
+ #### add resolution
530
+ # block.set_pair(tag="_refine.entry_id", value=gemmi.cif.quote(self.INFO["pdb_id"].upper()))
531
+ # block.set_pair(tag="_refine.pdbx_refine_id", value=gemmi.cif.quote(self.INFO["method"].upper()))
532
+ block.set_pair(tag="_refine.ls_d_res_high", value=gemmi.cif.quote(str(self.INFO["resolution"])))
533
+
534
+ # tag_names = ["_exptl.entry_id",
535
+ # "_refine.entry_id", "_refine.pdbx_refine_id",
536
+ # "_refine.ls_d_res_high"]
537
+ # for i in range(1, len(tag_names)):
538
+ # idx_1a = self._item_index(block, tag=tag_names[i])
539
+ # idx_2a = self._item_index(block, tag=tag_names[i - 1])
540
+ # block.move_item(idx_1a, idx_2a + 1)
541
+
542
+ #### add entity description
543
+ ta = block.find_mmcif_category(category="_entity.")
544
+ da = pd.DataFrame(list(ta), columns=list(ta.tags))
545
+ da["_entity.pdbx_description"] = da["_entity.id"].apply(
546
+ lambda i: gemmi.cif.quote(self.INFO["description"].get(i, "?")))
547
+
548
+ rows_1 = da.to_numpy().tolist()
549
+ tags_1 = [s.replace("_entity.", "") for s in da.columns.tolist()]
550
+
551
+ # erase
552
+ qitem = block.find_loop_item("_entity.id")
553
+ if isinstance(qitem, gemmi.cif.Item):
554
+ qitem.erase()
555
+
556
+ # add
557
+ loop_1 = block.init_loop(prefix="_entity.", tags=tags_1)
558
+ for r in rows_1:
559
+ loop_1.add_row(r)
560
+
561
+ idx_1b = self._item_index(block, tag="_entity.id")
562
+ idx_2b = self._item_index(block, tag="_entity_poly.entity_id")
563
+
564
+ # place _entity. before _entity_poly.
565
+ if isinstance(idx_1b, int) and isinstance(idx_2b, int):
566
+ block.move_item(idx_1b, idx_2b - 1)
567
+
568
+ #### add source name and taxid
569
+ loop_2 = block.init_loop(prefix="_entity_src_gen.", tags=["entity_id",
570
+ "pdbx_gene_src_scientific_name",
571
+ "pdbx_gene_src_ncbi_taxonomy_id"])
572
+
573
+ for k, (name, taxid) in self.INFO["source"].items():
574
+ name = name if name != "" else "?"
575
+ taxid = taxid if taxid != "" else "?"
576
+
577
+ loop_2.add_row([gemmi.cif.quote(k),
578
+ gemmi.cif.quote(name),
579
+ gemmi.cif.quote(taxid)]
580
+ )
581
+
582
+ idx_1c = self._item_index(block, tag="_entity_src_gen.entity_id")
583
+ idx_2c = self._item_index(block, tag="_entity_poly_seq.entity_id")
584
+ # place _entity_src_gen. after _entity_poly_seq.
585
+ if isinstance(idx_1c, int) and isinstance(idx_2c, int):
586
+ block.move_item(idx_1c, idx_2c + 1)
587
+
588
+ block.write_file(outfile)
589
+
206
590
  def update_entity(self):
207
591
  """
208
592
  Update ENTITY, .entities .assemblies according to subchains
209
593
  :return:
210
594
  """
211
- subchains = []
212
- for model in self.STRUCT:
213
- for chain in model:
214
- subchains.extend([sc.subchain_id() for sc in chain.subchains()])
595
+ subchains = self.subchain_ids
215
596
 
216
597
  # update .entities
217
598
  new_entities = gemmi.EntityList()
@@ -224,15 +605,9 @@ class StructureParser(object):
224
605
  ent_names.append(ent.name)
225
606
  self.STRUCT.entities = new_entities
226
607
 
227
- # update .ENTITY
228
- for super_key in ["eid2desc", "eid2specie", "eid2taxid"]:
229
- for eid in list(self.ENTITY[super_key].keys()):
230
- if eid not in ent_names:
231
- del self.ENTITY[super_key][eid]
232
-
233
- for cid, eid in list(self.ENTITY["polymer2eid"].items()):
234
- if eid not in ent_names or cid not in self.chain_ids:
235
- del self.ENTITY["polymer2eid"][cid]
608
+ # update INFO
609
+ self.INFO["description"] = {k: v for k, v in self.INFO["description"].items() if k in ent_names}
610
+ self.INFO["source"] = {k: v for k, v in self.INFO["source"].items() if k in ent_names}
236
611
 
237
612
  # update .assemblies
238
613
  all_cid = self.chain_ids
@@ -262,189 +637,257 @@ class StructureParser(object):
262
637
  for dai in del_assembly_indexes:
263
638
  del self.STRUCT.assemblies[dai]
264
639
 
265
- @typechecked
266
640
  def rename_chain(self, origin_name: str, target_name: str):
267
641
  if origin_name not in self.chain_ids:
268
- raise ValueError("chain %s not found" % origin_name)
642
+ raise ValueError("Chain %s not found" % origin_name)
643
+
269
644
  other_chain_names = set(self.chain_ids) - {origin_name}
270
645
 
271
646
  if target_name in other_chain_names:
272
- raise ValueError("target chain name %s has existed, change to a different one." % target_name)
647
+ raise ValueError("Chain %s has existed, please set a different target_name." % target_name)
273
648
 
274
649
  self.STRUCT.rename_chain(origin_name, target_name)
275
650
 
276
- # update .polymer2eid if exist
277
- if origin_name in self.ENTITY.polymer2eid:
278
- val = self.ENTITY.polymer2eid[origin_name]
279
- del self.ENTITY.polymer2eid[origin_name]
280
- self.ENTITY.polymer2eid[target_name] = val
281
-
282
- # update .assemblies.generator.chain if exists, for .pdb loading structure
283
651
  for assembly in self.STRUCT.assemblies:
284
652
  for gen in assembly.generators:
285
653
  tmp = [target_name if c == origin_name else c for c in gen.chains]
286
654
  gen.chains = tmp
287
655
 
288
- @typechecked
289
- def switch_chain_names(self, chain_name_1: str, chain_name_2: str):
656
+ def swap_chain_names(self, chain_name_1: str, chain_name_2: str):
290
657
  if chain_name_1 not in self.chain_ids:
291
- raise ValueError("chain_name_2 %s not in structure" % chain_name_1)
658
+ raise ValueError("Chain %s not found" % chain_name_1)
292
659
  if chain_name_2 not in self.chain_ids:
293
- raise ValueError("chain_name_2 %s not in structure" % chain_name_2)
660
+ raise ValueError("Chain %s not in found" % chain_name_2)
294
661
 
295
- l3 = [i + j + k for i in string.ascii_uppercase for j in string.ascii_uppercase for k in string.ascii_uppercase]
296
- l3.sort(reverse=True)
662
+ flag = True
663
+ while flag:
664
+ characters = string.ascii_letters + string.digits
665
+ sw_name = ''.join(random.choices(characters, k=4))
666
+ if sw_name not in self.chain_ids:
667
+ flag = False
297
668
 
298
- current_names = set(self.chain_ids)
299
- l3_l = [n for n in l3 if n not in current_names]
300
- sw_name = l3_l.pop()
301
669
  self.rename_chain(chain_name_1, sw_name)
302
670
  self.rename_chain(chain_name_2, chain_name_1)
303
671
  self.rename_chain(sw_name, chain_name_2)
304
672
 
305
- @typechecked
306
- def pick_chains(self, chain_names: List[str]):
307
- self.set_default_model()
673
+ def make_one_letter_chain(self, only_uppercase: bool = True):
674
+ uppercase_letters = list(string.ascii_uppercase)
675
+ uppercase_letters.sort(reverse=True)
308
676
 
309
- if chain_names:
310
- missing = [c for c in chain_names if c not in self.chain_ids]
311
- if missing:
312
- raise ValueError("Chains %s not found" % ",".join(missing))
313
- else:
314
- del_chain_names = set(self.chain_ids) - set(chain_names)
315
- del_chain_indexes = [i for i, ch in enumerate(self.STRUCT[0]) if ch.name in del_chain_names]
316
- del_chain_indexes.sort(reverse=True)
317
- for di in del_chain_indexes:
318
- del self.STRUCT[0][di]
319
- self.update_entity()
677
+ lowercase_letters = list(string.ascii_lowercase)
678
+ lowercase_letters.sort(reverse=True)
679
+
680
+ digit_letters = list(string.digits)
681
+ digit_letters.sort(reverse=True)
682
+
683
+ if only_uppercase:
684
+ letters = uppercase_letters
320
685
  else:
321
- raise ValueError("No chain is given")
686
+ letters = digit_letters + lowercase_letters + uppercase_letters
322
687
 
323
- @typechecked
324
- def make_chain_names_to_one_letter(self, only_uppercase: bool = True):
325
- _mapper = _chain_names2one_letter(self.STRUCT, only_uppercase)
326
- for origin_name, target_name in _mapper.items():
688
+ if only_uppercase:
689
+ msg = "The number of chains exceed the number of uppercase letters: %d > %d"
690
+ else:
691
+ msg = "The number of chains exceed the number of one-letter characters: %d > %d"
692
+
693
+ if len(self.chain_ids) > len(letters):
694
+ raise RuntimeError(msg % (len(self.chain_ids), len(letters)))
695
+
696
+ # not use yet
697
+ letters_valid = [l for l in letters if l not in self.chain_ids]
698
+ chains2rename = [ch for ch in self.chain_ids if ch not in letters]
699
+ mapper = {ch: letters_valid.pop() for ch in self.chain_ids if ch not in letters}
700
+
701
+ for origin_name, target_name in mapper.items():
327
702
  self.rename_chain(origin_name, target_name)
328
- return _mapper
703
+ return mapper
329
704
 
330
- @typechecked
331
- def get_assembly(self, assembly_name: str):
705
+ def get_assembly(self, assembly_name: str,
706
+ how: gemmi.HowToNameCopiedChain = gemmi.HowToNameCopiedChain.AddNumber):
332
707
  if assembly_name not in self.assembly_names:
333
- raise ValueError("assembly %s is not found" % assembly_name)
708
+ raise ValueError("Assembly %s not found (only [%s])" % (assembly_name, ", ".join(self.assembly_names)))
334
709
 
335
- struct, polymer2eid = get_assembly(self.STRUCT, assembly_name, gemmi.HowToNameCopiedChain.Short)
336
- out = StructureParser(struct)
337
- out.ENTITY = deepcopy(self.ENTITY)
338
- out.ENTITY.polymer2eid = polymer2eid
710
+ struct = self.STRUCT.clone()
711
+ struct.transform_to_assembly(assembly_name, how)
712
+ struct.info["_struct.title"] = "(Assembly %s): " % assembly_name + struct.info["_struct.title"]
713
+
714
+ new_struct = StructureParser(struct)
715
+
716
+ # find perfect match entities
717
+ entity_mapper = dict()
718
+ for new_ent in new_struct.STRUCT.entities:
719
+ for ent in self.STRUCT.entities:
720
+ if new_ent.entity_type == ent.entity_type:
721
+ if ent.entity_type.name == "Polymer":
722
+ if new_ent.full_sequence == ent.full_sequence:
723
+ entity_mapper[new_ent.name] = ent.name
724
+ break
725
+ else:
726
+ new_s = new_struct.get_subchain(new_ent.subchains[0]).extract_sequence()
727
+ s = self.get_subchain(ent.subchains[0]).extract_sequence()
728
+ if new_s == s:
729
+ entity_mapper[new_ent.name] = ent.name
730
+ break
339
731
 
340
- # update info
341
- prefix = "[Assembly %s] " % assembly_name
342
- out.INFO.title = prefix + out.INFO.title
343
- out.STRUCT.info = out.INFO.to_gemmi_structure_infomap()
344
- return out
732
+ # update Info
733
+ desc = dict()
734
+ src = dict()
345
735
 
346
- @typechecked
347
- def merge_chains(self, chains: List[str]):
348
- """
349
- Merge a list of chains, target chain id is chains[0]
736
+ for ent in new_struct.STRUCT.entities:
737
+ if ent.name in entity_mapper and entity_mapper[ent.name] in self.INFO["description"]:
738
+ desc[ent.name] = self.INFO["description"][entity_mapper[ent.name]]
350
739
 
351
- Renumber the new chain from 1
740
+ if ent.name in entity_mapper and entity_mapper[ent.name] in self.INFO["source"]:
741
+ src[ent.name] = self.INFO["source"][entity_mapper[ent.name]]
352
742
 
353
- [No fix the Entity and some other information of structure]
354
- :param chains:
743
+ new_struct.INFO["description"] = desc
744
+ new_struct.INFO["source"] = src
745
+ return new_struct
746
+
747
+ def clean_structure(self, remove_ligand=True):
748
+ """
749
+ Remove water by default
750
+
751
+ :param remove_ligand:
355
752
  :return:
356
- GemmiLoader
357
753
  """
358
- for c in chains:
359
- if c not in self.chain_ids:
360
- raise RuntimeError("Chain %s is not in the structure" % c)
361
- if len(self.STRUCT) > 1:
362
- print("Multiple models in structure, do nothing")
363
- elif len(chains) < 2:
364
- print("Query chains less than 2, do nothing")
365
- else:
366
- new_chain = gemmi.Chain(chains[0])
367
- residue_index = 1
368
-
369
- model = self.STRUCT[0]
370
-
371
- for ch in model:
372
- if ch.name in chains:
373
- for res in ch:
374
- nr = deepcopy(res)
375
- nr.seqid.icode = " "
376
- nr.seqid.num = residue_index
377
- new_chain.add_residue(nr)
378
- residue_index += 1
379
-
380
- for c in chains:
381
- self.STRUCT[0].remove_chain(c)
382
-
383
- self.STRUCT[0].add_chain(new_chain, unique_name=True)
384
-
385
- def get_atom_coords(self, chains: List[str], atoms: Optional[List[str]] = None):
386
- for c in chains:
387
- if c not in self.chain_ids:
388
- warnings.warn("Chain %s is not in the structure" % c)
389
-
390
- coord = []
391
- atom_id = []
392
- id_type = np.dtype([
393
- ("ch_name", "U5"),
394
- ("res_num", "i4"),
395
- ("res_icode", "U3"),
396
- ("res_name", "U5"),
397
- ("atom_name", "U5")
398
- ])
399
-
400
- model = self.STRUCT[0]
401
- for ch in model:
402
- if ch.name in chains:
403
- for res in ch:
404
- for atom in res:
405
- if atoms is None or atom.name in atoms:
406
- cur_id = (ch.name, res.seqid.num, res.seqid.icode, res.name, atom.name)
407
- cur_pos = atom.pos.tolist()
408
- coord.append(cur_pos)
409
- atom_id.append(cur_id)
410
-
411
- if coord:
412
- return np.array(coord, dtype=np.float32), np.array(atom_id, dtype=id_type)
754
+ if remove_ligand:
755
+ self.STRUCT.remove_waters()
413
756
  else:
414
- return np.empty(shape=(0, 3), dtype=np.float32), np.array(atom_id, dtype=id_type)
415
-
416
- def make_one_letter_sequence(self, chain_id):
417
- c_type = self.chain_types[chain_id]
418
- residues = self.chain_residues(polymer_only=True, with_water=False)[chain_id]
757
+ self.STRUCT.remove_ligands_and_waters()
419
758
 
420
- if c_type == "protein":
421
- one_letter_code = "".join([protein_3to1_mapper.get(r, "X") for r in residues])
422
- elif c_type in ["dna", "rna"]:
423
- one_letter_code = "".join([nucleic_3to1_mapper.get(r, "N") for r in residues])
424
- else:
425
- one_letter_code = ""
426
- return one_letter_code
759
+ self.STRUCT.remove_empty_chains()
760
+ self.update_entity()
427
761
 
428
- def clean_structure(self, keep_ligand=True):
762
+ def met_to_mse(self):
763
+ for chain in self.MODEL:
764
+ for residue in chain:
765
+ if residue.name == 'MET':
766
+ residue.name = 'MSE'
767
+ for atom in residue:
768
+ if atom.name == 'SD':
769
+ atom.name = 'SE'
770
+ atom.element = gemmi.Element('Se')
771
+
772
+ def get_atoms(self, arg: str = "*"):
429
773
  """
430
- (1) remove_alternative_conformations
431
- (2) remove_hydrogens
432
- (3) remove_water
433
- (4) remove_empty_chains
434
774
 
775
+ :param arg: str, "*", "/1/*//N,CA,C,O", "/1/*"
776
+ see gemmi.Selection
435
777
  :return:
778
+ np.ndarray
436
779
  """
437
- self.set_default_model()
438
- self.STRUCT.remove_alternative_conformations()
439
- self.STRUCT.remove_hydrogens()
780
+ sel = gemmi.Selection(arg)
781
+ res = []
782
+
783
+ for model in sel.models(self.STRUCT):
784
+ for chain in sel.chains(model):
785
+ for residue in sel.residues(chain):
786
+ for atom in sel.atoms(residue):
787
+ val = (chain.name,
788
+ residue.seqid.num,
789
+ residue.seqid.icode,
790
+ residue.name,
791
+ atom.name,
792
+ atom.element.name,
793
+ atom.charge,
794
+ atom.b_iso,
795
+ atom.occ,
796
+ tuple(atom.pos.tolist()),
797
+ )
798
+ res.append(val)
799
+
800
+ dtype = [("chain_name", "U5"),
801
+ ("residue_num", "i4"),
802
+ ("residue_icode", "U3"),
803
+ ("residue_name", "U5"),
804
+ ("atom_name", "U5"),
805
+ ("element", "U3"),
806
+ ("charge", "i1"),
807
+ ("b_factor", "f4"),
808
+ ("occupancy", "f4"),
809
+ ("coordinate", ("f4", (3,)))
810
+ ]
811
+ return np.array(res, dtype=dtype)
812
+
813
+ def polymer_interface_residues(self,
814
+ chains_x: List[str],
815
+ chains_y: List[str],
816
+ threshold: float = 4.5):
817
+ """
818
+ Identify PPI among protein, DNA, RNA using heavy atom distances.
819
+ :param chains_x:
820
+ :param chains_y:
821
+ :param threshold:
822
+ :return:
823
+ PPI residues of chains_x, PPI residues of chains_y
824
+ """
825
+ for ch in chains_x + chains_y:
826
+ if ch not in self.chain_ids:
827
+ raise ValueError("Chain %s not found (only [%s])" % (ch, " ".join(self.chain_ids)))
828
+ elif ch not in self.polymer_types:
829
+ raise ValueError("Chain %s is not a polymer (only [%s])"
830
+ % (ch, " ".join(list(self.polymer_types.keys())))
831
+ )
832
+
833
+ def ppi_atoms(struct, chains):
834
+ # atoms for N and O of backbone and N, O, P, S of side chains, only for PPI searching
835
+ protein_atoms = ['N', 'ND1', 'ND2', 'NE', 'NE1', 'NE2', 'NH1', 'NH2', 'NZ',
836
+ 'O', 'OD1', 'OD2', 'OE1', 'OE2', 'OG', 'OG1', 'OH',
837
+ 'SD', 'SG']
838
+ xna_atoms = ['N1', 'N2', 'N3', 'N4', 'N6', 'N7', 'N9',
839
+ 'O2', "O2'", "O3'", 'O4', "O4'", "O5'", 'O6',
840
+ 'OP1', 'OP2', 'OP3', 'P']
841
+ tag = "/1/%s//%s" % (",".join(chains), ",".join(protein_atoms + xna_atoms))
842
+ z = struct.get_atoms(tag)
843
+ return z
844
+
845
+ query_struct = deepcopy(self)
846
+ query_struct.clean_structure(remove_ligand=True)
847
+
848
+ atom_x = ppi_atoms(query_struct, chains_x)
849
+ atom_y = ppi_atoms(query_struct, chains_y)
850
+
851
+ kd_tree_x = cKDTree(atom_x["coordinate"])
852
+ kd_tree_y = cKDTree(atom_y["coordinate"])
853
+
854
+ pairs = kd_tree_x.sparse_distance_matrix(kd_tree_y, threshold, output_type='coo_matrix')
855
+ x_res = np.unique(atom_x[pairs.row][["chain_name", "residue_num", "residue_icode", "residue_name"]])
856
+ y_res = np.unique(atom_y[pairs.col][["chain_name", "residue_num", "residue_icode", "residue_name"]])
857
+
858
+ return x_res, y_res
859
+
860
+ def polymer_interface_residues_all(self, ppi_threshold: float = 4.5, n_cpus: int = 4):
861
+ """
862
+ Identify PPI among protein, DNA, RNA using heavy atom distances between all chain pairs.
440
863
 
441
- if keep_ligand:
442
- self.STRUCT.remove_waters()
443
- else:
444
- self.STRUCT.remove_ligands_and_waters()
864
+ :param ppi_threshold:
865
+ :param n_cpus:
866
+ :return:
867
+ """
868
+ chains = list(self.polymer_types.keys())
869
+ ch_pairs = list(itertools.combinations(chains, r=2))
870
+ ch_pairs.sort()
871
+
872
+ def _run(ch_1, ch_2):
873
+ key = "%s/%s" % (ch_1, ch_2)
874
+ res_x, res_y = self.polymer_interface_residues(chains_x=[ch_1], chains_y=[ch_2], threshold=ppi_threshold)
875
+
876
+ if len(res_x) > 0:
877
+ vx = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in res_x.tolist()]
878
+ vy = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in res_y.tolist()]
879
+ return {key: [vx, vy]}
880
+ else:
881
+ return dict()
445
882
 
446
- self.STRUCT.remove_empty_chains()
883
+ cpu2use = max(min(n_cpus, len(ch_pairs)), 1)
447
884
 
448
- # update information
449
- self.update_entity()
450
- self.update_full_sequences()
885
+ outputs = dict()
886
+ if cpu2use == 1 or len(ch_pairs) < 50:
887
+ for ch_1, ch_2 in ch_pairs:
888
+ outputs.update(_run(ch_1, ch_2))
889
+ else:
890
+ results = Parallel(n_jobs=cpu2use)(delayed(_run)(c1, c2) for c1, c2 in ch_pairs)
891
+ for item in results:
892
+ outputs.update(item)
893
+ return outputs