gemmi-protools 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gemmi-protools might be problematic. Click here for more details.

@@ -0,0 +1,387 @@
1
+ #!/usr/bin/env python
2
+ # Copyright 2004 Kristian Rother.
3
+ # Revisions copyright 2004 Thomas Hamelryck.
4
+ # Revisions copyright 2024 James Krieger.
5
+ #
6
+ # This file is part of the Biopython distribution and governed by your
7
+ # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
8
+ # Please see the LICENSE file that should have been included as part of this
9
+ # package.
10
+
11
+ """Parse header of PDB files into a python dictionary.
12
+
13
+ Emerged from the Columba database project www.columba-db.de, original author
14
+ Kristian Rother.
15
+
16
+ Modify _parse_pdb_header_list
17
+ Don't perform lower() to chain id
18
+ By Luo Jiejian
19
+ """
20
+
21
+ import re
22
+ from collections import defaultdict
23
+
24
+ from Bio import File
25
+
26
+
27
+ def _get_biomoltrans(inl):
28
+ # REMARK 350
29
+ # REMARK 350 COORDINATES FOR A COMPLETE MULTIMER REPRESENTING THE KNOWN
30
+ # REMARK 350 BIOLOGICALLY SIGNIFICANT OLIGOMERIZATION STATE OF THE
31
+ # REMARK 350 MOLECULE CAN BE GENERATED BY APPLYING BIOMT TRANSFORMATIONS
32
+ # REMARK 350 GIVEN BELOW. BOTH NON-CRYSTALLOGRAPHIC AND
33
+ # REMARK 350 CRYSTALLOGRAPHIC OPERATIONS ARE GIVEN.
34
+ # REMARK 350
35
+ # REMARK 350 BIOMOLECULE: 1
36
+ # REMARK 350 AUTHOR DETERMINED BIOLOGICAL UNIT: MONOMERIC
37
+ # REMARK 350 APPLY THE FOLLOWING TO CHAINS: A
38
+ # REMARK 350 BIOMT1 1 1.000000 0.000000 0.000000 0.00000
39
+ # REMARK 350 BIOMT2 1 0.000000 1.000000 0.000000 0.00000
40
+ # REMARK 350 BIOMT3 1 0.000000 0.000000 1.000000 0.00000
41
+ biomolecule = defaultdict(list)
42
+ for line in inl:
43
+ if line.startswith("REMARK 350"):
44
+ if line[11:23] == "BIOMOLECULE:":
45
+ currentBiomolecule = line.split()[-1]
46
+ applyToChains = []
47
+ elif (
48
+ line[11:41] == "APPLY THE FOLLOWING TO CHAINS:"
49
+ or line[30:41] == "AND CHAINS:"
50
+ ):
51
+ applyToChains.extend(
52
+ line[41:].replace(" ", "").strip().strip(",").split(",")
53
+ )
54
+ elif line[13:18] == "BIOMT":
55
+ biomt = biomolecule[currentBiomolecule]
56
+ if line[13:19] == "BIOMT1":
57
+ if applyToChains == []:
58
+ applyToChains = biomt[0]
59
+ biomt.append(applyToChains)
60
+ elif line[13:19]:
61
+ applyToChains = []
62
+ biomt.append(line[23:])
63
+ return dict(biomolecule)
64
+
65
+
66
+ def _get_journal(inl):
67
+ # JRNL AUTH L.CHEN,M.DOI,F.S.MATHEWS,A.Y.CHISTOSERDOV, 2BBK 7
68
+ journal = ""
69
+ for line in inl:
70
+ if re.search(r"\AJRNL", line):
71
+ journal += line[19:72].lower()
72
+ journal = re.sub(r"\s\s+", " ", journal)
73
+ return journal
74
+
75
+
76
+ def _get_references(inl):
77
+ # REMARK 1 REFERENCE 1 1CSE 11
78
+ # REMARK 1 AUTH W.BODE,E.PAPAMOKOS,D.MUSIL 1CSE 12
79
+ references = []
80
+ actref = ""
81
+ for line in inl:
82
+ if re.search(r"\AREMARK 1", line):
83
+ if re.search(r"\AREMARK 1 REFERENCE", line):
84
+ if actref != "":
85
+ actref = re.sub(r"\s\s+", " ", actref)
86
+ if actref != " ":
87
+ references.append(actref)
88
+ actref = ""
89
+ else:
90
+ actref += line[19:72].lower()
91
+
92
+ if actref != "":
93
+ actref = re.sub(r"\s\s+", " ", actref)
94
+ if actref != " ":
95
+ references.append(actref)
96
+ return references
97
+
98
+
99
+ # bring dates to format: 1909-01-08
100
+ def _format_date(pdb_date):
101
+ """Convert dates from DD-Mon-YY to YYYY-MM-DD format (PRIVATE)."""
102
+ date = ""
103
+ year = int(pdb_date[7:])
104
+ if year < 50:
105
+ century = 2000
106
+ else:
107
+ century = 1900
108
+ date = str(century + year) + "-"
109
+ all_months = [
110
+ "xxx",
111
+ "Jan",
112
+ "Feb",
113
+ "Mar",
114
+ "Apr",
115
+ "May",
116
+ "Jun",
117
+ "Jul",
118
+ "Aug",
119
+ "Sep",
120
+ "Oct",
121
+ "Nov",
122
+ "Dec",
123
+ ]
124
+ month = str(all_months.index(pdb_date[3:6]))
125
+ if len(month) == 1:
126
+ month = "0" + month
127
+ date = date + month + "-" + pdb_date[:2]
128
+ return date
129
+
130
+
131
+ def _chop_end_codes(line):
132
+ """Chops lines ending with ' 1CSA 14' and the like (PRIVATE)."""
133
+ return re.sub(r"\s\s\s\s+[\w]{4}.\s+\d*\Z", "", line)
134
+
135
+
136
+ def _chop_end_misc(line):
137
+ """Chops lines ending with ' 14-JUL-97 1CSA' and the like (PRIVATE)."""
138
+ return re.sub(r"\s+\d\d-\w\w\w-\d\d\s+[1-9][0-9A-Z]{3}\s*\Z", "", line)
139
+
140
+
141
+ def _nice_case(line):
142
+ """Make A Lowercase String With Capitals (PRIVATE)."""
143
+ line_lower = line.lower()
144
+ s = ""
145
+ i = 0
146
+ nextCap = 1
147
+ while i < len(line_lower):
148
+ c = line_lower[i]
149
+ if c >= "a" and c <= "z" and nextCap:
150
+ c = c.upper()
151
+ nextCap = 0
152
+ elif c in " .,;:\t-_":
153
+ nextCap = 1
154
+ s += c
155
+ i += 1
156
+ return s
157
+
158
+
159
+ def parse_pdb_header(infile):
160
+ """Return the header lines of a pdb file as a dictionary.
161
+
162
+ Dictionary keys are: head, deposition_date, release_date, structure_method,
163
+ resolution, structure_reference, journal_reference, author and
164
+ compound.
165
+ """
166
+ header = []
167
+ with File.as_handle(infile) as f:
168
+ for line in f:
169
+ record_type = line[0:6]
170
+ if record_type in ("ATOM ", "HETATM", "MODEL "):
171
+ break
172
+ header.append(line)
173
+ return _parse_pdb_header_list(header)
174
+
175
+
176
+ def _parse_remark_465(line):
177
+ """Parse missing residue remarks.
178
+
179
+ Returns a dictionary describing the missing residue.
180
+ The specification for REMARK 465 at
181
+ http://www.wwpdb.org/documentation/file-format-content/format33/remarks2.html#REMARK%20465
182
+ only gives templates, but does not say they have to be followed.
183
+ So we assume that not all pdb-files with a REMARK 465 can be understood.
184
+
185
+ Returns a dictionary with the following keys:
186
+ "model", "res_name", "chain", "ssseq", "insertion"
187
+ """
188
+ if line:
189
+ # Note that line has been stripped.
190
+ assert line[0] != " " and line[-1] not in "\n ", "line has to be stripped"
191
+ pattern = re.compile(
192
+ r"""
193
+ (\d+\s[\sA-Z][\sA-Z][A-Z] | # Either model number + residue name
194
+ [A-Z]{1,3}) # Or only residue name with 1 (RNA) to 3 letters
195
+ \s ([A-Za-z0-9]) # A single character chain
196
+ \s+(-?\d+[A-Za-z]?)$ # Residue number: A digit followed by an optional
197
+ # insertion code (Hetero-flags make no sense in
198
+ # context with missing res)
199
+ """,
200
+ re.VERBOSE,
201
+ )
202
+ match = pattern.match(line)
203
+ if match is None:
204
+ return None
205
+ residue = {}
206
+ if " " in match.group(1):
207
+ model, residue["res_name"] = match.group(1).split()
208
+ residue["model"] = int(model)
209
+ else:
210
+ residue["model"] = None
211
+ residue["res_name"] = match.group(1)
212
+ residue["chain"] = match.group(2)
213
+ try:
214
+ residue["ssseq"] = int(match.group(3))
215
+ except ValueError:
216
+ residue["insertion"] = match.group(3)[-1]
217
+ residue["ssseq"] = int(match.group(3)[:-1])
218
+ else:
219
+ residue["insertion"] = None
220
+ return residue
221
+
222
+
223
+ def _parse_pdb_header_list(header):
224
+ # database fields
225
+ pdbh_dict = {
226
+ "name": "",
227
+ "head": "",
228
+ "idcode": "",
229
+ "deposition_date": "1909-01-08",
230
+ "release_date": "1909-01-08",
231
+ "structure_method": "unknown",
232
+ "resolution": None,
233
+ "structure_reference": "unknown",
234
+ "journal_reference": "unknown",
235
+ "author": "",
236
+ "compound": {"1": {"misc": ""}},
237
+ "source": {"1": {"misc": ""}},
238
+ "has_missing_residues": False,
239
+ "missing_residues": [],
240
+ "biomoltrans": [],
241
+ }
242
+
243
+ pdbh_dict["structure_reference"] = _get_references(header)
244
+ pdbh_dict["journal_reference"] = _get_journal(header)
245
+ pdbh_dict["biomoltrans"] = _get_biomoltrans(header)
246
+ comp_molid = "1"
247
+ last_comp_key = "misc"
248
+ last_src_key = "misc"
249
+
250
+ for hh in header:
251
+ h = re.sub(r"[\s\n\r]*\Z", "", hh) # chop linebreaks off
252
+ # key=re.sub("\s.+\s*","",h)
253
+ key = h[:6].strip()
254
+ # tail=re.sub("\A\w+\s+\d*\s*","",h)
255
+ tail = h[10:].strip()
256
+ # print("%s:%s" % (key, tail)
257
+
258
+ # From here, all the keys from the header are being parsed
259
+ if key == "TITLE":
260
+ name = _chop_end_codes(tail).lower()
261
+ pdbh_dict["name"] = " ".join([pdbh_dict["name"], name]).strip()
262
+ elif key == "HEADER":
263
+ rr = re.search(r"\d\d-\w\w\w-\d\d", tail)
264
+ if rr is not None:
265
+ pdbh_dict["deposition_date"] = _format_date(_nice_case(rr.group()))
266
+ rr = re.search(r"\s+([1-9][0-9A-Z]{3})\s*\Z", tail)
267
+ if rr is not None:
268
+ pdbh_dict["idcode"] = rr.group(1)
269
+ head = _chop_end_misc(tail).lower()
270
+ pdbh_dict["head"] = head
271
+ elif key == "COMPND":
272
+ # LJJ
273
+ # tt = re.sub(r"\;\s*\Z", "", _chop_end_codes(tail)).lower()
274
+ tt = re.sub(r"\;\s*\Z", "", _chop_end_codes(tail))
275
+ # look for E.C. numbers in COMPND lines
276
+ rec = re.search(r"\d+\.\d+\.\d+\.\d+", tt)
277
+ if rec:
278
+ pdbh_dict["compound"][comp_molid]["ec_number"] = rec.group()
279
+ tt = re.sub(r"\((e\.c\.)*\d+\.\d+\.\d+\.\d+\)", "", tt)
280
+ tok = tt.split(":")
281
+ if len(tok) >= 2:
282
+ # lower ckey LJJ
283
+ ckey = tok[0].lower()
284
+ # ckey = tok[0]
285
+ cval = re.sub(r"\A\s*", "", tok[1])
286
+ if ckey == "mol_id":
287
+ # mol_id, keep original, usually digital string
288
+ pdbh_dict["compound"][cval] = {"misc": ""}
289
+ comp_molid = cval
290
+ last_comp_key = "misc"
291
+ else:
292
+ # add two lines, lower all except chain value LJJ
293
+ if ckey != "chain":
294
+ cval = cval.lower()
295
+
296
+ pdbh_dict["compound"][comp_molid][ckey] = cval
297
+ last_comp_key = ckey
298
+ else:
299
+ # pdbh_dict["compound"][comp_molid][last_comp_key] += tok[0] + " "
300
+ # concat and lower LJJ
301
+ pdbh_dict["compound"][comp_molid][last_comp_key] += tok[0].lower() + " "
302
+ elif key == "SOURCE":
303
+ tt = re.sub(r"\;\s*\Z", "", _chop_end_codes(tail)).lower()
304
+ tok = tt.split(":")
305
+ # print(tok)
306
+ if len(tok) >= 2:
307
+ ckey = tok[0]
308
+ cval = re.sub(r"\A\s*", "", tok[1])
309
+ if ckey == "mol_id":
310
+ pdbh_dict["source"][cval] = {"misc": ""}
311
+ comp_molid = cval
312
+ last_src_key = "misc"
313
+ else:
314
+ pdbh_dict["source"][comp_molid][ckey] = cval
315
+ last_src_key = ckey
316
+ else:
317
+ pdbh_dict["source"][comp_molid][last_src_key] += tok[0] + " "
318
+ elif key == "KEYWDS":
319
+ kwd = _chop_end_codes(tail).lower()
320
+ if "keywords" in pdbh_dict:
321
+ pdbh_dict["keywords"] += " " + kwd
322
+ else:
323
+ pdbh_dict["keywords"] = kwd
324
+ elif key == "EXPDTA":
325
+ expd = _chop_end_codes(tail)
326
+ # chop junk at end of lines for some structures
327
+ expd = re.sub(r"\s\s\s\s\s\s\s.*\Z", "", expd)
328
+ # if re.search('\Anmr',expd,re.IGNORECASE): expd='nmr'
329
+ # if re.search('x-ray diffraction',expd,re.IGNORECASE): expd='x-ray diffraction'
330
+ pdbh_dict["structure_method"] = expd.lower()
331
+ elif key == "CAVEAT":
332
+ # make Annotation entries out of these!!!
333
+ pass
334
+ elif key == "REVDAT":
335
+ rr = re.search(r"\d\d-\w\w\w-\d\d", tail)
336
+ if rr is not None:
337
+ pdbh_dict["release_date"] = _format_date(_nice_case(rr.group()))
338
+ elif key == "JRNL":
339
+ # print("%s:%s" % (key, tail))
340
+ if "journal" in pdbh_dict:
341
+ pdbh_dict["journal"] += tail
342
+ else:
343
+ pdbh_dict["journal"] = tail
344
+ elif key == "AUTHOR":
345
+ auth = _nice_case(_chop_end_codes(tail))
346
+ if "author" in pdbh_dict:
347
+ pdbh_dict["author"] += auth
348
+ else:
349
+ pdbh_dict["author"] = auth
350
+ elif key == "REMARK":
351
+ if re.search("REMARK 2 RESOLUTION.", hh):
352
+ r = _chop_end_codes(re.sub("REMARK 2 RESOLUTION.", "", hh))
353
+ r = re.sub(r"\s+ANGSTROM.*", "", r)
354
+ try:
355
+ pdbh_dict["resolution"] = float(r)
356
+ except ValueError:
357
+ # print('nonstandard resolution %r' % r)
358
+ pdbh_dict["resolution"] = None
359
+ elif hh.startswith("REMARK 465"):
360
+ if tail:
361
+ pdbh_dict["has_missing_residues"] = True
362
+ missing_res_info = _parse_remark_465(tail)
363
+ if missing_res_info:
364
+ pdbh_dict["missing_residues"].append(missing_res_info)
365
+ elif hh.startswith("REMARK 99 ASTRAL"):
366
+ if tail:
367
+ remark_99_keyval = tail.replace("ASTRAL ", "").split(": ")
368
+ if (
369
+ isinstance(remark_99_keyval, list)
370
+ and len(remark_99_keyval) == 2
371
+ ):
372
+ if "astral" not in pdbh_dict:
373
+ pdbh_dict["astral"] = {
374
+ remark_99_keyval[0]: remark_99_keyval[1]
375
+ }
376
+ else:
377
+ pdbh_dict["astral"][remark_99_keyval[0]] = remark_99_keyval[
378
+ 1
379
+ ]
380
+ else:
381
+ # print(key)
382
+ pass
383
+ if pdbh_dict["structure_method"] == "unknown":
384
+ res = pdbh_dict["resolution"]
385
+ if res is not None and res > 0.0:
386
+ pdbh_dict["structure_method"] = "x-ray diffraction"
387
+ return pdbh_dict
@@ -0,0 +1,279 @@
1
+ """
2
+ @Author: Luo Jiejian
3
+ """
4
+ import pathlib
5
+ from collections import defaultdict, Counter
6
+ from typing import Union, Optional, Dict, List
7
+
8
+ import gemmi
9
+ from typeguard import typechecked
10
+
11
+ from .cif_opts import _cif_entity_info, _is_cif, _get_cif_resolution
12
+ from .pdb_opts import _pdb_entity_info, _is_pdb, _get_pdb_resolution
13
+ from .struct_info import Entity
14
+
15
+
16
+ @typechecked
17
+ def _ent_from_structure(struct: gemmi.Structure) -> Entity:
18
+ """
19
+ Run .setup_entities() in advance
20
+ :param struct:
21
+ :return:
22
+ """
23
+ block = struct.make_mmcif_block()
24
+ ent_info = _cif_entity_info(block)
25
+ for ent in struct.entities:
26
+ if ent.name not in ent_info["eid2desc"]:
27
+ ent_info["eid2desc"][ent.name] = ent.name
28
+ return ent_info
29
+
30
+
31
+ @typechecked
32
+ def cif_parser(path: Union[str, pathlib.Path]):
33
+ """
34
+ Parse .cif or .cif.gz
35
+ :param path:
36
+ :return: (gemmi.Structure, entity)
37
+ """
38
+ if _is_cif(path):
39
+ doc = gemmi.cif.read(str(path))
40
+ block0 = doc.sole_block()
41
+ struct = gemmi.read_structure(str(path))
42
+ struct.setup_entities()
43
+ # sheet_id like 1' will get some strange errors
44
+ # result in sheets with 0 strands
45
+ # delete sheets with 0 strands
46
+ # check here
47
+
48
+ zero_sheet_ind = []
49
+ for i, sheet in enumerate(struct.sheets):
50
+ if len(sheet.strands) == 0:
51
+ zero_sheet_ind.append(i)
52
+
53
+ if zero_sheet_ind:
54
+ zero_sheet_ind.sort(reverse=True)
55
+ for i in zero_sheet_ind:
56
+ del struct.sheets[i]
57
+
58
+ # gemmi fail to parse right resolution, update here
59
+ struct.resolution = _get_cif_resolution(block0)
60
+
61
+ # ent information
62
+ # from doc
63
+ ent_0 = _cif_entity_info(block0)
64
+
65
+ # init from struct
66
+ ent_1 = _ent_from_structure(struct)
67
+
68
+ # update ent_0 with ent_1
69
+ for super_key in ["eid2desc", "polymer2eid"]:
70
+ for key, val in ent_1[super_key].items():
71
+ if key not in ent_0[super_key]:
72
+ ent_0[super_key][key] = val
73
+ return struct, ent_0
74
+ else:
75
+ raise ValueError("Only support .cif or .cif.gz file, but got %s" % path)
76
+
77
+
78
+ @typechecked
79
+ def _assign_digital_entity_names(structure: gemmi.Structure) -> Optional[Dict[str, str]]:
80
+ """
81
+ Run .setup_entities() in advance
82
+ :param structure:
83
+ :return:
84
+ """
85
+ # rename entities' names to numbers if not
86
+ not_digit_name = False
87
+ for ent in structure.entities:
88
+ if not ent.name.isdigit():
89
+ not_digit_name = True
90
+ break
91
+
92
+ if not_digit_name:
93
+ mapper = dict()
94
+ for ix, ent in enumerate(structure.entities):
95
+ new_name = str(ix + 1)
96
+ mapper[ent.name] = new_name
97
+ ent.name = new_name
98
+ return mapper
99
+ else:
100
+ return None
101
+
102
+
103
+ @typechecked
104
+ def _update_entity_names(entity: Entity, mapper: Dict[str, str]):
105
+ """
106
+ Update entity names to new ones in eid2desc, eid2specie, eid2taxid in place.
107
+ :param entity:
108
+ :param mapper: {old_entity_name: new_entity_name}
109
+ :return:
110
+ """
111
+ for super_key in ['eid2desc', 'eid2specie', 'eid2taxid']:
112
+ tmp = dict()
113
+ for key in entity[super_key]:
114
+ tmp[mapper[key]] = entity[super_key][key]
115
+ entity.__setattr__(super_key, tmp)
116
+
117
+
118
+ @typechecked
119
+ def pdb_parser(path: Union[str, pathlib.Path]):
120
+ """
121
+ Parse .pdb or .pdb.gz
122
+ :param path:
123
+ :return: (gemmi.Structure, entity)
124
+ """
125
+ if _is_pdb(path):
126
+ struct = gemmi.read_structure(str(path))
127
+ struct.resolution = _get_pdb_resolution(struct.raw_remarks)
128
+ ent_0 = _pdb_entity_info(path)
129
+
130
+ struct.setup_entities()
131
+
132
+ # pdb have "A,B,C" chains
133
+ # after setup, entities will merge
134
+ block = struct.make_mmcif_block()
135
+ ent_t = _cif_entity_info(block)
136
+ rec = defaultdict(list)
137
+ for cn, middle_eid in ent_t.polymer2eid.items():
138
+ rec[middle_eid].append(cn)
139
+
140
+ _mapper = _assign_digital_entity_names(struct)
141
+ _mapper_n = dict()
142
+ for middle_eid, new_eid in _mapper.items():
143
+ old_eid = str(",".join(rec[middle_eid]))
144
+ _mapper_n[old_eid] = new_eid
145
+
146
+ if _mapper_n:
147
+ _update_entity_names(ent_0, _mapper_n)
148
+
149
+ ent_1 = _ent_from_structure(struct)
150
+
151
+ # update ent_0 with ent_1
152
+ for super_key in ["eid2desc", "polymer2eid"]:
153
+ for key, val in ent_1[super_key].items():
154
+ if key not in ent_0[super_key]:
155
+ ent_0[super_key][key] = val
156
+ return struct, ent_0
157
+ else:
158
+ raise ValueError("Only support .pdb or .pdb.gz file, but got %s" % path)
159
+
160
+
161
+ @typechecked
162
+ def _chain_type(structure: gemmi.Structure, chain_id: str) -> str:
163
+ out = None
164
+ values = {"PeptideL": "protein",
165
+ "Dna": "dna",
166
+ "Rna": "rna"}
167
+
168
+ for model in structure:
169
+ for cur_chain in model:
170
+ if cur_chain.name == chain_id:
171
+ sc_types = set()
172
+ for sc in cur_chain.subchains():
173
+ t = sc.check_polymer_type().name
174
+ if t != "Unknown":
175
+ sc_types.update({t})
176
+
177
+ if len(sc_types) == 1:
178
+ out = sc_types.pop()
179
+ else:
180
+ out = "Unknown"
181
+ if out is None:
182
+ raise RuntimeError("chain_id %s not in structure" % chain_id)
183
+ else:
184
+ return values.get(out, "other")
185
+
186
+
187
+ @typechecked
188
+ def _get_model_chain_names(model: gemmi.Model) -> List[str]:
189
+ vals = []
190
+ for ch in model:
191
+ vals.append(ch.name)
192
+ return vals
193
+
194
+
195
+ @typechecked
196
+ def _assert_unique_chain_names_in_models(structure: gemmi.Structure):
197
+ for model in structure:
198
+ names = _get_model_chain_names(model)
199
+ nums = Counter(names)
200
+ dup_names = [k for k, v in nums.items() if v > 1]
201
+
202
+ if dup_names:
203
+ raise RuntimeError("Duplicate chain names in model %d: %s" % (model.num, ",".join(dup_names)))
204
+
205
+
206
+ @typechecked
207
+ def _chain_names2one_letter(structure: gemmi.Structure, only_uppercase: bool = True) -> Dict[str, str]:
208
+ """
209
+ Automatically generate one letter mapper when the length of chain name > 1 or chain name is not uppercase letters
210
+
211
+ (1) when only_uppercase is True, only supported when the number of chains of the one-model structure <= 26
212
+ (2) when only_uppercase is False, only supported when the number of chains of the one-model structure <= 62
213
+
214
+ If there are too many chains, make some splits or assemblies first,
215
+ or just keep the longer chain names in .cif format.
216
+ PDB only support the single letter chain name.
217
+ """
218
+
219
+ if len(structure) > 1:
220
+ raise RuntimeError("> 1 models in structure, do nothing")
221
+
222
+ _assert_unique_chain_names_in_models(structure)
223
+
224
+ n_chains = len(structure[0])
225
+ if only_uppercase:
226
+ l1 = ['Z', 'Y', 'X', 'W', 'V', 'U', 'T', 'S', 'R', 'Q', 'P', 'O', 'N', 'M',
227
+ 'L', 'K', 'J', 'I', 'H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']
228
+ mode = "UPPERCASE"
229
+ else:
230
+ l1 = ['9', '8', '7', '6', '5', '4', '3', '2', '1', '0',
231
+ 'z', 'y', 'x', 'w', 'v', 'u', 't', 's', 'r', 'q',
232
+ 'p', 'o', 'n', 'm', 'l', 'k', 'j', 'i', 'h', 'g',
233
+ 'f', 'e', 'd', 'c', 'b', 'a', 'Z', 'Y', 'X', 'W',
234
+ 'V', 'U', 'T', 'S', 'R', 'Q', 'P', 'O', 'N', 'M',
235
+ 'L', 'K', 'J', 'I', 'H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']
236
+ mode = "UPPERCASE + LOWERCASE + DIGITAL"
237
+
238
+ if n_chains > len(l1):
239
+ raise RuntimeError("Support max %d chains under %s mode, but got %d chains in structure"
240
+ % (len(l1), mode, n_chains))
241
+
242
+ existed_one_letter_ids = []
243
+ for model in structure:
244
+ for chain in model:
245
+ if chain.name in l1 and chain.name not in existed_one_letter_ids:
246
+ existed_one_letter_ids.append(chain.name)
247
+
248
+ left_l1 = [i for i in l1 if i not in existed_one_letter_ids]
249
+
250
+ name_mapper = dict()
251
+ for model in structure:
252
+ for chain in model:
253
+ if chain.name not in l1:
254
+ new_name = left_l1.pop()
255
+ name_mapper[chain.name] = new_name
256
+ return name_mapper
257
+
258
+
259
+ @typechecked
260
+ def get_assembly(structure: gemmi.Structure, assembly_name: str,
261
+ how: gemmi.HowToNameCopiedChain = gemmi.HowToNameCopiedChain.AddNumber):
262
+ struct = structure.clone()
263
+ struct.transform_to_assembly(assembly_name, how)
264
+
265
+ # update ENTITY.polymer2eid
266
+ scn2eid = dict()
267
+ for ent in struct.entities:
268
+ for scn in ent.subchains:
269
+ scn2eid[scn] = ent.name
270
+
271
+ polymer2eid = dict()
272
+ for model in struct:
273
+ for chain in model:
274
+ for sc in chain.subchains():
275
+ sc_t = sc.check_polymer_type().name
276
+ if sc_t in ["PeptideL", "Dna", "Rna"]:
277
+ polymer2eid[chain.name] = scn2eid[sc.subchain_id()]
278
+ break
279
+ return struct, polymer2eid