gemmi-protools 0.1.17__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gemmi-protools might be problematic. Click here for more details.

@@ -1,292 +0,0 @@
1
- """
2
- @Author: Luo Jiejian
3
- """
4
- import pathlib
5
- from collections import Counter
6
- from typing import Union, Optional, Dict, List
7
-
8
- import gemmi
9
- from typeguard import typechecked
10
-
11
- from gemmi_protools.io.cif_opts import _cif_entity_info, _is_cif, _get_cif_resolution
12
- from gemmi_protools.io.pdb_opts import _pdb_entity_info, _is_pdb, _get_pdb_resolution
13
- from gemmi_protools.io.struct_info import Entity
14
-
15
-
16
- @typechecked
17
- def _ent_from_structure(struct: gemmi.Structure) -> Entity:
18
- """
19
- Run .setup_entities() in advance
20
- :param struct:
21
- :return:
22
- """
23
- block = struct.make_mmcif_block()
24
- ent_info = _cif_entity_info(block)
25
- for ent in struct.entities:
26
- if ent.name not in ent_info["eid2desc"]:
27
- ent_info["eid2desc"][ent.name] = ent.name
28
- return ent_info
29
-
30
-
31
- @typechecked
32
- def cif_parser(path: Union[str, pathlib.Path]):
33
- """
34
- Parse .cif or .cif.gz
35
- :param path:
36
- :return: (gemmi.Structure, entity)
37
- """
38
- if _is_cif(path):
39
- doc = gemmi.cif.read(str(path))
40
- block0 = doc.sole_block()
41
- struct = gemmi.read_structure(str(path))
42
- struct.setup_entities()
43
- # sheet_id like 1' will get some strange errors
44
- # result in sheets with 0 strands
45
- # delete sheets with 0 strands
46
- # check here
47
-
48
- zero_sheet_ind = []
49
- for i, sheet in enumerate(struct.sheets):
50
- if len(sheet.strands) == 0:
51
- zero_sheet_ind.append(i)
52
-
53
- if zero_sheet_ind:
54
- zero_sheet_ind.sort(reverse=True)
55
- for i in zero_sheet_ind:
56
- del struct.sheets[i]
57
-
58
- # gemmi fail to parse right resolution, update here
59
- struct.resolution = _get_cif_resolution(block0)
60
-
61
- # ent information
62
- # from doc
63
- ent_0 = _cif_entity_info(block0)
64
-
65
- # init from struct
66
- ent_1 = _ent_from_structure(struct)
67
-
68
- # update ent_0 with ent_1
69
- for super_key in ["eid2desc", "polymer2eid"]:
70
- for key, val in ent_1[super_key].items():
71
- if key not in ent_0[super_key]:
72
- ent_0[super_key][key] = val
73
- return struct, ent_0
74
- else:
75
- raise ValueError("Only support .cif or .cif.gz file, but got %s" % path)
76
-
77
-
78
- @typechecked
79
- def _assign_digital_entity_names(structure: gemmi.Structure) -> Optional[Dict[str, str]]:
80
- """
81
- Run .setup_entities() in advance
82
- :param structure:
83
- :return:
84
- """
85
- # rename entities' names to numbers if not
86
- not_digit_name = False
87
- for ent in structure.entities:
88
- if not ent.name.isdigit():
89
- not_digit_name = True
90
- break
91
-
92
- if not_digit_name:
93
- mapper = dict()
94
- for ix, ent in enumerate(structure.entities):
95
- new_name = str(ix + 1)
96
- mapper[ent.name] = new_name
97
- ent.name = new_name
98
- return mapper
99
- else:
100
- return None
101
-
102
-
103
- @typechecked
104
- def _update_entity_names(entity: Entity, mapper: Dict[str, str]):
105
- """
106
- Update entity names to new ones in eid2desc, eid2specie, eid2taxid in place.
107
- :param entity:
108
- :param mapper: {old_entity_name: new_entity_name}
109
- :return:
110
- """
111
- for super_key in ['eid2desc', 'eid2specie', 'eid2taxid']:
112
- tmp = dict()
113
- for key in entity[super_key]:
114
- tmp[mapper[key]] = entity[super_key][key]
115
- entity.__setattr__(super_key, tmp)
116
-
117
- new_polymer2eid = dict()
118
- for c, old_eid in entity.polymer2eid.items():
119
- new_polymer2eid[c] = mapper[old_eid]
120
- entity.__setattr__(name="polymer2eid", value=new_polymer2eid)
121
-
122
-
123
- def _melt_dict(inputs: dict):
124
- outputs = dict()
125
- for keys, val in inputs.items():
126
- for k in keys.split(","):
127
- outputs[k] = val
128
- return outputs
129
-
130
-
131
- @typechecked
132
- def pdb_parser(path: Union[str, pathlib.Path]):
133
- """
134
- Parse .pdb or .pdb.gz
135
- :param path:
136
- :return: (gemmi.Structure, entity)
137
- """
138
- if _is_pdb(path):
139
- struct = gemmi.read_structure(str(path))
140
- struct.resolution = _get_pdb_resolution(struct.raw_remarks)
141
- ent_0 = _pdb_entity_info(path)
142
- ch2desc = _melt_dict(ent_0.eid2desc)
143
- ch2specie = _melt_dict(ent_0.eid2specie)
144
- ch2taxid = _melt_dict(ent_0.eid2taxid)
145
-
146
- struct.setup_entities()
147
- block = struct.make_mmcif_block()
148
- ent_t = _cif_entity_info(block)
149
-
150
- # set non-polymer entity names
151
- non_polymer_entities = [e.name for e in struct.entities if e.polymer_type.name == "Unknown"]
152
- for k in non_polymer_entities:
153
- assert k in ent_t.eid2desc
154
- if ent_t.eid2desc[k] == "?":
155
- ent_t.eid2desc[k] = k
156
-
157
- for k in ent_t.eid2desc.keys():
158
- if k not in non_polymer_entities:
159
- ent_t.eid2desc[k] = ch2desc.get(k, "?")
160
-
161
- polymer_chs_used_as_eid = set(ch2specie.keys()).intersection(ent_t.eid2desc.keys())
162
- for k in polymer_chs_used_as_eid:
163
- ent_t.eid2specie[k] = ch2specie.get(k, "?")
164
- ent_t.eid2taxid[k] = ch2taxid.get(k, "?")
165
-
166
- m = _assign_digital_entity_names(struct)
167
- _update_entity_names(ent_t, m)
168
-
169
- return struct, ent_t
170
- else:
171
- raise ValueError("Only support .pdb or .pdb.gz file, but got %s" % path)
172
-
173
-
174
- @typechecked
175
- def _chain_type(structure: gemmi.Structure, chain_id: str) -> str:
176
- out = None
177
- values = {"PeptideL": "protein",
178
- "Dna": "dna",
179
- "Rna": "rna"}
180
-
181
- for model in structure:
182
- for cur_chain in model:
183
- if cur_chain.name == chain_id:
184
- sc_types = set()
185
- for sc in cur_chain.subchains():
186
- t = sc.check_polymer_type().name
187
- if t != "Unknown":
188
- sc_types.update({t})
189
-
190
- if len(sc_types) == 1:
191
- out = sc_types.pop()
192
- else:
193
- out = "Unknown"
194
- if out is None:
195
- raise RuntimeError("chain_id %s not in structure" % chain_id)
196
- else:
197
- return values.get(out, "other")
198
-
199
-
200
- @typechecked
201
- def _get_model_chain_names(model: gemmi.Model) -> List[str]:
202
- vals = []
203
- for ch in model:
204
- vals.append(ch.name)
205
- return vals
206
-
207
-
208
- @typechecked
209
- def _assert_unique_chain_names_in_models(structure: gemmi.Structure):
210
- for model in structure:
211
- names = _get_model_chain_names(model)
212
- nums = Counter(names)
213
- dup_names = [k for k, v in nums.items() if v > 1]
214
-
215
- if dup_names:
216
- raise RuntimeError("Duplicate chain names in model %d: %s" % (model.num, ",".join(dup_names)))
217
-
218
-
219
- @typechecked
220
- def _chain_names2one_letter(structure: gemmi.Structure, only_uppercase: bool = True) -> Dict[str, str]:
221
- """
222
- Automatically generate one letter mapper when the length of chain name > 1 or chain name is not uppercase letters
223
-
224
- (1) when only_uppercase is True, only supported when the number of chains of the one-model structure <= 26
225
- (2) when only_uppercase is False, only supported when the number of chains of the one-model structure <= 62
226
-
227
- If there are too many chains, make some splits or assemblies first,
228
- or just keep the longer chain names in .cif format.
229
- PDB only support the single letter chain name.
230
- """
231
-
232
- if len(structure) > 1:
233
- raise RuntimeError("> 1 models in structure, do nothing")
234
-
235
- _assert_unique_chain_names_in_models(structure)
236
-
237
- n_chains = len(structure[0])
238
- if only_uppercase:
239
- l1 = ['Z', 'Y', 'X', 'W', 'V', 'U', 'T', 'S', 'R', 'Q', 'P', 'O', 'N', 'M',
240
- 'L', 'K', 'J', 'I', 'H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']
241
- mode = "UPPERCASE"
242
- else:
243
- l1 = ['9', '8', '7', '6', '5', '4', '3', '2', '1', '0',
244
- 'z', 'y', 'x', 'w', 'v', 'u', 't', 's', 'r', 'q',
245
- 'p', 'o', 'n', 'm', 'l', 'k', 'j', 'i', 'h', 'g',
246
- 'f', 'e', 'd', 'c', 'b', 'a', 'Z', 'Y', 'X', 'W',
247
- 'V', 'U', 'T', 'S', 'R', 'Q', 'P', 'O', 'N', 'M',
248
- 'L', 'K', 'J', 'I', 'H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']
249
- mode = "UPPERCASE + LOWERCASE + DIGITAL"
250
-
251
- if n_chains > len(l1):
252
- raise RuntimeError("Support max %d chains under %s mode, but got %d chains in structure"
253
- % (len(l1), mode, n_chains))
254
-
255
- existed_one_letter_ids = []
256
- for model in structure:
257
- for chain in model:
258
- if chain.name in l1 and chain.name not in existed_one_letter_ids:
259
- existed_one_letter_ids.append(chain.name)
260
-
261
- left_l1 = [i for i in l1 if i not in existed_one_letter_ids]
262
-
263
- name_mapper = dict()
264
- for model in structure:
265
- for chain in model:
266
- if chain.name not in l1:
267
- new_name = left_l1.pop()
268
- name_mapper[chain.name] = new_name
269
- return name_mapper
270
-
271
-
272
- @typechecked
273
- def get_assembly(structure: gemmi.Structure, assembly_name: str,
274
- how: gemmi.HowToNameCopiedChain = gemmi.HowToNameCopiedChain.AddNumber):
275
- struct = structure.clone()
276
- struct.transform_to_assembly(assembly_name, how)
277
-
278
- # update ENTITY.polymer2eid
279
- scn2eid = dict()
280
- for ent in struct.entities:
281
- for scn in ent.subchains:
282
- scn2eid[scn] = ent.name
283
-
284
- polymer2eid = dict()
285
- for model in struct:
286
- for chain in model:
287
- for sc in chain.subchains():
288
- sc_t = sc.check_polymer_type().name
289
- if sc_t in ["PeptideL", "Dna", "Rna"]:
290
- polymer2eid[chain.name] = scn2eid[sc.subchain_id()]
291
- break
292
- return struct, polymer2eid
@@ -1,179 +0,0 @@
1
- """
2
- @Author: Luo Jiejian
3
- """
4
- import gzip
5
- import io
6
- import pathlib
7
- import re
8
- from collections import defaultdict
9
- from typing import Dict, Union, List
10
-
11
- from typeguard import typechecked
12
-
13
- from gemmi_protools.io.parse_pdb_header import _parse_pdb_header_list
14
- from gemmi_protools.io.struct_info import Entity
15
-
16
-
17
- @typechecked
18
- def _molecule_information(header_dict: Dict) -> Entity:
19
- entity2description = dict()
20
- entity2species = dict()
21
- entity2taxid = dict()
22
-
23
- for idx in header_dict["compound"].keys():
24
- compound = header_dict["compound"][idx]
25
- if "chain" in compound:
26
- chain = re.sub(pattern=r"\s+", repl="", string=compound["chain"])
27
- if chain != "":
28
- tmp = chain.split(",")
29
- tmp.sort()
30
- key = ",".join(tmp)
31
-
32
- molecule = compound.get("molecule", "")
33
-
34
- if idx in header_dict["source"]:
35
- source = header_dict["source"][idx]
36
- specie = source.get("organism_scientific", "")
37
- taxid = source.get("organism_taxid", "")
38
- else:
39
- specie = ""
40
- taxid = ""
41
-
42
- entity2description[key] = molecule
43
- entity2species[key] = specie
44
- entity2taxid[key] = taxid
45
-
46
- vals = dict(eid2desc=entity2description,
47
- eid2specie=entity2species,
48
- eid2taxid=entity2taxid,
49
- polymer2eid=dict()
50
- )
51
- return Entity(**vals)
52
-
53
-
54
- @typechecked
55
- def _is_pdb(path: Union[str, pathlib.Path]) -> bool:
56
- if isinstance(path, str):
57
- path = pathlib.Path(path)
58
- if path.suffixes:
59
- if path.suffixes[-1] == ".pdb":
60
- return True
61
- elif "".join(path.suffixes[-2:]) == ".pdb.gz":
62
- return True
63
- else:
64
- return False
65
- else:
66
- return False
67
-
68
-
69
- # add by Ljj
70
- @typechecked
71
- def _pdb_entity_info(path: Union[str, pathlib.Path]) -> Entity:
72
- if _is_pdb(path):
73
- cur_path = pathlib.Path(path)
74
- if cur_path.suffixes[-1] == ".pdb":
75
- with open(path, "r") as text_io:
76
- lines = text_io.readlines()
77
- else:
78
- with gzip.open(path, "rb") as gz_handle:
79
- with io.TextIOWrapper(gz_handle, encoding="utf-8") as text_io:
80
- lines = text_io.readlines()
81
- else:
82
- raise ValueError("Only support .pdb or .pdb.gz file, but got %s" % path)
83
-
84
- i = 0
85
- for i in range(len(lines)):
86
- line = lines[i]
87
- record_type = line[0:6]
88
- if record_type in ("ATOM ", "HETATM", "MODEL "):
89
- break
90
-
91
- header = lines[0:i]
92
- info = _parse_pdb_header_list(header)
93
- return _molecule_information(info)
94
-
95
-
96
- @typechecked
97
- def _get_pdb_resolution(remark_lines: List[str]) -> float:
98
- resolutions = []
99
- for line in remark_lines:
100
- tmp = re.search(r"REMARK.+RESOLUTION.+?([\d\.]+|NOT APPLICABLE)", line)
101
- if tmp:
102
- v = tmp.groups()[0]
103
- try:
104
- vf = float(v)
105
- except (TypeError, ValueError):
106
- continue
107
- else:
108
- resolutions.append(vf)
109
- if resolutions:
110
- return min(resolutions)
111
- else:
112
- return 0.0
113
-
114
-
115
- @typechecked
116
- def _compound_source_string(entity: Entity) -> List[str]:
117
- entity2polymer = defaultdict(list)
118
- for k, v in entity["polymer2eid"].items():
119
- entity2polymer[v].append(k)
120
- entity_labels = list(entity2polymer.keys())
121
- entity_labels.sort()
122
- for v in entity2polymer.values():
123
- v.sort()
124
-
125
- values = []
126
- for i, el in enumerate(entity_labels):
127
- values.append(dict(mol_id=str(i + 1),
128
- chain=", ".join(entity2polymer[el]),
129
- molecule=entity["eid2desc"].get(el, "?"),
130
- organism_scientific=entity["eid2specie"].get(el, "?"),
131
- organism_taxid=entity["eid2taxid"].get(el, "?")
132
- )
133
- )
134
- outputs = []
135
- # compound
136
- compound_mol0 = "COMPND MOL_ID: {mol_id};"
137
- compound_mol1 = "COMPND {n_line:>3} MOL_ID: {mol_id};"
138
- compound_molecule = "COMPND {n_line:>3} MOLECULE: {molecule};"
139
- compound_chain = "COMPND {n_line:>3} CHAIN: {chain};"
140
-
141
- i = 1
142
- for val in values:
143
- if i == 1:
144
- outputs.append(compound_mol0.format(**val))
145
- i += 1
146
- for c_str in [compound_molecule, compound_chain]:
147
- cur_val = val.copy()
148
- cur_val["n_line"] = i
149
- outputs.append(c_str.format(**cur_val))
150
- i += 1
151
- else:
152
- for c_str in [compound_mol1, compound_molecule, compound_chain]:
153
- cur_val = val.copy()
154
- cur_val["n_line"] = i
155
- outputs.append(c_str.format(**cur_val))
156
- i += 1
157
-
158
- source_mol0 = "SOURCE MOL_ID: {mol_id};"
159
- source_mol1 = "SOURCE {n_line:>3} MOL_ID: {mol_id};"
160
- source_scientific = "SOURCE {n_line:>3} ORGANISM_SCIENTIFIC: {organism_scientific};"
161
- source_taxid = "SOURCE {n_line:>3} ORGANISM_TAXID: {organism_taxid};"
162
-
163
- i = 0
164
- for val in values:
165
- if i == 0:
166
- outputs.append(source_mol0.format(**val))
167
- i += 1
168
- for c_str in [source_scientific, source_taxid]:
169
- cur_val = val.copy()
170
- cur_val["n_line"] = i
171
- outputs.append(c_str.format(**cur_val))
172
- i += 1
173
- else:
174
- for c_str in [source_mol1, source_scientific, source_taxid]:
175
- cur_val = val.copy()
176
- cur_val["n_line"] = i
177
- outputs.append(c_str.format(**cur_val))
178
- i += 1
179
- return outputs
@@ -1,32 +0,0 @@
1
- """
2
- @Author: Luo Jiejian
3
- """
4
- from copy import deepcopy
5
-
6
- from Bio.PDB.Polypeptide import nucleic_letters_3to1_extended, protein_letters_3to1_extended
7
-
8
-
9
- def strip_key_val(inputs):
10
- outputs = dict()
11
- for key, val in inputs.items():
12
- outputs[key.strip()] = val.strip()
13
- return outputs
14
-
15
-
16
- def __nucleic_3to1_mapper():
17
- mapper = deepcopy(nucleic_letters_3to1_extended)
18
- mapper["DN"] = "N"
19
- mapper["N"] = "N"
20
- new_mapper = strip_key_val(mapper)
21
- return new_mapper
22
-
23
-
24
- def __protein_3to1_mapper():
25
- mapper = deepcopy(protein_letters_3to1_extended)
26
- mapper["UNK"] = "X"
27
- new_mapper = strip_key_val(mapper)
28
- return new_mapper
29
-
30
-
31
- nucleic_3to1_mapper = __nucleic_3to1_mapper()
32
- protein_3to1_mapper = __protein_3to1_mapper()
@@ -1,91 +0,0 @@
1
- """
2
- @Author: Luo Jiejian
3
- """
4
- from dataclasses import dataclass, field
5
- from datetime import datetime
6
- from typing import Dict, Optional
7
-
8
- import gemmi
9
- from typeguard import typechecked
10
-
11
-
12
- @typechecked
13
- @dataclass
14
- class Entity:
15
- eid2desc: Dict[str, str] = field(default_factory=dict)
16
- eid2specie: Dict[str, str] = field(default_factory=dict)
17
- eid2taxid: Dict[str, str] = field(default_factory=dict)
18
- polymer2eid: Dict[str, str] = field(default_factory=dict)
19
-
20
- @typechecked
21
- def __setattr__(self, name: str, value: Dict[str, str]):
22
- super().__setattr__(name, value)
23
-
24
- @typechecked
25
- def update(self, inputs: Dict[str, Dict[str, str]]):
26
- for key, value in inputs.items():
27
- if hasattr(self, key):
28
- self.__setattr__(key, value)
29
-
30
- def get(self, name: str, default: Optional[str] = None):
31
- if hasattr(self, name):
32
- return self.__getitem__(name)
33
- else:
34
- return default
35
-
36
- def __getitem__(self, name: str):
37
- return getattr(self, name)
38
-
39
- def keys(self):
40
- return list(self.__dict__.keys())
41
-
42
-
43
- @typechecked
44
- @dataclass
45
- class Info:
46
- cell_Z: str = ""
47
- pdb_id: str = ""
48
- exp_method: str = ""
49
- deposition_date: str = "1909-01-08"
50
- title: str = ""
51
- keywords: str = ""
52
- keywords_text: str = ""
53
-
54
- @property
55
- def __attributes_mapper(self):
56
- return {'cell_Z': '_cell.Z_PDB',
57
- 'pdb_id': '_entry.id',
58
- 'exp_method': '_exptl.method',
59
- 'deposition_date': '_pdbx_database_status.recvd_initial_deposition_date',
60
- 'title': '_struct.title',
61
- 'keywords': '_struct_keywords.pdbx_keywords',
62
- 'keywords_text': '_struct_keywords.text'}
63
-
64
- def to_gemmi_structure_infomap(self) -> gemmi.InfoMap:
65
- outputs = dict()
66
- for name, target_name in self.__attributes_mapper.items():
67
- value = self.__getattribute__(name)
68
- if isinstance(value, str):
69
- v = str(value)
70
- if len(v) > 1:
71
- outputs[target_name] = v
72
- return gemmi.InfoMap(outputs)
73
-
74
- @typechecked
75
- def from_gemmi_structure_infomap(self, infomap: gemmi.InfoMap):
76
- mapper_iv = {v: k for k, v in self.__attributes_mapper.items()}
77
- for key, val in infomap.items():
78
- if key in mapper_iv:
79
- name = mapper_iv[key]
80
- self.__setattr__(name, val)
81
-
82
- @typechecked
83
- def __setattr__(self, name: str, value: str):
84
- if name == "deposition_date":
85
- try:
86
- datetime.strptime(value, "%Y-%m-%d")
87
- except ValueError as e:
88
- raise ValueError(f"{e}")
89
-
90
- if hasattr(self, name):
91
- super().__setattr__(name, value)