gemmi-protools 0.1.17__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gemmi-protools might be problematic. Click here for more details.
- gemmi_protools/__init__.py +1 -4
- gemmi_protools/io/convert.py +0 -3
- gemmi_protools/io/reader.py +749 -310
- gemmi_protools/{utils → tools}/align.py +38 -55
- gemmi_protools/tools/dockq.py +127 -0
- gemmi_protools/tools/mesh.py +95 -0
- gemmi_protools/{utils → tools}/pdb_annot.py +21 -106
- {gemmi_protools-0.1.17.dist-info → gemmi_protools-1.0.0.dist-info}/METADATA +14 -11
- gemmi_protools-1.0.0.dist-info/RECORD +19 -0
- gemmi_protools/io/cif_opts.py +0 -173
- gemmi_protools/io/parse_pdb_header.py +0 -387
- gemmi_protools/io/parser.py +0 -292
- gemmi_protools/io/pdb_opts.py +0 -179
- gemmi_protools/io/peptide.py +0 -32
- gemmi_protools/io/struct_info.py +0 -91
- gemmi_protools/utils/dockq.py +0 -139
- gemmi_protools/utils/fixer.py +0 -274
- gemmi_protools/utils/immune_complex.py +0 -787
- gemmi_protools/utils/ppi.py +0 -74
- gemmi_protools-0.1.17.dist-info/RECORD +0 -27
- /gemmi_protools/{utils → tools}/__init__.py +0 -0
- {gemmi_protools-0.1.17.dist-info → gemmi_protools-1.0.0.dist-info}/WHEEL +0 -0
- {gemmi_protools-0.1.17.dist-info → gemmi_protools-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {gemmi_protools-0.1.17.dist-info → gemmi_protools-1.0.0.dist-info}/top_level.txt +0 -0
gemmi_protools/io/cif_opts.py
DELETED
|
@@ -1,173 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
@Author: Luo Jiejian
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
import pathlib
|
|
6
|
-
from typing import Union, Dict, Any
|
|
7
|
-
|
|
8
|
-
import gemmi
|
|
9
|
-
import pandas as pd
|
|
10
|
-
from typeguard import typechecked
|
|
11
|
-
|
|
12
|
-
from gemmi_protools.io.struct_info import Entity
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@typechecked
|
|
16
|
-
def _is_cif(path: Union[str, pathlib.Path]) -> bool:
|
|
17
|
-
if isinstance(path, str):
|
|
18
|
-
path = pathlib.Path(path)
|
|
19
|
-
if path.suffixes:
|
|
20
|
-
if path.suffixes[-1] == ".cif":
|
|
21
|
-
return True
|
|
22
|
-
elif "".join(path.suffixes[-2:]) == ".cif.gz":
|
|
23
|
-
return True
|
|
24
|
-
else:
|
|
25
|
-
return False
|
|
26
|
-
else:
|
|
27
|
-
return False
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@typechecked
|
|
31
|
-
def _value_mapper_from_block(block: gemmi.cif.Block, category: str, column1: str, column2: str,
|
|
32
|
-
expand_column1: bool = False) -> Dict[str, Any]:
|
|
33
|
-
"""
|
|
34
|
-
mapper from column1 to column2
|
|
35
|
-
:param block:
|
|
36
|
-
:param category:
|
|
37
|
-
:param column1:
|
|
38
|
-
:param column2:
|
|
39
|
-
:param expand_column1: bool, if True, values joint by comma in column1 with be split
|
|
40
|
-
:return:
|
|
41
|
-
Only return a mapper when both column1 and column2 in category
|
|
42
|
-
"""
|
|
43
|
-
loop = block.find_mmcif_category(category)
|
|
44
|
-
tags = list(loop.tags)
|
|
45
|
-
|
|
46
|
-
results = dict()
|
|
47
|
-
if column1 in tags:
|
|
48
|
-
values1 = loop.column(tags.index(column1))
|
|
49
|
-
v1 = [values1.str(i) for i in range(len(values1))]
|
|
50
|
-
|
|
51
|
-
if column2 in tags:
|
|
52
|
-
values2 = loop.column(tags.index(column2))
|
|
53
|
-
v2 = [values2.str(i) for i in range(len(values2))]
|
|
54
|
-
else:
|
|
55
|
-
v2 = ["?"] * len(v1)
|
|
56
|
-
|
|
57
|
-
outputs = dict(zip(v1, v2))
|
|
58
|
-
|
|
59
|
-
if expand_column1:
|
|
60
|
-
outputs_ex = dict()
|
|
61
|
-
for key, val in outputs.items():
|
|
62
|
-
tmp = key.split(",")
|
|
63
|
-
for sk in tmp:
|
|
64
|
-
nk = sk.strip()
|
|
65
|
-
if nk:
|
|
66
|
-
outputs_ex[nk] = val
|
|
67
|
-
results = outputs_ex
|
|
68
|
-
else:
|
|
69
|
-
results = outputs
|
|
70
|
-
return results
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
@typechecked
|
|
74
|
-
def _get_cif_resolution(block: gemmi.cif.Block) -> float:
|
|
75
|
-
resolution = 0.0
|
|
76
|
-
for key in ["_reflns.d_resolution_high",
|
|
77
|
-
"_refine.ls_d_res_high",
|
|
78
|
-
"_refine_hist.d_res_high",
|
|
79
|
-
"_em_3d_reconstruction.resolution",
|
|
80
|
-
]:
|
|
81
|
-
v = block.find_value(key)
|
|
82
|
-
try:
|
|
83
|
-
vf = float(v)
|
|
84
|
-
except (TypeError, ValueError):
|
|
85
|
-
continue
|
|
86
|
-
else:
|
|
87
|
-
resolution = vf
|
|
88
|
-
break
|
|
89
|
-
return resolution
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
@typechecked
|
|
93
|
-
def _cif_entity_info(block: gemmi.cif.Block) -> Entity:
|
|
94
|
-
entity2description = _value_mapper_from_block(block, category="_entity.",
|
|
95
|
-
column1="_entity.id",
|
|
96
|
-
column2="_entity.pdbx_description")
|
|
97
|
-
|
|
98
|
-
polymer2entity = _value_mapper_from_block(block, category="_entity_poly.",
|
|
99
|
-
column1="_entity_poly.pdbx_strand_id",
|
|
100
|
-
column2="_entity_poly.entity_id",
|
|
101
|
-
expand_column1=True)
|
|
102
|
-
entity2species = _value_mapper_from_block(block, category="_entity_src_gen.",
|
|
103
|
-
column1="_entity_src_gen.entity_id",
|
|
104
|
-
column2="_entity_src_gen.pdbx_gene_src_scientific_name")
|
|
105
|
-
|
|
106
|
-
entity2species.update(_value_mapper_from_block(block, category="_pdbx_entity_src_syn.",
|
|
107
|
-
column1="_pdbx_entity_src_syn.entity_id",
|
|
108
|
-
column2="_pdbx_entity_src_syn.organism_scientific")
|
|
109
|
-
)
|
|
110
|
-
entity2species.update(_value_mapper_from_block(block, category="_entity_src_nat.",
|
|
111
|
-
column1="_entity_src_nat.entity_id",
|
|
112
|
-
column2="_entity_src_nat.pdbx_organism_scientific")
|
|
113
|
-
)
|
|
114
|
-
entity2taxid = _value_mapper_from_block(block, category="_entity_src_gen.",
|
|
115
|
-
column1="_entity_src_gen.entity_id",
|
|
116
|
-
column2="_entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id")
|
|
117
|
-
entity2taxid.update(_value_mapper_from_block(block, category="_pdbx_entity_src_syn.",
|
|
118
|
-
column1="_pdbx_entity_src_syn.entity_id",
|
|
119
|
-
column2="_pdbx_entity_src_syn.ncbi_taxonomy_id")
|
|
120
|
-
)
|
|
121
|
-
entity2taxid.update(_value_mapper_from_block(block, category="_entity_src_nat.",
|
|
122
|
-
column1="_entity_src_nat.entity_id",
|
|
123
|
-
column2="_entity_src_nat.pdbx_ncbi_taxonomy_id")
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
vals = dict(eid2desc=entity2description,
|
|
127
|
-
eid2specie=entity2species,
|
|
128
|
-
eid2taxid=entity2taxid,
|
|
129
|
-
polymer2eid=polymer2entity
|
|
130
|
-
)
|
|
131
|
-
return Entity(**vals)
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
@typechecked
|
|
135
|
-
def _cif_block_for_output(structure: gemmi.Structure, entity: Entity) -> gemmi.cif.Block:
|
|
136
|
-
block = structure.make_mmcif_block()
|
|
137
|
-
|
|
138
|
-
reflns = block.find_mmcif_category(category="_reflns.")
|
|
139
|
-
resolution = "%.2f" % structure.resolution
|
|
140
|
-
reflns.erase()
|
|
141
|
-
reflns_loop = block.init_loop(prefix="_reflns.", tags=["d_resolution_high", "d_resolution_low"])
|
|
142
|
-
reflns_loop.add_row([resolution, resolution])
|
|
143
|
-
|
|
144
|
-
ta = block.find_mmcif_category(category="_entity.")
|
|
145
|
-
da = pd.DataFrame(list(ta), columns=list(ta.tags))
|
|
146
|
-
if "_entity.id" in da.columns:
|
|
147
|
-
da["_entity.pdbx_description"] = da["_entity.id"].apply(
|
|
148
|
-
lambda i: entity["eid2desc"].get(i, "?").strip() or "?")
|
|
149
|
-
|
|
150
|
-
rows = []
|
|
151
|
-
for ar in da.to_numpy().tolist():
|
|
152
|
-
rows.append([gemmi.cif.quote(i) for i in ar])
|
|
153
|
-
|
|
154
|
-
if "_entity.pdbx_description" not in list(ta.tags):
|
|
155
|
-
ta.loop.add_columns(["_entity.pdbx_description"], "?")
|
|
156
|
-
|
|
157
|
-
ta = block.find_mmcif_category(category="_entity.")
|
|
158
|
-
for _ in range(len(ta)):
|
|
159
|
-
ta.remove_row(0)
|
|
160
|
-
for row in rows:
|
|
161
|
-
ta.append_row(row)
|
|
162
|
-
|
|
163
|
-
loop = block.init_loop("_entity_src_gen.", ["entity_id",
|
|
164
|
-
"pdbx_gene_src_scientific_name",
|
|
165
|
-
"pdbx_gene_src_ncbi_taxonomy_id"])
|
|
166
|
-
|
|
167
|
-
for k in entity["eid2specie"].keys():
|
|
168
|
-
loop.add_row([gemmi.cif.quote(k),
|
|
169
|
-
gemmi.cif.quote(entity["eid2specie"].get(k, "?")),
|
|
170
|
-
gemmi.cif.quote(entity["eid2taxid"].get(k, "?"))]
|
|
171
|
-
)
|
|
172
|
-
block.move_item(-1, 16)
|
|
173
|
-
return block
|
|
@@ -1,387 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# Copyright 2004 Kristian Rother.
|
|
3
|
-
# Revisions copyright 2004 Thomas Hamelryck.
|
|
4
|
-
# Revisions copyright 2024 James Krieger.
|
|
5
|
-
#
|
|
6
|
-
# This file is part of the Biopython distribution and governed by your
|
|
7
|
-
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
|
8
|
-
# Please see the LICENSE file that should have been included as part of this
|
|
9
|
-
# package.
|
|
10
|
-
|
|
11
|
-
"""Parse header of PDB files into a python dictionary.
|
|
12
|
-
|
|
13
|
-
Emerged from the Columba database project www.columba-db.de, original author
|
|
14
|
-
Kristian Rother.
|
|
15
|
-
|
|
16
|
-
Modify _parse_pdb_header_list
|
|
17
|
-
Don't perform lower() to chain id
|
|
18
|
-
By Luo Jiejian
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
import re
|
|
22
|
-
from collections import defaultdict
|
|
23
|
-
|
|
24
|
-
from Bio import File
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def _get_biomoltrans(inl):
|
|
28
|
-
# REMARK 350
|
|
29
|
-
# REMARK 350 COORDINATES FOR A COMPLETE MULTIMER REPRESENTING THE KNOWN
|
|
30
|
-
# REMARK 350 BIOLOGICALLY SIGNIFICANT OLIGOMERIZATION STATE OF THE
|
|
31
|
-
# REMARK 350 MOLECULE CAN BE GENERATED BY APPLYING BIOMT TRANSFORMATIONS
|
|
32
|
-
# REMARK 350 GIVEN BELOW. BOTH NON-CRYSTALLOGRAPHIC AND
|
|
33
|
-
# REMARK 350 CRYSTALLOGRAPHIC OPERATIONS ARE GIVEN.
|
|
34
|
-
# REMARK 350
|
|
35
|
-
# REMARK 350 BIOMOLECULE: 1
|
|
36
|
-
# REMARK 350 AUTHOR DETERMINED BIOLOGICAL UNIT: MONOMERIC
|
|
37
|
-
# REMARK 350 APPLY THE FOLLOWING TO CHAINS: A
|
|
38
|
-
# REMARK 350 BIOMT1 1 1.000000 0.000000 0.000000 0.00000
|
|
39
|
-
# REMARK 350 BIOMT2 1 0.000000 1.000000 0.000000 0.00000
|
|
40
|
-
# REMARK 350 BIOMT3 1 0.000000 0.000000 1.000000 0.00000
|
|
41
|
-
biomolecule = defaultdict(list)
|
|
42
|
-
for line in inl:
|
|
43
|
-
if line.startswith("REMARK 350"):
|
|
44
|
-
if line[11:23] == "BIOMOLECULE:":
|
|
45
|
-
currentBiomolecule = line.split()[-1]
|
|
46
|
-
applyToChains = []
|
|
47
|
-
elif (
|
|
48
|
-
line[11:41] == "APPLY THE FOLLOWING TO CHAINS:"
|
|
49
|
-
or line[30:41] == "AND CHAINS:"
|
|
50
|
-
):
|
|
51
|
-
applyToChains.extend(
|
|
52
|
-
line[41:].replace(" ", "").strip().strip(",").split(",")
|
|
53
|
-
)
|
|
54
|
-
elif line[13:18] == "BIOMT":
|
|
55
|
-
biomt = biomolecule[currentBiomolecule]
|
|
56
|
-
if line[13:19] == "BIOMT1":
|
|
57
|
-
if applyToChains == []:
|
|
58
|
-
applyToChains = biomt[0]
|
|
59
|
-
biomt.append(applyToChains)
|
|
60
|
-
elif line[13:19]:
|
|
61
|
-
applyToChains = []
|
|
62
|
-
biomt.append(line[23:])
|
|
63
|
-
return dict(biomolecule)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def _get_journal(inl):
|
|
67
|
-
# JRNL AUTH L.CHEN,M.DOI,F.S.MATHEWS,A.Y.CHISTOSERDOV, 2BBK 7
|
|
68
|
-
journal = ""
|
|
69
|
-
for line in inl:
|
|
70
|
-
if re.search(r"\AJRNL", line):
|
|
71
|
-
journal += line[19:72].lower()
|
|
72
|
-
journal = re.sub(r"\s\s+", " ", journal)
|
|
73
|
-
return journal
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
def _get_references(inl):
|
|
77
|
-
# REMARK 1 REFERENCE 1 1CSE 11
|
|
78
|
-
# REMARK 1 AUTH W.BODE,E.PAPAMOKOS,D.MUSIL 1CSE 12
|
|
79
|
-
references = []
|
|
80
|
-
actref = ""
|
|
81
|
-
for line in inl:
|
|
82
|
-
if re.search(r"\AREMARK 1", line):
|
|
83
|
-
if re.search(r"\AREMARK 1 REFERENCE", line):
|
|
84
|
-
if actref != "":
|
|
85
|
-
actref = re.sub(r"\s\s+", " ", actref)
|
|
86
|
-
if actref != " ":
|
|
87
|
-
references.append(actref)
|
|
88
|
-
actref = ""
|
|
89
|
-
else:
|
|
90
|
-
actref += line[19:72].lower()
|
|
91
|
-
|
|
92
|
-
if actref != "":
|
|
93
|
-
actref = re.sub(r"\s\s+", " ", actref)
|
|
94
|
-
if actref != " ":
|
|
95
|
-
references.append(actref)
|
|
96
|
-
return references
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
# bring dates to format: 1909-01-08
|
|
100
|
-
def _format_date(pdb_date):
|
|
101
|
-
"""Convert dates from DD-Mon-YY to YYYY-MM-DD format (PRIVATE)."""
|
|
102
|
-
date = ""
|
|
103
|
-
year = int(pdb_date[7:])
|
|
104
|
-
if year < 50:
|
|
105
|
-
century = 2000
|
|
106
|
-
else:
|
|
107
|
-
century = 1900
|
|
108
|
-
date = str(century + year) + "-"
|
|
109
|
-
all_months = [
|
|
110
|
-
"xxx",
|
|
111
|
-
"Jan",
|
|
112
|
-
"Feb",
|
|
113
|
-
"Mar",
|
|
114
|
-
"Apr",
|
|
115
|
-
"May",
|
|
116
|
-
"Jun",
|
|
117
|
-
"Jul",
|
|
118
|
-
"Aug",
|
|
119
|
-
"Sep",
|
|
120
|
-
"Oct",
|
|
121
|
-
"Nov",
|
|
122
|
-
"Dec",
|
|
123
|
-
]
|
|
124
|
-
month = str(all_months.index(pdb_date[3:6]))
|
|
125
|
-
if len(month) == 1:
|
|
126
|
-
month = "0" + month
|
|
127
|
-
date = date + month + "-" + pdb_date[:2]
|
|
128
|
-
return date
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
def _chop_end_codes(line):
|
|
132
|
-
"""Chops lines ending with ' 1CSA 14' and the like (PRIVATE)."""
|
|
133
|
-
return re.sub(r"\s\s\s\s+[\w]{4}.\s+\d*\Z", "", line)
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
def _chop_end_misc(line):
|
|
137
|
-
"""Chops lines ending with ' 14-JUL-97 1CSA' and the like (PRIVATE)."""
|
|
138
|
-
return re.sub(r"\s+\d\d-\w\w\w-\d\d\s+[1-9][0-9A-Z]{3}\s*\Z", "", line)
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
def _nice_case(line):
|
|
142
|
-
"""Make A Lowercase String With Capitals (PRIVATE)."""
|
|
143
|
-
line_lower = line.lower()
|
|
144
|
-
s = ""
|
|
145
|
-
i = 0
|
|
146
|
-
nextCap = 1
|
|
147
|
-
while i < len(line_lower):
|
|
148
|
-
c = line_lower[i]
|
|
149
|
-
if c >= "a" and c <= "z" and nextCap:
|
|
150
|
-
c = c.upper()
|
|
151
|
-
nextCap = 0
|
|
152
|
-
elif c in " .,;:\t-_":
|
|
153
|
-
nextCap = 1
|
|
154
|
-
s += c
|
|
155
|
-
i += 1
|
|
156
|
-
return s
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def parse_pdb_header(infile):
|
|
160
|
-
"""Return the header lines of a pdb file as a dictionary.
|
|
161
|
-
|
|
162
|
-
Dictionary keys are: head, deposition_date, release_date, structure_method,
|
|
163
|
-
resolution, structure_reference, journal_reference, author and
|
|
164
|
-
compound.
|
|
165
|
-
"""
|
|
166
|
-
header = []
|
|
167
|
-
with File.as_handle(infile) as f:
|
|
168
|
-
for line in f:
|
|
169
|
-
record_type = line[0:6]
|
|
170
|
-
if record_type in ("ATOM ", "HETATM", "MODEL "):
|
|
171
|
-
break
|
|
172
|
-
header.append(line)
|
|
173
|
-
return _parse_pdb_header_list(header)
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
def _parse_remark_465(line):
|
|
177
|
-
"""Parse missing residue remarks.
|
|
178
|
-
|
|
179
|
-
Returns a dictionary describing the missing residue.
|
|
180
|
-
The specification for REMARK 465 at
|
|
181
|
-
http://www.wwpdb.org/documentation/file-format-content/format33/remarks2.html#REMARK%20465
|
|
182
|
-
only gives templates, but does not say they have to be followed.
|
|
183
|
-
So we assume that not all pdb-files with a REMARK 465 can be understood.
|
|
184
|
-
|
|
185
|
-
Returns a dictionary with the following keys:
|
|
186
|
-
"model", "res_name", "chain", "ssseq", "insertion"
|
|
187
|
-
"""
|
|
188
|
-
if line:
|
|
189
|
-
# Note that line has been stripped.
|
|
190
|
-
assert line[0] != " " and line[-1] not in "\n ", "line has to be stripped"
|
|
191
|
-
pattern = re.compile(
|
|
192
|
-
r"""
|
|
193
|
-
(\d+\s[\sA-Z][\sA-Z][A-Z] | # Either model number + residue name
|
|
194
|
-
[A-Z]{1,3}) # Or only residue name with 1 (RNA) to 3 letters
|
|
195
|
-
\s ([A-Za-z0-9]) # A single character chain
|
|
196
|
-
\s+(-?\d+[A-Za-z]?)$ # Residue number: A digit followed by an optional
|
|
197
|
-
# insertion code (Hetero-flags make no sense in
|
|
198
|
-
# context with missing res)
|
|
199
|
-
""",
|
|
200
|
-
re.VERBOSE,
|
|
201
|
-
)
|
|
202
|
-
match = pattern.match(line)
|
|
203
|
-
if match is None:
|
|
204
|
-
return None
|
|
205
|
-
residue = {}
|
|
206
|
-
if " " in match.group(1):
|
|
207
|
-
model, residue["res_name"] = match.group(1).split()
|
|
208
|
-
residue["model"] = int(model)
|
|
209
|
-
else:
|
|
210
|
-
residue["model"] = None
|
|
211
|
-
residue["res_name"] = match.group(1)
|
|
212
|
-
residue["chain"] = match.group(2)
|
|
213
|
-
try:
|
|
214
|
-
residue["ssseq"] = int(match.group(3))
|
|
215
|
-
except ValueError:
|
|
216
|
-
residue["insertion"] = match.group(3)[-1]
|
|
217
|
-
residue["ssseq"] = int(match.group(3)[:-1])
|
|
218
|
-
else:
|
|
219
|
-
residue["insertion"] = None
|
|
220
|
-
return residue
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
def _parse_pdb_header_list(header):
|
|
224
|
-
# database fields
|
|
225
|
-
pdbh_dict = {
|
|
226
|
-
"name": "",
|
|
227
|
-
"head": "",
|
|
228
|
-
"idcode": "",
|
|
229
|
-
"deposition_date": "1909-01-08",
|
|
230
|
-
"release_date": "1909-01-08",
|
|
231
|
-
"structure_method": "unknown",
|
|
232
|
-
"resolution": None,
|
|
233
|
-
"structure_reference": "unknown",
|
|
234
|
-
"journal_reference": "unknown",
|
|
235
|
-
"author": "",
|
|
236
|
-
"compound": {"1": {"misc": ""}},
|
|
237
|
-
"source": {"1": {"misc": ""}},
|
|
238
|
-
"has_missing_residues": False,
|
|
239
|
-
"missing_residues": [],
|
|
240
|
-
"biomoltrans": [],
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
pdbh_dict["structure_reference"] = _get_references(header)
|
|
244
|
-
pdbh_dict["journal_reference"] = _get_journal(header)
|
|
245
|
-
pdbh_dict["biomoltrans"] = _get_biomoltrans(header)
|
|
246
|
-
comp_molid = "1"
|
|
247
|
-
last_comp_key = "misc"
|
|
248
|
-
last_src_key = "misc"
|
|
249
|
-
|
|
250
|
-
for hh in header:
|
|
251
|
-
h = re.sub(r"[\s\n\r]*\Z", "", hh) # chop linebreaks off
|
|
252
|
-
# key=re.sub("\s.+\s*","",h)
|
|
253
|
-
key = h[:6].strip()
|
|
254
|
-
# tail=re.sub("\A\w+\s+\d*\s*","",h)
|
|
255
|
-
tail = h[10:].strip()
|
|
256
|
-
# print("%s:%s" % (key, tail)
|
|
257
|
-
|
|
258
|
-
# From here, all the keys from the header are being parsed
|
|
259
|
-
if key == "TITLE":
|
|
260
|
-
name = _chop_end_codes(tail).lower()
|
|
261
|
-
pdbh_dict["name"] = " ".join([pdbh_dict["name"], name]).strip()
|
|
262
|
-
elif key == "HEADER":
|
|
263
|
-
rr = re.search(r"\d\d-\w\w\w-\d\d", tail)
|
|
264
|
-
if rr is not None:
|
|
265
|
-
pdbh_dict["deposition_date"] = _format_date(_nice_case(rr.group()))
|
|
266
|
-
rr = re.search(r"\s+([1-9][0-9A-Z]{3})\s*\Z", tail)
|
|
267
|
-
if rr is not None:
|
|
268
|
-
pdbh_dict["idcode"] = rr.group(1)
|
|
269
|
-
head = _chop_end_misc(tail).lower()
|
|
270
|
-
pdbh_dict["head"] = head
|
|
271
|
-
elif key == "COMPND":
|
|
272
|
-
# LJJ
|
|
273
|
-
# tt = re.sub(r"\;\s*\Z", "", _chop_end_codes(tail)).lower()
|
|
274
|
-
tt = re.sub(r"\;\s*\Z", "", _chop_end_codes(tail))
|
|
275
|
-
# look for E.C. numbers in COMPND lines
|
|
276
|
-
rec = re.search(r"\d+\.\d+\.\d+\.\d+", tt)
|
|
277
|
-
if rec:
|
|
278
|
-
pdbh_dict["compound"][comp_molid]["ec_number"] = rec.group()
|
|
279
|
-
tt = re.sub(r"\((e\.c\.)*\d+\.\d+\.\d+\.\d+\)", "", tt)
|
|
280
|
-
tok = tt.split(":")
|
|
281
|
-
if len(tok) >= 2:
|
|
282
|
-
# lower ckey LJJ
|
|
283
|
-
ckey = tok[0].lower()
|
|
284
|
-
# ckey = tok[0]
|
|
285
|
-
cval = re.sub(r"\A\s*", "", tok[1])
|
|
286
|
-
if ckey == "mol_id":
|
|
287
|
-
# mol_id, keep original, usually digital string
|
|
288
|
-
pdbh_dict["compound"][cval] = {"misc": ""}
|
|
289
|
-
comp_molid = cval
|
|
290
|
-
last_comp_key = "misc"
|
|
291
|
-
else:
|
|
292
|
-
# add two lines, lower all except chain value LJJ
|
|
293
|
-
if ckey != "chain":
|
|
294
|
-
cval = cval.lower()
|
|
295
|
-
|
|
296
|
-
pdbh_dict["compound"][comp_molid][ckey] = cval
|
|
297
|
-
last_comp_key = ckey
|
|
298
|
-
else:
|
|
299
|
-
# pdbh_dict["compound"][comp_molid][last_comp_key] += tok[0] + " "
|
|
300
|
-
# concat and lower LJJ
|
|
301
|
-
pdbh_dict["compound"][comp_molid][last_comp_key] += tok[0].lower() + " "
|
|
302
|
-
elif key == "SOURCE":
|
|
303
|
-
tt = re.sub(r"\;\s*\Z", "", _chop_end_codes(tail)).lower()
|
|
304
|
-
tok = tt.split(":")
|
|
305
|
-
# print(tok)
|
|
306
|
-
if len(tok) >= 2:
|
|
307
|
-
ckey = tok[0]
|
|
308
|
-
cval = re.sub(r"\A\s*", "", tok[1])
|
|
309
|
-
if ckey == "mol_id":
|
|
310
|
-
pdbh_dict["source"][cval] = {"misc": ""}
|
|
311
|
-
comp_molid = cval
|
|
312
|
-
last_src_key = "misc"
|
|
313
|
-
else:
|
|
314
|
-
pdbh_dict["source"][comp_molid][ckey] = cval
|
|
315
|
-
last_src_key = ckey
|
|
316
|
-
else:
|
|
317
|
-
pdbh_dict["source"][comp_molid][last_src_key] += tok[0] + " "
|
|
318
|
-
elif key == "KEYWDS":
|
|
319
|
-
kwd = _chop_end_codes(tail).lower()
|
|
320
|
-
if "keywords" in pdbh_dict:
|
|
321
|
-
pdbh_dict["keywords"] += " " + kwd
|
|
322
|
-
else:
|
|
323
|
-
pdbh_dict["keywords"] = kwd
|
|
324
|
-
elif key == "EXPDTA":
|
|
325
|
-
expd = _chop_end_codes(tail)
|
|
326
|
-
# chop junk at end of lines for some structures
|
|
327
|
-
expd = re.sub(r"\s\s\s\s\s\s\s.*\Z", "", expd)
|
|
328
|
-
# if re.search('\Anmr',expd,re.IGNORECASE): expd='nmr'
|
|
329
|
-
# if re.search('x-ray diffraction',expd,re.IGNORECASE): expd='x-ray diffraction'
|
|
330
|
-
pdbh_dict["structure_method"] = expd.lower()
|
|
331
|
-
elif key == "CAVEAT":
|
|
332
|
-
# make Annotation entries out of these!!!
|
|
333
|
-
pass
|
|
334
|
-
elif key == "REVDAT":
|
|
335
|
-
rr = re.search(r"\d\d-\w\w\w-\d\d", tail)
|
|
336
|
-
if rr is not None:
|
|
337
|
-
pdbh_dict["release_date"] = _format_date(_nice_case(rr.group()))
|
|
338
|
-
elif key == "JRNL":
|
|
339
|
-
# print("%s:%s" % (key, tail))
|
|
340
|
-
if "journal" in pdbh_dict:
|
|
341
|
-
pdbh_dict["journal"] += tail
|
|
342
|
-
else:
|
|
343
|
-
pdbh_dict["journal"] = tail
|
|
344
|
-
elif key == "AUTHOR":
|
|
345
|
-
auth = _nice_case(_chop_end_codes(tail))
|
|
346
|
-
if "author" in pdbh_dict:
|
|
347
|
-
pdbh_dict["author"] += auth
|
|
348
|
-
else:
|
|
349
|
-
pdbh_dict["author"] = auth
|
|
350
|
-
elif key == "REMARK":
|
|
351
|
-
if re.search("REMARK 2 RESOLUTION.", hh):
|
|
352
|
-
r = _chop_end_codes(re.sub("REMARK 2 RESOLUTION.", "", hh))
|
|
353
|
-
r = re.sub(r"\s+ANGSTROM.*", "", r)
|
|
354
|
-
try:
|
|
355
|
-
pdbh_dict["resolution"] = float(r)
|
|
356
|
-
except ValueError:
|
|
357
|
-
# print('nonstandard resolution %r' % r)
|
|
358
|
-
pdbh_dict["resolution"] = None
|
|
359
|
-
elif hh.startswith("REMARK 465"):
|
|
360
|
-
if tail:
|
|
361
|
-
pdbh_dict["has_missing_residues"] = True
|
|
362
|
-
missing_res_info = _parse_remark_465(tail)
|
|
363
|
-
if missing_res_info:
|
|
364
|
-
pdbh_dict["missing_residues"].append(missing_res_info)
|
|
365
|
-
elif hh.startswith("REMARK 99 ASTRAL"):
|
|
366
|
-
if tail:
|
|
367
|
-
remark_99_keyval = tail.replace("ASTRAL ", "").split(": ")
|
|
368
|
-
if (
|
|
369
|
-
isinstance(remark_99_keyval, list)
|
|
370
|
-
and len(remark_99_keyval) == 2
|
|
371
|
-
):
|
|
372
|
-
if "astral" not in pdbh_dict:
|
|
373
|
-
pdbh_dict["astral"] = {
|
|
374
|
-
remark_99_keyval[0]: remark_99_keyval[1]
|
|
375
|
-
}
|
|
376
|
-
else:
|
|
377
|
-
pdbh_dict["astral"][remark_99_keyval[0]] = remark_99_keyval[
|
|
378
|
-
1
|
|
379
|
-
]
|
|
380
|
-
else:
|
|
381
|
-
# print(key)
|
|
382
|
-
pass
|
|
383
|
-
if pdbh_dict["structure_method"] == "unknown":
|
|
384
|
-
res = pdbh_dict["resolution"]
|
|
385
|
-
if res is not None and res > 0.0:
|
|
386
|
-
pdbh_dict["structure_method"] = "x-ray diffraction"
|
|
387
|
-
return pdbh_dict
|