biotite 0.39.0__cp311-cp311-win_amd64.whl → 0.40.0__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +3 -3
- biotite/application/dssp/app.py +18 -18
- biotite/database/rcsb/download.py +19 -14
- biotite/sequence/align/banded.c +258 -237
- biotite/sequence/align/banded.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.c +243 -222
- biotite/sequence/align/kmeralphabet.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmersimilarity.c +215 -196
- biotite/sequence/align/kmersimilarity.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cpp +233 -205
- biotite/sequence/align/localgapped.c +258 -237
- biotite/sequence/align/localgapped.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.c +235 -214
- biotite/sequence/align/localungapped.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.c +255 -234
- biotite/sequence/align/multiple.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/pairwise.c +274 -253
- biotite/sequence/align/pairwise.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.c +215 -196
- biotite/sequence/align/permutation.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.c +217 -197
- biotite/sequence/align/selector.cp311-win_amd64.pyd +0 -0
- biotite/sequence/align/tracetable.c +215 -195
- biotite/sequence/align/tracetable.cp311-win_amd64.pyd +0 -0
- biotite/sequence/codec.c +235 -214
- biotite/sequence/codec.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/nj.c +215 -196
- biotite/sequence/phylo/nj.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.c +227 -202
- biotite/sequence/phylo/tree.cp311-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.c +215 -196
- biotite/sequence/phylo/upgma.cp311-win_amd64.pyd +0 -0
- biotite/structure/basepairs.py +7 -12
- biotite/structure/bonds.c +1175 -1226
- biotite/structure/bonds.cp311-win_amd64.pyd +0 -0
- biotite/structure/celllist.c +217 -197
- biotite/structure/celllist.cp311-win_amd64.pyd +0 -0
- biotite/structure/charges.c +1052 -1101
- biotite/structure/charges.cp311-win_amd64.pyd +0 -0
- biotite/structure/filter.py +30 -37
- biotite/structure/info/__init__.py +5 -8
- biotite/structure/info/atoms.py +25 -67
- biotite/structure/info/bonds.py +46 -100
- biotite/structure/info/ccd/README.rst +8 -0
- biotite/structure/info/ccd/amino_acids.txt +1646 -0
- biotite/structure/info/ccd/carbohydrates.txt +1133 -0
- biotite/structure/info/ccd/components.bcif +0 -0
- biotite/structure/info/ccd/nucleotides.txt +797 -0
- biotite/structure/info/ccd.py +95 -0
- biotite/structure/info/groups.py +90 -0
- biotite/structure/info/masses.py +21 -20
- biotite/structure/info/misc.py +11 -22
- biotite/structure/info/standardize.py +17 -12
- biotite/structure/io/__init__.py +2 -4
- biotite/structure/io/ctab.py +1 -1
- biotite/structure/io/general.py +37 -43
- biotite/structure/io/mmtf/__init__.py +3 -0
- biotite/structure/io/mmtf/convertarray.c +219 -198
- biotite/structure/io/mmtf/convertarray.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/convertfile.c +217 -197
- biotite/structure/io/mmtf/convertfile.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/decode.c +225 -204
- biotite/structure/io/mmtf/decode.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/encode.c +215 -196
- biotite/structure/io/mmtf/encode.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/mmtf/file.py +34 -26
- biotite/structure/io/npz/__init__.py +3 -0
- biotite/structure/io/npz/file.py +21 -18
- biotite/structure/io/pdb/__init__.py +3 -3
- biotite/structure/io/pdb/file.py +5 -3
- biotite/structure/io/pdb/hybrid36.c +63 -43
- biotite/structure/io/pdb/hybrid36.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/pdbqt/file.py +32 -32
- biotite/structure/io/pdbx/__init__.py +13 -6
- biotite/structure/io/pdbx/bcif.py +649 -0
- biotite/structure/io/pdbx/cif.py +1028 -0
- biotite/structure/io/pdbx/component.py +243 -0
- biotite/structure/io/pdbx/convert.py +707 -359
- biotite/structure/io/pdbx/encoding.c +112813 -0
- biotite/structure/io/pdbx/encoding.cp311-win_amd64.pyd +0 -0
- biotite/structure/io/pdbx/error.py +14 -0
- biotite/structure/io/pdbx/legacy.py +267 -0
- biotite/structure/molecules.py +151 -151
- biotite/structure/sasa.c +215 -196
- biotite/structure/sasa.cp311-win_amd64.pyd +0 -0
- biotite/structure/superimpose.py +158 -115
- {biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/METADATA +2 -2
- {biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/RECORD +92 -90
- {biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/WHEEL +1 -1
- biotite/structure/info/amino_acids.json +0 -1556
- biotite/structure/info/amino_acids.py +0 -42
- biotite/structure/info/carbohydrates.json +0 -1122
- biotite/structure/info/carbohydrates.py +0 -39
- biotite/structure/info/intra_bonds.msgpack +0 -0
- biotite/structure/info/link_types.msgpack +0 -1
- biotite/structure/info/nucleotides.json +0 -772
- biotite/structure/info/nucleotides.py +0 -39
- biotite/structure/info/residue_masses.msgpack +0 -0
- biotite/structure/info/residue_names.msgpack +0 -3
- biotite/structure/info/residues.msgpack +0 -0
- biotite/structure/io/pdbx/file.py +0 -652
- {biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/LICENSE.rst +0 -0
- {biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/top_level.txt +0 -0
|
@@ -17,21 +17,50 @@ __all__ = [
|
|
|
17
17
|
|
|
18
18
|
import itertools
|
|
19
19
|
import warnings
|
|
20
|
-
from collections import OrderedDict
|
|
21
20
|
import numpy as np
|
|
22
21
|
from ....file import InvalidFileError
|
|
23
22
|
from ....sequence.seqtypes import NucleotideSequence, ProteinSequence
|
|
24
23
|
from ...atoms import AtomArray, AtomArrayStack, repeat
|
|
25
|
-
from ...bonds import BondList, BondType
|
|
24
|
+
from ...bonds import BondList, BondType, connect_via_residue_names
|
|
26
25
|
from ...box import unitcell_from_vectors, vectors_from_unitcell
|
|
27
26
|
from ...filter import filter_first_altloc, filter_highest_occupancy_altloc
|
|
28
|
-
from ...residues import get_residue_count
|
|
27
|
+
from ...residues import get_residue_count, get_residue_starts_for
|
|
29
28
|
from ...error import BadStructureError
|
|
30
29
|
from ...util import matrix_rotate
|
|
30
|
+
from .legacy import PDBxFile
|
|
31
|
+
from .component import MaskValue
|
|
32
|
+
from .cif import CIFFile, CIFBlock
|
|
33
|
+
from .bcif import BinaryCIFFile, BinaryCIFBlock, BinaryCIFColumn
|
|
34
|
+
from .encoding import StringArrayEncoding
|
|
31
35
|
|
|
32
36
|
|
|
33
|
-
#
|
|
34
|
-
|
|
37
|
+
# Cond types in `struct_conn` category that refer to covalent bonds
|
|
38
|
+
PDBX_COVALENT_TYPES = [
|
|
39
|
+
"covale", "covale_base", "covale_phosphate", "covale_sugar",
|
|
40
|
+
"disulf", "modres", "modres_link", "metalc"
|
|
41
|
+
]
|
|
42
|
+
# Map 'struct_conn' bond orders to 'BondType'...
|
|
43
|
+
PDBX_BOND_ORDER_TO_TYPE = {
|
|
44
|
+
"": BondType.ANY,
|
|
45
|
+
"sing": BondType.SINGLE,
|
|
46
|
+
"doub": BondType.DOUBLE,
|
|
47
|
+
"trip": BondType.TRIPLE,
|
|
48
|
+
"quad": BondType.QUADRUPLE,
|
|
49
|
+
}
|
|
50
|
+
# ...and vice versa
|
|
51
|
+
PDBX_BOND_TYPE_TO_ORDER = {
|
|
52
|
+
# 'ANY' is masked later, it is merely added here to avoid a KeyError
|
|
53
|
+
BondType.ANY: "",
|
|
54
|
+
BondType.SINGLE: "sing",
|
|
55
|
+
BondType.DOUBLE: "doub",
|
|
56
|
+
BondType.TRIPLE: "trip",
|
|
57
|
+
BondType.QUADRUPLE: "quad",
|
|
58
|
+
BondType.AROMATIC_SINGLE: "sing",
|
|
59
|
+
BondType.AROMATIC_DOUBLE: "doub",
|
|
60
|
+
BondType.AROMATIC_TRIPLE: "trip",
|
|
61
|
+
}
|
|
62
|
+
# Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
|
|
63
|
+
COMP_BOND_ORDER_TO_TYPE = {
|
|
35
64
|
("SING", "N") : BondType.SINGLE,
|
|
36
65
|
("DOUB", "N") : BondType.DOUBLE,
|
|
37
66
|
("TRIP", "N") : BondType.TRIPLE,
|
|
@@ -41,11 +70,10 @@ BOND_ORDER_TO_BOND_TYPE = {
|
|
|
41
70
|
("TRIP", "Y") : BondType.AROMATIC_TRIPLE,
|
|
42
71
|
}
|
|
43
72
|
# ...and vice versa
|
|
44
|
-
|
|
45
|
-
bond_type: order for order, bond_type in
|
|
73
|
+
COMP_BOND_TYPE_TO_ORDER = {
|
|
74
|
+
bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
|
|
46
75
|
}
|
|
47
76
|
|
|
48
|
-
|
|
49
77
|
_proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
|
|
50
78
|
_nucleotideseq_type_list = [
|
|
51
79
|
"polydeoxyribonucleotide",
|
|
@@ -61,6 +89,27 @@ _other_type_list = [
|
|
|
61
89
|
]
|
|
62
90
|
|
|
63
91
|
|
|
92
|
+
def _filter(category, index):
|
|
93
|
+
"""
|
|
94
|
+
Reduce the ``atom_site`` category to the values for the given
|
|
95
|
+
model.
|
|
96
|
+
"""
|
|
97
|
+
Category = type(category)
|
|
98
|
+
Column = Category.subcomponent_class()
|
|
99
|
+
Data = Column.subcomponent_class()
|
|
100
|
+
|
|
101
|
+
return Category({
|
|
102
|
+
key: Column(
|
|
103
|
+
Data(column.data.array[index]),
|
|
104
|
+
(
|
|
105
|
+
Data(column.mask.array[index])
|
|
106
|
+
if column.mask is not None else None
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
for key, column in category.items()
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
|
|
64
113
|
def get_sequence(pdbx_file, data_block=None):
|
|
65
114
|
"""
|
|
66
115
|
Get the protein and nucleotide sequences from the
|
|
@@ -74,11 +123,14 @@ def get_sequence(pdbx_file, data_block=None):
|
|
|
74
123
|
|
|
75
124
|
Parameters
|
|
76
125
|
----------
|
|
77
|
-
pdbx_file :
|
|
126
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
78
127
|
The file object.
|
|
79
|
-
data_block :
|
|
80
|
-
The name of the data block.
|
|
81
|
-
(and most times only) data block of the
|
|
128
|
+
data_block : str, optional
|
|
129
|
+
The name of the data block.
|
|
130
|
+
Default is the first (and most times only) data block of the
|
|
131
|
+
file.
|
|
132
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
133
|
+
this parameter is ignored.
|
|
82
134
|
|
|
83
135
|
Returns
|
|
84
136
|
-------
|
|
@@ -86,50 +138,55 @@ def get_sequence(pdbx_file, data_block=None):
|
|
|
86
138
|
The protein and nucleotide sequences for each entity
|
|
87
139
|
(equivalent to chains in most cases).
|
|
88
140
|
"""
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
141
|
+
block = _get_block(pdbx_file, data_block)
|
|
142
|
+
|
|
143
|
+
poly_category= block["entity_poly"]
|
|
144
|
+
seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
|
|
145
|
+
seq_type = poly_category["type"].as_array(str)
|
|
92
146
|
sequences = []
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
sequences.append(sequence)
|
|
98
|
-
else:
|
|
99
|
-
sequences.append(_convert_string_to_sequence(seq_string, seq_type))
|
|
147
|
+
for string, stype in zip(seq_string, seq_type):
|
|
148
|
+
sequence = _convert_string_to_sequence(string, stype)
|
|
149
|
+
if sequence is not None:
|
|
150
|
+
sequences.append(sequence)
|
|
100
151
|
return sequences
|
|
101
152
|
|
|
102
153
|
|
|
103
|
-
def get_model_count(
|
|
154
|
+
def get_model_count(pdbx_file, data_block=None):
|
|
104
155
|
"""
|
|
105
156
|
Get the number of models contained in a :class:`PDBxFile`.
|
|
106
157
|
|
|
107
158
|
Parameters
|
|
108
159
|
----------
|
|
109
|
-
|
|
160
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
110
161
|
The file object.
|
|
111
162
|
data_block : str, optional
|
|
112
|
-
The name of the data block.
|
|
113
|
-
(and most times only) data block of the
|
|
163
|
+
The name of the data block.
|
|
164
|
+
Default is the first (and most times only) data block of the
|
|
165
|
+
file.
|
|
166
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
167
|
+
this parameter is ignored.
|
|
114
168
|
|
|
115
169
|
Returns
|
|
116
170
|
-------
|
|
117
171
|
model_count : int
|
|
118
172
|
The number of models.
|
|
119
173
|
"""
|
|
120
|
-
|
|
121
|
-
return len(_get_model_starts(
|
|
174
|
+
block = _get_block(pdbx_file, data_block)
|
|
175
|
+
return len(_get_model_starts(
|
|
176
|
+
block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32)
|
|
177
|
+
))
|
|
122
178
|
|
|
123
179
|
|
|
124
180
|
def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
|
|
125
|
-
extra_fields=None, use_author_fields=True
|
|
181
|
+
extra_fields=None, use_author_fields=True,
|
|
182
|
+
include_bonds=False):
|
|
126
183
|
"""
|
|
127
184
|
Create an :class:`AtomArray` or :class:`AtomArrayStack` from the
|
|
128
185
|
``atom_site`` category in a :class:`PDBxFile`.
|
|
129
186
|
|
|
130
187
|
Parameters
|
|
131
188
|
----------
|
|
132
|
-
pdbx_file :
|
|
189
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
133
190
|
The file object.
|
|
134
191
|
model : int, optional
|
|
135
192
|
If this parameter is given, the function will return an
|
|
@@ -141,8 +198,11 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
|
|
|
141
198
|
containing all models will be returned, even if the structure
|
|
142
199
|
contains only one model.
|
|
143
200
|
data_block : str, optional
|
|
144
|
-
The name of the data block.
|
|
145
|
-
(and most times only) data block of the
|
|
201
|
+
The name of the data block.
|
|
202
|
+
Default is the first (and most times only) data block of the
|
|
203
|
+
file.
|
|
204
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
205
|
+
this parameter is ignored.
|
|
146
206
|
altloc : {'first', 'occupancy', 'all'}
|
|
147
207
|
This parameter defines how *altloc* IDs are handled:
|
|
148
208
|
- ``'first'`` - Use atoms that have the first *altloc* ID
|
|
@@ -176,6 +236,15 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
|
|
|
176
236
|
otherwise from the the ``label_xxx`` fields.
|
|
177
237
|
If the requested field is not available, the respective other
|
|
178
238
|
field is taken as fallback.
|
|
239
|
+
include_bonds : bool, optional
|
|
240
|
+
If set to true, a :class:`BondList` will be created for the
|
|
241
|
+
resulting :class:`AtomArray` containing the bond information
|
|
242
|
+
from the file.
|
|
243
|
+
Bonds, whose order could not be determined from the
|
|
244
|
+
*Chemical Component Dictionary*
|
|
245
|
+
(e.g. especially inter-residue bonds),
|
|
246
|
+
have :attr:`BondType.ANY`, since the PDB format itself does
|
|
247
|
+
not support bond orders.
|
|
179
248
|
|
|
180
249
|
Returns
|
|
181
250
|
-------
|
|
@@ -186,31 +255,35 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
|
|
|
186
255
|
--------
|
|
187
256
|
|
|
188
257
|
>>> import os.path
|
|
189
|
-
>>> file =
|
|
258
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "1l2y.cif"))
|
|
190
259
|
>>> arr = get_structure(file, model=1)
|
|
191
260
|
>>> print(len(arr))
|
|
192
261
|
304
|
|
193
262
|
|
|
194
263
|
"""
|
|
195
|
-
|
|
264
|
+
block = _get_block(pdbx_file, data_block)
|
|
265
|
+
|
|
266
|
+
extra_fields = set() if extra_fields is None else set(extra_fields)
|
|
196
267
|
|
|
197
|
-
|
|
198
|
-
if
|
|
268
|
+
atom_site = block.get("atom_site")
|
|
269
|
+
if atom_site is None:
|
|
199
270
|
raise InvalidFileError("Missing 'atom_site' category in file")
|
|
200
|
-
|
|
201
|
-
models =
|
|
271
|
+
|
|
272
|
+
models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
|
|
202
273
|
model_starts = _get_model_starts(models)
|
|
203
274
|
model_count = len(model_starts)
|
|
204
275
|
atom_count = len(models)
|
|
205
276
|
|
|
206
277
|
if model is None:
|
|
207
278
|
# For a stack, the annotations are derived from the first model
|
|
208
|
-
|
|
279
|
+
model_atom_site = _filter_model(atom_site, model_starts, 1)
|
|
209
280
|
# Any field of the category would work here to get the length
|
|
210
|
-
model_length =
|
|
281
|
+
model_length = model_atom_site.row_count
|
|
211
282
|
stack = AtomArrayStack(model_count, model_length)
|
|
212
283
|
|
|
213
|
-
_fill_annotations(
|
|
284
|
+
_fill_annotations(
|
|
285
|
+
stack, model_atom_site, extra_fields, use_author_fields
|
|
286
|
+
)
|
|
214
287
|
|
|
215
288
|
# Check if each model has the same amount of atoms
|
|
216
289
|
# If not, raise exception
|
|
@@ -221,22 +294,24 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
|
|
|
221
294
|
"instead"
|
|
222
295
|
)
|
|
223
296
|
|
|
224
|
-
stack.coord = np.
|
|
225
|
-
|
|
226
|
-
)
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
297
|
+
stack.coord[:, :, 0] = atom_site["Cartn_x"].as_array(np.float32) \
|
|
298
|
+
.reshape((model_count, model_length))
|
|
299
|
+
stack.coord[:, :, 1] = atom_site["Cartn_y"].as_array(np.float32) \
|
|
300
|
+
.reshape((model_count, model_length))
|
|
301
|
+
stack.coord[:, :, 2] = atom_site["Cartn_z"].as_array(np.float32) \
|
|
302
|
+
.reshape((model_count, model_length))
|
|
303
|
+
|
|
304
|
+
if include_bonds:
|
|
305
|
+
bonds = connect_via_residue_names(stack)
|
|
306
|
+
if "struct_conn" in block:
|
|
307
|
+
bonds = bonds.merge(_parse_inter_residue_bonds(
|
|
308
|
+
model_atom_site, block["struct_conn"]
|
|
309
|
+
))
|
|
310
|
+
stack.bonds = bonds
|
|
236
311
|
|
|
237
|
-
stack = _filter_altloc(stack,
|
|
312
|
+
stack = _filter_altloc(stack, model_atom_site, altloc)
|
|
238
313
|
|
|
239
|
-
box = _get_box(
|
|
314
|
+
box = _get_box(block)
|
|
240
315
|
if box is not None:
|
|
241
316
|
# Duplicate same box for each model
|
|
242
317
|
stack.box = np.repeat(box[np.newaxis, ...], model_count, axis=0)
|
|
@@ -254,169 +329,284 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
|
|
|
254
329
|
f"the given model {model} does not exist"
|
|
255
330
|
)
|
|
256
331
|
|
|
257
|
-
|
|
332
|
+
model_atom_site = _filter_model(atom_site, model_starts, model)
|
|
258
333
|
# Any field of the category would work here to get the length
|
|
259
|
-
model_length =
|
|
334
|
+
model_length = model_atom_site.row_count
|
|
260
335
|
array = AtomArray(model_length)
|
|
261
336
|
|
|
262
|
-
_fill_annotations(
|
|
263
|
-
|
|
264
|
-
# Append exclusive stop
|
|
265
|
-
model_starts = np.append(
|
|
266
|
-
model_starts, [len(atom_site_dict["group_PDB"])]
|
|
267
|
-
)
|
|
268
|
-
# Indexing starts at 0, but model number starts at 1
|
|
269
|
-
model_index = model - 1
|
|
270
|
-
start, stop = model_starts[model_index], model_starts[model_index + 1]
|
|
271
|
-
array.coord = np.zeros((model_length, 3), dtype=np.float32)
|
|
272
|
-
array.coord[:, 0] = atom_site_dict["Cartn_x"][start:stop].astype(
|
|
273
|
-
np.float32
|
|
274
|
-
)
|
|
275
|
-
array.coord[:, 1] = atom_site_dict["Cartn_y"][start:stop].astype(
|
|
276
|
-
np.float32
|
|
277
|
-
)
|
|
278
|
-
array.coord[:, 2] = atom_site_dict["Cartn_z"][start:stop].astype(
|
|
279
|
-
np.float32
|
|
337
|
+
_fill_annotations(
|
|
338
|
+
array, model_atom_site, extra_fields, use_author_fields
|
|
280
339
|
)
|
|
281
340
|
|
|
282
|
-
array =
|
|
341
|
+
array.coord[:, 0] = model_atom_site["Cartn_x"].as_array(np.float32)
|
|
342
|
+
array.coord[:, 1] = model_atom_site["Cartn_y"].as_array(np.float32)
|
|
343
|
+
array.coord[:, 2] = model_atom_site["Cartn_z"].as_array(np.float32)
|
|
344
|
+
|
|
345
|
+
if include_bonds:
|
|
346
|
+
bonds = connect_via_residue_names(array)
|
|
347
|
+
if "struct_conn" in block:
|
|
348
|
+
bonds = bonds.merge(_parse_inter_residue_bonds(
|
|
349
|
+
model_atom_site, block["struct_conn"]
|
|
350
|
+
))
|
|
351
|
+
array.bonds = bonds
|
|
352
|
+
|
|
353
|
+
array = _filter_altloc(array, model_atom_site, altloc)
|
|
283
354
|
|
|
284
|
-
array.box = _get_box(
|
|
355
|
+
array.box = _get_box(block)
|
|
285
356
|
|
|
286
357
|
return array
|
|
287
358
|
|
|
288
359
|
|
|
289
|
-
def
|
|
290
|
-
|
|
360
|
+
def _get_block(pdbx_component, block_name):
|
|
361
|
+
if isinstance(pdbx_component, PDBxFile):
|
|
362
|
+
# The deprecated 'PDBxFile' is a thin wrapper around 'CIFFile'
|
|
363
|
+
pdbx_component = pdbx_component.cif_file
|
|
364
|
+
|
|
365
|
+
if not isinstance(pdbx_component, (CIFBlock, BinaryCIFBlock)):
|
|
366
|
+
# Determine block
|
|
367
|
+
if block_name is None:
|
|
368
|
+
return pdbx_component.block
|
|
369
|
+
else:
|
|
370
|
+
return pdbx_component[block_name]
|
|
371
|
+
else:
|
|
372
|
+
return pdbx_component
|
|
291
373
|
|
|
292
|
-
Parameters
|
|
293
|
-
----------
|
|
294
|
-
array : AtomArray or AtomArrayStack
|
|
295
|
-
Atom array or stack which will be annotated.
|
|
296
|
-
model_dict : dict(str, ndarray)
|
|
297
|
-
``atom_site`` dictionary with values for one model.
|
|
298
|
-
extra_fields : list of str
|
|
299
|
-
Entry names, that are additionally added as annotation arrays.
|
|
300
|
-
use_author_fields : bool
|
|
301
|
-
Define if alternate fields prefixed with ``auth_`` should be used
|
|
302
|
-
instead of ``label_``.
|
|
303
|
-
"""
|
|
304
374
|
|
|
305
|
-
|
|
306
|
-
dict_name="input"):
|
|
375
|
+
def _get_or_fallback(category, key, fallback_key, cat_name="input"):
|
|
307
376
|
"""
|
|
308
|
-
Return
|
|
309
|
-
otherwise try to get the
|
|
310
|
-
|
|
377
|
+
Return column related to key in category if it exists,
|
|
378
|
+
otherwise try to get the column related to fallback key.
|
|
379
|
+
"""
|
|
380
|
+
if key not in category:
|
|
311
381
|
warnings.warn(
|
|
312
|
-
f"Attribute '{key}' not found within '{
|
|
382
|
+
f"Attribute '{key}' not found within '{cat_name}' category. "
|
|
313
383
|
f"The fallback attribute '{fallback_key}' will be used instead",
|
|
314
384
|
UserWarning
|
|
315
385
|
)
|
|
316
386
|
try:
|
|
317
|
-
return
|
|
387
|
+
return category[fallback_key]
|
|
318
388
|
except KeyError as key_exc:
|
|
319
389
|
raise InvalidFileError(
|
|
320
390
|
f"Fallback attribute '{fallback_key}' not found in "
|
|
321
391
|
"'{dict_name}' category"
|
|
322
392
|
) from key_exc
|
|
323
|
-
return
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
if as_type is not None:
|
|
342
|
-
array = array.astype(as_type)
|
|
343
|
-
return formatter(array) if formatter is not None else array
|
|
393
|
+
return category[key]
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
|
|
397
|
+
"""Fill atom_site annotations in atom array or atom array stack.
|
|
398
|
+
|
|
399
|
+
Parameters
|
|
400
|
+
----------
|
|
401
|
+
array : AtomArray or AtomArrayStack
|
|
402
|
+
Atom array or stack which will be annotated.
|
|
403
|
+
atom_site : CIFCategory or BinaryCIFCategory
|
|
404
|
+
``atom_site`` category with values for one model.
|
|
405
|
+
extra_fields : list of str
|
|
406
|
+
Entry names, that are additionally added as annotation arrays.
|
|
407
|
+
use_author_fields : bool
|
|
408
|
+
Define if alternate fields prefixed with ``auth_`` should be used
|
|
409
|
+
instead of ``label_``.
|
|
410
|
+
"""
|
|
344
411
|
|
|
345
412
|
prefix, alt_prefix = (
|
|
346
413
|
("auth", "label") if use_author_fields else ("label", "auth")
|
|
347
414
|
)
|
|
348
415
|
|
|
349
|
-
|
|
350
|
-
"chain_id": (f"{prefix}_asym_id", f"{alt_prefix}_asym_id", "U4", None),
|
|
351
|
-
"res_id": (
|
|
352
|
-
f"{prefix}_seq_id",
|
|
353
|
-
f"{alt_prefix}_seq_id",
|
|
354
|
-
None,
|
|
355
|
-
lambda annot: np.array(
|
|
356
|
-
[-1 if elt in [".", "?"] else int(elt) for elt in annot]
|
|
357
|
-
),
|
|
358
|
-
),
|
|
359
|
-
"ins_code": (
|
|
360
|
-
"pdbx_PDB_ins_code",
|
|
361
|
-
None,
|
|
362
|
-
"U1",
|
|
363
|
-
lambda annot: np.array(
|
|
364
|
-
["" if elt in [".", "?"] else elt for elt in annot]
|
|
365
|
-
),
|
|
366
|
-
),
|
|
367
|
-
"res_name": (f"{prefix}_comp_id", f"{alt_prefix}_comp_id", "U5", None),
|
|
368
|
-
"hetero": ("group_PDB", None, None, lambda annot: annot == "HETATM"),
|
|
369
|
-
"atom_name": (
|
|
370
|
-
f"{prefix}_atom_id",
|
|
371
|
-
f"{alt_prefix}_atom_id",
|
|
372
|
-
"U6",
|
|
373
|
-
None,
|
|
374
|
-
),
|
|
375
|
-
"element": ("type_symbol", None, "U2", None),
|
|
376
|
-
"atom_id": ("id", None, int, None),
|
|
377
|
-
"b_factor": ("B_iso_or_equiv", None, float, None),
|
|
378
|
-
"occupancy": ("occupancy", None, float, None),
|
|
379
|
-
"charge": (
|
|
380
|
-
"pdbx_formal_charge",
|
|
381
|
-
None,
|
|
382
|
-
None,
|
|
383
|
-
lambda annot: np.array(
|
|
384
|
-
[
|
|
385
|
-
0 if charge in ["?", "."] else int(charge)
|
|
386
|
-
for charge in annot
|
|
387
|
-
],
|
|
388
|
-
dtype=int,
|
|
389
|
-
),
|
|
390
|
-
),
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
mandatory_annotations = [
|
|
416
|
+
array.set_annotation(
|
|
394
417
|
"chain_id",
|
|
418
|
+
_get_or_fallback(
|
|
419
|
+
atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
|
|
420
|
+
).as_array("U4")
|
|
421
|
+
)
|
|
422
|
+
array.set_annotation(
|
|
395
423
|
"res_id",
|
|
424
|
+
_get_or_fallback(
|
|
425
|
+
atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
|
|
426
|
+
).as_array(int, -1)
|
|
427
|
+
)
|
|
428
|
+
array.set_annotation(
|
|
396
429
|
"ins_code",
|
|
430
|
+
atom_site["pdbx_PDB_ins_code"].as_array("U1", "")
|
|
431
|
+
)
|
|
432
|
+
array.set_annotation(
|
|
397
433
|
"res_name",
|
|
434
|
+
_get_or_fallback(
|
|
435
|
+
atom_site, f"{prefix}_comp_id", f"{alt_prefix}_comp_id"
|
|
436
|
+
).as_array("U5")
|
|
437
|
+
)
|
|
438
|
+
array.set_annotation(
|
|
398
439
|
"hetero",
|
|
440
|
+
atom_site["group_PDB"].as_array(str) == "HETATM"
|
|
441
|
+
)
|
|
442
|
+
array.set_annotation(
|
|
399
443
|
"atom_name",
|
|
444
|
+
_get_or_fallback(
|
|
445
|
+
atom_site, f"{prefix}_atom_id", f"{alt_prefix}_atom_id"
|
|
446
|
+
).as_array("U6")
|
|
447
|
+
)
|
|
448
|
+
array.set_annotation(
|
|
400
449
|
"element",
|
|
401
|
-
|
|
450
|
+
atom_site["type_symbol"].as_array("U2")
|
|
451
|
+
)
|
|
402
452
|
|
|
403
|
-
|
|
404
|
-
for annotation_name in mandatory_annotations + extra_fields:
|
|
453
|
+
if "atom_id" in extra_fields:
|
|
405
454
|
array.set_annotation(
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
)
|
|
455
|
+
"atom_id",
|
|
456
|
+
atom_site["id"].as_array(int)
|
|
457
|
+
)
|
|
458
|
+
extra_fields.remove("atom_id")
|
|
459
|
+
if "b_factor" in extra_fields:
|
|
460
|
+
array.set_annotation(
|
|
461
|
+
"b_factor",
|
|
462
|
+
atom_site["B_iso_or_equiv"].as_array(float)
|
|
463
|
+
)
|
|
464
|
+
extra_fields.remove("b_factor")
|
|
465
|
+
if "occupancy" in extra_fields:
|
|
466
|
+
array.set_annotation(
|
|
467
|
+
"occupancy",
|
|
468
|
+
atom_site["occupancy"].as_array(float)
|
|
469
|
+
)
|
|
470
|
+
extra_fields.remove("occupancy")
|
|
471
|
+
if "charge" in extra_fields:
|
|
472
|
+
array.set_annotation(
|
|
473
|
+
"charge",
|
|
474
|
+
atom_site["pdbx_formal_charge"].as_array(int, 0)
|
|
475
|
+
)
|
|
476
|
+
extra_fields.remove("charge")
|
|
477
|
+
|
|
478
|
+
# Handle all remaining custom fields
|
|
479
|
+
for field in extra_fields:
|
|
480
|
+
array.set_annotation(
|
|
481
|
+
field,
|
|
482
|
+
atom_site[field].as_array(str)
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def _parse_inter_residue_bonds(atom_site, struct_conn):
|
|
487
|
+
"""
|
|
488
|
+
Create inter-residue bonds by parsing the ``struct_conn`` category.
|
|
489
|
+
The atom indices of each bond are found by matching the bond labels
|
|
490
|
+
to the ``atom_site`` category.
|
|
491
|
+
"""
|
|
492
|
+
# Identity symmetry operation
|
|
493
|
+
IDENTITY = "1_555"
|
|
494
|
+
# Columns in 'atom_site' that should be matched by 'struct_conn'
|
|
495
|
+
COLUMNS = [
|
|
496
|
+
"label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id",
|
|
497
|
+
"label_alt_id", "auth_asym_id", "auth_comp_id", "auth_seq_id",
|
|
498
|
+
"pdbx_PDB_ins_code"
|
|
499
|
+
]
|
|
500
|
+
|
|
501
|
+
covale_mask = np.isin(
|
|
502
|
+
struct_conn["conn_type_id"].as_array(str), PDBX_COVALENT_TYPES
|
|
503
|
+
)
|
|
504
|
+
if "ptnr1_symmetry" in struct_conn:
|
|
505
|
+
covale_mask &= (
|
|
506
|
+
struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
|
|
507
|
+
)
|
|
508
|
+
if "ptnr2_symmetry" in struct_conn:
|
|
509
|
+
covale_mask &= (
|
|
510
|
+
struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
atom_indices = [None] * 2
|
|
514
|
+
for i in range(2):
|
|
515
|
+
reference_arrays = []
|
|
516
|
+
query_arrays = []
|
|
517
|
+
for col_name in COLUMNS:
|
|
518
|
+
struct_conn_col_name = _get_struct_conn_col_name(col_name, i+1)
|
|
519
|
+
if (
|
|
520
|
+
col_name not in atom_site
|
|
521
|
+
or struct_conn_col_name not in struct_conn
|
|
522
|
+
):
|
|
523
|
+
continue
|
|
524
|
+
# Ensure both arrays have the same dtype to allow comparison
|
|
525
|
+
reference = atom_site[col_name].as_array()
|
|
526
|
+
dtype = reference.dtype
|
|
527
|
+
query = struct_conn[struct_conn_col_name].as_array(dtype)
|
|
528
|
+
if np.issubdtype(reference.dtype, str):
|
|
529
|
+
# The mask value is not necessarily consistent
|
|
530
|
+
# between query and reference
|
|
531
|
+
# -> make it consistent
|
|
532
|
+
reference[reference == "?"] = "."
|
|
533
|
+
query[query == "?"] = "."
|
|
534
|
+
reference_arrays.append(reference)
|
|
535
|
+
query_arrays.append(query[covale_mask])
|
|
536
|
+
# Match the combination of 'label_asym_id', 'label_comp_id', etc.
|
|
537
|
+
# in 'atom_site' and 'struct_conn'
|
|
538
|
+
atom_indices[i] = _find_matches(query_arrays, reference_arrays)
|
|
539
|
+
atoms_indices_1 = atom_indices[0]
|
|
540
|
+
atoms_indices_2 = atom_indices[1]
|
|
541
|
+
|
|
542
|
+
# Some bonds in 'struct_conn' may not be found in 'atom_site'
|
|
543
|
+
# This is okay,
|
|
544
|
+
# as 'atom_site' might already be reduced to a single model
|
|
545
|
+
mapping_exists_mask = (atoms_indices_1 != -1) & (atoms_indices_2 != -1)
|
|
546
|
+
atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
|
|
547
|
+
atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
|
|
548
|
+
|
|
549
|
+
# Interpret missing values as ANY bonds
|
|
550
|
+
bond_order = struct_conn["pdbx_value_order"].as_array("U4", "")
|
|
551
|
+
# Consecutively apply the same masks as applied to the atom indices
|
|
552
|
+
# Logical combination does not work here,
|
|
553
|
+
# as the second mask was created based on already filtered data
|
|
554
|
+
bond_order = bond_order[covale_mask][mapping_exists_mask]
|
|
555
|
+
bond_types = [PDBX_BOND_ORDER_TO_TYPE[order] for order in bond_order]
|
|
556
|
+
|
|
557
|
+
return BondList(
|
|
558
|
+
atom_site.row_count,
|
|
559
|
+
np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1)
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def _find_matches(query_arrays, reference_arrays):
|
|
564
|
+
"""
|
|
565
|
+
For each index in the `query_arrays` find the indices in the
|
|
566
|
+
`reference_arrays` where all query values the reference counterpart.
|
|
567
|
+
If no match is found for a query, the corresponding index is -1.
|
|
568
|
+
"""
|
|
569
|
+
match_masks_for_all_columns = np.stack([
|
|
570
|
+
query[:, np.newaxis] == reference[np.newaxis, :]
|
|
571
|
+
for query, reference in zip(query_arrays, reference_arrays)
|
|
572
|
+
], axis=-1)
|
|
573
|
+
match_masks = np.all(match_masks_for_all_columns, axis=-1)
|
|
574
|
+
query_matches, reference_matches = np.where(match_masks)
|
|
575
|
+
|
|
576
|
+
# Duplicate matches indicate that an atom from the query cannot
|
|
577
|
+
# be uniquely matched to an atom in the reference
|
|
578
|
+
unique_query_matches, counts = np.unique(query_matches, return_counts=True)
|
|
579
|
+
if np.any(counts > 1):
|
|
580
|
+
ambiguous_query = unique_query_matches[np.where(counts > 1)[0][0]]
|
|
581
|
+
raise InvalidFileError(
|
|
582
|
+
f"The covalent bond in the 'struct_conn' category at index "
|
|
583
|
+
f"{ambiguous_query} cannot be unambiguously assigned to atoms in "
|
|
584
|
+
f"the 'atom_site' category"
|
|
414
585
|
)
|
|
415
586
|
|
|
587
|
+
# -1 indicates that no match was found in the reference
|
|
588
|
+
match_indices = np.full(len(query_arrays[0]), -1, dtype=int)
|
|
589
|
+
match_indices[query_matches] = reference_matches
|
|
590
|
+
return match_indices
|
|
416
591
|
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
592
|
+
|
|
593
|
+
def _get_struct_conn_col_name(col_name, partner):
|
|
594
|
+
"""
|
|
595
|
+
For a column name in ``atom_site`` get the corresponding column name
|
|
596
|
+
in ``struct_conn``.
|
|
597
|
+
"""
|
|
598
|
+
if col_name == "label_alt_id":
|
|
599
|
+
return f"pdbx_ptnr{partner}_label_alt_id"
|
|
600
|
+
elif col_name.startswith("pdbx_"):
|
|
601
|
+
# Move 'pdbx_' to front
|
|
602
|
+
return f"pdbx_ptnr{partner}_{col_name[5:]}"
|
|
603
|
+
else:
|
|
604
|
+
return f"ptnr{partner}_{col_name}"
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
def _filter_altloc(array, atom_site, altloc):
|
|
608
|
+
altloc_ids = atom_site.get("label_alt_id")
|
|
609
|
+
occupancy = atom_site.get("occupancy")
|
|
420
610
|
|
|
421
611
|
# Filter altloc IDs and return
|
|
422
612
|
if altloc_ids is None:
|
|
@@ -425,14 +615,14 @@ def _filter_altloc(array, model_dict, altloc):
|
|
|
425
615
|
return array[
|
|
426
616
|
...,
|
|
427
617
|
filter_highest_occupancy_altloc(
|
|
428
|
-
array, altloc_ids, occupancy.
|
|
618
|
+
array, altloc_ids.as_array(str), occupancy.as_array(float)
|
|
429
619
|
),
|
|
430
620
|
]
|
|
431
621
|
# 'first' is also fallback if file has no occupancy information
|
|
432
622
|
elif altloc == "first":
|
|
433
|
-
return array[..., filter_first_altloc(array, altloc_ids)]
|
|
623
|
+
return array[..., filter_first_altloc(array, altloc_ids.as_array(str))]
|
|
434
624
|
elif altloc == "all":
|
|
435
|
-
array.set_annotation("altloc_id", altloc_ids)
|
|
625
|
+
array.set_annotation("altloc_id", altloc_ids.as_array(str))
|
|
436
626
|
return array
|
|
437
627
|
else:
|
|
438
628
|
raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
|
|
@@ -443,49 +633,46 @@ def _get_model_starts(model_array):
|
|
|
443
633
|
Get the start index for each model in the arrays of the
|
|
444
634
|
``atom_site`` category.
|
|
445
635
|
"""
|
|
446
|
-
|
|
636
|
+
_, indices = np.unique(model_array, return_index=True)
|
|
447
637
|
indices.sort()
|
|
448
638
|
return indices
|
|
449
639
|
|
|
450
640
|
|
|
451
|
-
def
|
|
641
|
+
def _filter_model(atom_site, model_starts, model):
|
|
452
642
|
"""
|
|
453
|
-
Reduce the ``atom_site``
|
|
643
|
+
Reduce the ``atom_site`` category to the values for the given
|
|
454
644
|
model.
|
|
455
645
|
"""
|
|
646
|
+
Category = type(atom_site)
|
|
647
|
+
Column = Category.subcomponent_class()
|
|
648
|
+
Data = Column.subcomponent_class()
|
|
649
|
+
|
|
456
650
|
# Append exclusive stop
|
|
457
651
|
model_starts = np.append(
|
|
458
|
-
model_starts, [
|
|
652
|
+
model_starts, [atom_site.row_count]
|
|
459
653
|
)
|
|
460
|
-
model_dict = {}
|
|
461
654
|
# Indexing starts at 0, but model number starts at 1
|
|
462
655
|
model_index = model - 1
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
model_starts[model_index] : model_starts[model_index + 1]
|
|
466
|
-
]
|
|
467
|
-
return model_dict
|
|
656
|
+
index = slice(model_starts[model_index], model_starts[model_index + 1])
|
|
657
|
+
return _filter(atom_site, index)
|
|
468
658
|
|
|
469
659
|
|
|
470
|
-
def _get_box(
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
else:
|
|
474
|
-
cell_dict = pdbx_file.get((data_block, "cell"))
|
|
475
|
-
if cell_dict is None:
|
|
660
|
+
def _get_box(block):
|
|
661
|
+
cell = block.get("cell")
|
|
662
|
+
if cell is None:
|
|
476
663
|
return None
|
|
477
664
|
try:
|
|
478
665
|
len_a, len_b, len_c = [
|
|
479
|
-
float(
|
|
666
|
+
float(cell[length].as_item())
|
|
480
667
|
for length in ["length_a", "length_b", "length_c"]
|
|
481
668
|
]
|
|
669
|
+
alpha, beta, gamma = [
|
|
670
|
+
np.deg2rad(float(cell[angle].as_item()))
|
|
671
|
+
for angle in ["angle_alpha", "angle_beta", "angle_gamma"]
|
|
672
|
+
]
|
|
482
673
|
except ValueError:
|
|
483
674
|
# 'cell_dict' has no proper unit cell values, e.g. '?'
|
|
484
675
|
return None
|
|
485
|
-
alpha, beta, gamma = [
|
|
486
|
-
np.deg2rad(float(cell_dict[angle]))
|
|
487
|
-
for angle in ["angle_alpha", "angle_beta", "angle_gamma"]
|
|
488
|
-
]
|
|
489
676
|
return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
|
|
490
677
|
|
|
491
678
|
|
|
@@ -496,69 +683,90 @@ def set_structure(pdbx_file, array, data_block=None):
|
|
|
496
683
|
|
|
497
684
|
This will save the coordinates, the mandatory annotation categories
|
|
498
685
|
and the optional annotation categories
|
|
499
|
-
``
|
|
686
|
+
``atom_id``, ``b_factor``, ``occupancy`` and ``charge``.
|
|
500
687
|
If the atom array (stack) contains the annotation ``'atom_id'``,
|
|
501
688
|
these values will be used for atom numbering instead of continuous
|
|
502
689
|
numbering.
|
|
690
|
+
Furthermore, inter-residue bonds will be written into the
|
|
691
|
+
``struct_conn`` category.
|
|
503
692
|
|
|
504
693
|
Parameters
|
|
505
694
|
----------
|
|
506
|
-
pdbx_file :
|
|
695
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
507
696
|
The file object.
|
|
508
697
|
array : AtomArray or AtomArrayStack
|
|
509
698
|
The structure to be written. If a stack is given, each array in
|
|
510
699
|
the stack will be in a separate model.
|
|
511
700
|
data_block : str, optional
|
|
512
|
-
The name of the data block.
|
|
513
|
-
(and most times only) data block of the
|
|
701
|
+
The name of the data block.
|
|
702
|
+
Default is the first (and most times only) data block of the
|
|
703
|
+
file.
|
|
704
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
705
|
+
this parameter is ignored.
|
|
706
|
+
If the file is empty, a new data will be created.
|
|
707
|
+
|
|
708
|
+
Notes
|
|
709
|
+
-----
|
|
710
|
+
In some cases, the written inter-residue bonds cannot be read again
|
|
711
|
+
due to ambiguity to which atoms the bond refers.
|
|
712
|
+
This is the case, when two equal residues in the same chain have
|
|
713
|
+
the same (or a masked) `res_id`.
|
|
514
714
|
|
|
515
715
|
Examples
|
|
516
716
|
--------
|
|
517
717
|
|
|
518
718
|
>>> import os.path
|
|
519
|
-
>>> file =
|
|
520
|
-
>>> set_structure(file, atom_array
|
|
719
|
+
>>> file = CIFFile()
|
|
720
|
+
>>> set_structure(file, atom_array)
|
|
521
721
|
>>> file.write(os.path.join(path_to_directory, "structure.cif"))
|
|
522
722
|
|
|
523
723
|
"""
|
|
724
|
+
block = _get_or_create_block(pdbx_file, data_block)
|
|
725
|
+
Category = block.subcomponent_class()
|
|
726
|
+
Column = Category.subcomponent_class()
|
|
727
|
+
|
|
524
728
|
# Fill PDBx columns from information
|
|
525
729
|
# in structures' attribute arrays as good as possible
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
730
|
+
atom_site = Category()
|
|
731
|
+
atom_site["group_PDB"] = np.where(
|
|
732
|
+
array.hetero, "HETATM", "ATOM"
|
|
733
|
+
)
|
|
734
|
+
atom_site["type_symbol"] = np.copy(array.element)
|
|
735
|
+
atom_site["label_atom_id"] = np.copy(array.atom_name)
|
|
736
|
+
atom_site["label_alt_id"] = Column(
|
|
737
|
+
# AtomArrays do not store altloc atoms
|
|
738
|
+
np.full(array.array_length(), "."),
|
|
739
|
+
np.full(array.array_length(), MaskValue.INAPPLICABLE),
|
|
740
|
+
)
|
|
741
|
+
atom_site["label_comp_id"] = np.copy(array.res_name)
|
|
742
|
+
atom_site["label_asym_id"] = np.copy(array.chain_id)
|
|
743
|
+
atom_site["label_entity_id"] = _determine_entity_id(array.chain_id)
|
|
744
|
+
atom_site["label_seq_id"] = np.copy(array.res_id)
|
|
745
|
+
atom_site["pdbx_PDB_ins_code"] = Column(
|
|
746
|
+
np.copy(array.ins_code),
|
|
747
|
+
np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT)
|
|
533
748
|
)
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
atom_site_dict["label_asym_id"] = np.copy(array.chain_id)
|
|
539
|
-
atom_site_dict["label_entity_id"] = _determine_entity_id(array.chain_id)
|
|
540
|
-
atom_site_dict["label_seq_id"] = np.array([str(e) for e in array.res_id])
|
|
541
|
-
atom_site_dict["pdbx_PDB_ins_code"] = array.ins_code
|
|
542
|
-
atom_site_dict["auth_seq_id"] = atom_site_dict["label_seq_id"]
|
|
543
|
-
atom_site_dict["auth_comp_id"] = atom_site_dict["label_comp_id"]
|
|
544
|
-
atom_site_dict["auth_asym_id"] = atom_site_dict["label_asym_id"]
|
|
545
|
-
atom_site_dict["auth_atom_id"] = atom_site_dict["label_atom_id"]
|
|
749
|
+
atom_site["auth_seq_id"] = atom_site["label_seq_id"]
|
|
750
|
+
atom_site["auth_comp_id"] = atom_site["label_comp_id"]
|
|
751
|
+
atom_site["auth_asym_id"] = atom_site["label_asym_id"]
|
|
752
|
+
atom_site["auth_atom_id"] = atom_site["label_atom_id"]
|
|
546
753
|
|
|
754
|
+
annot_categories = array.get_annotation_categories()
|
|
547
755
|
if "atom_id" in annot_categories:
|
|
548
|
-
|
|
756
|
+
atom_site["id"] = np.copy(array.atom_id)
|
|
549
757
|
if "b_factor" in annot_categories:
|
|
550
|
-
|
|
551
|
-
[f"{b:.2f}" for b in array.b_factor]
|
|
552
|
-
)
|
|
758
|
+
atom_site["B_iso_or_equiv"] = np.copy(array.b_factor)
|
|
553
759
|
if "occupancy" in annot_categories:
|
|
554
|
-
|
|
555
|
-
[f"{occ:.2f}" for occ in array.occupancy]
|
|
556
|
-
)
|
|
760
|
+
atom_site["occupancy"] = np.copy(array.occupancy)
|
|
557
761
|
if "charge" in annot_categories:
|
|
558
|
-
|
|
559
|
-
[f"{c:+d}" if c != 0 else "?" for c in array.charge]
|
|
762
|
+
atom_site["pdbx_formal_charge"] = Column(
|
|
763
|
+
np.array([f"{c:+d}" if c != 0 else "?" for c in array.charge]),
|
|
764
|
+
np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT)
|
|
560
765
|
)
|
|
561
766
|
|
|
767
|
+
if array.bonds is not None:
|
|
768
|
+
block["struct_conn"] = _set_inter_residue_bonds(array, atom_site)
|
|
769
|
+
|
|
562
770
|
# In case of a single model handle each coordinate
|
|
563
771
|
# simply like a flattened array
|
|
564
772
|
if type(array) == AtomArray or (
|
|
@@ -566,42 +774,34 @@ def set_structure(pdbx_file, array, data_block=None):
|
|
|
566
774
|
):
|
|
567
775
|
# 'ravel' flattens coord without copy
|
|
568
776
|
# in case of stack with stack_depth = 1
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
)
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
)
|
|
575
|
-
atom_site_dict["Cartn_z"] = np.array(
|
|
576
|
-
[f"{c:.3f}" for c in np.ravel(array.coord[..., 2])]
|
|
577
|
-
)
|
|
578
|
-
atom_site_dict["pdbx_PDB_model_num"] = np.full(
|
|
579
|
-
array.array_length(), "1"
|
|
777
|
+
atom_site["Cartn_x"] = np.copy(np.ravel(array.coord[..., 0]))
|
|
778
|
+
atom_site["Cartn_y"] = np.copy(np.ravel(array.coord[..., 1]))
|
|
779
|
+
atom_site["Cartn_z"] = np.copy(np.ravel(array.coord[..., 2]))
|
|
780
|
+
atom_site["pdbx_PDB_model_num"] = np.ones(
|
|
781
|
+
array.array_length(), dtype=np.int32
|
|
580
782
|
)
|
|
581
783
|
# In case of multiple models repeat annotations
|
|
582
784
|
# and use model specific coordinates
|
|
583
785
|
elif type(array) == AtomArrayStack:
|
|
584
|
-
|
|
585
|
-
atom_site_dict[key] = np.tile(value, reps=array.stack_depth())
|
|
786
|
+
atom_site = _repeat(atom_site, array.stack_depth())
|
|
586
787
|
coord = np.reshape(
|
|
587
788
|
array.coord, (array.stack_depth() * array.array_length(), 3)
|
|
588
789
|
)
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
np.arange(1, array.stack_depth() + 1
|
|
790
|
+
atom_site["Cartn_x"] = np.copy(coord[:, 0])
|
|
791
|
+
atom_site["Cartn_y"] = np.copy(coord[:, 1])
|
|
792
|
+
atom_site["Cartn_z"] = np.copy(coord[:, 2])
|
|
793
|
+
atom_site["pdbx_PDB_model_num"] = np.repeat(
|
|
794
|
+
np.arange(1, array.stack_depth() + 1, dtype=np.int32),
|
|
594
795
|
repeats=array.array_length(),
|
|
595
796
|
)
|
|
596
|
-
atom_site_dict["pdbx_PDB_model_num"] = models
|
|
597
797
|
else:
|
|
598
798
|
raise ValueError("Structure must be AtomArray or AtomArrayStack")
|
|
599
799
|
if not "atom_id" in annot_categories:
|
|
600
800
|
# Count from 1
|
|
601
|
-
|
|
602
|
-
1, len(
|
|
603
|
-
)
|
|
604
|
-
|
|
801
|
+
atom_site["id"] = np.arange(
|
|
802
|
+
1, len(atom_site["group_PDB"]) + 1
|
|
803
|
+
)
|
|
804
|
+
block["atom_site"] = atom_site
|
|
605
805
|
|
|
606
806
|
# Write box into file
|
|
607
807
|
if array.box is not None:
|
|
@@ -612,14 +812,38 @@ def set_structure(pdbx_file, array, data_block=None):
|
|
|
612
812
|
else:
|
|
613
813
|
box = array.box
|
|
614
814
|
len_a, len_b, len_c, alpha, beta, gamma = unitcell_from_vectors(box)
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
815
|
+
cell = Category()
|
|
816
|
+
cell["length_a"] = len_a
|
|
817
|
+
cell["length_b"] = len_b
|
|
818
|
+
cell["length_c"] = len_c
|
|
819
|
+
cell["angle_alpha"] = np.rad2deg(alpha)
|
|
820
|
+
cell["angle_beta"] = np.rad2deg(beta)
|
|
821
|
+
cell["angle_gamma"] = np.rad2deg(gamma)
|
|
822
|
+
block["cell"] = cell
|
|
823
|
+
|
|
824
|
+
|
|
825
|
+
def _get_or_create_block(pdbx_component, block_name):
|
|
826
|
+
if isinstance(pdbx_component, PDBxFile):
|
|
827
|
+
# The deprecated 'PDBxFile' is a thin wrapper around 'CIFFile'
|
|
828
|
+
pdbx_component = pdbx_component.cif_file
|
|
829
|
+
|
|
830
|
+
Block = pdbx_component.subcomponent_class()
|
|
831
|
+
|
|
832
|
+
if isinstance(pdbx_component, (CIFFile, BinaryCIFFile)):
|
|
833
|
+
if block_name is None:
|
|
834
|
+
if len(pdbx_component) > 0:
|
|
835
|
+
block_name = next(iter(pdbx_component.keys()))
|
|
836
|
+
else:
|
|
837
|
+
# File is empty -> invent a new block name
|
|
838
|
+
block_name = "structure"
|
|
839
|
+
|
|
840
|
+
if block_name not in pdbx_component:
|
|
841
|
+
block = Block()
|
|
842
|
+
pdbx_component[block_name] = block
|
|
843
|
+
return pdbx_component[block_name]
|
|
844
|
+
else:
|
|
845
|
+
# Already a block
|
|
846
|
+
return pdbx_component
|
|
623
847
|
|
|
624
848
|
|
|
625
849
|
def _determine_entity_id(chain_id):
|
|
@@ -635,10 +859,81 @@ def _determine_entity_id(chain_id):
|
|
|
635
859
|
id_translation[chain_id[i]] = id
|
|
636
860
|
entity_id[i] = id_translation[chain_id[i]]
|
|
637
861
|
id += 1
|
|
638
|
-
return entity_id
|
|
862
|
+
return entity_id
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
def _repeat(category, repetitions):
|
|
866
|
+
Category = type(category)
|
|
867
|
+
Column = Category.subcomponent_class()
|
|
868
|
+
Data = Column.subcomponent_class()
|
|
869
|
+
|
|
870
|
+
category_dict = {}
|
|
871
|
+
for key, column in category.items():
|
|
872
|
+
if isinstance(column, BinaryCIFColumn):
|
|
873
|
+
data_encoding = column.data.encoding
|
|
874
|
+
# Optimization: The repeated string array has the same
|
|
875
|
+
# unique values, as the original string array
|
|
876
|
+
# -> Use same unique values (faster due to shorter array)
|
|
877
|
+
if isinstance(data_encoding[0], StringArrayEncoding):
|
|
878
|
+
data_encoding[0].strings = np.unique(column.data.array)
|
|
879
|
+
data = Data(np.tile(column.data.array, repetitions), data_encoding)
|
|
880
|
+
else:
|
|
881
|
+
data = Data(np.tile(column.data.array, repetitions))
|
|
882
|
+
mask = Data(np.tile(column.mask.array, repetitions)) \
|
|
883
|
+
if column.mask is not None else None
|
|
884
|
+
category_dict[key] = Column(data, mask)
|
|
885
|
+
return Category(category_dict)
|
|
639
886
|
|
|
640
887
|
|
|
641
|
-
def
|
|
888
|
+
def _set_inter_residue_bonds(array, atom_site):
|
|
889
|
+
"""
|
|
890
|
+
Create the ``struct_conn`` category containing the inter-residue
|
|
891
|
+
bonds.
|
|
892
|
+
The involved atoms are identified by annotations from the
|
|
893
|
+
``atom_site`` category.
|
|
894
|
+
"""
|
|
895
|
+
COLUMNS = [
|
|
896
|
+
"label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id",
|
|
897
|
+
"pdbx_PDB_ins_code"
|
|
898
|
+
]
|
|
899
|
+
|
|
900
|
+
Category = type(atom_site)
|
|
901
|
+
Column = Category.subcomponent_class()
|
|
902
|
+
|
|
903
|
+
bond_array = array.bonds.as_array()
|
|
904
|
+
# To save computation time call 'get_residue_starts_for()' only once
|
|
905
|
+
# with indices of the first and second atom of each bond
|
|
906
|
+
residue_starts_1, residue_starts_2 = get_residue_starts_for(
|
|
907
|
+
array, bond_array[:, :2].flatten()
|
|
908
|
+
).reshape(-1, 2).T
|
|
909
|
+
# Filter out all intra-residue bonds
|
|
910
|
+
bond_array = bond_array[residue_starts_1 != residue_starts_2]
|
|
911
|
+
|
|
912
|
+
struct_conn = Category()
|
|
913
|
+
struct_conn["id"] = np.arange(1, len(bond_array) + 1)
|
|
914
|
+
struct_conn["conn_type_id"] = np.full(len(bond_array), "covale")
|
|
915
|
+
struct_conn["pdbx_value_order"] = Column(
|
|
916
|
+
np.array(
|
|
917
|
+
[PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]
|
|
918
|
+
),
|
|
919
|
+
np.where(
|
|
920
|
+
bond_array[:, 2] == BondType.ANY,
|
|
921
|
+
MaskValue.MISSING, MaskValue.PRESENT,
|
|
922
|
+
)
|
|
923
|
+
)
|
|
924
|
+
# Write the identifying annotation...
|
|
925
|
+
for col_name in COLUMNS:
|
|
926
|
+
annot = atom_site[col_name].as_array()
|
|
927
|
+
# ...for each bond partner
|
|
928
|
+
for i in range(2):
|
|
929
|
+
atom_indices = bond_array[:, i]
|
|
930
|
+
struct_conn[_get_struct_conn_col_name(col_name, i+1)] \
|
|
931
|
+
= annot[atom_indices]
|
|
932
|
+
return struct_conn
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
def get_component(pdbx_file, data_block=None, use_ideal_coord=True,
|
|
936
|
+
res_name=None):
|
|
642
937
|
"""
|
|
643
938
|
Create an :class:`AtomArray` for a chemical component from the
|
|
644
939
|
``chem_comp_atom`` and, if available, the ``chem_comp_bond``
|
|
@@ -646,26 +941,37 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
|
|
|
646
941
|
|
|
647
942
|
Parameters
|
|
648
943
|
----------
|
|
944
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
945
|
+
The file object.
|
|
649
946
|
data_block : str, optional
|
|
650
|
-
The name of the data block.
|
|
651
|
-
(and most times only) data block of the
|
|
947
|
+
The name of the data block.
|
|
948
|
+
Default is the first (and most times only) data block of the
|
|
949
|
+
file.
|
|
950
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
951
|
+
this parameter is ignored.
|
|
652
952
|
use_ideal_coord : bool, optional
|
|
653
953
|
If true, the *ideal* coordinates are read from the file
|
|
654
954
|
(``pdbx_model_Cartn_<dim>_ideal`` fields), typically
|
|
655
955
|
originating from computations.
|
|
656
956
|
If set to false, alternative coordinates are read
|
|
657
957
|
(``model_Cartn_<dim>_`` fields).
|
|
658
|
-
|
|
958
|
+
res_name : str
|
|
959
|
+
In rare cases the categories may contain rows for multiple
|
|
960
|
+
components.
|
|
961
|
+
In this case, the component with the given residue name is
|
|
962
|
+
read.
|
|
963
|
+
By default, all rows would be read in this case.
|
|
964
|
+
|
|
659
965
|
Returns
|
|
660
966
|
-------
|
|
661
967
|
array : AtomArray
|
|
662
968
|
The parsed chemical component.
|
|
663
|
-
|
|
969
|
+
|
|
664
970
|
Examples
|
|
665
971
|
--------
|
|
666
972
|
|
|
667
973
|
>>> import os.path
|
|
668
|
-
>>> file =
|
|
974
|
+
>>> file = CIFFile.read(
|
|
669
975
|
... os.path.join(path_to_structures, "molecules", "TYR.cif")
|
|
670
976
|
... )
|
|
671
977
|
>>> comp = get_component(file)
|
|
@@ -695,26 +1001,31 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
|
|
|
695
1001
|
HET 0 TYR HH H -0.123 -0.399 -5.059
|
|
696
1002
|
HET 0 TYR HXT H -1.333 -0.030 4.784
|
|
697
1003
|
"""
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
1004
|
+
block = _get_block(pdbx_file, data_block)
|
|
1005
|
+
|
|
1006
|
+
try:
|
|
1007
|
+
atom_category = block["chem_comp_atom"]
|
|
1008
|
+
except KeyError:
|
|
702
1009
|
raise InvalidFileError("Missing 'chem_comp_atom' category in file")
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
1010
|
+
if res_name is not None:
|
|
1011
|
+
atom_category = _filter(
|
|
1012
|
+
atom_category, atom_category["comp_id"].as_array() == res_name
|
|
1013
|
+
)
|
|
1014
|
+
if len(atom_category) == 0:
|
|
1015
|
+
raise KeyError(
|
|
1016
|
+
f"No rows with residue name '{res_name}' found in "
|
|
1017
|
+
f"'chem_comp_atom' category"
|
|
1018
|
+
)
|
|
706
1019
|
|
|
707
|
-
array = AtomArray(
|
|
1020
|
+
array = AtomArray(atom_category.row_count)
|
|
708
1021
|
|
|
709
1022
|
array.hetero[:] = True
|
|
710
|
-
array.res_name =
|
|
711
|
-
array.atom_name =
|
|
712
|
-
array.element =
|
|
1023
|
+
array.res_name = atom_category["comp_id"].as_array("U5")
|
|
1024
|
+
array.atom_name = atom_category["atom_id"].as_array("U6")
|
|
1025
|
+
array.element = atom_category["type_symbol"].as_array("U2")
|
|
713
1026
|
array.add_annotation("charge", int)
|
|
714
|
-
array.charge =
|
|
715
|
-
|
|
716
|
-
)
|
|
717
|
-
|
|
1027
|
+
array.charge = atom_category["charge"].as_array(int, 0)
|
|
1028
|
+
|
|
718
1029
|
coord_fields = [f"pdbx_model_Cartn_{dim}_ideal" for dim in ("x", "y", "z")]
|
|
719
1030
|
alt_coord_fields = [f"model_Cartn_{dim}" for dim in ("x", "y", "z")]
|
|
720
1031
|
if not use_ideal_coord:
|
|
@@ -722,7 +1033,7 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
|
|
|
722
1033
|
coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
|
|
723
1034
|
try:
|
|
724
1035
|
for i, field in enumerate(coord_fields):
|
|
725
|
-
array.coord[:,i] =
|
|
1036
|
+
array.coord[:,i] = atom_category[field].as_array(np.float32)
|
|
726
1037
|
except KeyError as err:
|
|
727
1038
|
key = err.args[0]
|
|
728
1039
|
warnings.warn(
|
|
@@ -731,9 +1042,15 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
|
|
|
731
1042
|
UserWarning
|
|
732
1043
|
)
|
|
733
1044
|
for i, field in enumerate(alt_coord_fields):
|
|
734
|
-
array.coord[:,i] =
|
|
735
|
-
|
|
736
|
-
|
|
1045
|
+
array.coord[:,i] = atom_category[field].as_array(np.float32)
|
|
1046
|
+
|
|
1047
|
+
try:
|
|
1048
|
+
bond_category = block["chem_comp_bond"]
|
|
1049
|
+
if res_name is not None:
|
|
1050
|
+
bond_category = _filter(
|
|
1051
|
+
bond_category, bond_category["comp_id"].as_array() == res_name
|
|
1052
|
+
)
|
|
1053
|
+
except KeyError:
|
|
737
1054
|
warnings.warn(
|
|
738
1055
|
f"Category 'chem_comp_bond' not found. "
|
|
739
1056
|
f"No bonds will be parsed",
|
|
@@ -742,12 +1059,14 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
|
|
|
742
1059
|
else:
|
|
743
1060
|
bonds = BondList(array.array_length())
|
|
744
1061
|
for atom1, atom2, order, aromatic_flag in zip(
|
|
745
|
-
|
|
746
|
-
|
|
1062
|
+
bond_category["atom_id_1"].as_array(str),
|
|
1063
|
+
bond_category["atom_id_2"].as_array(str),
|
|
1064
|
+
bond_category["value_order"].as_array(str),
|
|
1065
|
+
bond_category["pdbx_aromatic_flag"].as_array(str)
|
|
747
1066
|
):
|
|
748
1067
|
atom_i = np.where(array.atom_name == atom1)[0][0]
|
|
749
1068
|
atom_j = np.where(array.atom_name == atom2)[0][0]
|
|
750
|
-
bond_type =
|
|
1069
|
+
bond_type = COMP_BOND_ORDER_TO_TYPE[order, aromatic_flag]
|
|
751
1070
|
bonds.add_bond(atom_i, atom_j, bond_type)
|
|
752
1071
|
array.bonds = bonds
|
|
753
1072
|
|
|
@@ -766,15 +1085,22 @@ def set_component(pdbx_file, array, data_block=None):
|
|
|
766
1085
|
|
|
767
1086
|
Parameters
|
|
768
1087
|
----------
|
|
769
|
-
pdbx_file :
|
|
1088
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
770
1089
|
The file object.
|
|
771
1090
|
array : AtomArray
|
|
772
1091
|
The chemical component to be written.
|
|
773
1092
|
Must contain only a single residue.
|
|
774
1093
|
data_block : str, optional
|
|
775
|
-
The name of the data block.
|
|
776
|
-
(and most times only) data block of the
|
|
1094
|
+
The name of the data block.
|
|
1095
|
+
Default is the first (and most times only) data block of the
|
|
1096
|
+
file.
|
|
1097
|
+
If the file is empty, a new data will be created.
|
|
1098
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
1099
|
+
this parameter is ignored.
|
|
777
1100
|
"""
|
|
1101
|
+
block = _get_or_create_block(pdbx_file, data_block)
|
|
1102
|
+
Category = block.subcomponent_class()
|
|
1103
|
+
|
|
778
1104
|
if get_residue_count(array) > 1:
|
|
779
1105
|
raise BadStructureError(
|
|
780
1106
|
"The input atom array must comprise only one residue"
|
|
@@ -787,45 +1113,44 @@ def set_component(pdbx_file, array, data_block=None):
|
|
|
787
1113
|
else:
|
|
788
1114
|
charge = np.full(array.array_length(), "?", dtype="U2")
|
|
789
1115
|
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
1116
|
+
atom_cat = Category()
|
|
1117
|
+
atom_cat["comp_id"] = np.full(array.array_length(), res_name)
|
|
1118
|
+
atom_cat["atom_id"] = np.copy(array.atom_name)
|
|
1119
|
+
atom_cat["alt_atom_id"] = atom_cat["atom_id"]
|
|
1120
|
+
atom_cat["type_symbol"] = np.copy(array.element)
|
|
1121
|
+
atom_cat["charge"] = charge
|
|
1122
|
+
atom_cat["model_Cartn_x"] = np.copy(array.coord[:, 0])
|
|
1123
|
+
atom_cat["model_Cartn_y"] = np.copy(array.coord[:, 1])
|
|
1124
|
+
atom_cat["model_Cartn_z"] = np.copy(array.coord[:, 2])
|
|
1125
|
+
atom_cat["pdbx_model_Cartn_x_ideal"] = atom_cat["model_Cartn_x"]
|
|
1126
|
+
atom_cat["pdbx_model_Cartn_y_ideal"] = atom_cat["model_Cartn_y"]
|
|
1127
|
+
atom_cat["pdbx_model_Cartn_z_ideal"] = atom_cat["model_Cartn_z"]
|
|
1128
|
+
atom_cat["pdbx_component_atom_id"] = atom_cat["atom_id"]
|
|
1129
|
+
atom_cat["pdbx_component_comp_id"] = atom_cat["comp_id"]
|
|
1130
|
+
atom_cat["pdbx_ordinal"] = np.arange(
|
|
805
1131
|
1, array.array_length() + 1
|
|
806
1132
|
).astype(str)
|
|
807
|
-
|
|
1133
|
+
block["chem_comp_atom"] = atom_cat
|
|
808
1134
|
|
|
809
1135
|
if array.bonds is not None:
|
|
810
1136
|
bond_array = array.bonds.as_array()
|
|
811
1137
|
order_flags = []
|
|
812
1138
|
aromatic_flags = []
|
|
813
1139
|
for bond_type in bond_array[:,2]:
|
|
814
|
-
order_flag, aromatic_flag =
|
|
1140
|
+
order_flag, aromatic_flag = COMP_BOND_TYPE_TO_ORDER[bond_type]
|
|
815
1141
|
order_flags.append(order_flag)
|
|
816
1142
|
aromatic_flags.append(aromatic_flag)
|
|
817
1143
|
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
1144
|
+
bond_cat = Category()
|
|
1145
|
+
bond_cat["comp_id"] = np.full(len(bond_array), res_name)
|
|
1146
|
+
bond_cat["atom_id_1"] = array.atom_name[bond_array[:,0]]
|
|
1147
|
+
bond_cat["atom_id_2"] = array.atom_name[bond_array[:,1]]
|
|
1148
|
+
bond_cat["value_order"] = np.array(order_flags)
|
|
1149
|
+
bond_cat["pdbx_aromatic_flag"] = np.array(aromatic_flags)
|
|
1150
|
+
bond_cat["pdbx_ordinal"] = np.arange(
|
|
825
1151
|
1, len(bond_array) + 1
|
|
826
1152
|
).astype(str)
|
|
827
|
-
|
|
828
|
-
|
|
1153
|
+
block["chem_comp_bond"] = bond_cat
|
|
829
1154
|
|
|
830
1155
|
def list_assemblies(pdbx_file, data_block=None):
|
|
831
1156
|
"""
|
|
@@ -838,23 +1163,25 @@ def list_assemblies(pdbx_file, data_block=None):
|
|
|
838
1163
|
|
|
839
1164
|
Parameters
|
|
840
1165
|
----------
|
|
841
|
-
pdbx_file :
|
|
1166
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
842
1167
|
The file object.
|
|
843
1168
|
data_block : str, optional
|
|
844
1169
|
The name of the data block.
|
|
845
|
-
|
|
1170
|
+
Default is the first (and most times only) data block of the
|
|
846
1171
|
file.
|
|
1172
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
1173
|
+
this parameter is ignored.
|
|
847
1174
|
|
|
848
1175
|
Returns
|
|
849
1176
|
-------
|
|
850
1177
|
assemblies : dict of str -> str
|
|
851
1178
|
A dictionary that maps an assembly ID to a description of the
|
|
852
1179
|
corresponding assembly.
|
|
853
|
-
|
|
1180
|
+
|
|
854
1181
|
Examples
|
|
855
1182
|
--------
|
|
856
1183
|
>>> import os.path
|
|
857
|
-
>>> file =
|
|
1184
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
|
|
858
1185
|
>>> assembly_ids = list_assemblies(file)
|
|
859
1186
|
>>> for key, val in assembly_ids.items():
|
|
860
1187
|
... print(f"'{key}' : '{val}'")
|
|
@@ -865,21 +1192,24 @@ def list_assemblies(pdbx_file, data_block=None):
|
|
|
865
1192
|
'5' : 'icosahedral asymmetric unit, std point frame'
|
|
866
1193
|
'6' : 'crystal asymmetric unit, crystal frame'
|
|
867
1194
|
"""
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
1195
|
+
block = _get_block(pdbx_file, data_block)
|
|
1196
|
+
|
|
1197
|
+
try:
|
|
1198
|
+
assembly_category = block["pdbx_struct_assembly"]
|
|
1199
|
+
except KeyError:
|
|
872
1200
|
raise InvalidFileError("File has no 'pdbx_struct_assembly' category")
|
|
873
1201
|
return {
|
|
874
1202
|
id: details
|
|
875
1203
|
for id, details in zip(
|
|
876
|
-
assembly_category["id"],
|
|
1204
|
+
assembly_category["id"].as_array(str),
|
|
1205
|
+
assembly_category["details"].as_array(str)
|
|
877
1206
|
)
|
|
878
1207
|
}
|
|
879
1208
|
|
|
880
1209
|
|
|
881
1210
|
def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
|
|
882
|
-
altloc="first", extra_fields=None, use_author_fields=True
|
|
1211
|
+
altloc="first", extra_fields=None, use_author_fields=True,
|
|
1212
|
+
include_bonds=False):
|
|
883
1213
|
"""
|
|
884
1214
|
Build the given biological assembly.
|
|
885
1215
|
|
|
@@ -890,7 +1220,7 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
|
|
|
890
1220
|
|
|
891
1221
|
Parameters
|
|
892
1222
|
----------
|
|
893
|
-
pdbx_file :
|
|
1223
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
894
1224
|
The file object.
|
|
895
1225
|
assembly_id : str
|
|
896
1226
|
The assembly to build.
|
|
@@ -907,8 +1237,10 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
|
|
|
907
1237
|
contains only one model.
|
|
908
1238
|
data_block : str, optional
|
|
909
1239
|
The name of the data block.
|
|
910
|
-
|
|
1240
|
+
Default is the first (and most times only) data block of the
|
|
911
1241
|
file.
|
|
1242
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
1243
|
+
this parameter is ignored.
|
|
912
1244
|
altloc : {'first', 'occupancy', 'all'}
|
|
913
1245
|
This parameter defines how *altloc* IDs are handled:
|
|
914
1246
|
- ``'first'`` - Use atoms that have the first *altloc* ID
|
|
@@ -940,36 +1272,46 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
|
|
|
940
1272
|
If `use_author_fields` is true, the annotation arrays will be
|
|
941
1273
|
read from the ``auth_xxx`` fields (if applicable),
|
|
942
1274
|
otherwise from the the ``label_xxx`` fields.
|
|
1275
|
+
include_bonds : bool, optional
|
|
1276
|
+
If set to true, a :class:`BondList` will be created for the
|
|
1277
|
+
resulting :class:`AtomArray` containing the bond information
|
|
1278
|
+
from the file.
|
|
1279
|
+
Bonds, whose order could not be determined from the
|
|
1280
|
+
*Chemical Component Dictionary*
|
|
1281
|
+
(e.g. especially inter-residue bonds),
|
|
1282
|
+
have :attr:`BondType.ANY`, since the PDB format itself does
|
|
1283
|
+
not support bond orders.
|
|
943
1284
|
|
|
944
1285
|
Returns
|
|
945
1286
|
-------
|
|
946
1287
|
assembly : AtomArray or AtomArrayStack
|
|
947
1288
|
The assembly. The return type depends on the `model` parameter.
|
|
948
|
-
|
|
1289
|
+
|
|
949
1290
|
Examples
|
|
950
1291
|
--------
|
|
951
1292
|
|
|
952
1293
|
>>> import os.path
|
|
953
|
-
>>> file =
|
|
1294
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
|
|
954
1295
|
>>> assembly = get_assembly(file, model=1)
|
|
955
1296
|
"""
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
1297
|
+
block = _get_block(pdbx_file, data_block)
|
|
1298
|
+
|
|
1299
|
+
try:
|
|
1300
|
+
assembly_gen_category = block["pdbx_struct_assembly_gen"]
|
|
1301
|
+
except KeyError:
|
|
960
1302
|
raise InvalidFileError(
|
|
961
1303
|
"File has no 'pdbx_struct_assembly_gen' category"
|
|
962
1304
|
)
|
|
963
1305
|
|
|
964
|
-
|
|
965
|
-
"pdbx_struct_oper_list"
|
|
966
|
-
|
|
967
|
-
if struct_oper_category is None:
|
|
1306
|
+
try:
|
|
1307
|
+
struct_oper_category = block["pdbx_struct_oper_list"]
|
|
1308
|
+
except KeyError:
|
|
968
1309
|
raise InvalidFileError("File has no 'pdbx_struct_oper_list' category")
|
|
969
1310
|
|
|
1311
|
+
assembly_ids = assembly_gen_category["assembly_id"].as_array(str)
|
|
970
1312
|
if assembly_id is None:
|
|
971
|
-
assembly_id =
|
|
972
|
-
elif assembly_id not in
|
|
1313
|
+
assembly_id = assembly_ids[0]
|
|
1314
|
+
elif assembly_id not in assembly_ids:
|
|
973
1315
|
raise KeyError(f"File has no Assembly ID '{assembly_id}'")
|
|
974
1316
|
|
|
975
1317
|
### Calculate all possible transformations
|
|
@@ -982,6 +1324,8 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
|
|
|
982
1324
|
if "label_asym_id" in extra_fields:
|
|
983
1325
|
extra_fields_and_asym = extra_fields
|
|
984
1326
|
else:
|
|
1327
|
+
# The operations apply on asym IDs
|
|
1328
|
+
# -> they need to be included to select the correct atoms
|
|
985
1329
|
extra_fields_and_asym = extra_fields + ["label_asym_id"]
|
|
986
1330
|
structure = get_structure(
|
|
987
1331
|
pdbx_file,
|
|
@@ -990,14 +1334,15 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
|
|
|
990
1334
|
altloc,
|
|
991
1335
|
extra_fields_and_asym,
|
|
992
1336
|
use_author_fields,
|
|
1337
|
+
include_bonds
|
|
993
1338
|
)
|
|
994
1339
|
|
|
995
1340
|
### Get transformations and apply them to the affected asym IDs
|
|
996
1341
|
assembly = None
|
|
997
1342
|
for id, op_expr, asym_id_expr in zip(
|
|
998
|
-
assembly_gen_category["assembly_id"],
|
|
999
|
-
assembly_gen_category["oper_expression"],
|
|
1000
|
-
assembly_gen_category["asym_id_list"],
|
|
1343
|
+
assembly_gen_category["assembly_id"].as_array(str),
|
|
1344
|
+
assembly_gen_category["oper_expression"].as_array(str),
|
|
1345
|
+
assembly_gen_category["asym_id_list"].as_array(str),
|
|
1001
1346
|
):
|
|
1002
1347
|
# Find the operation expressions for given assembly ID
|
|
1003
1348
|
# We already asserted that the ID is actually present
|
|
@@ -1017,12 +1362,12 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
|
|
|
1017
1362
|
assembly = sub_assembly
|
|
1018
1363
|
else:
|
|
1019
1364
|
assembly += sub_assembly
|
|
1020
|
-
|
|
1365
|
+
|
|
1021
1366
|
# Remove 'label_asym_id', if it was not included in the original
|
|
1022
1367
|
# user-supplied 'extra_fields'
|
|
1023
1368
|
if "label_asym_id" not in extra_fields:
|
|
1024
1369
|
assembly.del_annotation("label_asym_id")
|
|
1025
|
-
|
|
1370
|
+
|
|
1026
1371
|
return assembly
|
|
1027
1372
|
|
|
1028
1373
|
|
|
@@ -1056,19 +1401,20 @@ def _get_transformations(struct_oper):
|
|
|
1056
1401
|
translation for each operation ID in ``pdbx_struct_oper_list``.
|
|
1057
1402
|
"""
|
|
1058
1403
|
transformation_dict = {}
|
|
1059
|
-
for index, id in enumerate(struct_oper["id"]):
|
|
1404
|
+
for index, id in enumerate(struct_oper["id"].as_array(str)):
|
|
1060
1405
|
rotation_matrix = np.array(
|
|
1061
1406
|
[
|
|
1062
1407
|
[
|
|
1063
|
-
|
|
1408
|
+
struct_oper[f"matrix[{i}][{j}]"].as_array(float)[index]
|
|
1064
1409
|
for j in (1, 2, 3)
|
|
1065
1410
|
]
|
|
1066
1411
|
for i in (1, 2, 3)
|
|
1067
1412
|
]
|
|
1068
1413
|
)
|
|
1069
|
-
translation_vector = np.array(
|
|
1070
|
-
|
|
1071
|
-
|
|
1414
|
+
translation_vector = np.array([
|
|
1415
|
+
struct_oper[f"vector[{i}]"].as_array(float)[index]
|
|
1416
|
+
for i in (1, 2, 3)
|
|
1417
|
+
])
|
|
1072
1418
|
transformation_dict[id] = (rotation_matrix, translation_vector)
|
|
1073
1419
|
return transformation_dict
|
|
1074
1420
|
|
|
@@ -1112,6 +1458,8 @@ def _convert_string_to_sequence(string, stype):
|
|
|
1112
1458
|
``proteinseq_type_list`` or to ``NucleotideSequence`` if `stype` is
|
|
1113
1459
|
contained in ``_nucleotideseq_type_list``.
|
|
1114
1460
|
"""
|
|
1461
|
+
# sequence may be stored as multiline string
|
|
1462
|
+
string = string.replace("\n", "")
|
|
1115
1463
|
if stype in _proteinseq_type_list:
|
|
1116
1464
|
return ProteinSequence(string)
|
|
1117
1465
|
elif stype in _nucleotideseq_type_list:
|