biotite 0.39.0__cp310-cp310-macosx_11_0_arm64.whl → 0.41.0__cp310-cp310-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +3 -3
- biotite/application/dssp/app.py +18 -18
- biotite/database/pubchem/download.py +23 -23
- biotite/database/pubchem/query.py +7 -7
- biotite/database/rcsb/download.py +19 -14
- biotite/file.py +17 -9
- biotite/sequence/align/banded.c +258 -237
- biotite/sequence/align/banded.cpython-310-darwin.so +0 -0
- biotite/sequence/align/cigar.py +60 -15
- biotite/sequence/align/kmeralphabet.c +243 -222
- biotite/sequence/align/kmeralphabet.cpython-310-darwin.so +0 -0
- biotite/sequence/align/kmersimilarity.c +215 -196
- biotite/sequence/align/kmersimilarity.cpython-310-darwin.so +0 -0
- biotite/sequence/align/kmertable.cpp +233 -205
- biotite/sequence/align/kmertable.cpython-310-darwin.so +0 -0
- biotite/sequence/align/localgapped.c +258 -237
- biotite/sequence/align/localgapped.cpython-310-darwin.so +0 -0
- biotite/sequence/align/localungapped.c +235 -214
- biotite/sequence/align/localungapped.cpython-310-darwin.so +0 -0
- biotite/sequence/align/multiple.c +255 -234
- biotite/sequence/align/multiple.cpython-310-darwin.so +0 -0
- biotite/sequence/align/pairwise.c +274 -253
- biotite/sequence/align/pairwise.cpython-310-darwin.so +0 -0
- biotite/sequence/align/permutation.c +215 -196
- biotite/sequence/align/permutation.cpython-310-darwin.so +0 -0
- biotite/sequence/align/selector.c +217 -197
- biotite/sequence/align/selector.cpython-310-darwin.so +0 -0
- biotite/sequence/align/tracetable.c +215 -195
- biotite/sequence/align/tracetable.cpython-310-darwin.so +0 -0
- biotite/sequence/annotation.py +2 -2
- biotite/sequence/codec.c +235 -214
- biotite/sequence/codec.cpython-310-darwin.so +0 -0
- biotite/sequence/io/fasta/convert.py +27 -24
- biotite/sequence/phylo/nj.c +215 -196
- biotite/sequence/phylo/nj.cpython-310-darwin.so +0 -0
- biotite/sequence/phylo/tree.c +227 -202
- biotite/sequence/phylo/tree.cpython-310-darwin.so +0 -0
- biotite/sequence/phylo/upgma.c +215 -196
- biotite/sequence/phylo/upgma.cpython-310-darwin.so +0 -0
- biotite/structure/__init__.py +2 -0
- biotite/structure/basepairs.py +7 -12
- biotite/structure/bonds.c +1437 -1279
- biotite/structure/bonds.cpython-310-darwin.so +0 -0
- biotite/structure/celllist.c +217 -197
- biotite/structure/celllist.cpython-310-darwin.so +0 -0
- biotite/structure/charges.c +1052 -1101
- biotite/structure/charges.cpython-310-darwin.so +0 -0
- biotite/structure/dotbracket.py +2 -0
- biotite/structure/filter.py +30 -37
- biotite/structure/info/__init__.py +5 -8
- biotite/structure/info/atoms.py +31 -68
- biotite/structure/info/bonds.py +47 -101
- biotite/structure/info/ccd/README.rst +8 -0
- biotite/structure/info/ccd/amino_acids.txt +1663 -0
- biotite/structure/info/ccd/carbohydrates.txt +1135 -0
- biotite/structure/info/ccd/components.bcif +0 -0
- biotite/structure/info/ccd/nucleotides.txt +798 -0
- biotite/structure/info/ccd.py +95 -0
- biotite/structure/info/groups.py +90 -0
- biotite/structure/info/masses.py +21 -20
- biotite/structure/info/misc.py +78 -25
- biotite/structure/info/standardize.py +17 -12
- biotite/structure/integrity.py +19 -70
- biotite/structure/io/__init__.py +2 -4
- biotite/structure/io/ctab.py +12 -106
- biotite/structure/io/general.py +167 -181
- biotite/structure/io/gro/file.py +16 -16
- biotite/structure/io/mmtf/__init__.py +3 -0
- biotite/structure/io/mmtf/convertarray.c +219 -198
- biotite/structure/io/mmtf/convertarray.cpython-310-darwin.so +0 -0
- biotite/structure/io/mmtf/convertfile.c +217 -197
- biotite/structure/io/mmtf/convertfile.cpython-310-darwin.so +0 -0
- biotite/structure/io/mmtf/decode.c +225 -204
- biotite/structure/io/mmtf/decode.cpython-310-darwin.so +0 -0
- biotite/structure/io/mmtf/encode.c +215 -196
- biotite/structure/io/mmtf/encode.cpython-310-darwin.so +0 -0
- biotite/structure/io/mmtf/file.py +34 -26
- biotite/structure/io/mol/__init__.py +4 -2
- biotite/structure/io/mol/convert.py +71 -7
- biotite/structure/io/mol/ctab.py +414 -0
- biotite/structure/io/mol/header.py +116 -0
- biotite/structure/io/mol/{file.py → mol.py} +69 -82
- biotite/structure/io/mol/sdf.py +909 -0
- biotite/structure/io/npz/__init__.py +3 -0
- biotite/structure/io/npz/file.py +21 -18
- biotite/structure/io/pdb/__init__.py +3 -3
- biotite/structure/io/pdb/file.py +89 -34
- biotite/structure/io/pdb/hybrid36.c +63 -43
- biotite/structure/io/pdb/hybrid36.cpython-310-darwin.so +0 -0
- biotite/structure/io/pdbqt/file.py +32 -32
- biotite/structure/io/pdbx/__init__.py +12 -6
- biotite/structure/io/pdbx/bcif.py +648 -0
- biotite/structure/io/pdbx/cif.py +1032 -0
- biotite/structure/io/pdbx/component.py +246 -0
- biotite/structure/io/pdbx/convert.py +858 -386
- biotite/structure/io/pdbx/encoding.c +112813 -0
- biotite/structure/io/pdbx/encoding.cpython-310-darwin.so +0 -0
- biotite/structure/io/pdbx/legacy.py +267 -0
- biotite/structure/molecules.py +151 -151
- biotite/structure/repair.py +253 -0
- biotite/structure/sasa.c +215 -196
- biotite/structure/sasa.cpython-310-darwin.so +0 -0
- biotite/structure/sequence.py +112 -0
- biotite/structure/superimpose.py +618 -116
- {biotite-0.39.0.dist-info → biotite-0.41.0.dist-info}/METADATA +3 -3
- {biotite-0.39.0.dist-info → biotite-0.41.0.dist-info}/RECORD +109 -103
- {biotite-0.39.0.dist-info → biotite-0.41.0.dist-info}/WHEEL +1 -1
- biotite/structure/info/amino_acids.json +0 -1556
- biotite/structure/info/amino_acids.py +0 -42
- biotite/structure/info/carbohydrates.json +0 -1122
- biotite/structure/info/carbohydrates.py +0 -39
- biotite/structure/info/intra_bonds.msgpack +0 -0
- biotite/structure/info/link_types.msgpack +0 -1
- biotite/structure/info/nucleotides.json +0 -772
- biotite/structure/info/nucleotides.py +0 -39
- biotite/structure/info/residue_masses.msgpack +0 -0
- biotite/structure/info/residue_names.msgpack +0 -3
- biotite/structure/info/residues.msgpack +0 -0
- biotite/structure/io/pdbx/file.py +0 -652
- {biotite-0.39.0.dist-info → biotite-0.41.0.dist-info}/LICENSE.rst +0 -0
- {biotite-0.39.0.dist-info → biotite-0.41.0.dist-info}/top_level.txt +0 -0
|
@@ -17,21 +17,50 @@ __all__ = [
|
|
|
17
17
|
|
|
18
18
|
import itertools
|
|
19
19
|
import warnings
|
|
20
|
-
from collections import OrderedDict
|
|
21
20
|
import numpy as np
|
|
22
21
|
from ....file import InvalidFileError
|
|
23
22
|
from ....sequence.seqtypes import NucleotideSequence, ProteinSequence
|
|
24
23
|
from ...atoms import AtomArray, AtomArrayStack, repeat
|
|
25
|
-
from ...bonds import BondList, BondType
|
|
24
|
+
from ...bonds import BondList, BondType, connect_via_residue_names
|
|
26
25
|
from ...box import unitcell_from_vectors, vectors_from_unitcell
|
|
27
26
|
from ...filter import filter_first_altloc, filter_highest_occupancy_altloc
|
|
28
|
-
from ...residues import get_residue_count
|
|
27
|
+
from ...residues import get_residue_count, get_residue_starts_for
|
|
29
28
|
from ...error import BadStructureError
|
|
30
29
|
from ...util import matrix_rotate
|
|
30
|
+
from .legacy import PDBxFile
|
|
31
|
+
from .component import MaskValue
|
|
32
|
+
from .cif import CIFFile, CIFBlock
|
|
33
|
+
from .bcif import BinaryCIFFile, BinaryCIFBlock, BinaryCIFColumn
|
|
34
|
+
from .encoding import StringArrayEncoding
|
|
31
35
|
|
|
32
36
|
|
|
33
|
-
#
|
|
34
|
-
|
|
37
|
+
# Cond types in `struct_conn` category that refer to covalent bonds
|
|
38
|
+
PDBX_COVALENT_TYPES = [
|
|
39
|
+
"covale", "covale_base", "covale_phosphate", "covale_sugar",
|
|
40
|
+
"disulf", "modres", "modres_link", "metalc"
|
|
41
|
+
]
|
|
42
|
+
# Map 'struct_conn' bond orders to 'BondType'...
|
|
43
|
+
PDBX_BOND_ORDER_TO_TYPE = {
|
|
44
|
+
"": BondType.ANY,
|
|
45
|
+
"sing": BondType.SINGLE,
|
|
46
|
+
"doub": BondType.DOUBLE,
|
|
47
|
+
"trip": BondType.TRIPLE,
|
|
48
|
+
"quad": BondType.QUADRUPLE,
|
|
49
|
+
}
|
|
50
|
+
# ...and vice versa
|
|
51
|
+
PDBX_BOND_TYPE_TO_ORDER = {
|
|
52
|
+
# 'ANY' is masked later, it is merely added here to avoid a KeyError
|
|
53
|
+
BondType.ANY: "",
|
|
54
|
+
BondType.SINGLE: "sing",
|
|
55
|
+
BondType.DOUBLE: "doub",
|
|
56
|
+
BondType.TRIPLE: "trip",
|
|
57
|
+
BondType.QUADRUPLE: "quad",
|
|
58
|
+
BondType.AROMATIC_SINGLE: "sing",
|
|
59
|
+
BondType.AROMATIC_DOUBLE: "doub",
|
|
60
|
+
BondType.AROMATIC_TRIPLE: "trip",
|
|
61
|
+
}
|
|
62
|
+
# Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
|
|
63
|
+
COMP_BOND_ORDER_TO_TYPE = {
|
|
35
64
|
("SING", "N") : BondType.SINGLE,
|
|
36
65
|
("DOUB", "N") : BondType.DOUBLE,
|
|
37
66
|
("TRIP", "N") : BondType.TRIPLE,
|
|
@@ -41,11 +70,10 @@ BOND_ORDER_TO_BOND_TYPE = {
|
|
|
41
70
|
("TRIP", "Y") : BondType.AROMATIC_TRIPLE,
|
|
42
71
|
}
|
|
43
72
|
# ...and vice versa
|
|
44
|
-
|
|
45
|
-
bond_type: order for order, bond_type in
|
|
73
|
+
COMP_BOND_TYPE_TO_ORDER = {
|
|
74
|
+
bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
|
|
46
75
|
}
|
|
47
76
|
|
|
48
|
-
|
|
49
77
|
_proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
|
|
50
78
|
_nucleotideseq_type_list = [
|
|
51
79
|
"polydeoxyribonucleotide",
|
|
@@ -61,6 +89,27 @@ _other_type_list = [
|
|
|
61
89
|
]
|
|
62
90
|
|
|
63
91
|
|
|
92
|
+
def _filter(category, index):
|
|
93
|
+
"""
|
|
94
|
+
Reduce the ``atom_site`` category to the values for the given
|
|
95
|
+
model.
|
|
96
|
+
"""
|
|
97
|
+
Category = type(category)
|
|
98
|
+
Column = Category.subcomponent_class()
|
|
99
|
+
Data = Column.subcomponent_class()
|
|
100
|
+
|
|
101
|
+
return Category({
|
|
102
|
+
key: Column(
|
|
103
|
+
Data(column.data.array[index]),
|
|
104
|
+
(
|
|
105
|
+
Data(column.mask.array[index])
|
|
106
|
+
if column.mask is not None else None
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
for key, column in category.items()
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
|
|
64
113
|
def get_sequence(pdbx_file, data_block=None):
|
|
65
114
|
"""
|
|
66
115
|
Get the protein and nucleotide sequences from the
|
|
@@ -74,11 +123,14 @@ def get_sequence(pdbx_file, data_block=None):
|
|
|
74
123
|
|
|
75
124
|
Parameters
|
|
76
125
|
----------
|
|
77
|
-
pdbx_file :
|
|
126
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
78
127
|
The file object.
|
|
79
|
-
data_block :
|
|
80
|
-
The name of the data block.
|
|
81
|
-
(and most times only) data block of the
|
|
128
|
+
data_block : str, optional
|
|
129
|
+
The name of the data block.
|
|
130
|
+
Default is the first (and most times only) data block of the
|
|
131
|
+
file.
|
|
132
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
133
|
+
this parameter is ignored.
|
|
82
134
|
|
|
83
135
|
Returns
|
|
84
136
|
-------
|
|
@@ -86,50 +138,55 @@ def get_sequence(pdbx_file, data_block=None):
|
|
|
86
138
|
The protein and nucleotide sequences for each entity
|
|
87
139
|
(equivalent to chains in most cases).
|
|
88
140
|
"""
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
141
|
+
block = _get_block(pdbx_file, data_block)
|
|
142
|
+
|
|
143
|
+
poly_category= block["entity_poly"]
|
|
144
|
+
seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
|
|
145
|
+
seq_type = poly_category["type"].as_array(str)
|
|
92
146
|
sequences = []
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
sequences.append(sequence)
|
|
98
|
-
else:
|
|
99
|
-
sequences.append(_convert_string_to_sequence(seq_string, seq_type))
|
|
147
|
+
for string, stype in zip(seq_string, seq_type):
|
|
148
|
+
sequence = _convert_string_to_sequence(string, stype)
|
|
149
|
+
if sequence is not None:
|
|
150
|
+
sequences.append(sequence)
|
|
100
151
|
return sequences
|
|
101
152
|
|
|
102
153
|
|
|
103
|
-
def get_model_count(
|
|
154
|
+
def get_model_count(pdbx_file, data_block=None):
|
|
104
155
|
"""
|
|
105
156
|
Get the number of models contained in a :class:`PDBxFile`.
|
|
106
157
|
|
|
107
158
|
Parameters
|
|
108
159
|
----------
|
|
109
|
-
|
|
160
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
110
161
|
The file object.
|
|
111
162
|
data_block : str, optional
|
|
112
|
-
The name of the data block.
|
|
113
|
-
(and most times only) data block of the
|
|
163
|
+
The name of the data block.
|
|
164
|
+
Default is the first (and most times only) data block of the
|
|
165
|
+
file.
|
|
166
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
167
|
+
this parameter is ignored.
|
|
114
168
|
|
|
115
169
|
Returns
|
|
116
170
|
-------
|
|
117
171
|
model_count : int
|
|
118
172
|
The number of models.
|
|
119
173
|
"""
|
|
120
|
-
|
|
121
|
-
return len(_get_model_starts(
|
|
174
|
+
block = _get_block(pdbx_file, data_block)
|
|
175
|
+
return len(_get_model_starts(
|
|
176
|
+
block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32)
|
|
177
|
+
))
|
|
122
178
|
|
|
123
179
|
|
|
124
180
|
def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
|
|
125
|
-
extra_fields=None, use_author_fields=True
|
|
181
|
+
extra_fields=None, use_author_fields=True,
|
|
182
|
+
include_bonds=False):
|
|
126
183
|
"""
|
|
127
184
|
Create an :class:`AtomArray` or :class:`AtomArrayStack` from the
|
|
128
185
|
``atom_site`` category in a :class:`PDBxFile`.
|
|
129
186
|
|
|
130
187
|
Parameters
|
|
131
188
|
----------
|
|
132
|
-
pdbx_file :
|
|
189
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
133
190
|
The file object.
|
|
134
191
|
model : int, optional
|
|
135
192
|
If this parameter is given, the function will return an
|
|
@@ -141,8 +198,11 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
|
|
|
141
198
|
containing all models will be returned, even if the structure
|
|
142
199
|
contains only one model.
|
|
143
200
|
data_block : str, optional
|
|
144
|
-
The name of the data block.
|
|
145
|
-
(and most times only) data block of the
|
|
201
|
+
The name of the data block.
|
|
202
|
+
Default is the first (and most times only) data block of the
|
|
203
|
+
file.
|
|
204
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
205
|
+
this parameter is ignored.
|
|
146
206
|
altloc : {'first', 'occupancy', 'all'}
|
|
147
207
|
This parameter defines how *altloc* IDs are handled:
|
|
148
208
|
- ``'first'`` - Use atoms that have the first *altloc* ID
|
|
@@ -176,6 +236,15 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
|
|
|
176
236
|
otherwise from the the ``label_xxx`` fields.
|
|
177
237
|
If the requested field is not available, the respective other
|
|
178
238
|
field is taken as fallback.
|
|
239
|
+
include_bonds : bool, optional
|
|
240
|
+
If set to true, a :class:`BondList` will be created for the
|
|
241
|
+
resulting :class:`AtomArray` containing the bond information
|
|
242
|
+
from the file.
|
|
243
|
+
Inter-residue bonds, will be read from the ``struct_conn``
|
|
244
|
+
category.
|
|
245
|
+
Intra-residue bonds will be read from the ``chem_comp_bond``, if
|
|
246
|
+
available, otherwise they will be derived from the Chemical
|
|
247
|
+
Component Dictionary.
|
|
179
248
|
|
|
180
249
|
Returns
|
|
181
250
|
-------
|
|
@@ -186,31 +255,31 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
|
|
|
186
255
|
--------
|
|
187
256
|
|
|
188
257
|
>>> import os.path
|
|
189
|
-
>>> file =
|
|
258
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "1l2y.cif"))
|
|
190
259
|
>>> arr = get_structure(file, model=1)
|
|
191
260
|
>>> print(len(arr))
|
|
192
261
|
304
|
|
193
262
|
|
|
194
263
|
"""
|
|
195
|
-
|
|
264
|
+
block = _get_block(pdbx_file, data_block)
|
|
265
|
+
|
|
266
|
+
extra_fields = set() if extra_fields is None else set(extra_fields)
|
|
196
267
|
|
|
197
|
-
|
|
198
|
-
if
|
|
268
|
+
atom_site = block.get("atom_site")
|
|
269
|
+
if atom_site is None:
|
|
199
270
|
raise InvalidFileError("Missing 'atom_site' category in file")
|
|
200
|
-
|
|
201
|
-
models =
|
|
271
|
+
|
|
272
|
+
models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
|
|
202
273
|
model_starts = _get_model_starts(models)
|
|
203
274
|
model_count = len(model_starts)
|
|
204
275
|
atom_count = len(models)
|
|
205
276
|
|
|
206
277
|
if model is None:
|
|
207
278
|
# For a stack, the annotations are derived from the first model
|
|
208
|
-
|
|
279
|
+
model_atom_site = _filter_model(atom_site, model_starts, 1)
|
|
209
280
|
# Any field of the category would work here to get the length
|
|
210
|
-
model_length =
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
_fill_annotations(stack, model_dict, extra_fields, use_author_fields)
|
|
281
|
+
model_length = model_atom_site.row_count
|
|
282
|
+
atoms = AtomArrayStack(model_count, model_length)
|
|
214
283
|
|
|
215
284
|
# Check if each model has the same amount of atoms
|
|
216
285
|
# If not, raise exception
|
|
@@ -221,27 +290,17 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
|
|
|
221
290
|
"instead"
|
|
222
291
|
)
|
|
223
292
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
)
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
stack.coord[:, :, 1] = atom_site_dict["Cartn_y"].reshape(
|
|
231
|
-
(model_count, model_length)
|
|
232
|
-
)
|
|
233
|
-
stack.coord[:, :, 2] = atom_site_dict["Cartn_z"].reshape(
|
|
234
|
-
(model_count, model_length)
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
stack = _filter_altloc(stack, model_dict, altloc)
|
|
293
|
+
atoms.coord[:, :, 0] = atom_site["Cartn_x"].as_array(np.float32) \
|
|
294
|
+
.reshape((model_count, model_length))
|
|
295
|
+
atoms.coord[:, :, 1] = atom_site["Cartn_y"].as_array(np.float32) \
|
|
296
|
+
.reshape((model_count, model_length))
|
|
297
|
+
atoms.coord[:, :, 2] = atom_site["Cartn_z"].as_array(np.float32) \
|
|
298
|
+
.reshape((model_count, model_length))
|
|
238
299
|
|
|
239
|
-
box = _get_box(
|
|
300
|
+
box = _get_box(block)
|
|
240
301
|
if box is not None:
|
|
241
302
|
# Duplicate same box for each model
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
return stack
|
|
303
|
+
atoms.box = np.repeat(box[np.newaxis, ...], model_count, axis=0)
|
|
245
304
|
|
|
246
305
|
else:
|
|
247
306
|
if model == 0:
|
|
@@ -254,47 +313,94 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
|
|
|
254
313
|
f"the given model {model} does not exist"
|
|
255
314
|
)
|
|
256
315
|
|
|
257
|
-
|
|
316
|
+
model_atom_site = _filter_model(atom_site, model_starts, model)
|
|
258
317
|
# Any field of the category would work here to get the length
|
|
259
|
-
model_length =
|
|
260
|
-
|
|
318
|
+
model_length = model_atom_site.row_count
|
|
319
|
+
atoms = AtomArray(model_length)
|
|
261
320
|
|
|
262
|
-
|
|
321
|
+
atoms.coord[:, 0] = model_atom_site["Cartn_x"].as_array(np.float32)
|
|
322
|
+
atoms.coord[:, 1] = model_atom_site["Cartn_y"].as_array(np.float32)
|
|
323
|
+
atoms.coord[:, 2] = model_atom_site["Cartn_z"].as_array(np.float32)
|
|
263
324
|
|
|
264
|
-
|
|
265
|
-
model_starts = np.append(
|
|
266
|
-
model_starts, [len(atom_site_dict["group_PDB"])]
|
|
267
|
-
)
|
|
268
|
-
# Indexing starts at 0, but model number starts at 1
|
|
269
|
-
model_index = model - 1
|
|
270
|
-
start, stop = model_starts[model_index], model_starts[model_index + 1]
|
|
271
|
-
array.coord = np.zeros((model_length, 3), dtype=np.float32)
|
|
272
|
-
array.coord[:, 0] = atom_site_dict["Cartn_x"][start:stop].astype(
|
|
273
|
-
np.float32
|
|
274
|
-
)
|
|
275
|
-
array.coord[:, 1] = atom_site_dict["Cartn_y"][start:stop].astype(
|
|
276
|
-
np.float32
|
|
277
|
-
)
|
|
278
|
-
array.coord[:, 2] = atom_site_dict["Cartn_z"][start:stop].astype(
|
|
279
|
-
np.float32
|
|
280
|
-
)
|
|
325
|
+
atoms.box = _get_box(block)
|
|
281
326
|
|
|
282
|
-
|
|
327
|
+
# The below part is the same for both, AtomArray and AtomArrayStack
|
|
328
|
+
_fill_annotations(
|
|
329
|
+
atoms, model_atom_site, extra_fields, use_author_fields
|
|
330
|
+
)
|
|
331
|
+
if include_bonds:
|
|
332
|
+
if "chem_comp_bond" in block:
|
|
333
|
+
try:
|
|
334
|
+
custom_bond_dict = _parse_intra_residue_bonds(
|
|
335
|
+
block["chem_comp_bond"]
|
|
336
|
+
)
|
|
337
|
+
except KeyError:
|
|
338
|
+
warnings.warn(
|
|
339
|
+
"The 'chem_comp_bond' category has missing columns, "
|
|
340
|
+
"falling back to using Chemical Component Dictionary",
|
|
341
|
+
UserWarning
|
|
342
|
+
)
|
|
343
|
+
custom_bond_dict = None
|
|
344
|
+
bonds = connect_via_residue_names(
|
|
345
|
+
atoms, custom_bond_dict=custom_bond_dict
|
|
346
|
+
)
|
|
347
|
+
else:
|
|
348
|
+
bonds = connect_via_residue_names(atoms)
|
|
349
|
+
if "struct_conn" in block:
|
|
350
|
+
bonds = bonds.merge(_parse_inter_residue_bonds(
|
|
351
|
+
model_atom_site, block["struct_conn"]
|
|
352
|
+
))
|
|
353
|
+
atoms.bonds = bonds
|
|
354
|
+
atoms = _filter_altloc(atoms, model_atom_site, altloc)
|
|
355
|
+
|
|
356
|
+
return atoms
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def _get_block(pdbx_component, block_name):
|
|
360
|
+
if isinstance(pdbx_component, PDBxFile):
|
|
361
|
+
# The deprecated 'PDBxFile' is a thin wrapper around 'CIFFile'
|
|
362
|
+
pdbx_component = pdbx_component.cif_file
|
|
363
|
+
|
|
364
|
+
if not isinstance(pdbx_component, (CIFBlock, BinaryCIFBlock)):
|
|
365
|
+
# Determine block
|
|
366
|
+
if block_name is None:
|
|
367
|
+
return pdbx_component.block
|
|
368
|
+
else:
|
|
369
|
+
return pdbx_component[block_name]
|
|
370
|
+
else:
|
|
371
|
+
return pdbx_component
|
|
283
372
|
|
|
284
|
-
array.box = _get_box(pdbx_file, data_block)
|
|
285
373
|
|
|
286
|
-
|
|
374
|
+
def _get_or_fallback(category, key, fallback_key):
|
|
375
|
+
"""
|
|
376
|
+
Return column related to key in category if it exists,
|
|
377
|
+
otherwise try to get the column related to fallback key.
|
|
378
|
+
"""
|
|
379
|
+
if key not in category:
|
|
380
|
+
warnings.warn(
|
|
381
|
+
f"Attribute '{key}' not found within 'atom_site' category. "
|
|
382
|
+
f"The fallback attribute '{fallback_key}' will be used instead",
|
|
383
|
+
UserWarning
|
|
384
|
+
)
|
|
385
|
+
try:
|
|
386
|
+
return category[fallback_key]
|
|
387
|
+
except KeyError as key_exc:
|
|
388
|
+
raise InvalidFileError(
|
|
389
|
+
f"Fallback attribute '{fallback_key}' not found within "
|
|
390
|
+
"'atom_site' category"
|
|
391
|
+
) from key_exc
|
|
392
|
+
return category[key]
|
|
287
393
|
|
|
288
394
|
|
|
289
|
-
def _fill_annotations(array,
|
|
395
|
+
def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
|
|
290
396
|
"""Fill atom_site annotations in atom array or atom array stack.
|
|
291
397
|
|
|
292
398
|
Parameters
|
|
293
399
|
----------
|
|
294
400
|
array : AtomArray or AtomArrayStack
|
|
295
401
|
Atom array or stack which will be annotated.
|
|
296
|
-
|
|
297
|
-
``atom_site``
|
|
402
|
+
atom_site : CIFCategory or BinaryCIFCategory
|
|
403
|
+
``atom_site`` category with values for one model.
|
|
298
404
|
extra_fields : list of str
|
|
299
405
|
Entry names, that are additionally added as annotation arrays.
|
|
300
406
|
use_author_fields : bool
|
|
@@ -302,121 +408,226 @@ def _fill_annotations(array, model_dict, extra_fields, use_author_fields):
|
|
|
302
408
|
instead of ``label_``.
|
|
303
409
|
"""
|
|
304
410
|
|
|
305
|
-
def get_or_fallback_from_dict(input_dict, key, fallback_key,
|
|
306
|
-
dict_name="input"):
|
|
307
|
-
"""
|
|
308
|
-
Return value related to key in input dict if it exists,
|
|
309
|
-
otherwise try to get the value related to fallback key."""
|
|
310
|
-
if key not in input_dict:
|
|
311
|
-
warnings.warn(
|
|
312
|
-
f"Attribute '{key}' not found within '{dict_name}' category. "
|
|
313
|
-
f"The fallback attribute '{fallback_key}' will be used instead",
|
|
314
|
-
UserWarning
|
|
315
|
-
)
|
|
316
|
-
try:
|
|
317
|
-
return input_dict[fallback_key]
|
|
318
|
-
except KeyError as key_exc:
|
|
319
|
-
raise InvalidFileError(
|
|
320
|
-
f"Fallback attribute '{fallback_key}' not found in "
|
|
321
|
-
"'{dict_name}' category"
|
|
322
|
-
) from key_exc
|
|
323
|
-
return input_dict[key]
|
|
324
|
-
|
|
325
|
-
def get_annotation_from_model(
|
|
326
|
-
model_dict,
|
|
327
|
-
annotation_name,
|
|
328
|
-
annotation_fallback=None,
|
|
329
|
-
as_type=None,
|
|
330
|
-
formatter=None,
|
|
331
|
-
):
|
|
332
|
-
"""Get and format annotation array from model dictionary."""
|
|
333
|
-
array = (
|
|
334
|
-
get_or_fallback_from_dict(
|
|
335
|
-
model_dict, annotation_name, annotation_fallback,
|
|
336
|
-
dict_name="atom_site"
|
|
337
|
-
)
|
|
338
|
-
if annotation_fallback is not None
|
|
339
|
-
else model_dict[annotation_name]
|
|
340
|
-
)
|
|
341
|
-
if as_type is not None:
|
|
342
|
-
array = array.astype(as_type)
|
|
343
|
-
return formatter(array) if formatter is not None else array
|
|
344
|
-
|
|
345
411
|
prefix, alt_prefix = (
|
|
346
412
|
("auth", "label") if use_author_fields else ("label", "auth")
|
|
347
413
|
)
|
|
348
414
|
|
|
349
|
-
|
|
350
|
-
"chain_id": (f"{prefix}_asym_id", f"{alt_prefix}_asym_id", "U4", None),
|
|
351
|
-
"res_id": (
|
|
352
|
-
f"{prefix}_seq_id",
|
|
353
|
-
f"{alt_prefix}_seq_id",
|
|
354
|
-
None,
|
|
355
|
-
lambda annot: np.array(
|
|
356
|
-
[-1 if elt in [".", "?"] else int(elt) for elt in annot]
|
|
357
|
-
),
|
|
358
|
-
),
|
|
359
|
-
"ins_code": (
|
|
360
|
-
"pdbx_PDB_ins_code",
|
|
361
|
-
None,
|
|
362
|
-
"U1",
|
|
363
|
-
lambda annot: np.array(
|
|
364
|
-
["" if elt in [".", "?"] else elt for elt in annot]
|
|
365
|
-
),
|
|
366
|
-
),
|
|
367
|
-
"res_name": (f"{prefix}_comp_id", f"{alt_prefix}_comp_id", "U5", None),
|
|
368
|
-
"hetero": ("group_PDB", None, None, lambda annot: annot == "HETATM"),
|
|
369
|
-
"atom_name": (
|
|
370
|
-
f"{prefix}_atom_id",
|
|
371
|
-
f"{alt_prefix}_atom_id",
|
|
372
|
-
"U6",
|
|
373
|
-
None,
|
|
374
|
-
),
|
|
375
|
-
"element": ("type_symbol", None, "U2", None),
|
|
376
|
-
"atom_id": ("id", None, int, None),
|
|
377
|
-
"b_factor": ("B_iso_or_equiv", None, float, None),
|
|
378
|
-
"occupancy": ("occupancy", None, float, None),
|
|
379
|
-
"charge": (
|
|
380
|
-
"pdbx_formal_charge",
|
|
381
|
-
None,
|
|
382
|
-
None,
|
|
383
|
-
lambda annot: np.array(
|
|
384
|
-
[
|
|
385
|
-
0 if charge in ["?", "."] else int(charge)
|
|
386
|
-
for charge in annot
|
|
387
|
-
],
|
|
388
|
-
dtype=int,
|
|
389
|
-
),
|
|
390
|
-
),
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
mandatory_annotations = [
|
|
415
|
+
array.set_annotation(
|
|
394
416
|
"chain_id",
|
|
417
|
+
_get_or_fallback(
|
|
418
|
+
atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
|
|
419
|
+
).as_array("U4")
|
|
420
|
+
)
|
|
421
|
+
array.set_annotation(
|
|
395
422
|
"res_id",
|
|
423
|
+
_get_or_fallback(
|
|
424
|
+
atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
|
|
425
|
+
).as_array(int, -1)
|
|
426
|
+
)
|
|
427
|
+
array.set_annotation(
|
|
396
428
|
"ins_code",
|
|
429
|
+
atom_site["pdbx_PDB_ins_code"].as_array("U1", "")
|
|
430
|
+
)
|
|
431
|
+
array.set_annotation(
|
|
397
432
|
"res_name",
|
|
433
|
+
_get_or_fallback(
|
|
434
|
+
atom_site, f"{prefix}_comp_id", f"{alt_prefix}_comp_id"
|
|
435
|
+
).as_array("U5")
|
|
436
|
+
)
|
|
437
|
+
array.set_annotation(
|
|
398
438
|
"hetero",
|
|
439
|
+
atom_site["group_PDB"].as_array(str) == "HETATM"
|
|
440
|
+
)
|
|
441
|
+
array.set_annotation(
|
|
399
442
|
"atom_name",
|
|
443
|
+
_get_or_fallback(
|
|
444
|
+
atom_site, f"{prefix}_atom_id", f"{alt_prefix}_atom_id"
|
|
445
|
+
).as_array("U6")
|
|
446
|
+
)
|
|
447
|
+
array.set_annotation(
|
|
400
448
|
"element",
|
|
401
|
-
|
|
449
|
+
atom_site["type_symbol"].as_array("U2")
|
|
450
|
+
)
|
|
402
451
|
|
|
403
|
-
|
|
404
|
-
for annotation_name in mandatory_annotations + extra_fields:
|
|
452
|
+
if "atom_id" in extra_fields:
|
|
405
453
|
array.set_annotation(
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
)
|
|
454
|
+
"atom_id",
|
|
455
|
+
atom_site["id"].as_array(int)
|
|
456
|
+
)
|
|
457
|
+
extra_fields.remove("atom_id")
|
|
458
|
+
if "b_factor" in extra_fields:
|
|
459
|
+
array.set_annotation(
|
|
460
|
+
"b_factor",
|
|
461
|
+
atom_site["B_iso_or_equiv"].as_array(float)
|
|
462
|
+
)
|
|
463
|
+
extra_fields.remove("b_factor")
|
|
464
|
+
if "occupancy" in extra_fields:
|
|
465
|
+
array.set_annotation(
|
|
466
|
+
"occupancy",
|
|
467
|
+
atom_site["occupancy"].as_array(float)
|
|
468
|
+
)
|
|
469
|
+
extra_fields.remove("occupancy")
|
|
470
|
+
if "charge" in extra_fields:
|
|
471
|
+
array.set_annotation(
|
|
472
|
+
"charge",
|
|
473
|
+
atom_site["pdbx_formal_charge"].as_array(int, 0)
|
|
474
|
+
)
|
|
475
|
+
extra_fields.remove("charge")
|
|
476
|
+
|
|
477
|
+
# Handle all remaining custom fields
|
|
478
|
+
for field in extra_fields:
|
|
479
|
+
array.set_annotation(
|
|
480
|
+
field,
|
|
481
|
+
atom_site[field].as_array(str)
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def _parse_intra_residue_bonds(chem_comp_bond):
|
|
486
|
+
"""
|
|
487
|
+
Create a :func:`connect_via_residue_names()` compatible
|
|
488
|
+
`custom_bond_dict` from the ``chem_comp_bond`` category.
|
|
489
|
+
"""
|
|
490
|
+
custom_bond_dict = {}
|
|
491
|
+
for res_name, atom_1, atom_2, order, aromatic_flag in zip(
|
|
492
|
+
chem_comp_bond["comp_id"].as_array(str),
|
|
493
|
+
chem_comp_bond["atom_id_1"].as_array(str),
|
|
494
|
+
chem_comp_bond["atom_id_2"].as_array(str),
|
|
495
|
+
chem_comp_bond["value_order"].as_array(str),
|
|
496
|
+
chem_comp_bond["pdbx_aromatic_flag"].as_array(str)
|
|
497
|
+
):
|
|
498
|
+
if res_name not in custom_bond_dict:
|
|
499
|
+
custom_bond_dict[res_name] = {}
|
|
500
|
+
bond_type = COMP_BOND_ORDER_TO_TYPE.get(
|
|
501
|
+
(order.upper(), aromatic_flag), BondType.ANY
|
|
502
|
+
)
|
|
503
|
+
custom_bond_dict[res_name][atom_1.item(), atom_2.item()] = bond_type
|
|
504
|
+
return custom_bond_dict
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def _parse_inter_residue_bonds(atom_site, struct_conn):
|
|
508
|
+
"""
|
|
509
|
+
Create inter-residue bonds by parsing the ``struct_conn`` category.
|
|
510
|
+
The atom indices of each bond are found by matching the bond labels
|
|
511
|
+
to the ``atom_site`` category.
|
|
512
|
+
"""
|
|
513
|
+
# Identity symmetry operation
|
|
514
|
+
IDENTITY = "1_555"
|
|
515
|
+
# Columns in 'atom_site' that should be matched by 'struct_conn'
|
|
516
|
+
COLUMNS = [
|
|
517
|
+
"label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id",
|
|
518
|
+
"label_alt_id", "auth_asym_id", "auth_comp_id", "auth_seq_id",
|
|
519
|
+
"pdbx_PDB_ins_code"
|
|
520
|
+
]
|
|
521
|
+
|
|
522
|
+
covale_mask = np.isin(
|
|
523
|
+
struct_conn["conn_type_id"].as_array(str), PDBX_COVALENT_TYPES
|
|
524
|
+
)
|
|
525
|
+
if "ptnr1_symmetry" in struct_conn:
|
|
526
|
+
covale_mask &= (
|
|
527
|
+
struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
|
|
528
|
+
)
|
|
529
|
+
if "ptnr2_symmetry" in struct_conn:
|
|
530
|
+
covale_mask &= (
|
|
531
|
+
struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
atom_indices = [None] * 2
|
|
535
|
+
for i in range(2):
|
|
536
|
+
reference_arrays = []
|
|
537
|
+
query_arrays = []
|
|
538
|
+
for col_name in COLUMNS:
|
|
539
|
+
struct_conn_col_name = _get_struct_conn_col_name(col_name, i+1)
|
|
540
|
+
if (
|
|
541
|
+
col_name not in atom_site
|
|
542
|
+
or struct_conn_col_name not in struct_conn
|
|
543
|
+
):
|
|
544
|
+
continue
|
|
545
|
+
# Ensure both arrays have the same dtype to allow comparison
|
|
546
|
+
reference = atom_site[col_name].as_array()
|
|
547
|
+
dtype = reference.dtype
|
|
548
|
+
query = struct_conn[struct_conn_col_name].as_array(dtype)
|
|
549
|
+
if np.issubdtype(reference.dtype, str):
|
|
550
|
+
# The mask value is not necessarily consistent
|
|
551
|
+
# between query and reference
|
|
552
|
+
# -> make it consistent
|
|
553
|
+
reference[reference == "?"] = "."
|
|
554
|
+
query[query == "?"] = "."
|
|
555
|
+
reference_arrays.append(reference)
|
|
556
|
+
query_arrays.append(query[covale_mask])
|
|
557
|
+
# Match the combination of 'label_asym_id', 'label_comp_id', etc.
|
|
558
|
+
# in 'atom_site' and 'struct_conn'
|
|
559
|
+
atom_indices[i] = _find_matches(query_arrays, reference_arrays)
|
|
560
|
+
atoms_indices_1 = atom_indices[0]
|
|
561
|
+
atoms_indices_2 = atom_indices[1]
|
|
562
|
+
|
|
563
|
+
# Some bonds in 'struct_conn' may not be found in 'atom_site'
|
|
564
|
+
# This is okay,
|
|
565
|
+
# as 'atom_site' might already be reduced to a single model
|
|
566
|
+
mapping_exists_mask = (atoms_indices_1 != -1) & (atoms_indices_2 != -1)
|
|
567
|
+
atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
|
|
568
|
+
atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
|
|
569
|
+
|
|
570
|
+
# Interpret missing values as ANY bonds
|
|
571
|
+
bond_order = struct_conn["pdbx_value_order"].as_array("U4", "")
|
|
572
|
+
# Consecutively apply the same masks as applied to the atom indices
|
|
573
|
+
# Logical combination does not work here,
|
|
574
|
+
# as the second mask was created based on already filtered data
|
|
575
|
+
bond_order = bond_order[covale_mask][mapping_exists_mask]
|
|
576
|
+
bond_types = [PDBX_BOND_ORDER_TO_TYPE[order] for order in bond_order]
|
|
577
|
+
|
|
578
|
+
return BondList(
|
|
579
|
+
atom_site.row_count,
|
|
580
|
+
np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1)
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
def _find_matches(query_arrays, reference_arrays):
|
|
585
|
+
"""
|
|
586
|
+
For each index in the `query_arrays` find the indices in the
|
|
587
|
+
`reference_arrays` where all query values the reference counterpart.
|
|
588
|
+
If no match is found for a query, the corresponding index is -1.
|
|
589
|
+
"""
|
|
590
|
+
match_masks_for_all_columns = np.stack([
|
|
591
|
+
query[:, np.newaxis] == reference[np.newaxis, :]
|
|
592
|
+
for query, reference in zip(query_arrays, reference_arrays)
|
|
593
|
+
], axis=-1)
|
|
594
|
+
match_masks = np.all(match_masks_for_all_columns, axis=-1)
|
|
595
|
+
query_matches, reference_matches = np.where(match_masks)
|
|
596
|
+
|
|
597
|
+
# Duplicate matches indicate that an atom from the query cannot
|
|
598
|
+
# be uniquely matched to an atom in the reference
|
|
599
|
+
unique_query_matches, counts = np.unique(query_matches, return_counts=True)
|
|
600
|
+
if np.any(counts > 1):
|
|
601
|
+
ambiguous_query = unique_query_matches[np.where(counts > 1)[0][0]]
|
|
602
|
+
raise InvalidFileError(
|
|
603
|
+
f"The covalent bond in the 'struct_conn' category at index "
|
|
604
|
+
f"{ambiguous_query} cannot be unambiguously assigned to atoms in "
|
|
605
|
+
f"the 'atom_site' category"
|
|
414
606
|
)
|
|
415
607
|
|
|
608
|
+
# -1 indicates that no match was found in the reference
|
|
609
|
+
match_indices = np.full(len(query_arrays[0]), -1, dtype=int)
|
|
610
|
+
match_indices[query_matches] = reference_matches
|
|
611
|
+
return match_indices
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
def _get_struct_conn_col_name(col_name, partner):
|
|
615
|
+
"""
|
|
616
|
+
For a column name in ``atom_site`` get the corresponding column name
|
|
617
|
+
in ``struct_conn``.
|
|
618
|
+
"""
|
|
619
|
+
if col_name == "label_alt_id":
|
|
620
|
+
return f"pdbx_ptnr{partner}_label_alt_id"
|
|
621
|
+
elif col_name.startswith("pdbx_"):
|
|
622
|
+
# Move 'pdbx_' to front
|
|
623
|
+
return f"pdbx_ptnr{partner}_{col_name[5:]}"
|
|
624
|
+
else:
|
|
625
|
+
return f"ptnr{partner}_{col_name}"
|
|
626
|
+
|
|
416
627
|
|
|
417
|
-
def _filter_altloc(array,
|
|
418
|
-
altloc_ids =
|
|
419
|
-
occupancy =
|
|
628
|
+
def _filter_altloc(array, atom_site, altloc):
|
|
629
|
+
altloc_ids = atom_site.get("label_alt_id")
|
|
630
|
+
occupancy = atom_site.get("occupancy")
|
|
420
631
|
|
|
421
632
|
# Filter altloc IDs and return
|
|
422
633
|
if altloc_ids is None:
|
|
@@ -425,14 +636,14 @@ def _filter_altloc(array, model_dict, altloc):
|
|
|
425
636
|
return array[
|
|
426
637
|
...,
|
|
427
638
|
filter_highest_occupancy_altloc(
|
|
428
|
-
array, altloc_ids, occupancy.
|
|
639
|
+
array, altloc_ids.as_array(str), occupancy.as_array(float)
|
|
429
640
|
),
|
|
430
641
|
]
|
|
431
642
|
# 'first' is also fallback if file has no occupancy information
|
|
432
643
|
elif altloc == "first":
|
|
433
|
-
return array[..., filter_first_altloc(array, altloc_ids)]
|
|
644
|
+
return array[..., filter_first_altloc(array, altloc_ids.as_array(str))]
|
|
434
645
|
elif altloc == "all":
|
|
435
|
-
array.set_annotation("altloc_id", altloc_ids)
|
|
646
|
+
array.set_annotation("altloc_id", altloc_ids.as_array(str))
|
|
436
647
|
return array
|
|
437
648
|
else:
|
|
438
649
|
raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
|
|
@@ -443,122 +654,154 @@ def _get_model_starts(model_array):
|
|
|
443
654
|
Get the start index for each model in the arrays of the
|
|
444
655
|
``atom_site`` category.
|
|
445
656
|
"""
|
|
446
|
-
|
|
657
|
+
_, indices = np.unique(model_array, return_index=True)
|
|
447
658
|
indices.sort()
|
|
448
659
|
return indices
|
|
449
660
|
|
|
450
661
|
|
|
451
|
-
def
|
|
662
|
+
def _filter_model(atom_site, model_starts, model):
|
|
452
663
|
"""
|
|
453
|
-
Reduce the ``atom_site``
|
|
664
|
+
Reduce the ``atom_site`` category to the values for the given
|
|
454
665
|
model.
|
|
455
666
|
"""
|
|
667
|
+
Category = type(atom_site)
|
|
668
|
+
Column = Category.subcomponent_class()
|
|
669
|
+
Data = Column.subcomponent_class()
|
|
670
|
+
|
|
456
671
|
# Append exclusive stop
|
|
457
672
|
model_starts = np.append(
|
|
458
|
-
model_starts, [
|
|
673
|
+
model_starts, [atom_site.row_count]
|
|
459
674
|
)
|
|
460
|
-
model_dict = {}
|
|
461
675
|
# Indexing starts at 0, but model number starts at 1
|
|
462
676
|
model_index = model - 1
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
model_starts[model_index] : model_starts[model_index + 1]
|
|
466
|
-
]
|
|
467
|
-
return model_dict
|
|
677
|
+
index = slice(model_starts[model_index], model_starts[model_index + 1])
|
|
678
|
+
return _filter(atom_site, index)
|
|
468
679
|
|
|
469
680
|
|
|
470
|
-
def _get_box(
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
else:
|
|
474
|
-
cell_dict = pdbx_file.get((data_block, "cell"))
|
|
475
|
-
if cell_dict is None:
|
|
681
|
+
def _get_box(block):
|
|
682
|
+
cell = block.get("cell")
|
|
683
|
+
if cell is None:
|
|
476
684
|
return None
|
|
477
685
|
try:
|
|
478
686
|
len_a, len_b, len_c = [
|
|
479
|
-
float(
|
|
687
|
+
float(cell[length].as_item())
|
|
480
688
|
for length in ["length_a", "length_b", "length_c"]
|
|
481
689
|
]
|
|
690
|
+
alpha, beta, gamma = [
|
|
691
|
+
np.deg2rad(float(cell[angle].as_item()))
|
|
692
|
+
for angle in ["angle_alpha", "angle_beta", "angle_gamma"]
|
|
693
|
+
]
|
|
482
694
|
except ValueError:
|
|
483
695
|
# 'cell_dict' has no proper unit cell values, e.g. '?'
|
|
484
696
|
return None
|
|
485
|
-
alpha, beta, gamma = [
|
|
486
|
-
np.deg2rad(float(cell_dict[angle]))
|
|
487
|
-
for angle in ["angle_alpha", "angle_beta", "angle_gamma"]
|
|
488
|
-
]
|
|
489
697
|
return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
|
|
490
698
|
|
|
491
699
|
|
|
492
|
-
def set_structure(pdbx_file, array, data_block=None):
|
|
700
|
+
def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
|
|
493
701
|
"""
|
|
494
702
|
Set the ``atom_site`` category with atom information from an
|
|
495
703
|
:class:`AtomArray` or :class:`AtomArrayStack`.
|
|
496
704
|
|
|
497
705
|
This will save the coordinates, the mandatory annotation categories
|
|
498
706
|
and the optional annotation categories
|
|
499
|
-
``
|
|
707
|
+
``atom_id``, ``b_factor``, ``occupancy`` and ``charge``.
|
|
500
708
|
If the atom array (stack) contains the annotation ``'atom_id'``,
|
|
501
709
|
these values will be used for atom numbering instead of continuous
|
|
502
710
|
numbering.
|
|
711
|
+
Furthermore, inter-residue bonds will be written into the
|
|
712
|
+
``struct_conn`` category.
|
|
503
713
|
|
|
504
714
|
Parameters
|
|
505
715
|
----------
|
|
506
|
-
pdbx_file :
|
|
716
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
507
717
|
The file object.
|
|
508
718
|
array : AtomArray or AtomArrayStack
|
|
509
719
|
The structure to be written. If a stack is given, each array in
|
|
510
720
|
the stack will be in a separate model.
|
|
511
721
|
data_block : str, optional
|
|
512
|
-
The name of the data block.
|
|
513
|
-
(and most times only) data block of the
|
|
722
|
+
The name of the data block.
|
|
723
|
+
Default is the first (and most times only) data block of the
|
|
724
|
+
file.
|
|
725
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
726
|
+
this parameter is ignored.
|
|
727
|
+
If the file is empty, a new data block will be created.
|
|
728
|
+
include_bonds : bool, optional
|
|
729
|
+
If set to true and `array` has associated ``bonds`` , the
|
|
730
|
+
intra-residue bonds will be written into the ``chem_comp_bond``
|
|
731
|
+
category.
|
|
732
|
+
Inter-residue bonds will be written into the ``struct_conn``
|
|
733
|
+
independent of this parameter.
|
|
734
|
+
|
|
735
|
+
Notes
|
|
736
|
+
-----
|
|
737
|
+
In some cases, the written inter-residue bonds cannot be read again
|
|
738
|
+
due to ambiguity to which atoms the bond refers.
|
|
739
|
+
This is the case, when two equal residues in the same chain have
|
|
740
|
+
the same (or a masked) `res_id`.
|
|
514
741
|
|
|
515
742
|
Examples
|
|
516
743
|
--------
|
|
517
744
|
|
|
518
745
|
>>> import os.path
|
|
519
|
-
>>> file =
|
|
520
|
-
>>> set_structure(file, atom_array
|
|
746
|
+
>>> file = CIFFile()
|
|
747
|
+
>>> set_structure(file, atom_array)
|
|
521
748
|
>>> file.write(os.path.join(path_to_directory, "structure.cif"))
|
|
522
749
|
|
|
523
750
|
"""
|
|
751
|
+
_check_non_empty(array)
|
|
752
|
+
|
|
753
|
+
block = _get_or_create_block(pdbx_file, data_block)
|
|
754
|
+
Category = block.subcomponent_class()
|
|
755
|
+
Column = Category.subcomponent_class()
|
|
756
|
+
|
|
524
757
|
# Fill PDBx columns from information
|
|
525
758
|
# in structures' attribute arrays as good as possible
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
# if an optional category exists
|
|
530
|
-
annot_categories = array.get_annotation_categories()
|
|
531
|
-
atom_site_dict["group_PDB"] = np.array(
|
|
532
|
-
["ATOM" if e == False else "HETATM" for e in array.hetero]
|
|
759
|
+
atom_site = Category()
|
|
760
|
+
atom_site["group_PDB"] = np.where(
|
|
761
|
+
array.hetero, "HETATM", "ATOM"
|
|
533
762
|
)
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
763
|
+
atom_site["type_symbol"] = np.copy(array.element)
|
|
764
|
+
atom_site["label_atom_id"] = np.copy(array.atom_name)
|
|
765
|
+
atom_site["label_alt_id"] = Column(
|
|
766
|
+
# AtomArrays do not store altloc atoms
|
|
767
|
+
np.full(array.array_length(), "."),
|
|
768
|
+
np.full(array.array_length(), MaskValue.INAPPLICABLE),
|
|
769
|
+
)
|
|
770
|
+
atom_site["label_comp_id"] = np.copy(array.res_name)
|
|
771
|
+
atom_site["label_asym_id"] = np.copy(array.chain_id)
|
|
772
|
+
atom_site["label_entity_id"] = _determine_entity_id(array.chain_id)
|
|
773
|
+
atom_site["label_seq_id"] = np.copy(array.res_id)
|
|
774
|
+
atom_site["pdbx_PDB_ins_code"] = Column(
|
|
775
|
+
np.copy(array.ins_code),
|
|
776
|
+
np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT)
|
|
777
|
+
)
|
|
778
|
+
atom_site["auth_seq_id"] = atom_site["label_seq_id"]
|
|
779
|
+
atom_site["auth_comp_id"] = atom_site["label_comp_id"]
|
|
780
|
+
atom_site["auth_asym_id"] = atom_site["label_asym_id"]
|
|
781
|
+
atom_site["auth_atom_id"] = atom_site["label_atom_id"]
|
|
546
782
|
|
|
783
|
+
annot_categories = array.get_annotation_categories()
|
|
547
784
|
if "atom_id" in annot_categories:
|
|
548
|
-
|
|
785
|
+
atom_site["id"] = np.copy(array.atom_id)
|
|
549
786
|
if "b_factor" in annot_categories:
|
|
550
|
-
|
|
551
|
-
[f"{b:.2f}" for b in array.b_factor]
|
|
552
|
-
)
|
|
787
|
+
atom_site["B_iso_or_equiv"] = np.copy(array.b_factor)
|
|
553
788
|
if "occupancy" in annot_categories:
|
|
554
|
-
|
|
555
|
-
[f"{occ:.2f}" for occ in array.occupancy]
|
|
556
|
-
)
|
|
789
|
+
atom_site["occupancy"] = np.copy(array.occupancy)
|
|
557
790
|
if "charge" in annot_categories:
|
|
558
|
-
|
|
559
|
-
[f"{c:+d}" if c != 0 else "?" for c in array.charge]
|
|
791
|
+
atom_site["pdbx_formal_charge"] = Column(
|
|
792
|
+
np.array([f"{c:+d}" if c != 0 else "?" for c in array.charge]),
|
|
793
|
+
np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT)
|
|
560
794
|
)
|
|
561
795
|
|
|
796
|
+
if array.bonds is not None:
|
|
797
|
+
struct_conn = _set_inter_residue_bonds(array, atom_site)
|
|
798
|
+
if struct_conn is not None:
|
|
799
|
+
block["struct_conn"] = struct_conn
|
|
800
|
+
if include_bonds:
|
|
801
|
+
chem_comp_bond = _set_intra_residue_bonds(array, atom_site)
|
|
802
|
+
if chem_comp_bond is not None:
|
|
803
|
+
block["chem_comp_bond"] = chem_comp_bond
|
|
804
|
+
|
|
562
805
|
# In case of a single model handle each coordinate
|
|
563
806
|
# simply like a flattened array
|
|
564
807
|
if type(array) == AtomArray or (
|
|
@@ -566,42 +809,32 @@ def set_structure(pdbx_file, array, data_block=None):
|
|
|
566
809
|
):
|
|
567
810
|
# 'ravel' flattens coord without copy
|
|
568
811
|
# in case of stack with stack_depth = 1
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
)
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
)
|
|
575
|
-
atom_site_dict["Cartn_z"] = np.array(
|
|
576
|
-
[f"{c:.3f}" for c in np.ravel(array.coord[..., 2])]
|
|
577
|
-
)
|
|
578
|
-
atom_site_dict["pdbx_PDB_model_num"] = np.full(
|
|
579
|
-
array.array_length(), "1"
|
|
812
|
+
atom_site["Cartn_x"] = np.copy(np.ravel(array.coord[..., 0]))
|
|
813
|
+
atom_site["Cartn_y"] = np.copy(np.ravel(array.coord[..., 1]))
|
|
814
|
+
atom_site["Cartn_z"] = np.copy(np.ravel(array.coord[..., 2]))
|
|
815
|
+
atom_site["pdbx_PDB_model_num"] = np.ones(
|
|
816
|
+
array.array_length(), dtype=np.int32
|
|
580
817
|
)
|
|
581
818
|
# In case of multiple models repeat annotations
|
|
582
819
|
# and use model specific coordinates
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
atom_site_dict[key] = np.tile(value, reps=array.stack_depth())
|
|
820
|
+
else:
|
|
821
|
+
atom_site = _repeat(atom_site, array.stack_depth())
|
|
586
822
|
coord = np.reshape(
|
|
587
823
|
array.coord, (array.stack_depth() * array.array_length(), 3)
|
|
588
824
|
)
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
np.arange(1, array.stack_depth() + 1
|
|
825
|
+
atom_site["Cartn_x"] = np.copy(coord[:, 0])
|
|
826
|
+
atom_site["Cartn_y"] = np.copy(coord[:, 1])
|
|
827
|
+
atom_site["Cartn_z"] = np.copy(coord[:, 2])
|
|
828
|
+
atom_site["pdbx_PDB_model_num"] = np.repeat(
|
|
829
|
+
np.arange(1, array.stack_depth() + 1, dtype=np.int32),
|
|
594
830
|
repeats=array.array_length(),
|
|
595
831
|
)
|
|
596
|
-
atom_site_dict["pdbx_PDB_model_num"] = models
|
|
597
|
-
else:
|
|
598
|
-
raise ValueError("Structure must be AtomArray or AtomArrayStack")
|
|
599
832
|
if not "atom_id" in annot_categories:
|
|
600
833
|
# Count from 1
|
|
601
|
-
|
|
602
|
-
1, len(
|
|
603
|
-
)
|
|
604
|
-
|
|
834
|
+
atom_site["id"] = np.arange(
|
|
835
|
+
1, len(atom_site["group_PDB"]) + 1
|
|
836
|
+
)
|
|
837
|
+
block["atom_site"] = atom_site
|
|
605
838
|
|
|
606
839
|
# Write box into file
|
|
607
840
|
if array.box is not None:
|
|
@@ -612,14 +845,52 @@ def set_structure(pdbx_file, array, data_block=None):
|
|
|
612
845
|
else:
|
|
613
846
|
box = array.box
|
|
614
847
|
len_a, len_b, len_c, alpha, beta, gamma = unitcell_from_vectors(box)
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
848
|
+
cell = Category()
|
|
849
|
+
cell["length_a"] = len_a
|
|
850
|
+
cell["length_b"] = len_b
|
|
851
|
+
cell["length_c"] = len_c
|
|
852
|
+
cell["angle_alpha"] = np.rad2deg(alpha)
|
|
853
|
+
cell["angle_beta"] = np.rad2deg(beta)
|
|
854
|
+
cell["angle_gamma"] = np.rad2deg(gamma)
|
|
855
|
+
block["cell"] = cell
|
|
856
|
+
|
|
857
|
+
|
|
858
|
+
def _check_non_empty(array):
|
|
859
|
+
if isinstance(array, AtomArray):
|
|
860
|
+
if array.array_length() == 0:
|
|
861
|
+
raise BadStructureError("Structure must not be empty")
|
|
862
|
+
elif isinstance(array, AtomArrayStack):
|
|
863
|
+
if array.array_length() == 0 or array.stack_depth() == 0:
|
|
864
|
+
raise BadStructureError("Structure must not be empty")
|
|
865
|
+
else:
|
|
866
|
+
raise ValueError(
|
|
867
|
+
"Structure must be AtomArray or AtomArrayStack, "
|
|
868
|
+
f"but got {type(array).__name__}"
|
|
869
|
+
)
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
def _get_or_create_block(pdbx_component, block_name):
|
|
873
|
+
if isinstance(pdbx_component, PDBxFile):
|
|
874
|
+
# The deprecated 'PDBxFile' is a thin wrapper around 'CIFFile'
|
|
875
|
+
pdbx_component = pdbx_component.cif_file
|
|
876
|
+
|
|
877
|
+
Block = pdbx_component.subcomponent_class()
|
|
878
|
+
|
|
879
|
+
if isinstance(pdbx_component, (CIFFile, BinaryCIFFile)):
|
|
880
|
+
if block_name is None:
|
|
881
|
+
if len(pdbx_component) > 0:
|
|
882
|
+
block_name = next(iter(pdbx_component.keys()))
|
|
883
|
+
else:
|
|
884
|
+
# File is empty -> invent a new block name
|
|
885
|
+
block_name = "structure"
|
|
886
|
+
|
|
887
|
+
if block_name not in pdbx_component:
|
|
888
|
+
block = Block()
|
|
889
|
+
pdbx_component[block_name] = block
|
|
890
|
+
return pdbx_component[block_name]
|
|
891
|
+
else:
|
|
892
|
+
# Already a block
|
|
893
|
+
return pdbx_component
|
|
623
894
|
|
|
624
895
|
|
|
625
896
|
def _determine_entity_id(chain_id):
|
|
@@ -635,10 +906,155 @@ def _determine_entity_id(chain_id):
|
|
|
635
906
|
id_translation[chain_id[i]] = id
|
|
636
907
|
entity_id[i] = id_translation[chain_id[i]]
|
|
637
908
|
id += 1
|
|
638
|
-
return entity_id
|
|
909
|
+
return entity_id
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
def _repeat(category, repetitions):
|
|
913
|
+
Category = type(category)
|
|
914
|
+
Column = Category.subcomponent_class()
|
|
915
|
+
Data = Column.subcomponent_class()
|
|
916
|
+
|
|
917
|
+
category_dict = {}
|
|
918
|
+
for key, column in category.items():
|
|
919
|
+
if isinstance(column, BinaryCIFColumn):
|
|
920
|
+
data_encoding = column.data.encoding
|
|
921
|
+
# Optimization: The repeated string array has the same
|
|
922
|
+
# unique values, as the original string array
|
|
923
|
+
# -> Use same unique values (faster due to shorter array)
|
|
924
|
+
if isinstance(data_encoding[0], StringArrayEncoding):
|
|
925
|
+
data_encoding[0].strings = np.unique(column.data.array)
|
|
926
|
+
data = Data(np.tile(column.data.array, repetitions), data_encoding)
|
|
927
|
+
else:
|
|
928
|
+
data = Data(np.tile(column.data.array, repetitions))
|
|
929
|
+
mask = Data(np.tile(column.mask.array, repetitions)) \
|
|
930
|
+
if column.mask is not None else None
|
|
931
|
+
category_dict[key] = Column(data, mask)
|
|
932
|
+
return Category(category_dict)
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
def _set_intra_residue_bonds(array, atom_site):
|
|
936
|
+
"""
|
|
937
|
+
Create the ``chem_comp_bond`` category containing the intra-residue
|
|
938
|
+
bonds.
|
|
939
|
+
``atom_site`` is only used to infer the right :class:`Category` type
|
|
940
|
+
(either :class:`CIFCategory` or :class:`BinaryCIFCategory`).
|
|
941
|
+
"""
|
|
942
|
+
if (array.res_name == "").any():
|
|
943
|
+
raise BadStructureError(
|
|
944
|
+
"Structure contains atoms with empty residue name, "
|
|
945
|
+
"but it is required to write intra-residue bonds"
|
|
946
|
+
)
|
|
947
|
+
if (array.atom_name == "").any():
|
|
948
|
+
raise BadStructureError(
|
|
949
|
+
"Structure contains atoms with empty atom name, "
|
|
950
|
+
"but it is required to write intra-residue bonds"
|
|
951
|
+
)
|
|
952
|
+
|
|
953
|
+
Category = type(atom_site)
|
|
954
|
+
Column = Category.subcomponent_class()
|
|
955
|
+
|
|
956
|
+
bond_array = _filter_bonds(array, "intra")
|
|
957
|
+
if len(bond_array) == 0:
|
|
958
|
+
return None
|
|
959
|
+
value_order = np.zeros(len(bond_array), dtype="U4")
|
|
960
|
+
aromatic_flag = np.zeros(len(bond_array), dtype="U1")
|
|
961
|
+
for i, bond_type in enumerate(bond_array[:, 2]):
|
|
962
|
+
if bond_type == BondType.ANY:
|
|
963
|
+
# ANY bonds will be masked anyway, no need to set the value
|
|
964
|
+
continue
|
|
965
|
+
order, aromatic = COMP_BOND_TYPE_TO_ORDER[bond_type]
|
|
966
|
+
value_order[i] = order
|
|
967
|
+
aromatic_flag[i] = aromatic
|
|
968
|
+
any_mask = bond_array[:, 2] == BondType.ANY
|
|
969
|
+
|
|
970
|
+
chem_comp_bond = Category()
|
|
971
|
+
# Take the residue name from the first atom index, as the residue
|
|
972
|
+
# name is the same for both atoms, since we have only intra bonds
|
|
973
|
+
chem_comp_bond["comp_id"] = array.res_name[bond_array[:, 0]]
|
|
974
|
+
chem_comp_bond["atom_id_1"] = array.atom_name[bond_array[:, 0]]
|
|
975
|
+
chem_comp_bond["atom_id_2"] = array.atom_name[bond_array[:, 1]]
|
|
976
|
+
chem_comp_bond["value_order"] = Column(
|
|
977
|
+
value_order,
|
|
978
|
+
np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT)
|
|
979
|
+
)
|
|
980
|
+
chem_comp_bond["pdbx_aromatic_flag"] = Column(
|
|
981
|
+
aromatic_flag,
|
|
982
|
+
np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT)
|
|
983
|
+
)
|
|
984
|
+
# BondList does not contain stereo information
|
|
985
|
+
# -> all values are missing
|
|
986
|
+
chem_comp_bond["pdbx_stereo_config"] = Column(
|
|
987
|
+
np.zeros(len(bond_array), dtype="U1"),
|
|
988
|
+
np.full(len(bond_array), MaskValue.MISSING)
|
|
989
|
+
)
|
|
990
|
+
chem_comp_bond["pdbx_ordinal"] = np.arange(
|
|
991
|
+
1, len(bond_array) + 1, dtype=np.int32
|
|
992
|
+
)
|
|
993
|
+
return chem_comp_bond
|
|
994
|
+
|
|
995
|
+
|
|
996
|
+
def _set_inter_residue_bonds(array, atom_site):
|
|
997
|
+
"""
|
|
998
|
+
Create the ``struct_conn`` category containing the inter-residue
|
|
999
|
+
bonds.
|
|
1000
|
+
The involved atoms are identified by annotations from the
|
|
1001
|
+
``atom_site`` category.
|
|
1002
|
+
"""
|
|
1003
|
+
COLUMNS = [
|
|
1004
|
+
"label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id",
|
|
1005
|
+
"pdbx_PDB_ins_code"
|
|
1006
|
+
]
|
|
1007
|
+
|
|
1008
|
+
Category = type(atom_site)
|
|
1009
|
+
Column = Category.subcomponent_class()
|
|
1010
|
+
|
|
1011
|
+
bond_array = _filter_bonds(array, "inter")
|
|
1012
|
+
if len(bond_array) == 0:
|
|
1013
|
+
return None
|
|
1014
|
+
struct_conn = Category()
|
|
1015
|
+
struct_conn["id"] = np.arange(1, len(bond_array) + 1)
|
|
1016
|
+
struct_conn["conn_type_id"] = np.full(len(bond_array), "covale")
|
|
1017
|
+
struct_conn["pdbx_value_order"] = Column(
|
|
1018
|
+
np.array(
|
|
1019
|
+
[PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]
|
|
1020
|
+
),
|
|
1021
|
+
np.where(
|
|
1022
|
+
bond_array[:, 2] == BondType.ANY,
|
|
1023
|
+
MaskValue.MISSING, MaskValue.PRESENT,
|
|
1024
|
+
)
|
|
1025
|
+
)
|
|
1026
|
+
# Write the identifying annotation...
|
|
1027
|
+
for col_name in COLUMNS:
|
|
1028
|
+
annot = atom_site[col_name].as_array()
|
|
1029
|
+
# ...for each bond partner
|
|
1030
|
+
for i in range(2):
|
|
1031
|
+
atom_indices = bond_array[:, i]
|
|
1032
|
+
struct_conn[_get_struct_conn_col_name(col_name, i+1)] \
|
|
1033
|
+
= annot[atom_indices]
|
|
1034
|
+
return struct_conn
|
|
1035
|
+
|
|
1036
|
+
|
|
1037
|
+
def _filter_bonds(array, connection):
|
|
1038
|
+
"""
|
|
1039
|
+
Get a bonds array, that contain either only intra-residue or
|
|
1040
|
+
only inter-residue bonds.
|
|
1041
|
+
"""
|
|
1042
|
+
bond_array = array.bonds.as_array()
|
|
1043
|
+
# To save computation time call 'get_residue_starts_for()' only once
|
|
1044
|
+
# with indices of the first and second atom of each bond
|
|
1045
|
+
residue_starts_1, residue_starts_2 = get_residue_starts_for(
|
|
1046
|
+
array, bond_array[:, :2].flatten()
|
|
1047
|
+
).reshape(-1, 2).T
|
|
1048
|
+
if connection == "intra":
|
|
1049
|
+
return bond_array[residue_starts_1 == residue_starts_2]
|
|
1050
|
+
elif connection == "inter":
|
|
1051
|
+
return bond_array[residue_starts_1 != residue_starts_2]
|
|
1052
|
+
else:
|
|
1053
|
+
raise ValueError("Invalid 'connection' option")
|
|
639
1054
|
|
|
640
1055
|
|
|
641
|
-
def get_component(pdbx_file, data_block=None, use_ideal_coord=True
|
|
1056
|
+
def get_component(pdbx_file, data_block=None, use_ideal_coord=True,
|
|
1057
|
+
res_name=None):
|
|
642
1058
|
"""
|
|
643
1059
|
Create an :class:`AtomArray` for a chemical component from the
|
|
644
1060
|
``chem_comp_atom`` and, if available, the ``chem_comp_bond``
|
|
@@ -646,26 +1062,37 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
|
|
|
646
1062
|
|
|
647
1063
|
Parameters
|
|
648
1064
|
----------
|
|
1065
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
1066
|
+
The file object.
|
|
649
1067
|
data_block : str, optional
|
|
650
|
-
The name of the data block.
|
|
651
|
-
(and most times only) data block of the
|
|
1068
|
+
The name of the data block.
|
|
1069
|
+
Default is the first (and most times only) data block of the
|
|
1070
|
+
file.
|
|
1071
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
1072
|
+
this parameter is ignored.
|
|
652
1073
|
use_ideal_coord : bool, optional
|
|
653
1074
|
If true, the *ideal* coordinates are read from the file
|
|
654
1075
|
(``pdbx_model_Cartn_<dim>_ideal`` fields), typically
|
|
655
1076
|
originating from computations.
|
|
656
1077
|
If set to false, alternative coordinates are read
|
|
657
1078
|
(``model_Cartn_<dim>_`` fields).
|
|
658
|
-
|
|
1079
|
+
res_name : str
|
|
1080
|
+
In rare cases the categories may contain rows for multiple
|
|
1081
|
+
components.
|
|
1082
|
+
In this case, the component with the given residue name is
|
|
1083
|
+
read.
|
|
1084
|
+
By default, all rows would be read in this case.
|
|
1085
|
+
|
|
659
1086
|
Returns
|
|
660
1087
|
-------
|
|
661
1088
|
array : AtomArray
|
|
662
1089
|
The parsed chemical component.
|
|
663
|
-
|
|
1090
|
+
|
|
664
1091
|
Examples
|
|
665
1092
|
--------
|
|
666
1093
|
|
|
667
1094
|
>>> import os.path
|
|
668
|
-
>>> file =
|
|
1095
|
+
>>> file = CIFFile.read(
|
|
669
1096
|
... os.path.join(path_to_structures, "molecules", "TYR.cif")
|
|
670
1097
|
... )
|
|
671
1098
|
>>> comp = get_component(file)
|
|
@@ -695,26 +1122,31 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
|
|
|
695
1122
|
HET 0 TYR HH H -0.123 -0.399 -5.059
|
|
696
1123
|
HET 0 TYR HXT H -1.333 -0.030 4.784
|
|
697
1124
|
"""
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
1125
|
+
block = _get_block(pdbx_file, data_block)
|
|
1126
|
+
|
|
1127
|
+
try:
|
|
1128
|
+
atom_category = block["chem_comp_atom"]
|
|
1129
|
+
except KeyError:
|
|
702
1130
|
raise InvalidFileError("Missing 'chem_comp_atom' category in file")
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
1131
|
+
if res_name is not None:
|
|
1132
|
+
atom_category = _filter(
|
|
1133
|
+
atom_category, atom_category["comp_id"].as_array() == res_name
|
|
1134
|
+
)
|
|
1135
|
+
if atom_category.row_count == 0:
|
|
1136
|
+
raise KeyError(
|
|
1137
|
+
f"No rows with residue name '{res_name}' found in "
|
|
1138
|
+
f"'chem_comp_atom' category"
|
|
1139
|
+
)
|
|
706
1140
|
|
|
707
|
-
array = AtomArray(
|
|
1141
|
+
array = AtomArray(atom_category.row_count)
|
|
708
1142
|
|
|
709
1143
|
array.hetero[:] = True
|
|
710
|
-
array.res_name =
|
|
711
|
-
array.atom_name =
|
|
712
|
-
array.element =
|
|
1144
|
+
array.res_name = atom_category["comp_id"].as_array("U5")
|
|
1145
|
+
array.atom_name = atom_category["atom_id"].as_array("U6")
|
|
1146
|
+
array.element = atom_category["type_symbol"].as_array("U2")
|
|
713
1147
|
array.add_annotation("charge", int)
|
|
714
|
-
array.charge =
|
|
715
|
-
|
|
716
|
-
)
|
|
717
|
-
|
|
1148
|
+
array.charge = atom_category["charge"].as_array(int, 0)
|
|
1149
|
+
|
|
718
1150
|
coord_fields = [f"pdbx_model_Cartn_{dim}_ideal" for dim in ("x", "y", "z")]
|
|
719
1151
|
alt_coord_fields = [f"model_Cartn_{dim}" for dim in ("x", "y", "z")]
|
|
720
1152
|
if not use_ideal_coord:
|
|
@@ -722,7 +1154,7 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
|
|
|
722
1154
|
coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
|
|
723
1155
|
try:
|
|
724
1156
|
for i, field in enumerate(coord_fields):
|
|
725
|
-
array.coord[:,i] =
|
|
1157
|
+
array.coord[:,i] = atom_category[field].as_array(np.float32)
|
|
726
1158
|
except KeyError as err:
|
|
727
1159
|
key = err.args[0]
|
|
728
1160
|
warnings.warn(
|
|
@@ -731,9 +1163,15 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
|
|
|
731
1163
|
UserWarning
|
|
732
1164
|
)
|
|
733
1165
|
for i, field in enumerate(alt_coord_fields):
|
|
734
|
-
array.coord[:,i] =
|
|
735
|
-
|
|
736
|
-
|
|
1166
|
+
array.coord[:,i] = atom_category[field].as_array(np.float32)
|
|
1167
|
+
|
|
1168
|
+
try:
|
|
1169
|
+
bond_category = block["chem_comp_bond"]
|
|
1170
|
+
if res_name is not None:
|
|
1171
|
+
bond_category = _filter(
|
|
1172
|
+
bond_category, bond_category["comp_id"].as_array() == res_name
|
|
1173
|
+
)
|
|
1174
|
+
except KeyError:
|
|
737
1175
|
warnings.warn(
|
|
738
1176
|
f"Category 'chem_comp_bond' not found. "
|
|
739
1177
|
f"No bonds will be parsed",
|
|
@@ -742,12 +1180,14 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
|
|
|
742
1180
|
else:
|
|
743
1181
|
bonds = BondList(array.array_length())
|
|
744
1182
|
for atom1, atom2, order, aromatic_flag in zip(
|
|
745
|
-
|
|
746
|
-
|
|
1183
|
+
bond_category["atom_id_1"].as_array(str),
|
|
1184
|
+
bond_category["atom_id_2"].as_array(str),
|
|
1185
|
+
bond_category["value_order"].as_array(str),
|
|
1186
|
+
bond_category["pdbx_aromatic_flag"].as_array(str)
|
|
747
1187
|
):
|
|
748
1188
|
atom_i = np.where(array.atom_name == atom1)[0][0]
|
|
749
1189
|
atom_j = np.where(array.atom_name == atom2)[0][0]
|
|
750
|
-
bond_type =
|
|
1190
|
+
bond_type = COMP_BOND_ORDER_TO_TYPE[order, aromatic_flag]
|
|
751
1191
|
bonds.add_bond(atom_i, atom_j, bond_type)
|
|
752
1192
|
array.bonds = bonds
|
|
753
1193
|
|
|
@@ -766,15 +1206,24 @@ def set_component(pdbx_file, array, data_block=None):
|
|
|
766
1206
|
|
|
767
1207
|
Parameters
|
|
768
1208
|
----------
|
|
769
|
-
pdbx_file :
|
|
1209
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
770
1210
|
The file object.
|
|
771
1211
|
array : AtomArray
|
|
772
1212
|
The chemical component to be written.
|
|
773
1213
|
Must contain only a single residue.
|
|
774
1214
|
data_block : str, optional
|
|
775
|
-
The name of the data block.
|
|
776
|
-
(and most times only) data block of the
|
|
1215
|
+
The name of the data block.
|
|
1216
|
+
Default is the first (and most times only) data block of the
|
|
1217
|
+
file.
|
|
1218
|
+
If the file is empty, a new data will be created.
|
|
1219
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
1220
|
+
this parameter is ignored.
|
|
777
1221
|
"""
|
|
1222
|
+
_check_non_empty(array)
|
|
1223
|
+
|
|
1224
|
+
block = _get_or_create_block(pdbx_file, data_block)
|
|
1225
|
+
Category = block.subcomponent_class()
|
|
1226
|
+
|
|
778
1227
|
if get_residue_count(array) > 1:
|
|
779
1228
|
raise BadStructureError(
|
|
780
1229
|
"The input atom array must comprise only one residue"
|
|
@@ -787,45 +1236,44 @@ def set_component(pdbx_file, array, data_block=None):
|
|
|
787
1236
|
else:
|
|
788
1237
|
charge = np.full(array.array_length(), "?", dtype="U2")
|
|
789
1238
|
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
1239
|
+
atom_cat = Category()
|
|
1240
|
+
atom_cat["comp_id"] = np.full(array.array_length(), res_name)
|
|
1241
|
+
atom_cat["atom_id"] = np.copy(array.atom_name)
|
|
1242
|
+
atom_cat["alt_atom_id"] = atom_cat["atom_id"]
|
|
1243
|
+
atom_cat["type_symbol"] = np.copy(array.element)
|
|
1244
|
+
atom_cat["charge"] = charge
|
|
1245
|
+
atom_cat["model_Cartn_x"] = np.copy(array.coord[:, 0])
|
|
1246
|
+
atom_cat["model_Cartn_y"] = np.copy(array.coord[:, 1])
|
|
1247
|
+
atom_cat["model_Cartn_z"] = np.copy(array.coord[:, 2])
|
|
1248
|
+
atom_cat["pdbx_model_Cartn_x_ideal"] = atom_cat["model_Cartn_x"]
|
|
1249
|
+
atom_cat["pdbx_model_Cartn_y_ideal"] = atom_cat["model_Cartn_y"]
|
|
1250
|
+
atom_cat["pdbx_model_Cartn_z_ideal"] = atom_cat["model_Cartn_z"]
|
|
1251
|
+
atom_cat["pdbx_component_atom_id"] = atom_cat["atom_id"]
|
|
1252
|
+
atom_cat["pdbx_component_comp_id"] = atom_cat["comp_id"]
|
|
1253
|
+
atom_cat["pdbx_ordinal"] = np.arange(
|
|
805
1254
|
1, array.array_length() + 1
|
|
806
1255
|
).astype(str)
|
|
807
|
-
|
|
1256
|
+
block["chem_comp_atom"] = atom_cat
|
|
808
1257
|
|
|
809
|
-
if array.bonds is not None:
|
|
1258
|
+
if array.bonds is not None and array.bonds.get_bond_count() > 0:
|
|
810
1259
|
bond_array = array.bonds.as_array()
|
|
811
1260
|
order_flags = []
|
|
812
1261
|
aromatic_flags = []
|
|
813
1262
|
for bond_type in bond_array[:,2]:
|
|
814
|
-
order_flag, aromatic_flag =
|
|
1263
|
+
order_flag, aromatic_flag = COMP_BOND_TYPE_TO_ORDER[bond_type]
|
|
815
1264
|
order_flags.append(order_flag)
|
|
816
1265
|
aromatic_flags.append(aromatic_flag)
|
|
817
1266
|
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
1267
|
+
bond_cat = Category()
|
|
1268
|
+
bond_cat["comp_id"] = np.full(len(bond_array), res_name)
|
|
1269
|
+
bond_cat["atom_id_1"] = array.atom_name[bond_array[:,0]]
|
|
1270
|
+
bond_cat["atom_id_2"] = array.atom_name[bond_array[:,1]]
|
|
1271
|
+
bond_cat["value_order"] = np.array(order_flags)
|
|
1272
|
+
bond_cat["pdbx_aromatic_flag"] = np.array(aromatic_flags)
|
|
1273
|
+
bond_cat["pdbx_ordinal"] = np.arange(
|
|
825
1274
|
1, len(bond_array) + 1
|
|
826
1275
|
).astype(str)
|
|
827
|
-
|
|
828
|
-
|
|
1276
|
+
block["chem_comp_bond"] = bond_cat
|
|
829
1277
|
|
|
830
1278
|
def list_assemblies(pdbx_file, data_block=None):
|
|
831
1279
|
"""
|
|
@@ -838,23 +1286,25 @@ def list_assemblies(pdbx_file, data_block=None):
|
|
|
838
1286
|
|
|
839
1287
|
Parameters
|
|
840
1288
|
----------
|
|
841
|
-
pdbx_file :
|
|
1289
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
842
1290
|
The file object.
|
|
843
1291
|
data_block : str, optional
|
|
844
1292
|
The name of the data block.
|
|
845
|
-
|
|
1293
|
+
Default is the first (and most times only) data block of the
|
|
846
1294
|
file.
|
|
1295
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
1296
|
+
this parameter is ignored.
|
|
847
1297
|
|
|
848
1298
|
Returns
|
|
849
1299
|
-------
|
|
850
1300
|
assemblies : dict of str -> str
|
|
851
1301
|
A dictionary that maps an assembly ID to a description of the
|
|
852
1302
|
corresponding assembly.
|
|
853
|
-
|
|
1303
|
+
|
|
854
1304
|
Examples
|
|
855
1305
|
--------
|
|
856
1306
|
>>> import os.path
|
|
857
|
-
>>> file =
|
|
1307
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
|
|
858
1308
|
>>> assembly_ids = list_assemblies(file)
|
|
859
1309
|
>>> for key, val in assembly_ids.items():
|
|
860
1310
|
... print(f"'{key}' : '{val}'")
|
|
@@ -865,21 +1315,24 @@ def list_assemblies(pdbx_file, data_block=None):
|
|
|
865
1315
|
'5' : 'icosahedral asymmetric unit, std point frame'
|
|
866
1316
|
'6' : 'crystal asymmetric unit, crystal frame'
|
|
867
1317
|
"""
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
1318
|
+
block = _get_block(pdbx_file, data_block)
|
|
1319
|
+
|
|
1320
|
+
try:
|
|
1321
|
+
assembly_category = block["pdbx_struct_assembly"]
|
|
1322
|
+
except KeyError:
|
|
872
1323
|
raise InvalidFileError("File has no 'pdbx_struct_assembly' category")
|
|
873
1324
|
return {
|
|
874
1325
|
id: details
|
|
875
1326
|
for id, details in zip(
|
|
876
|
-
assembly_category["id"],
|
|
1327
|
+
assembly_category["id"].as_array(str),
|
|
1328
|
+
assembly_category["details"].as_array(str)
|
|
877
1329
|
)
|
|
878
1330
|
}
|
|
879
1331
|
|
|
880
1332
|
|
|
881
1333
|
def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
|
|
882
|
-
altloc="first", extra_fields=None, use_author_fields=True
|
|
1334
|
+
altloc="first", extra_fields=None, use_author_fields=True,
|
|
1335
|
+
include_bonds=False):
|
|
883
1336
|
"""
|
|
884
1337
|
Build the given biological assembly.
|
|
885
1338
|
|
|
@@ -890,7 +1343,7 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
|
|
|
890
1343
|
|
|
891
1344
|
Parameters
|
|
892
1345
|
----------
|
|
893
|
-
pdbx_file :
|
|
1346
|
+
pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
|
|
894
1347
|
The file object.
|
|
895
1348
|
assembly_id : str
|
|
896
1349
|
The assembly to build.
|
|
@@ -907,8 +1360,10 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
|
|
|
907
1360
|
contains only one model.
|
|
908
1361
|
data_block : str, optional
|
|
909
1362
|
The name of the data block.
|
|
910
|
-
|
|
1363
|
+
Default is the first (and most times only) data block of the
|
|
911
1364
|
file.
|
|
1365
|
+
If the data block object is passed directly to `pdbx_file`,
|
|
1366
|
+
this parameter is ignored.
|
|
912
1367
|
altloc : {'first', 'occupancy', 'all'}
|
|
913
1368
|
This parameter defines how *altloc* IDs are handled:
|
|
914
1369
|
- ``'first'`` - Use atoms that have the first *altloc* ID
|
|
@@ -940,36 +1395,46 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
|
|
|
940
1395
|
If `use_author_fields` is true, the annotation arrays will be
|
|
941
1396
|
read from the ``auth_xxx`` fields (if applicable),
|
|
942
1397
|
otherwise from the the ``label_xxx`` fields.
|
|
1398
|
+
include_bonds : bool, optional
|
|
1399
|
+
If set to true, a :class:`BondList` will be created for the
|
|
1400
|
+
resulting :class:`AtomArray` containing the bond information
|
|
1401
|
+
from the file.
|
|
1402
|
+
Bonds, whose order could not be determined from the
|
|
1403
|
+
*Chemical Component Dictionary*
|
|
1404
|
+
(e.g. especially inter-residue bonds),
|
|
1405
|
+
have :attr:`BondType.ANY`, since the PDB format itself does
|
|
1406
|
+
not support bond orders.
|
|
943
1407
|
|
|
944
1408
|
Returns
|
|
945
1409
|
-------
|
|
946
1410
|
assembly : AtomArray or AtomArrayStack
|
|
947
1411
|
The assembly. The return type depends on the `model` parameter.
|
|
948
|
-
|
|
1412
|
+
|
|
949
1413
|
Examples
|
|
950
1414
|
--------
|
|
951
1415
|
|
|
952
1416
|
>>> import os.path
|
|
953
|
-
>>> file =
|
|
1417
|
+
>>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
|
|
954
1418
|
>>> assembly = get_assembly(file, model=1)
|
|
955
1419
|
"""
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
1420
|
+
block = _get_block(pdbx_file, data_block)
|
|
1421
|
+
|
|
1422
|
+
try:
|
|
1423
|
+
assembly_gen_category = block["pdbx_struct_assembly_gen"]
|
|
1424
|
+
except KeyError:
|
|
960
1425
|
raise InvalidFileError(
|
|
961
1426
|
"File has no 'pdbx_struct_assembly_gen' category"
|
|
962
1427
|
)
|
|
963
1428
|
|
|
964
|
-
|
|
965
|
-
"pdbx_struct_oper_list"
|
|
966
|
-
|
|
967
|
-
if struct_oper_category is None:
|
|
1429
|
+
try:
|
|
1430
|
+
struct_oper_category = block["pdbx_struct_oper_list"]
|
|
1431
|
+
except KeyError:
|
|
968
1432
|
raise InvalidFileError("File has no 'pdbx_struct_oper_list' category")
|
|
969
1433
|
|
|
1434
|
+
assembly_ids = assembly_gen_category["assembly_id"].as_array(str)
|
|
970
1435
|
if assembly_id is None:
|
|
971
|
-
assembly_id =
|
|
972
|
-
elif assembly_id not in
|
|
1436
|
+
assembly_id = assembly_ids[0]
|
|
1437
|
+
elif assembly_id not in assembly_ids:
|
|
973
1438
|
raise KeyError(f"File has no Assembly ID '{assembly_id}'")
|
|
974
1439
|
|
|
975
1440
|
### Calculate all possible transformations
|
|
@@ -982,6 +1447,8 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
|
|
|
982
1447
|
if "label_asym_id" in extra_fields:
|
|
983
1448
|
extra_fields_and_asym = extra_fields
|
|
984
1449
|
else:
|
|
1450
|
+
# The operations apply on asym IDs
|
|
1451
|
+
# -> they need to be included to select the correct atoms
|
|
985
1452
|
extra_fields_and_asym = extra_fields + ["label_asym_id"]
|
|
986
1453
|
structure = get_structure(
|
|
987
1454
|
pdbx_file,
|
|
@@ -990,14 +1457,15 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
|
|
|
990
1457
|
altloc,
|
|
991
1458
|
extra_fields_and_asym,
|
|
992
1459
|
use_author_fields,
|
|
1460
|
+
include_bonds
|
|
993
1461
|
)
|
|
994
1462
|
|
|
995
1463
|
### Get transformations and apply them to the affected asym IDs
|
|
996
1464
|
assembly = None
|
|
997
1465
|
for id, op_expr, asym_id_expr in zip(
|
|
998
|
-
assembly_gen_category["assembly_id"],
|
|
999
|
-
assembly_gen_category["oper_expression"],
|
|
1000
|
-
assembly_gen_category["asym_id_list"],
|
|
1466
|
+
assembly_gen_category["assembly_id"].as_array(str),
|
|
1467
|
+
assembly_gen_category["oper_expression"].as_array(str),
|
|
1468
|
+
assembly_gen_category["asym_id_list"].as_array(str),
|
|
1001
1469
|
):
|
|
1002
1470
|
# Find the operation expressions for given assembly ID
|
|
1003
1471
|
# We already asserted that the ID is actually present
|
|
@@ -1017,12 +1485,12 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
|
|
|
1017
1485
|
assembly = sub_assembly
|
|
1018
1486
|
else:
|
|
1019
1487
|
assembly += sub_assembly
|
|
1020
|
-
|
|
1488
|
+
|
|
1021
1489
|
# Remove 'label_asym_id', if it was not included in the original
|
|
1022
1490
|
# user-supplied 'extra_fields'
|
|
1023
1491
|
if "label_asym_id" not in extra_fields:
|
|
1024
1492
|
assembly.del_annotation("label_asym_id")
|
|
1025
|
-
|
|
1493
|
+
|
|
1026
1494
|
return assembly
|
|
1027
1495
|
|
|
1028
1496
|
|
|
@@ -1056,19 +1524,20 @@ def _get_transformations(struct_oper):
|
|
|
1056
1524
|
translation for each operation ID in ``pdbx_struct_oper_list``.
|
|
1057
1525
|
"""
|
|
1058
1526
|
transformation_dict = {}
|
|
1059
|
-
for index, id in enumerate(struct_oper["id"]):
|
|
1527
|
+
for index, id in enumerate(struct_oper["id"].as_array(str)):
|
|
1060
1528
|
rotation_matrix = np.array(
|
|
1061
1529
|
[
|
|
1062
1530
|
[
|
|
1063
|
-
|
|
1531
|
+
struct_oper[f"matrix[{i}][{j}]"].as_array(float)[index]
|
|
1064
1532
|
for j in (1, 2, 3)
|
|
1065
1533
|
]
|
|
1066
1534
|
for i in (1, 2, 3)
|
|
1067
1535
|
]
|
|
1068
1536
|
)
|
|
1069
|
-
translation_vector = np.array(
|
|
1070
|
-
|
|
1071
|
-
|
|
1537
|
+
translation_vector = np.array([
|
|
1538
|
+
struct_oper[f"vector[{i}]"].as_array(float)[index]
|
|
1539
|
+
for i in (1, 2, 3)
|
|
1540
|
+
])
|
|
1072
1541
|
transformation_dict[id] = (rotation_matrix, translation_vector)
|
|
1073
1542
|
return transformation_dict
|
|
1074
1543
|
|
|
@@ -1082,25 +1551,26 @@ def _parse_operation_expression(expression):
|
|
|
1082
1551
|
# Split groups by parentheses:
|
|
1083
1552
|
# use the opening parenthesis as delimiter
|
|
1084
1553
|
# and just remove the closing parenthesis
|
|
1554
|
+
# example: '(X0)(1-10,21-25)' from 1a34
|
|
1085
1555
|
expressions_per_step = expression.replace(")", "").split("(")
|
|
1086
1556
|
expressions_per_step = [e for e in expressions_per_step if len(e) > 0]
|
|
1087
1557
|
# Important: Operations are applied from right to left
|
|
1088
1558
|
expressions_per_step.reverse()
|
|
1089
1559
|
|
|
1090
1560
|
operations = []
|
|
1091
|
-
for
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1561
|
+
for one_step_expr in expressions_per_step:
|
|
1562
|
+
one_step_op_ids = []
|
|
1563
|
+
for expr in one_step_expr.split(","):
|
|
1564
|
+
if "-" in expr:
|
|
1565
|
+
# Range of operation IDs, they must be integers
|
|
1566
|
+
first, last = expr.split("-")
|
|
1567
|
+
one_step_op_ids.extend(
|
|
1568
|
+
[str(id) for id in range(int(first), int(last) + 1)]
|
|
1569
|
+
)
|
|
1570
|
+
else:
|
|
1571
|
+
# Single operation ID
|
|
1572
|
+
one_step_op_ids.append(expr)
|
|
1573
|
+
operations.append(one_step_op_ids)
|
|
1104
1574
|
|
|
1105
1575
|
# Cartesian product of operations
|
|
1106
1576
|
return list(itertools.product(*operations))
|
|
@@ -1112,6 +1582,8 @@ def _convert_string_to_sequence(string, stype):
|
|
|
1112
1582
|
``proteinseq_type_list`` or to ``NucleotideSequence`` if `stype` is
|
|
1113
1583
|
contained in ``_nucleotideseq_type_list``.
|
|
1114
1584
|
"""
|
|
1585
|
+
# sequence may be stored as multiline string
|
|
1586
|
+
string = string.replace("\n", "")
|
|
1115
1587
|
if stype in _proteinseq_type_list:
|
|
1116
1588
|
return ProteinSequence(string)
|
|
1117
1589
|
elif stype in _nucleotideseq_type_list:
|