biotite 0.39.0__cp310-cp310-win_amd64.whl → 0.40.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (104) hide show
  1. biotite/__init__.py +3 -3
  2. biotite/application/dssp/app.py +18 -18
  3. biotite/database/rcsb/download.py +19 -14
  4. biotite/sequence/align/banded.c +258 -237
  5. biotite/sequence/align/banded.cp310-win_amd64.pyd +0 -0
  6. biotite/sequence/align/kmeralphabet.c +243 -222
  7. biotite/sequence/align/kmeralphabet.cp310-win_amd64.pyd +0 -0
  8. biotite/sequence/align/kmersimilarity.c +215 -196
  9. biotite/sequence/align/kmersimilarity.cp310-win_amd64.pyd +0 -0
  10. biotite/sequence/align/kmertable.cp310-win_amd64.pyd +0 -0
  11. biotite/sequence/align/kmertable.cpp +233 -205
  12. biotite/sequence/align/localgapped.c +258 -237
  13. biotite/sequence/align/localgapped.cp310-win_amd64.pyd +0 -0
  14. biotite/sequence/align/localungapped.c +235 -214
  15. biotite/sequence/align/localungapped.cp310-win_amd64.pyd +0 -0
  16. biotite/sequence/align/multiple.c +255 -234
  17. biotite/sequence/align/multiple.cp310-win_amd64.pyd +0 -0
  18. biotite/sequence/align/pairwise.c +274 -253
  19. biotite/sequence/align/pairwise.cp310-win_amd64.pyd +0 -0
  20. biotite/sequence/align/permutation.c +215 -196
  21. biotite/sequence/align/permutation.cp310-win_amd64.pyd +0 -0
  22. biotite/sequence/align/selector.c +217 -197
  23. biotite/sequence/align/selector.cp310-win_amd64.pyd +0 -0
  24. biotite/sequence/align/tracetable.c +215 -195
  25. biotite/sequence/align/tracetable.cp310-win_amd64.pyd +0 -0
  26. biotite/sequence/codec.c +235 -214
  27. biotite/sequence/codec.cp310-win_amd64.pyd +0 -0
  28. biotite/sequence/phylo/nj.c +215 -196
  29. biotite/sequence/phylo/nj.cp310-win_amd64.pyd +0 -0
  30. biotite/sequence/phylo/tree.c +227 -202
  31. biotite/sequence/phylo/tree.cp310-win_amd64.pyd +0 -0
  32. biotite/sequence/phylo/upgma.c +215 -196
  33. biotite/sequence/phylo/upgma.cp310-win_amd64.pyd +0 -0
  34. biotite/structure/basepairs.py +7 -12
  35. biotite/structure/bonds.c +1175 -1226
  36. biotite/structure/bonds.cp310-win_amd64.pyd +0 -0
  37. biotite/structure/celllist.c +217 -197
  38. biotite/structure/celllist.cp310-win_amd64.pyd +0 -0
  39. biotite/structure/charges.c +1052 -1101
  40. biotite/structure/charges.cp310-win_amd64.pyd +0 -0
  41. biotite/structure/filter.py +30 -37
  42. biotite/structure/info/__init__.py +5 -8
  43. biotite/structure/info/atoms.py +25 -67
  44. biotite/structure/info/bonds.py +46 -100
  45. biotite/structure/info/ccd/README.rst +8 -0
  46. biotite/structure/info/ccd/amino_acids.txt +1646 -0
  47. biotite/structure/info/ccd/carbohydrates.txt +1133 -0
  48. biotite/structure/info/ccd/components.bcif +0 -0
  49. biotite/structure/info/ccd/nucleotides.txt +797 -0
  50. biotite/structure/info/ccd.py +95 -0
  51. biotite/structure/info/groups.py +90 -0
  52. biotite/structure/info/masses.py +21 -20
  53. biotite/structure/info/misc.py +11 -22
  54. biotite/structure/info/standardize.py +17 -12
  55. biotite/structure/io/__init__.py +2 -4
  56. biotite/structure/io/ctab.py +1 -1
  57. biotite/structure/io/general.py +37 -43
  58. biotite/structure/io/mmtf/__init__.py +3 -0
  59. biotite/structure/io/mmtf/convertarray.c +219 -198
  60. biotite/structure/io/mmtf/convertarray.cp310-win_amd64.pyd +0 -0
  61. biotite/structure/io/mmtf/convertfile.c +217 -197
  62. biotite/structure/io/mmtf/convertfile.cp310-win_amd64.pyd +0 -0
  63. biotite/structure/io/mmtf/decode.c +225 -204
  64. biotite/structure/io/mmtf/decode.cp310-win_amd64.pyd +0 -0
  65. biotite/structure/io/mmtf/encode.c +215 -196
  66. biotite/structure/io/mmtf/encode.cp310-win_amd64.pyd +0 -0
  67. biotite/structure/io/mmtf/file.py +34 -26
  68. biotite/structure/io/npz/__init__.py +3 -0
  69. biotite/structure/io/npz/file.py +21 -18
  70. biotite/structure/io/pdb/__init__.py +3 -3
  71. biotite/structure/io/pdb/file.py +5 -3
  72. biotite/structure/io/pdb/hybrid36.c +63 -43
  73. biotite/structure/io/pdb/hybrid36.cp310-win_amd64.pyd +0 -0
  74. biotite/structure/io/pdbqt/file.py +32 -32
  75. biotite/structure/io/pdbx/__init__.py +13 -6
  76. biotite/structure/io/pdbx/bcif.py +649 -0
  77. biotite/structure/io/pdbx/cif.py +1028 -0
  78. biotite/structure/io/pdbx/component.py +243 -0
  79. biotite/structure/io/pdbx/convert.py +707 -359
  80. biotite/structure/io/pdbx/encoding.c +112813 -0
  81. biotite/structure/io/pdbx/encoding.cp310-win_amd64.pyd +0 -0
  82. biotite/structure/io/pdbx/error.py +14 -0
  83. biotite/structure/io/pdbx/legacy.py +267 -0
  84. biotite/structure/molecules.py +151 -151
  85. biotite/structure/sasa.c +215 -196
  86. biotite/structure/sasa.cp310-win_amd64.pyd +0 -0
  87. biotite/structure/superimpose.py +158 -115
  88. {biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/METADATA +2 -2
  89. {biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/RECORD +92 -90
  90. {biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/WHEEL +1 -1
  91. biotite/structure/info/amino_acids.json +0 -1556
  92. biotite/structure/info/amino_acids.py +0 -42
  93. biotite/structure/info/carbohydrates.json +0 -1122
  94. biotite/structure/info/carbohydrates.py +0 -39
  95. biotite/structure/info/intra_bonds.msgpack +0 -0
  96. biotite/structure/info/link_types.msgpack +0 -1
  97. biotite/structure/info/nucleotides.json +0 -772
  98. biotite/structure/info/nucleotides.py +0 -39
  99. biotite/structure/info/residue_masses.msgpack +0 -0
  100. biotite/structure/info/residue_names.msgpack +0 -3
  101. biotite/structure/info/residues.msgpack +0 -0
  102. biotite/structure/io/pdbx/file.py +0 -652
  103. {biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/LICENSE.rst +0 -0
  104. {biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/top_level.txt +0 -0
@@ -17,21 +17,50 @@ __all__ = [
17
17
 
18
18
  import itertools
19
19
  import warnings
20
- from collections import OrderedDict
21
20
  import numpy as np
22
21
  from ....file import InvalidFileError
23
22
  from ....sequence.seqtypes import NucleotideSequence, ProteinSequence
24
23
  from ...atoms import AtomArray, AtomArrayStack, repeat
25
- from ...bonds import BondList, BondType
24
+ from ...bonds import BondList, BondType, connect_via_residue_names
26
25
  from ...box import unitcell_from_vectors, vectors_from_unitcell
27
26
  from ...filter import filter_first_altloc, filter_highest_occupancy_altloc
28
- from ...residues import get_residue_count
27
+ from ...residues import get_residue_count, get_residue_starts_for
29
28
  from ...error import BadStructureError
30
29
  from ...util import matrix_rotate
30
+ from .legacy import PDBxFile
31
+ from .component import MaskValue
32
+ from .cif import CIFFile, CIFBlock
33
+ from .bcif import BinaryCIFFile, BinaryCIFBlock, BinaryCIFColumn
34
+ from .encoding import StringArrayEncoding
31
35
 
32
36
 
33
- # Map 'chem_comp_bond' bond orders to 'BondType'...
34
- BOND_ORDER_TO_BOND_TYPE = {
37
+ # Cond types in `struct_conn` category that refer to covalent bonds
38
+ PDBX_COVALENT_TYPES = [
39
+ "covale", "covale_base", "covale_phosphate", "covale_sugar",
40
+ "disulf", "modres", "modres_link", "metalc"
41
+ ]
42
+ # Map 'struct_conn' bond orders to 'BondType'...
43
+ PDBX_BOND_ORDER_TO_TYPE = {
44
+ "": BondType.ANY,
45
+ "sing": BondType.SINGLE,
46
+ "doub": BondType.DOUBLE,
47
+ "trip": BondType.TRIPLE,
48
+ "quad": BondType.QUADRUPLE,
49
+ }
50
+ # ...and vice versa
51
+ PDBX_BOND_TYPE_TO_ORDER = {
52
+ # 'ANY' is masked later, it is merely added here to avoid a KeyError
53
+ BondType.ANY: "",
54
+ BondType.SINGLE: "sing",
55
+ BondType.DOUBLE: "doub",
56
+ BondType.TRIPLE: "trip",
57
+ BondType.QUADRUPLE: "quad",
58
+ BondType.AROMATIC_SINGLE: "sing",
59
+ BondType.AROMATIC_DOUBLE: "doub",
60
+ BondType.AROMATIC_TRIPLE: "trip",
61
+ }
62
+ # Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
63
+ COMP_BOND_ORDER_TO_TYPE = {
35
64
  ("SING", "N") : BondType.SINGLE,
36
65
  ("DOUB", "N") : BondType.DOUBLE,
37
66
  ("TRIP", "N") : BondType.TRIPLE,
@@ -41,11 +70,10 @@ BOND_ORDER_TO_BOND_TYPE = {
41
70
  ("TRIP", "Y") : BondType.AROMATIC_TRIPLE,
42
71
  }
43
72
  # ...and vice versa
44
- BOND_TYPE_TO_BOND_ORDER = {
45
- bond_type: order for order, bond_type in BOND_ORDER_TO_BOND_TYPE.items()
73
+ COMP_BOND_TYPE_TO_ORDER = {
74
+ bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
46
75
  }
47
76
 
48
-
49
77
  _proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
50
78
  _nucleotideseq_type_list = [
51
79
  "polydeoxyribonucleotide",
@@ -61,6 +89,27 @@ _other_type_list = [
61
89
  ]
62
90
 
63
91
 
92
+ def _filter(category, index):
93
+ """
94
+ Reduce the ``atom_site`` category to the values for the given
95
+ model.
96
+ """
97
+ Category = type(category)
98
+ Column = Category.subcomponent_class()
99
+ Data = Column.subcomponent_class()
100
+
101
+ return Category({
102
+ key: Column(
103
+ Data(column.data.array[index]),
104
+ (
105
+ Data(column.mask.array[index])
106
+ if column.mask is not None else None
107
+ )
108
+ )
109
+ for key, column in category.items()
110
+ })
111
+
112
+
64
113
  def get_sequence(pdbx_file, data_block=None):
65
114
  """
66
115
  Get the protein and nucleotide sequences from the
@@ -74,11 +123,14 @@ def get_sequence(pdbx_file, data_block=None):
74
123
 
75
124
  Parameters
76
125
  ----------
77
- pdbx_file : PDBxFile
126
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
78
127
  The file object.
79
- data_block : string, optional
80
- The name of the data block. Default is the first
81
- (and most times only) data block of the file.
128
+ data_block : str, optional
129
+ The name of the data block.
130
+ Default is the first (and most times only) data block of the
131
+ file.
132
+ If the data block object is passed directly to `pdbx_file`,
133
+ this parameter is ignored.
82
134
 
83
135
  Returns
84
136
  -------
@@ -86,50 +138,55 @@ def get_sequence(pdbx_file, data_block=None):
86
138
  The protein and nucleotide sequences for each entity
87
139
  (equivalent to chains in most cases).
88
140
  """
89
- poly_dict = pdbx_file.get_category("entity_poly", data_block)
90
- seq_string = poly_dict["pdbx_seq_one_letter_code_can"]
91
- seq_type = poly_dict["type"]
141
+ block = _get_block(pdbx_file, data_block)
142
+
143
+ poly_category= block["entity_poly"]
144
+ seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
145
+ seq_type = poly_category["type"].as_array(str)
92
146
  sequences = []
93
- if isinstance(seq_string, np.ndarray):
94
- for string, stype in zip(seq_string, seq_type):
95
- sequence = _convert_string_to_sequence(string, stype)
96
- if sequence is not None:
97
- sequences.append(sequence)
98
- else:
99
- sequences.append(_convert_string_to_sequence(seq_string, seq_type))
147
+ for string, stype in zip(seq_string, seq_type):
148
+ sequence = _convert_string_to_sequence(string, stype)
149
+ if sequence is not None:
150
+ sequences.append(sequence)
100
151
  return sequences
101
152
 
102
153
 
103
- def get_model_count(file, data_block=None):
154
+ def get_model_count(pdbx_file, data_block=None):
104
155
  """
105
156
  Get the number of models contained in a :class:`PDBxFile`.
106
157
 
107
158
  Parameters
108
159
  ----------
109
- file : PDBxFile
160
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
110
161
  The file object.
111
162
  data_block : str, optional
112
- The name of the data block. Default is the first
113
- (and most times only) data block of the file.
163
+ The name of the data block.
164
+ Default is the first (and most times only) data block of the
165
+ file.
166
+ If the data block object is passed directly to `pdbx_file`,
167
+ this parameter is ignored.
114
168
 
115
169
  Returns
116
170
  -------
117
171
  model_count : int
118
172
  The number of models.
119
173
  """
120
- atom_site_dict = file.get_category("atom_site", data_block)
121
- return len(_get_model_starts(atom_site_dict["pdbx_PDB_model_num"]))
174
+ block = _get_block(pdbx_file, data_block)
175
+ return len(_get_model_starts(
176
+ block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32)
177
+ ))
122
178
 
123
179
 
124
180
  def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
125
- extra_fields=None, use_author_fields=True):
181
+ extra_fields=None, use_author_fields=True,
182
+ include_bonds=False):
126
183
  """
127
184
  Create an :class:`AtomArray` or :class:`AtomArrayStack` from the
128
185
  ``atom_site`` category in a :class:`PDBxFile`.
129
186
 
130
187
  Parameters
131
188
  ----------
132
- pdbx_file : PDBxFile
189
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
133
190
  The file object.
134
191
  model : int, optional
135
192
  If this parameter is given, the function will return an
@@ -141,8 +198,11 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
141
198
  containing all models will be returned, even if the structure
142
199
  contains only one model.
143
200
  data_block : str, optional
144
- The name of the data block. Default is the first
145
- (and most times only) data block of the file.
201
+ The name of the data block.
202
+ Default is the first (and most times only) data block of the
203
+ file.
204
+ If the data block object is passed directly to `pdbx_file`,
205
+ this parameter is ignored.
146
206
  altloc : {'first', 'occupancy', 'all'}
147
207
  This parameter defines how *altloc* IDs are handled:
148
208
  - ``'first'`` - Use atoms that have the first *altloc* ID
@@ -176,6 +236,15 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
176
236
  otherwise from the the ``label_xxx`` fields.
177
237
  If the requested field is not available, the respective other
178
238
  field is taken as fallback.
239
+ include_bonds : bool, optional
240
+ If set to true, a :class:`BondList` will be created for the
241
+ resulting :class:`AtomArray` containing the bond information
242
+ from the file.
243
+ Bonds, whose order could not be determined from the
244
+ *Chemical Component Dictionary*
245
+ (e.g. especially inter-residue bonds),
246
+ have :attr:`BondType.ANY`, since the PDB format itself does
247
+ not support bond orders.
179
248
 
180
249
  Returns
181
250
  -------
@@ -186,31 +255,35 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
186
255
  --------
187
256
 
188
257
  >>> import os.path
189
- >>> file = PDBxFile.read(os.path.join(path_to_structures, "1l2y.cif"))
258
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1l2y.cif"))
190
259
  >>> arr = get_structure(file, model=1)
191
260
  >>> print(len(arr))
192
261
  304
193
262
 
194
263
  """
195
- extra_fields = [] if extra_fields is None else extra_fields
264
+ block = _get_block(pdbx_file, data_block)
265
+
266
+ extra_fields = set() if extra_fields is None else set(extra_fields)
196
267
 
197
- atom_site_dict = pdbx_file.get_category("atom_site", data_block)
198
- if atom_site_dict is None:
268
+ atom_site = block.get("atom_site")
269
+ if atom_site is None:
199
270
  raise InvalidFileError("Missing 'atom_site' category in file")
200
-
201
- models = atom_site_dict["pdbx_PDB_model_num"]
271
+
272
+ models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
202
273
  model_starts = _get_model_starts(models)
203
274
  model_count = len(model_starts)
204
275
  atom_count = len(models)
205
276
 
206
277
  if model is None:
207
278
  # For a stack, the annotations are derived from the first model
208
- model_dict = _get_model_dict(atom_site_dict, model_starts, 1)
279
+ model_atom_site = _filter_model(atom_site, model_starts, 1)
209
280
  # Any field of the category would work here to get the length
210
- model_length = len(model_dict["group_PDB"])
281
+ model_length = model_atom_site.row_count
211
282
  stack = AtomArrayStack(model_count, model_length)
212
283
 
213
- _fill_annotations(stack, model_dict, extra_fields, use_author_fields)
284
+ _fill_annotations(
285
+ stack, model_atom_site, extra_fields, use_author_fields
286
+ )
214
287
 
215
288
  # Check if each model has the same amount of atoms
216
289
  # If not, raise exception
@@ -221,22 +294,24 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
221
294
  "instead"
222
295
  )
223
296
 
224
- stack.coord = np.zeros(
225
- (model_count, model_length, 3), dtype=np.float32
226
- )
227
- stack.coord[:, :, 0] = atom_site_dict["Cartn_x"].reshape(
228
- (model_count, model_length)
229
- )
230
- stack.coord[:, :, 1] = atom_site_dict["Cartn_y"].reshape(
231
- (model_count, model_length)
232
- )
233
- stack.coord[:, :, 2] = atom_site_dict["Cartn_z"].reshape(
234
- (model_count, model_length)
235
- )
297
+ stack.coord[:, :, 0] = atom_site["Cartn_x"].as_array(np.float32) \
298
+ .reshape((model_count, model_length))
299
+ stack.coord[:, :, 1] = atom_site["Cartn_y"].as_array(np.float32) \
300
+ .reshape((model_count, model_length))
301
+ stack.coord[:, :, 2] = atom_site["Cartn_z"].as_array(np.float32) \
302
+ .reshape((model_count, model_length))
303
+
304
+ if include_bonds:
305
+ bonds = connect_via_residue_names(stack)
306
+ if "struct_conn" in block:
307
+ bonds = bonds.merge(_parse_inter_residue_bonds(
308
+ model_atom_site, block["struct_conn"]
309
+ ))
310
+ stack.bonds = bonds
236
311
 
237
- stack = _filter_altloc(stack, model_dict, altloc)
312
+ stack = _filter_altloc(stack, model_atom_site, altloc)
238
313
 
239
- box = _get_box(pdbx_file, data_block)
314
+ box = _get_box(block)
240
315
  if box is not None:
241
316
  # Duplicate same box for each model
242
317
  stack.box = np.repeat(box[np.newaxis, ...], model_count, axis=0)
@@ -254,169 +329,284 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
254
329
  f"the given model {model} does not exist"
255
330
  )
256
331
 
257
- model_dict = _get_model_dict(atom_site_dict, model_starts, model)
332
+ model_atom_site = _filter_model(atom_site, model_starts, model)
258
333
  # Any field of the category would work here to get the length
259
- model_length = len(model_dict["group_PDB"])
334
+ model_length = model_atom_site.row_count
260
335
  array = AtomArray(model_length)
261
336
 
262
- _fill_annotations(array, model_dict, extra_fields, use_author_fields)
263
-
264
- # Append exclusive stop
265
- model_starts = np.append(
266
- model_starts, [len(atom_site_dict["group_PDB"])]
267
- )
268
- # Indexing starts at 0, but model number starts at 1
269
- model_index = model - 1
270
- start, stop = model_starts[model_index], model_starts[model_index + 1]
271
- array.coord = np.zeros((model_length, 3), dtype=np.float32)
272
- array.coord[:, 0] = atom_site_dict["Cartn_x"][start:stop].astype(
273
- np.float32
274
- )
275
- array.coord[:, 1] = atom_site_dict["Cartn_y"][start:stop].astype(
276
- np.float32
277
- )
278
- array.coord[:, 2] = atom_site_dict["Cartn_z"][start:stop].astype(
279
- np.float32
337
+ _fill_annotations(
338
+ array, model_atom_site, extra_fields, use_author_fields
280
339
  )
281
340
 
282
- array = _filter_altloc(array, model_dict, altloc)
341
+ array.coord[:, 0] = model_atom_site["Cartn_x"].as_array(np.float32)
342
+ array.coord[:, 1] = model_atom_site["Cartn_y"].as_array(np.float32)
343
+ array.coord[:, 2] = model_atom_site["Cartn_z"].as_array(np.float32)
344
+
345
+ if include_bonds:
346
+ bonds = connect_via_residue_names(array)
347
+ if "struct_conn" in block:
348
+ bonds = bonds.merge(_parse_inter_residue_bonds(
349
+ model_atom_site, block["struct_conn"]
350
+ ))
351
+ array.bonds = bonds
352
+
353
+ array = _filter_altloc(array, model_atom_site, altloc)
283
354
 
284
- array.box = _get_box(pdbx_file, data_block)
355
+ array.box = _get_box(block)
285
356
 
286
357
  return array
287
358
 
288
359
 
289
- def _fill_annotations(array, model_dict, extra_fields, use_author_fields):
290
- """Fill atom_site annotations in atom array or atom array stack.
360
+ def _get_block(pdbx_component, block_name):
361
+ if isinstance(pdbx_component, PDBxFile):
362
+ # The deprecated 'PDBxFile' is a thin wrapper around 'CIFFile'
363
+ pdbx_component = pdbx_component.cif_file
364
+
365
+ if not isinstance(pdbx_component, (CIFBlock, BinaryCIFBlock)):
366
+ # Determine block
367
+ if block_name is None:
368
+ return pdbx_component.block
369
+ else:
370
+ return pdbx_component[block_name]
371
+ else:
372
+ return pdbx_component
291
373
 
292
- Parameters
293
- ----------
294
- array : AtomArray or AtomArrayStack
295
- Atom array or stack which will be annotated.
296
- model_dict : dict(str, ndarray)
297
- ``atom_site`` dictionary with values for one model.
298
- extra_fields : list of str
299
- Entry names, that are additionally added as annotation arrays.
300
- use_author_fields : bool
301
- Define if alternate fields prefixed with ``auth_`` should be used
302
- instead of ``label_``.
303
- """
304
374
 
305
- def get_or_fallback_from_dict(input_dict, key, fallback_key,
306
- dict_name="input"):
375
+ def _get_or_fallback(category, key, fallback_key, cat_name="input"):
307
376
  """
308
- Return value related to key in input dict if it exists,
309
- otherwise try to get the value related to fallback key."""
310
- if key not in input_dict:
377
+ Return column related to key in category if it exists,
378
+ otherwise try to get the column related to fallback key.
379
+ """
380
+ if key not in category:
311
381
  warnings.warn(
312
- f"Attribute '{key}' not found within '{dict_name}' category. "
382
+ f"Attribute '{key}' not found within '{cat_name}' category. "
313
383
  f"The fallback attribute '{fallback_key}' will be used instead",
314
384
  UserWarning
315
385
  )
316
386
  try:
317
- return input_dict[fallback_key]
387
+ return category[fallback_key]
318
388
  except KeyError as key_exc:
319
389
  raise InvalidFileError(
320
390
  f"Fallback attribute '{fallback_key}' not found in "
321
391
  "'{dict_name}' category"
322
392
  ) from key_exc
323
- return input_dict[key]
324
-
325
- def get_annotation_from_model(
326
- model_dict,
327
- annotation_name,
328
- annotation_fallback=None,
329
- as_type=None,
330
- formatter=None,
331
- ):
332
- """Get and format annotation array from model dictionary."""
333
- array = (
334
- get_or_fallback_from_dict(
335
- model_dict, annotation_name, annotation_fallback,
336
- dict_name="atom_site"
337
- )
338
- if annotation_fallback is not None
339
- else model_dict[annotation_name]
340
- )
341
- if as_type is not None:
342
- array = array.astype(as_type)
343
- return formatter(array) if formatter is not None else array
393
+ return category[key]
394
+
395
+
396
+ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
397
+ """Fill atom_site annotations in atom array or atom array stack.
398
+
399
+ Parameters
400
+ ----------
401
+ array : AtomArray or AtomArrayStack
402
+ Atom array or stack which will be annotated.
403
+ atom_site : CIFCategory or BinaryCIFCategory
404
+ ``atom_site`` category with values for one model.
405
+ extra_fields : list of str
406
+ Entry names, that are additionally added as annotation arrays.
407
+ use_author_fields : bool
408
+ Define if alternate fields prefixed with ``auth_`` should be used
409
+ instead of ``label_``.
410
+ """
344
411
 
345
412
  prefix, alt_prefix = (
346
413
  ("auth", "label") if use_author_fields else ("label", "auth")
347
414
  )
348
415
 
349
- annotation_data = {
350
- "chain_id": (f"{prefix}_asym_id", f"{alt_prefix}_asym_id", "U4", None),
351
- "res_id": (
352
- f"{prefix}_seq_id",
353
- f"{alt_prefix}_seq_id",
354
- None,
355
- lambda annot: np.array(
356
- [-1 if elt in [".", "?"] else int(elt) for elt in annot]
357
- ),
358
- ),
359
- "ins_code": (
360
- "pdbx_PDB_ins_code",
361
- None,
362
- "U1",
363
- lambda annot: np.array(
364
- ["" if elt in [".", "?"] else elt for elt in annot]
365
- ),
366
- ),
367
- "res_name": (f"{prefix}_comp_id", f"{alt_prefix}_comp_id", "U5", None),
368
- "hetero": ("group_PDB", None, None, lambda annot: annot == "HETATM"),
369
- "atom_name": (
370
- f"{prefix}_atom_id",
371
- f"{alt_prefix}_atom_id",
372
- "U6",
373
- None,
374
- ),
375
- "element": ("type_symbol", None, "U2", None),
376
- "atom_id": ("id", None, int, None),
377
- "b_factor": ("B_iso_or_equiv", None, float, None),
378
- "occupancy": ("occupancy", None, float, None),
379
- "charge": (
380
- "pdbx_formal_charge",
381
- None,
382
- None,
383
- lambda annot: np.array(
384
- [
385
- 0 if charge in ["?", "."] else int(charge)
386
- for charge in annot
387
- ],
388
- dtype=int,
389
- ),
390
- ),
391
- }
392
-
393
- mandatory_annotations = [
416
+ array.set_annotation(
394
417
  "chain_id",
418
+ _get_or_fallback(
419
+ atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
420
+ ).as_array("U4")
421
+ )
422
+ array.set_annotation(
395
423
  "res_id",
424
+ _get_or_fallback(
425
+ atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
426
+ ).as_array(int, -1)
427
+ )
428
+ array.set_annotation(
396
429
  "ins_code",
430
+ atom_site["pdbx_PDB_ins_code"].as_array("U1", "")
431
+ )
432
+ array.set_annotation(
397
433
  "res_name",
434
+ _get_or_fallback(
435
+ atom_site, f"{prefix}_comp_id", f"{alt_prefix}_comp_id"
436
+ ).as_array("U5")
437
+ )
438
+ array.set_annotation(
398
439
  "hetero",
440
+ atom_site["group_PDB"].as_array(str) == "HETATM"
441
+ )
442
+ array.set_annotation(
399
443
  "atom_name",
444
+ _get_or_fallback(
445
+ atom_site, f"{prefix}_atom_id", f"{alt_prefix}_atom_id"
446
+ ).as_array("U6")
447
+ )
448
+ array.set_annotation(
400
449
  "element",
401
- ]
450
+ atom_site["type_symbol"].as_array("U2")
451
+ )
402
452
 
403
- # Iterate over mandatory annotations and given extra_fields
404
- for annotation_name in mandatory_annotations + extra_fields:
453
+ if "atom_id" in extra_fields:
405
454
  array.set_annotation(
406
- annotation_name,
407
- get_annotation_from_model(
408
- model_dict, *annotation_data[annotation_name]
409
- )
410
- if annotation_name in annotation_data
411
- else get_annotation_from_model(
412
- model_dict, annotation_name, as_type=str
413
- ),
455
+ "atom_id",
456
+ atom_site["id"].as_array(int)
457
+ )
458
+ extra_fields.remove("atom_id")
459
+ if "b_factor" in extra_fields:
460
+ array.set_annotation(
461
+ "b_factor",
462
+ atom_site["B_iso_or_equiv"].as_array(float)
463
+ )
464
+ extra_fields.remove("b_factor")
465
+ if "occupancy" in extra_fields:
466
+ array.set_annotation(
467
+ "occupancy",
468
+ atom_site["occupancy"].as_array(float)
469
+ )
470
+ extra_fields.remove("occupancy")
471
+ if "charge" in extra_fields:
472
+ array.set_annotation(
473
+ "charge",
474
+ atom_site["pdbx_formal_charge"].as_array(int, 0)
475
+ )
476
+ extra_fields.remove("charge")
477
+
478
+ # Handle all remaining custom fields
479
+ for field in extra_fields:
480
+ array.set_annotation(
481
+ field,
482
+ atom_site[field].as_array(str)
483
+ )
484
+
485
+
486
+ def _parse_inter_residue_bonds(atom_site, struct_conn):
487
+ """
488
+ Create inter-residue bonds by parsing the ``struct_conn`` category.
489
+ The atom indices of each bond are found by matching the bond labels
490
+ to the ``atom_site`` category.
491
+ """
492
+ # Identity symmetry operation
493
+ IDENTITY = "1_555"
494
+ # Columns in 'atom_site' that should be matched by 'struct_conn'
495
+ COLUMNS = [
496
+ "label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id",
497
+ "label_alt_id", "auth_asym_id", "auth_comp_id", "auth_seq_id",
498
+ "pdbx_PDB_ins_code"
499
+ ]
500
+
501
+ covale_mask = np.isin(
502
+ struct_conn["conn_type_id"].as_array(str), PDBX_COVALENT_TYPES
503
+ )
504
+ if "ptnr1_symmetry" in struct_conn:
505
+ covale_mask &= (
506
+ struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
507
+ )
508
+ if "ptnr2_symmetry" in struct_conn:
509
+ covale_mask &= (
510
+ struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY
511
+ )
512
+
513
+ atom_indices = [None] * 2
514
+ for i in range(2):
515
+ reference_arrays = []
516
+ query_arrays = []
517
+ for col_name in COLUMNS:
518
+ struct_conn_col_name = _get_struct_conn_col_name(col_name, i+1)
519
+ if (
520
+ col_name not in atom_site
521
+ or struct_conn_col_name not in struct_conn
522
+ ):
523
+ continue
524
+ # Ensure both arrays have the same dtype to allow comparison
525
+ reference = atom_site[col_name].as_array()
526
+ dtype = reference.dtype
527
+ query = struct_conn[struct_conn_col_name].as_array(dtype)
528
+ if np.issubdtype(reference.dtype, str):
529
+ # The mask value is not necessarily consistent
530
+ # between query and reference
531
+ # -> make it consistent
532
+ reference[reference == "?"] = "."
533
+ query[query == "?"] = "."
534
+ reference_arrays.append(reference)
535
+ query_arrays.append(query[covale_mask])
536
+ # Match the combination of 'label_asym_id', 'label_comp_id', etc.
537
+ # in 'atom_site' and 'struct_conn'
538
+ atom_indices[i] = _find_matches(query_arrays, reference_arrays)
539
+ atoms_indices_1 = atom_indices[0]
540
+ atoms_indices_2 = atom_indices[1]
541
+
542
+ # Some bonds in 'struct_conn' may not be found in 'atom_site'
543
+ # This is okay,
544
+ # as 'atom_site' might already be reduced to a single model
545
+ mapping_exists_mask = (atoms_indices_1 != -1) & (atoms_indices_2 != -1)
546
+ atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
547
+ atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
548
+
549
+ # Interpret missing values as ANY bonds
550
+ bond_order = struct_conn["pdbx_value_order"].as_array("U4", "")
551
+ # Consecutively apply the same masks as applied to the atom indices
552
+ # Logical combination does not work here,
553
+ # as the second mask was created based on already filtered data
554
+ bond_order = bond_order[covale_mask][mapping_exists_mask]
555
+ bond_types = [PDBX_BOND_ORDER_TO_TYPE[order] for order in bond_order]
556
+
557
+ return BondList(
558
+ atom_site.row_count,
559
+ np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1)
560
+ )
561
+
562
+
563
+ def _find_matches(query_arrays, reference_arrays):
564
+ """
565
+ For each index in the `query_arrays` find the indices in the
566
+ `reference_arrays` where all query values the reference counterpart.
567
+ If no match is found for a query, the corresponding index is -1.
568
+ """
569
+ match_masks_for_all_columns = np.stack([
570
+ query[:, np.newaxis] == reference[np.newaxis, :]
571
+ for query, reference in zip(query_arrays, reference_arrays)
572
+ ], axis=-1)
573
+ match_masks = np.all(match_masks_for_all_columns, axis=-1)
574
+ query_matches, reference_matches = np.where(match_masks)
575
+
576
+ # Duplicate matches indicate that an atom from the query cannot
577
+ # be uniquely matched to an atom in the reference
578
+ unique_query_matches, counts = np.unique(query_matches, return_counts=True)
579
+ if np.any(counts > 1):
580
+ ambiguous_query = unique_query_matches[np.where(counts > 1)[0][0]]
581
+ raise InvalidFileError(
582
+ f"The covalent bond in the 'struct_conn' category at index "
583
+ f"{ambiguous_query} cannot be unambiguously assigned to atoms in "
584
+ f"the 'atom_site' category"
414
585
  )
415
586
 
587
+ # -1 indicates that no match was found in the reference
588
+ match_indices = np.full(len(query_arrays[0]), -1, dtype=int)
589
+ match_indices[query_matches] = reference_matches
590
+ return match_indices
416
591
 
417
- def _filter_altloc(array, model_dict, altloc):
418
- altloc_ids = model_dict.get("label_alt_id")
419
- occupancy = model_dict.get("occupancy")
592
+
593
+ def _get_struct_conn_col_name(col_name, partner):
594
+ """
595
+ For a column name in ``atom_site`` get the corresponding column name
596
+ in ``struct_conn``.
597
+ """
598
+ if col_name == "label_alt_id":
599
+ return f"pdbx_ptnr{partner}_label_alt_id"
600
+ elif col_name.startswith("pdbx_"):
601
+ # Move 'pdbx_' to front
602
+ return f"pdbx_ptnr{partner}_{col_name[5:]}"
603
+ else:
604
+ return f"ptnr{partner}_{col_name}"
605
+
606
+
607
+ def _filter_altloc(array, atom_site, altloc):
608
+ altloc_ids = atom_site.get("label_alt_id")
609
+ occupancy = atom_site.get("occupancy")
420
610
 
421
611
  # Filter altloc IDs and return
422
612
  if altloc_ids is None:
@@ -425,14 +615,14 @@ def _filter_altloc(array, model_dict, altloc):
425
615
  return array[
426
616
  ...,
427
617
  filter_highest_occupancy_altloc(
428
- array, altloc_ids, occupancy.astype(float)
618
+ array, altloc_ids.as_array(str), occupancy.as_array(float)
429
619
  ),
430
620
  ]
431
621
  # 'first' is also fallback if file has no occupancy information
432
622
  elif altloc == "first":
433
- return array[..., filter_first_altloc(array, altloc_ids)]
623
+ return array[..., filter_first_altloc(array, altloc_ids.as_array(str))]
434
624
  elif altloc == "all":
435
- array.set_annotation("altloc_id", altloc_ids)
625
+ array.set_annotation("altloc_id", altloc_ids.as_array(str))
436
626
  return array
437
627
  else:
438
628
  raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
@@ -443,49 +633,46 @@ def _get_model_starts(model_array):
443
633
  Get the start index for each model in the arrays of the
444
634
  ``atom_site`` category.
445
635
  """
446
- models, indices = np.unique(model_array, return_index=True)
636
+ _, indices = np.unique(model_array, return_index=True)
447
637
  indices.sort()
448
638
  return indices
449
639
 
450
640
 
451
- def _get_model_dict(atom_site_dict, model_starts, model):
641
+ def _filter_model(atom_site, model_starts, model):
452
642
  """
453
- Reduce the ``atom_site`` dictionary to the values for the given
643
+ Reduce the ``atom_site`` category to the values for the given
454
644
  model.
455
645
  """
646
+ Category = type(atom_site)
647
+ Column = Category.subcomponent_class()
648
+ Data = Column.subcomponent_class()
649
+
456
650
  # Append exclusive stop
457
651
  model_starts = np.append(
458
- model_starts, [len(atom_site_dict["pdbx_PDB_model_num"])]
652
+ model_starts, [atom_site.row_count]
459
653
  )
460
- model_dict = {}
461
654
  # Indexing starts at 0, but model number starts at 1
462
655
  model_index = model - 1
463
- for key in atom_site_dict.keys():
464
- model_dict[key] = atom_site_dict[key][
465
- model_starts[model_index] : model_starts[model_index + 1]
466
- ]
467
- return model_dict
656
+ index = slice(model_starts[model_index], model_starts[model_index + 1])
657
+ return _filter(atom_site, index)
468
658
 
469
659
 
470
- def _get_box(pdbx_file, data_block):
471
- if data_block is None:
472
- cell_dict = pdbx_file.get("cell")
473
- else:
474
- cell_dict = pdbx_file.get((data_block, "cell"))
475
- if cell_dict is None:
660
+ def _get_box(block):
661
+ cell = block.get("cell")
662
+ if cell is None:
476
663
  return None
477
664
  try:
478
665
  len_a, len_b, len_c = [
479
- float(cell_dict[length])
666
+ float(cell[length].as_item())
480
667
  for length in ["length_a", "length_b", "length_c"]
481
668
  ]
669
+ alpha, beta, gamma = [
670
+ np.deg2rad(float(cell[angle].as_item()))
671
+ for angle in ["angle_alpha", "angle_beta", "angle_gamma"]
672
+ ]
482
673
  except ValueError:
483
674
  # 'cell_dict' has no proper unit cell values, e.g. '?'
484
675
  return None
485
- alpha, beta, gamma = [
486
- np.deg2rad(float(cell_dict[angle]))
487
- for angle in ["angle_alpha", "angle_beta", "angle_gamma"]
488
- ]
489
676
  return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
490
677
 
491
678
 
@@ -496,69 +683,90 @@ def set_structure(pdbx_file, array, data_block=None):
496
683
 
497
684
  This will save the coordinates, the mandatory annotation categories
498
685
  and the optional annotation categories
499
- ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
686
+ ``atom_id``, ``b_factor``, ``occupancy`` and ``charge``.
500
687
  If the atom array (stack) contains the annotation ``'atom_id'``,
501
688
  these values will be used for atom numbering instead of continuous
502
689
  numbering.
690
+ Furthermore, inter-residue bonds will be written into the
691
+ ``struct_conn`` category.
503
692
 
504
693
  Parameters
505
694
  ----------
506
- pdbx_file : PDBxFile
695
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
507
696
  The file object.
508
697
  array : AtomArray or AtomArrayStack
509
698
  The structure to be written. If a stack is given, each array in
510
699
  the stack will be in a separate model.
511
700
  data_block : str, optional
512
- The name of the data block. Default is the first
513
- (and most times only) data block of the file.
701
+ The name of the data block.
702
+ Default is the first (and most times only) data block of the
703
+ file.
704
+ If the data block object is passed directly to `pdbx_file`,
705
+ this parameter is ignored.
706
+ If the file is empty, a new data will be created.
707
+
708
+ Notes
709
+ -----
710
+ In some cases, the written inter-residue bonds cannot be read again
711
+ due to ambiguity to which atoms the bond refers.
712
+ This is the case, when two equal residues in the same chain have
713
+ the same (or a masked) `res_id`.
514
714
 
515
715
  Examples
516
716
  --------
517
717
 
518
718
  >>> import os.path
519
- >>> file = PDBxFile()
520
- >>> set_structure(file, atom_array, data_block="structure")
719
+ >>> file = CIFFile()
720
+ >>> set_structure(file, atom_array)
521
721
  >>> file.write(os.path.join(path_to_directory, "structure.cif"))
522
722
 
523
723
  """
724
+ block = _get_or_create_block(pdbx_file, data_block)
725
+ Category = block.subcomponent_class()
726
+ Column = Category.subcomponent_class()
727
+
524
728
  # Fill PDBx columns from information
525
729
  # in structures' attribute arrays as good as possible
526
- # Use OrderedDict in order to ensure the usually used column order.
527
- atom_site_dict = OrderedDict()
528
- # Save list of annotation categories for checks,
529
- # if an optional category exists
530
- annot_categories = array.get_annotation_categories()
531
- atom_site_dict["group_PDB"] = np.array(
532
- ["ATOM" if e == False else "HETATM" for e in array.hetero]
730
+ atom_site = Category()
731
+ atom_site["group_PDB"] = np.where(
732
+ array.hetero, "HETATM", "ATOM"
733
+ )
734
+ atom_site["type_symbol"] = np.copy(array.element)
735
+ atom_site["label_atom_id"] = np.copy(array.atom_name)
736
+ atom_site["label_alt_id"] = Column(
737
+ # AtomArrays do not store altloc atoms
738
+ np.full(array.array_length(), "."),
739
+ np.full(array.array_length(), MaskValue.INAPPLICABLE),
740
+ )
741
+ atom_site["label_comp_id"] = np.copy(array.res_name)
742
+ atom_site["label_asym_id"] = np.copy(array.chain_id)
743
+ atom_site["label_entity_id"] = _determine_entity_id(array.chain_id)
744
+ atom_site["label_seq_id"] = np.copy(array.res_id)
745
+ atom_site["pdbx_PDB_ins_code"] = Column(
746
+ np.copy(array.ins_code),
747
+ np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT)
533
748
  )
534
- atom_site_dict["type_symbol"] = np.copy(array.element)
535
- atom_site_dict["label_atom_id"] = np.copy(array.atom_name)
536
- atom_site_dict["label_alt_id"] = np.full(array.array_length(), ".")
537
- atom_site_dict["label_comp_id"] = np.copy(array.res_name)
538
- atom_site_dict["label_asym_id"] = np.copy(array.chain_id)
539
- atom_site_dict["label_entity_id"] = _determine_entity_id(array.chain_id)
540
- atom_site_dict["label_seq_id"] = np.array([str(e) for e in array.res_id])
541
- atom_site_dict["pdbx_PDB_ins_code"] = array.ins_code
542
- atom_site_dict["auth_seq_id"] = atom_site_dict["label_seq_id"]
543
- atom_site_dict["auth_comp_id"] = atom_site_dict["label_comp_id"]
544
- atom_site_dict["auth_asym_id"] = atom_site_dict["label_asym_id"]
545
- atom_site_dict["auth_atom_id"] = atom_site_dict["label_atom_id"]
749
+ atom_site["auth_seq_id"] = atom_site["label_seq_id"]
750
+ atom_site["auth_comp_id"] = atom_site["label_comp_id"]
751
+ atom_site["auth_asym_id"] = atom_site["label_asym_id"]
752
+ atom_site["auth_atom_id"] = atom_site["label_atom_id"]
546
753
 
754
+ annot_categories = array.get_annotation_categories()
547
755
  if "atom_id" in annot_categories:
548
- atom_site_dict["id"] = array.atom_id.astype(str)
756
+ atom_site["id"] = np.copy(array.atom_id)
549
757
  if "b_factor" in annot_categories:
550
- atom_site_dict["B_iso_or_equiv"] = np.array(
551
- [f"{b:.2f}" for b in array.b_factor]
552
- )
758
+ atom_site["B_iso_or_equiv"] = np.copy(array.b_factor)
553
759
  if "occupancy" in annot_categories:
554
- atom_site_dict["occupancy"] = np.array(
555
- [f"{occ:.2f}" for occ in array.occupancy]
556
- )
760
+ atom_site["occupancy"] = np.copy(array.occupancy)
557
761
  if "charge" in annot_categories:
558
- atom_site_dict["pdbx_formal_charge"] = np.array(
559
- [f"{c:+d}" if c != 0 else "?" for c in array.charge]
762
+ atom_site["pdbx_formal_charge"] = Column(
763
+ np.array([f"{c:+d}" if c != 0 else "?" for c in array.charge]),
764
+ np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT)
560
765
  )
561
766
 
767
+ if array.bonds is not None:
768
+ block["struct_conn"] = _set_inter_residue_bonds(array, atom_site)
769
+
562
770
  # In case of a single model handle each coordinate
563
771
  # simply like a flattened array
564
772
  if type(array) == AtomArray or (
@@ -566,42 +774,34 @@ def set_structure(pdbx_file, array, data_block=None):
566
774
  ):
567
775
  # 'ravel' flattens coord without copy
568
776
  # in case of stack with stack_depth = 1
569
- atom_site_dict["Cartn_x"] = np.array(
570
- [f"{c:.3f}" for c in np.ravel(array.coord[..., 0])]
571
- )
572
- atom_site_dict["Cartn_y"] = np.array(
573
- [f"{c:.3f}" for c in np.ravel(array.coord[..., 1])]
574
- )
575
- atom_site_dict["Cartn_z"] = np.array(
576
- [f"{c:.3f}" for c in np.ravel(array.coord[..., 2])]
577
- )
578
- atom_site_dict["pdbx_PDB_model_num"] = np.full(
579
- array.array_length(), "1"
777
+ atom_site["Cartn_x"] = np.copy(np.ravel(array.coord[..., 0]))
778
+ atom_site["Cartn_y"] = np.copy(np.ravel(array.coord[..., 1]))
779
+ atom_site["Cartn_z"] = np.copy(np.ravel(array.coord[..., 2]))
780
+ atom_site["pdbx_PDB_model_num"] = np.ones(
781
+ array.array_length(), dtype=np.int32
580
782
  )
581
783
  # In case of multiple models repeat annotations
582
784
  # and use model specific coordinates
583
785
  elif type(array) == AtomArrayStack:
584
- for key, value in atom_site_dict.items():
585
- atom_site_dict[key] = np.tile(value, reps=array.stack_depth())
786
+ atom_site = _repeat(atom_site, array.stack_depth())
586
787
  coord = np.reshape(
587
788
  array.coord, (array.stack_depth() * array.array_length(), 3)
588
789
  )
589
- atom_site_dict["Cartn_x"] = np.array([f"{c:.3f}" for c in coord[:, 0]])
590
- atom_site_dict["Cartn_y"] = np.array([f"{c:.3f}" for c in coord[:, 1]])
591
- atom_site_dict["Cartn_z"] = np.array([f"{c:.3f}" for c in coord[:, 2]])
592
- models = np.repeat(
593
- np.arange(1, array.stack_depth() + 1).astype(str),
790
+ atom_site["Cartn_x"] = np.copy(coord[:, 0])
791
+ atom_site["Cartn_y"] = np.copy(coord[:, 1])
792
+ atom_site["Cartn_z"] = np.copy(coord[:, 2])
793
+ atom_site["pdbx_PDB_model_num"] = np.repeat(
794
+ np.arange(1, array.stack_depth() + 1, dtype=np.int32),
594
795
  repeats=array.array_length(),
595
796
  )
596
- atom_site_dict["pdbx_PDB_model_num"] = models
597
797
  else:
598
798
  raise ValueError("Structure must be AtomArray or AtomArrayStack")
599
799
  if not "atom_id" in annot_categories:
600
800
  # Count from 1
601
- atom_site_dict["id"] = np.arange(
602
- 1, len(atom_site_dict["group_PDB"]) + 1
603
- ).astype("U6")
604
- pdbx_file.set_category("atom_site", atom_site_dict, data_block)
801
+ atom_site["id"] = np.arange(
802
+ 1, len(atom_site["group_PDB"]) + 1
803
+ )
804
+ block["atom_site"] = atom_site
605
805
 
606
806
  # Write box into file
607
807
  if array.box is not None:
@@ -612,14 +812,38 @@ def set_structure(pdbx_file, array, data_block=None):
612
812
  else:
613
813
  box = array.box
614
814
  len_a, len_b, len_c, alpha, beta, gamma = unitcell_from_vectors(box)
615
- cell_dict = OrderedDict()
616
- cell_dict["length_a"] = "{:6.3f}".format(len_a)
617
- cell_dict["length_b"] = "{:6.3f}".format(len_b)
618
- cell_dict["length_c"] = "{:6.3f}".format(len_c)
619
- cell_dict["angle_alpha"] = "{:5.3f}".format(np.rad2deg(alpha))
620
- cell_dict["angle_beta"] = "{:5.3f}".format(np.rad2deg(beta))
621
- cell_dict["angle_gamma"] = "{:5.3f}".format(np.rad2deg(gamma))
622
- pdbx_file.set_category("cell", cell_dict, data_block)
815
+ cell = Category()
816
+ cell["length_a"] = len_a
817
+ cell["length_b"] = len_b
818
+ cell["length_c"] = len_c
819
+ cell["angle_alpha"] = np.rad2deg(alpha)
820
+ cell["angle_beta"] = np.rad2deg(beta)
821
+ cell["angle_gamma"] = np.rad2deg(gamma)
822
+ block["cell"] = cell
823
+
824
+
825
+ def _get_or_create_block(pdbx_component, block_name):
826
+ if isinstance(pdbx_component, PDBxFile):
827
+ # The deprecated 'PDBxFile' is a thin wrapper around 'CIFFile'
828
+ pdbx_component = pdbx_component.cif_file
829
+
830
+ Block = pdbx_component.subcomponent_class()
831
+
832
+ if isinstance(pdbx_component, (CIFFile, BinaryCIFFile)):
833
+ if block_name is None:
834
+ if len(pdbx_component) > 0:
835
+ block_name = next(iter(pdbx_component.keys()))
836
+ else:
837
+ # File is empty -> invent a new block name
838
+ block_name = "structure"
839
+
840
+ if block_name not in pdbx_component:
841
+ block = Block()
842
+ pdbx_component[block_name] = block
843
+ return pdbx_component[block_name]
844
+ else:
845
+ # Already a block
846
+ return pdbx_component
623
847
 
624
848
 
625
849
  def _determine_entity_id(chain_id):
@@ -635,10 +859,81 @@ def _determine_entity_id(chain_id):
635
859
  id_translation[chain_id[i]] = id
636
860
  entity_id[i] = id_translation[chain_id[i]]
637
861
  id += 1
638
- return entity_id.astype(str)
862
+ return entity_id
863
+
864
+
865
+ def _repeat(category, repetitions):
866
+ Category = type(category)
867
+ Column = Category.subcomponent_class()
868
+ Data = Column.subcomponent_class()
869
+
870
+ category_dict = {}
871
+ for key, column in category.items():
872
+ if isinstance(column, BinaryCIFColumn):
873
+ data_encoding = column.data.encoding
874
+ # Optimization: The repeated string array has the same
875
+ # unique values, as the original string array
876
+ # -> Use same unique values (faster due to shorter array)
877
+ if isinstance(data_encoding[0], StringArrayEncoding):
878
+ data_encoding[0].strings = np.unique(column.data.array)
879
+ data = Data(np.tile(column.data.array, repetitions), data_encoding)
880
+ else:
881
+ data = Data(np.tile(column.data.array, repetitions))
882
+ mask = Data(np.tile(column.mask.array, repetitions)) \
883
+ if column.mask is not None else None
884
+ category_dict[key] = Column(data, mask)
885
+ return Category(category_dict)
639
886
 
640
887
 
641
- def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
888
+ def _set_inter_residue_bonds(array, atom_site):
889
+ """
890
+ Create the ``struct_conn`` category containing the inter-residue
891
+ bonds.
892
+ The involved atoms are identified by annotations from the
893
+ ``atom_site`` category.
894
+ """
895
+ COLUMNS = [
896
+ "label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id",
897
+ "pdbx_PDB_ins_code"
898
+ ]
899
+
900
+ Category = type(atom_site)
901
+ Column = Category.subcomponent_class()
902
+
903
+ bond_array = array.bonds.as_array()
904
+ # To save computation time call 'get_residue_starts_for()' only once
905
+ # with indices of the first and second atom of each bond
906
+ residue_starts_1, residue_starts_2 = get_residue_starts_for(
907
+ array, bond_array[:, :2].flatten()
908
+ ).reshape(-1, 2).T
909
+ # Filter out all intra-residue bonds
910
+ bond_array = bond_array[residue_starts_1 != residue_starts_2]
911
+
912
+ struct_conn = Category()
913
+ struct_conn["id"] = np.arange(1, len(bond_array) + 1)
914
+ struct_conn["conn_type_id"] = np.full(len(bond_array), "covale")
915
+ struct_conn["pdbx_value_order"] = Column(
916
+ np.array(
917
+ [PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]
918
+ ),
919
+ np.where(
920
+ bond_array[:, 2] == BondType.ANY,
921
+ MaskValue.MISSING, MaskValue.PRESENT,
922
+ )
923
+ )
924
+ # Write the identifying annotation...
925
+ for col_name in COLUMNS:
926
+ annot = atom_site[col_name].as_array()
927
+ # ...for each bond partner
928
+ for i in range(2):
929
+ atom_indices = bond_array[:, i]
930
+ struct_conn[_get_struct_conn_col_name(col_name, i+1)] \
931
+ = annot[atom_indices]
932
+ return struct_conn
933
+
934
+
935
+ def get_component(pdbx_file, data_block=None, use_ideal_coord=True,
936
+ res_name=None):
642
937
  """
643
938
  Create an :class:`AtomArray` for a chemical component from the
644
939
  ``chem_comp_atom`` and, if available, the ``chem_comp_bond``
@@ -646,26 +941,37 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
646
941
 
647
942
  Parameters
648
943
  ----------
944
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
945
+ The file object.
649
946
  data_block : str, optional
650
- The name of the data block. Default is the first
651
- (and most times only) data block of the file.
947
+ The name of the data block.
948
+ Default is the first (and most times only) data block of the
949
+ file.
950
+ If the data block object is passed directly to `pdbx_file`,
951
+ this parameter is ignored.
652
952
  use_ideal_coord : bool, optional
653
953
  If true, the *ideal* coordinates are read from the file
654
954
  (``pdbx_model_Cartn_<dim>_ideal`` fields), typically
655
955
  originating from computations.
656
956
  If set to false, alternative coordinates are read
657
957
  (``model_Cartn_<dim>_`` fields).
658
-
958
+ res_name : str
959
+ In rare cases the categories may contain rows for multiple
960
+ components.
961
+ In this case, the component with the given residue name is
962
+ read.
963
+ By default, all rows would be read in this case.
964
+
659
965
  Returns
660
966
  -------
661
967
  array : AtomArray
662
968
  The parsed chemical component.
663
-
969
+
664
970
  Examples
665
971
  --------
666
972
 
667
973
  >>> import os.path
668
- >>> file = PDBxFile.read(
974
+ >>> file = CIFFile.read(
669
975
  ... os.path.join(path_to_structures, "molecules", "TYR.cif")
670
976
  ... )
671
977
  >>> comp = get_component(file)
@@ -695,26 +1001,31 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
695
1001
  HET 0 TYR HH H -0.123 -0.399 -5.059
696
1002
  HET 0 TYR HXT H -1.333 -0.030 4.784
697
1003
  """
698
- atom_dict = pdbx_file.get_category(
699
- "chem_comp_atom", block=data_block, expect_looped=True
700
- )
701
- if atom_dict is None:
1004
+ block = _get_block(pdbx_file, data_block)
1005
+
1006
+ try:
1007
+ atom_category = block["chem_comp_atom"]
1008
+ except KeyError:
702
1009
  raise InvalidFileError("Missing 'chem_comp_atom' category in file")
703
- bond_dict = pdbx_file.get_category(
704
- "chem_comp_bond", block=data_block, expect_looped=True
705
- )
1010
+ if res_name is not None:
1011
+ atom_category = _filter(
1012
+ atom_category, atom_category["comp_id"].as_array() == res_name
1013
+ )
1014
+ if len(atom_category) == 0:
1015
+ raise KeyError(
1016
+ f"No rows with residue name '{res_name}' found in "
1017
+ f"'chem_comp_atom' category"
1018
+ )
706
1019
 
707
- array = AtomArray(len(list(atom_dict.values())[0]))
1020
+ array = AtomArray(atom_category.row_count)
708
1021
 
709
1022
  array.hetero[:] = True
710
- array.res_name = atom_dict["comp_id"]
711
- array.atom_name = atom_dict["atom_id"]
712
- array.element = atom_dict["type_symbol"]
1023
+ array.res_name = atom_category["comp_id"].as_array("U5")
1024
+ array.atom_name = atom_category["atom_id"].as_array("U6")
1025
+ array.element = atom_category["type_symbol"].as_array("U2")
713
1026
  array.add_annotation("charge", int)
714
- array.charge = np.array(
715
- [int(c) if c != "?" else 0 for c in atom_dict["charge"]]
716
- )
717
-
1027
+ array.charge = atom_category["charge"].as_array(int, 0)
1028
+
718
1029
  coord_fields = [f"pdbx_model_Cartn_{dim}_ideal" for dim in ("x", "y", "z")]
719
1030
  alt_coord_fields = [f"model_Cartn_{dim}" for dim in ("x", "y", "z")]
720
1031
  if not use_ideal_coord:
@@ -722,7 +1033,7 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
722
1033
  coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
723
1034
  try:
724
1035
  for i, field in enumerate(coord_fields):
725
- array.coord[:,i] = atom_dict[field]
1036
+ array.coord[:,i] = atom_category[field].as_array(np.float32)
726
1037
  except KeyError as err:
727
1038
  key = err.args[0]
728
1039
  warnings.warn(
@@ -731,9 +1042,15 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
731
1042
  UserWarning
732
1043
  )
733
1044
  for i, field in enumerate(alt_coord_fields):
734
- array.coord[:,i] = atom_dict[field]
735
-
736
- if bond_dict is None:
1045
+ array.coord[:,i] = atom_category[field].as_array(np.float32)
1046
+
1047
+ try:
1048
+ bond_category = block["chem_comp_bond"]
1049
+ if res_name is not None:
1050
+ bond_category = _filter(
1051
+ bond_category, bond_category["comp_id"].as_array() == res_name
1052
+ )
1053
+ except KeyError:
737
1054
  warnings.warn(
738
1055
  f"Category 'chem_comp_bond' not found. "
739
1056
  f"No bonds will be parsed",
@@ -742,12 +1059,14 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
742
1059
  else:
743
1060
  bonds = BondList(array.array_length())
744
1061
  for atom1, atom2, order, aromatic_flag in zip(
745
- bond_dict["atom_id_1"], bond_dict["atom_id_2"],
746
- bond_dict["value_order"], bond_dict["pdbx_aromatic_flag"]
1062
+ bond_category["atom_id_1"].as_array(str),
1063
+ bond_category["atom_id_2"].as_array(str),
1064
+ bond_category["value_order"].as_array(str),
1065
+ bond_category["pdbx_aromatic_flag"].as_array(str)
747
1066
  ):
748
1067
  atom_i = np.where(array.atom_name == atom1)[0][0]
749
1068
  atom_j = np.where(array.atom_name == atom2)[0][0]
750
- bond_type = BOND_ORDER_TO_BOND_TYPE[order, aromatic_flag]
1069
+ bond_type = COMP_BOND_ORDER_TO_TYPE[order, aromatic_flag]
751
1070
  bonds.add_bond(atom_i, atom_j, bond_type)
752
1071
  array.bonds = bonds
753
1072
 
@@ -766,15 +1085,22 @@ def set_component(pdbx_file, array, data_block=None):
766
1085
 
767
1086
  Parameters
768
1087
  ----------
769
- pdbx_file : PDBxFile
1088
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
770
1089
  The file object.
771
1090
  array : AtomArray
772
1091
  The chemical component to be written.
773
1092
  Must contain only a single residue.
774
1093
  data_block : str, optional
775
- The name of the data block. Default is the first
776
- (and most times only) data block of the file.
1094
+ The name of the data block.
1095
+ Default is the first (and most times only) data block of the
1096
+ file.
1097
+ If the file is empty, a new data will be created.
1098
+ If the data block object is passed directly to `pdbx_file`,
1099
+ this parameter is ignored.
777
1100
  """
1101
+ block = _get_or_create_block(pdbx_file, data_block)
1102
+ Category = block.subcomponent_class()
1103
+
778
1104
  if get_residue_count(array) > 1:
779
1105
  raise BadStructureError(
780
1106
  "The input atom array must comprise only one residue"
@@ -787,45 +1113,44 @@ def set_component(pdbx_file, array, data_block=None):
787
1113
  else:
788
1114
  charge = np.full(array.array_length(), "?", dtype="U2")
789
1115
 
790
- chem_comp_dict = OrderedDict()
791
- chem_comp_dict["comp_id"] = np.full(array.array_length(), res_name)
792
- chem_comp_dict["atom_id"] = np.copy(array.atom_name)
793
- chem_comp_dict["alt_atom_id"] = chem_comp_dict["atom_id"]
794
- chem_comp_dict["type_symbol"] = np.copy(array.element)
795
- chem_comp_dict["charge"] = charge
796
- chem_comp_dict["model_Cartn_x"] = np.copy(array.coord[:, 0])
797
- chem_comp_dict["model_Cartn_y"] = np.copy(array.coord[:, 1])
798
- chem_comp_dict["model_Cartn_z"] = np.copy(array.coord[:, 2])
799
- chem_comp_dict["pdbx_model_Cartn_x_ideal"] = chem_comp_dict["model_Cartn_x"]
800
- chem_comp_dict["pdbx_model_Cartn_y_ideal"] = chem_comp_dict["model_Cartn_y"]
801
- chem_comp_dict["pdbx_model_Cartn_z_ideal"] = chem_comp_dict["model_Cartn_z"]
802
- chem_comp_dict["pdbx_component_atom_id"] = chem_comp_dict["atom_id"]
803
- chem_comp_dict["pdbx_component_comp_id"] = chem_comp_dict["comp_id"]
804
- chem_comp_dict["pdbx_ordinal"] = np.arange(
1116
+ atom_cat = Category()
1117
+ atom_cat["comp_id"] = np.full(array.array_length(), res_name)
1118
+ atom_cat["atom_id"] = np.copy(array.atom_name)
1119
+ atom_cat["alt_atom_id"] = atom_cat["atom_id"]
1120
+ atom_cat["type_symbol"] = np.copy(array.element)
1121
+ atom_cat["charge"] = charge
1122
+ atom_cat["model_Cartn_x"] = np.copy(array.coord[:, 0])
1123
+ atom_cat["model_Cartn_y"] = np.copy(array.coord[:, 1])
1124
+ atom_cat["model_Cartn_z"] = np.copy(array.coord[:, 2])
1125
+ atom_cat["pdbx_model_Cartn_x_ideal"] = atom_cat["model_Cartn_x"]
1126
+ atom_cat["pdbx_model_Cartn_y_ideal"] = atom_cat["model_Cartn_y"]
1127
+ atom_cat["pdbx_model_Cartn_z_ideal"] = atom_cat["model_Cartn_z"]
1128
+ atom_cat["pdbx_component_atom_id"] = atom_cat["atom_id"]
1129
+ atom_cat["pdbx_component_comp_id"] = atom_cat["comp_id"]
1130
+ atom_cat["pdbx_ordinal"] = np.arange(
805
1131
  1, array.array_length() + 1
806
1132
  ).astype(str)
807
- pdbx_file.set_category("chem_comp_atom", chem_comp_dict, data_block)
1133
+ block["chem_comp_atom"] = atom_cat
808
1134
 
809
1135
  if array.bonds is not None:
810
1136
  bond_array = array.bonds.as_array()
811
1137
  order_flags = []
812
1138
  aromatic_flags = []
813
1139
  for bond_type in bond_array[:,2]:
814
- order_flag, aromatic_flag = BOND_TYPE_TO_BOND_ORDER[bond_type]
1140
+ order_flag, aromatic_flag = COMP_BOND_TYPE_TO_ORDER[bond_type]
815
1141
  order_flags.append(order_flag)
816
1142
  aromatic_flags.append(aromatic_flag)
817
1143
 
818
- chem_comp_bond_dict = OrderedDict()
819
- chem_comp_bond_dict["comp_id"] = np.full(len(bond_array), res_name)
820
- chem_comp_bond_dict["atom_id_1"] = array.atom_name[bond_array[:,0]]
821
- chem_comp_bond_dict["atom_id_2"] = array.atom_name[bond_array[:,1]]
822
- chem_comp_bond_dict["value_order"] = np.array(order_flags)
823
- chem_comp_bond_dict["pdbx_aromatic_flag"] = np.array(aromatic_flags)
824
- chem_comp_bond_dict["pdbx_ordinal"] = np.arange(
1144
+ bond_cat = Category()
1145
+ bond_cat["comp_id"] = np.full(len(bond_array), res_name)
1146
+ bond_cat["atom_id_1"] = array.atom_name[bond_array[:,0]]
1147
+ bond_cat["atom_id_2"] = array.atom_name[bond_array[:,1]]
1148
+ bond_cat["value_order"] = np.array(order_flags)
1149
+ bond_cat["pdbx_aromatic_flag"] = np.array(aromatic_flags)
1150
+ bond_cat["pdbx_ordinal"] = np.arange(
825
1151
  1, len(bond_array) + 1
826
1152
  ).astype(str)
827
- pdbx_file.set_category("chem_comp_bond", chem_comp_bond_dict, data_block)
828
-
1153
+ block["chem_comp_bond"] = bond_cat
829
1154
 
830
1155
  def list_assemblies(pdbx_file, data_block=None):
831
1156
  """
@@ -838,23 +1163,25 @@ def list_assemblies(pdbx_file, data_block=None):
838
1163
 
839
1164
  Parameters
840
1165
  ----------
841
- pdbx_file : PDBxFile
1166
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
842
1167
  The file object.
843
1168
  data_block : str, optional
844
1169
  The name of the data block.
845
- Defaults to the first (and most times only) data block of the
1170
+ Default is the first (and most times only) data block of the
846
1171
  file.
1172
+ If the data block object is passed directly to `pdbx_file`,
1173
+ this parameter is ignored.
847
1174
 
848
1175
  Returns
849
1176
  -------
850
1177
  assemblies : dict of str -> str
851
1178
  A dictionary that maps an assembly ID to a description of the
852
1179
  corresponding assembly.
853
-
1180
+
854
1181
  Examples
855
1182
  --------
856
1183
  >>> import os.path
857
- >>> file = PDBxFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1184
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
858
1185
  >>> assembly_ids = list_assemblies(file)
859
1186
  >>> for key, val in assembly_ids.items():
860
1187
  ... print(f"'{key}' : '{val}'")
@@ -865,21 +1192,24 @@ def list_assemblies(pdbx_file, data_block=None):
865
1192
  '5' : 'icosahedral asymmetric unit, std point frame'
866
1193
  '6' : 'crystal asymmetric unit, crystal frame'
867
1194
  """
868
- assembly_category = pdbx_file.get_category(
869
- "pdbx_struct_assembly", data_block, expect_looped=True
870
- )
871
- if assembly_category is None:
1195
+ block = _get_block(pdbx_file, data_block)
1196
+
1197
+ try:
1198
+ assembly_category = block["pdbx_struct_assembly"]
1199
+ except KeyError:
872
1200
  raise InvalidFileError("File has no 'pdbx_struct_assembly' category")
873
1201
  return {
874
1202
  id: details
875
1203
  for id, details in zip(
876
- assembly_category["id"], assembly_category["details"]
1204
+ assembly_category["id"].as_array(str),
1205
+ assembly_category["details"].as_array(str)
877
1206
  )
878
1207
  }
879
1208
 
880
1209
 
881
1210
  def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
882
- altloc="first", extra_fields=None, use_author_fields=True):
1211
+ altloc="first", extra_fields=None, use_author_fields=True,
1212
+ include_bonds=False):
883
1213
  """
884
1214
  Build the given biological assembly.
885
1215
 
@@ -890,7 +1220,7 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
890
1220
 
891
1221
  Parameters
892
1222
  ----------
893
- pdbx_file : PDBxFile
1223
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
894
1224
  The file object.
895
1225
  assembly_id : str
896
1226
  The assembly to build.
@@ -907,8 +1237,10 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
907
1237
  contains only one model.
908
1238
  data_block : str, optional
909
1239
  The name of the data block.
910
- Defaults to the first (and most times only) data block of the
1240
+ Default is the first (and most times only) data block of the
911
1241
  file.
1242
+ If the data block object is passed directly to `pdbx_file`,
1243
+ this parameter is ignored.
912
1244
  altloc : {'first', 'occupancy', 'all'}
913
1245
  This parameter defines how *altloc* IDs are handled:
914
1246
  - ``'first'`` - Use atoms that have the first *altloc* ID
@@ -940,36 +1272,46 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
940
1272
  If `use_author_fields` is true, the annotation arrays will be
941
1273
  read from the ``auth_xxx`` fields (if applicable),
942
1274
  otherwise from the the ``label_xxx`` fields.
1275
+ include_bonds : bool, optional
1276
+ If set to true, a :class:`BondList` will be created for the
1277
+ resulting :class:`AtomArray` containing the bond information
1278
+ from the file.
1279
+ Bonds, whose order could not be determined from the
1280
+ *Chemical Component Dictionary*
1281
+ (e.g. especially inter-residue bonds),
1282
+ have :attr:`BondType.ANY`, since the PDB format itself does
1283
+ not support bond orders.
943
1284
 
944
1285
  Returns
945
1286
  -------
946
1287
  assembly : AtomArray or AtomArrayStack
947
1288
  The assembly. The return type depends on the `model` parameter.
948
-
1289
+
949
1290
  Examples
950
1291
  --------
951
1292
 
952
1293
  >>> import os.path
953
- >>> file = PDBxFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1294
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
954
1295
  >>> assembly = get_assembly(file, model=1)
955
1296
  """
956
- assembly_gen_category = pdbx_file.get_category(
957
- "pdbx_struct_assembly_gen", data_block, expect_looped=True
958
- )
959
- if assembly_gen_category is None:
1297
+ block = _get_block(pdbx_file, data_block)
1298
+
1299
+ try:
1300
+ assembly_gen_category = block["pdbx_struct_assembly_gen"]
1301
+ except KeyError:
960
1302
  raise InvalidFileError(
961
1303
  "File has no 'pdbx_struct_assembly_gen' category"
962
1304
  )
963
1305
 
964
- struct_oper_category = pdbx_file.get_category(
965
- "pdbx_struct_oper_list", data_block, expect_looped=True
966
- )
967
- if struct_oper_category is None:
1306
+ try:
1307
+ struct_oper_category = block["pdbx_struct_oper_list"]
1308
+ except KeyError:
968
1309
  raise InvalidFileError("File has no 'pdbx_struct_oper_list' category")
969
1310
 
1311
+ assembly_ids = assembly_gen_category["assembly_id"].as_array(str)
970
1312
  if assembly_id is None:
971
- assembly_id = assembly_gen_category["assembly_id"][0]
972
- elif assembly_id not in assembly_gen_category["assembly_id"]:
1313
+ assembly_id = assembly_ids[0]
1314
+ elif assembly_id not in assembly_ids:
973
1315
  raise KeyError(f"File has no Assembly ID '{assembly_id}'")
974
1316
 
975
1317
  ### Calculate all possible transformations
@@ -982,6 +1324,8 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
982
1324
  if "label_asym_id" in extra_fields:
983
1325
  extra_fields_and_asym = extra_fields
984
1326
  else:
1327
+ # The operations apply on asym IDs
1328
+ # -> they need to be included to select the correct atoms
985
1329
  extra_fields_and_asym = extra_fields + ["label_asym_id"]
986
1330
  structure = get_structure(
987
1331
  pdbx_file,
@@ -990,14 +1334,15 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
990
1334
  altloc,
991
1335
  extra_fields_and_asym,
992
1336
  use_author_fields,
1337
+ include_bonds
993
1338
  )
994
1339
 
995
1340
  ### Get transformations and apply them to the affected asym IDs
996
1341
  assembly = None
997
1342
  for id, op_expr, asym_id_expr in zip(
998
- assembly_gen_category["assembly_id"],
999
- assembly_gen_category["oper_expression"],
1000
- assembly_gen_category["asym_id_list"],
1343
+ assembly_gen_category["assembly_id"].as_array(str),
1344
+ assembly_gen_category["oper_expression"].as_array(str),
1345
+ assembly_gen_category["asym_id_list"].as_array(str),
1001
1346
  ):
1002
1347
  # Find the operation expressions for given assembly ID
1003
1348
  # We already asserted that the ID is actually present
@@ -1017,12 +1362,12 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
1017
1362
  assembly = sub_assembly
1018
1363
  else:
1019
1364
  assembly += sub_assembly
1020
-
1365
+
1021
1366
  # Remove 'label_asym_id', if it was not included in the original
1022
1367
  # user-supplied 'extra_fields'
1023
1368
  if "label_asym_id" not in extra_fields:
1024
1369
  assembly.del_annotation("label_asym_id")
1025
-
1370
+
1026
1371
  return assembly
1027
1372
 
1028
1373
 
@@ -1056,19 +1401,20 @@ def _get_transformations(struct_oper):
1056
1401
  translation for each operation ID in ``pdbx_struct_oper_list``.
1057
1402
  """
1058
1403
  transformation_dict = {}
1059
- for index, id in enumerate(struct_oper["id"]):
1404
+ for index, id in enumerate(struct_oper["id"].as_array(str)):
1060
1405
  rotation_matrix = np.array(
1061
1406
  [
1062
1407
  [
1063
- float(struct_oper[f"matrix[{i}][{j}]"][index])
1408
+ struct_oper[f"matrix[{i}][{j}]"].as_array(float)[index]
1064
1409
  for j in (1, 2, 3)
1065
1410
  ]
1066
1411
  for i in (1, 2, 3)
1067
1412
  ]
1068
1413
  )
1069
- translation_vector = np.array(
1070
- [float(struct_oper[f"vector[{i}]"][index]) for i in (1, 2, 3)]
1071
- )
1414
+ translation_vector = np.array([
1415
+ struct_oper[f"vector[{i}]"].as_array(float)[index]
1416
+ for i in (1, 2, 3)
1417
+ ])
1072
1418
  transformation_dict[id] = (rotation_matrix, translation_vector)
1073
1419
  return transformation_dict
1074
1420
 
@@ -1112,6 +1458,8 @@ def _convert_string_to_sequence(string, stype):
1112
1458
  ``proteinseq_type_list`` or to ``NucleotideSequence`` if `stype` is
1113
1459
  contained in ``_nucleotideseq_type_list``.
1114
1460
  """
1461
+ # sequence may be stored as multiline string
1462
+ string = string.replace("\n", "")
1115
1463
  if stype in _proteinseq_type_list:
1116
1464
  return ProteinSequence(string)
1117
1465
  elif stype in _nucleotideseq_type_list: