biotite 0.39.0__cp312-cp312-macosx_11_0_arm64.whl → 0.41.0__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (121) hide show
  1. biotite/__init__.py +3 -3
  2. biotite/application/dssp/app.py +18 -18
  3. biotite/database/pubchem/download.py +23 -23
  4. biotite/database/pubchem/query.py +7 -7
  5. biotite/database/rcsb/download.py +19 -14
  6. biotite/file.py +17 -9
  7. biotite/sequence/align/banded.c +256 -235
  8. biotite/sequence/align/banded.cpython-312-darwin.so +0 -0
  9. biotite/sequence/align/cigar.py +60 -15
  10. biotite/sequence/align/kmeralphabet.c +241 -220
  11. biotite/sequence/align/kmeralphabet.cpython-312-darwin.so +0 -0
  12. biotite/sequence/align/kmersimilarity.c +213 -194
  13. biotite/sequence/align/kmersimilarity.cpython-312-darwin.so +0 -0
  14. biotite/sequence/align/kmertable.cpp +231 -203
  15. biotite/sequence/align/kmertable.cpython-312-darwin.so +0 -0
  16. biotite/sequence/align/localgapped.c +256 -235
  17. biotite/sequence/align/localgapped.cpython-312-darwin.so +0 -0
  18. biotite/sequence/align/localungapped.c +233 -212
  19. biotite/sequence/align/localungapped.cpython-312-darwin.so +0 -0
  20. biotite/sequence/align/multiple.c +253 -232
  21. biotite/sequence/align/multiple.cpython-312-darwin.so +0 -0
  22. biotite/sequence/align/pairwise.c +272 -251
  23. biotite/sequence/align/pairwise.cpython-312-darwin.so +0 -0
  24. biotite/sequence/align/permutation.c +213 -194
  25. biotite/sequence/align/permutation.cpython-312-darwin.so +0 -0
  26. biotite/sequence/align/selector.c +215 -195
  27. biotite/sequence/align/selector.cpython-312-darwin.so +0 -0
  28. biotite/sequence/align/tracetable.c +213 -193
  29. biotite/sequence/align/tracetable.cpython-312-darwin.so +0 -0
  30. biotite/sequence/annotation.py +2 -2
  31. biotite/sequence/codec.c +233 -212
  32. biotite/sequence/codec.cpython-312-darwin.so +0 -0
  33. biotite/sequence/io/fasta/convert.py +27 -24
  34. biotite/sequence/phylo/nj.c +213 -194
  35. biotite/sequence/phylo/nj.cpython-312-darwin.so +0 -0
  36. biotite/sequence/phylo/tree.c +225 -200
  37. biotite/sequence/phylo/tree.cpython-312-darwin.so +0 -0
  38. biotite/sequence/phylo/upgma.c +213 -194
  39. biotite/sequence/phylo/upgma.cpython-312-darwin.so +0 -0
  40. biotite/structure/__init__.py +2 -0
  41. biotite/structure/basepairs.py +7 -12
  42. biotite/structure/bonds.c +1435 -1277
  43. biotite/structure/bonds.cpython-312-darwin.so +0 -0
  44. biotite/structure/celllist.c +215 -195
  45. biotite/structure/celllist.cpython-312-darwin.so +0 -0
  46. biotite/structure/charges.c +1050 -1099
  47. biotite/structure/charges.cpython-312-darwin.so +0 -0
  48. biotite/structure/dotbracket.py +2 -0
  49. biotite/structure/filter.py +30 -37
  50. biotite/structure/info/__init__.py +5 -8
  51. biotite/structure/info/atoms.py +31 -68
  52. biotite/structure/info/bonds.py +47 -101
  53. biotite/structure/info/ccd/README.rst +8 -0
  54. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  55. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  56. biotite/structure/info/ccd/components.bcif +0 -0
  57. biotite/structure/info/ccd/nucleotides.txt +798 -0
  58. biotite/structure/info/ccd.py +95 -0
  59. biotite/structure/info/groups.py +90 -0
  60. biotite/structure/info/masses.py +21 -20
  61. biotite/structure/info/misc.py +78 -25
  62. biotite/structure/info/standardize.py +17 -12
  63. biotite/structure/integrity.py +19 -70
  64. biotite/structure/io/__init__.py +2 -4
  65. biotite/structure/io/ctab.py +12 -106
  66. biotite/structure/io/general.py +167 -181
  67. biotite/structure/io/gro/file.py +16 -16
  68. biotite/structure/io/mmtf/__init__.py +3 -0
  69. biotite/structure/io/mmtf/convertarray.c +217 -196
  70. biotite/structure/io/mmtf/convertarray.cpython-312-darwin.so +0 -0
  71. biotite/structure/io/mmtf/convertfile.c +215 -195
  72. biotite/structure/io/mmtf/convertfile.cpython-312-darwin.so +0 -0
  73. biotite/structure/io/mmtf/decode.c +223 -202
  74. biotite/structure/io/mmtf/decode.cpython-312-darwin.so +0 -0
  75. biotite/structure/io/mmtf/encode.c +213 -194
  76. biotite/structure/io/mmtf/encode.cpython-312-darwin.so +0 -0
  77. biotite/structure/io/mmtf/file.py +34 -26
  78. biotite/structure/io/mol/__init__.py +4 -2
  79. biotite/structure/io/mol/convert.py +71 -7
  80. biotite/structure/io/mol/ctab.py +414 -0
  81. biotite/structure/io/mol/header.py +116 -0
  82. biotite/structure/io/mol/{file.py → mol.py} +69 -82
  83. biotite/structure/io/mol/sdf.py +909 -0
  84. biotite/structure/io/npz/__init__.py +3 -0
  85. biotite/structure/io/npz/file.py +21 -18
  86. biotite/structure/io/pdb/__init__.py +3 -3
  87. biotite/structure/io/pdb/file.py +89 -34
  88. biotite/structure/io/pdb/hybrid36.c +63 -43
  89. biotite/structure/io/pdb/hybrid36.cpython-312-darwin.so +0 -0
  90. biotite/structure/io/pdbqt/file.py +32 -32
  91. biotite/structure/io/pdbx/__init__.py +12 -6
  92. biotite/structure/io/pdbx/bcif.py +648 -0
  93. biotite/structure/io/pdbx/cif.py +1032 -0
  94. biotite/structure/io/pdbx/component.py +246 -0
  95. biotite/structure/io/pdbx/convert.py +858 -386
  96. biotite/structure/io/pdbx/encoding.c +112803 -0
  97. biotite/structure/io/pdbx/encoding.cpython-312-darwin.so +0 -0
  98. biotite/structure/io/pdbx/legacy.py +267 -0
  99. biotite/structure/molecules.py +151 -151
  100. biotite/structure/repair.py +253 -0
  101. biotite/structure/sasa.c +213 -194
  102. biotite/structure/sasa.cpython-312-darwin.so +0 -0
  103. biotite/structure/sequence.py +112 -0
  104. biotite/structure/superimpose.py +618 -116
  105. {biotite-0.39.0.dist-info → biotite-0.41.0.dist-info}/METADATA +3 -3
  106. {biotite-0.39.0.dist-info → biotite-0.41.0.dist-info}/RECORD +109 -103
  107. {biotite-0.39.0.dist-info → biotite-0.41.0.dist-info}/WHEEL +1 -1
  108. biotite/structure/info/amino_acids.json +0 -1556
  109. biotite/structure/info/amino_acids.py +0 -42
  110. biotite/structure/info/carbohydrates.json +0 -1122
  111. biotite/structure/info/carbohydrates.py +0 -39
  112. biotite/structure/info/intra_bonds.msgpack +0 -0
  113. biotite/structure/info/link_types.msgpack +0 -1
  114. biotite/structure/info/nucleotides.json +0 -772
  115. biotite/structure/info/nucleotides.py +0 -39
  116. biotite/structure/info/residue_masses.msgpack +0 -0
  117. biotite/structure/info/residue_names.msgpack +0 -3
  118. biotite/structure/info/residues.msgpack +0 -0
  119. biotite/structure/io/pdbx/file.py +0 -652
  120. {biotite-0.39.0.dist-info → biotite-0.41.0.dist-info}/LICENSE.rst +0 -0
  121. {biotite-0.39.0.dist-info → biotite-0.41.0.dist-info}/top_level.txt +0 -0
@@ -17,21 +17,50 @@ __all__ = [
17
17
 
18
18
  import itertools
19
19
  import warnings
20
- from collections import OrderedDict
21
20
  import numpy as np
22
21
  from ....file import InvalidFileError
23
22
  from ....sequence.seqtypes import NucleotideSequence, ProteinSequence
24
23
  from ...atoms import AtomArray, AtomArrayStack, repeat
25
- from ...bonds import BondList, BondType
24
+ from ...bonds import BondList, BondType, connect_via_residue_names
26
25
  from ...box import unitcell_from_vectors, vectors_from_unitcell
27
26
  from ...filter import filter_first_altloc, filter_highest_occupancy_altloc
28
- from ...residues import get_residue_count
27
+ from ...residues import get_residue_count, get_residue_starts_for
29
28
  from ...error import BadStructureError
30
29
  from ...util import matrix_rotate
30
+ from .legacy import PDBxFile
31
+ from .component import MaskValue
32
+ from .cif import CIFFile, CIFBlock
33
+ from .bcif import BinaryCIFFile, BinaryCIFBlock, BinaryCIFColumn
34
+ from .encoding import StringArrayEncoding
31
35
 
32
36
 
33
- # Map 'chem_comp_bond' bond orders to 'BondType'...
34
- BOND_ORDER_TO_BOND_TYPE = {
37
+ # Cond types in `struct_conn` category that refer to covalent bonds
38
+ PDBX_COVALENT_TYPES = [
39
+ "covale", "covale_base", "covale_phosphate", "covale_sugar",
40
+ "disulf", "modres", "modres_link", "metalc"
41
+ ]
42
+ # Map 'struct_conn' bond orders to 'BondType'...
43
+ PDBX_BOND_ORDER_TO_TYPE = {
44
+ "": BondType.ANY,
45
+ "sing": BondType.SINGLE,
46
+ "doub": BondType.DOUBLE,
47
+ "trip": BondType.TRIPLE,
48
+ "quad": BondType.QUADRUPLE,
49
+ }
50
+ # ...and vice versa
51
+ PDBX_BOND_TYPE_TO_ORDER = {
52
+ # 'ANY' is masked later, it is merely added here to avoid a KeyError
53
+ BondType.ANY: "",
54
+ BondType.SINGLE: "sing",
55
+ BondType.DOUBLE: "doub",
56
+ BondType.TRIPLE: "trip",
57
+ BondType.QUADRUPLE: "quad",
58
+ BondType.AROMATIC_SINGLE: "sing",
59
+ BondType.AROMATIC_DOUBLE: "doub",
60
+ BondType.AROMATIC_TRIPLE: "trip",
61
+ }
62
+ # Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
63
+ COMP_BOND_ORDER_TO_TYPE = {
35
64
  ("SING", "N") : BondType.SINGLE,
36
65
  ("DOUB", "N") : BondType.DOUBLE,
37
66
  ("TRIP", "N") : BondType.TRIPLE,
@@ -41,11 +70,10 @@ BOND_ORDER_TO_BOND_TYPE = {
41
70
  ("TRIP", "Y") : BondType.AROMATIC_TRIPLE,
42
71
  }
43
72
  # ...and vice versa
44
- BOND_TYPE_TO_BOND_ORDER = {
45
- bond_type: order for order, bond_type in BOND_ORDER_TO_BOND_TYPE.items()
73
+ COMP_BOND_TYPE_TO_ORDER = {
74
+ bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
46
75
  }
47
76
 
48
-
49
77
  _proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
50
78
  _nucleotideseq_type_list = [
51
79
  "polydeoxyribonucleotide",
@@ -61,6 +89,27 @@ _other_type_list = [
61
89
  ]
62
90
 
63
91
 
92
+ def _filter(category, index):
93
+ """
94
+ Reduce the ``atom_site`` category to the values for the given
95
+ model.
96
+ """
97
+ Category = type(category)
98
+ Column = Category.subcomponent_class()
99
+ Data = Column.subcomponent_class()
100
+
101
+ return Category({
102
+ key: Column(
103
+ Data(column.data.array[index]),
104
+ (
105
+ Data(column.mask.array[index])
106
+ if column.mask is not None else None
107
+ )
108
+ )
109
+ for key, column in category.items()
110
+ })
111
+
112
+
64
113
  def get_sequence(pdbx_file, data_block=None):
65
114
  """
66
115
  Get the protein and nucleotide sequences from the
@@ -74,11 +123,14 @@ def get_sequence(pdbx_file, data_block=None):
74
123
 
75
124
  Parameters
76
125
  ----------
77
- pdbx_file : PDBxFile
126
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
78
127
  The file object.
79
- data_block : string, optional
80
- The name of the data block. Default is the first
81
- (and most times only) data block of the file.
128
+ data_block : str, optional
129
+ The name of the data block.
130
+ Default is the first (and most times only) data block of the
131
+ file.
132
+ If the data block object is passed directly to `pdbx_file`,
133
+ this parameter is ignored.
82
134
 
83
135
  Returns
84
136
  -------
@@ -86,50 +138,55 @@ def get_sequence(pdbx_file, data_block=None):
86
138
  The protein and nucleotide sequences for each entity
87
139
  (equivalent to chains in most cases).
88
140
  """
89
- poly_dict = pdbx_file.get_category("entity_poly", data_block)
90
- seq_string = poly_dict["pdbx_seq_one_letter_code_can"]
91
- seq_type = poly_dict["type"]
141
+ block = _get_block(pdbx_file, data_block)
142
+
143
+ poly_category= block["entity_poly"]
144
+ seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
145
+ seq_type = poly_category["type"].as_array(str)
92
146
  sequences = []
93
- if isinstance(seq_string, np.ndarray):
94
- for string, stype in zip(seq_string, seq_type):
95
- sequence = _convert_string_to_sequence(string, stype)
96
- if sequence is not None:
97
- sequences.append(sequence)
98
- else:
99
- sequences.append(_convert_string_to_sequence(seq_string, seq_type))
147
+ for string, stype in zip(seq_string, seq_type):
148
+ sequence = _convert_string_to_sequence(string, stype)
149
+ if sequence is not None:
150
+ sequences.append(sequence)
100
151
  return sequences
101
152
 
102
153
 
103
- def get_model_count(file, data_block=None):
154
+ def get_model_count(pdbx_file, data_block=None):
104
155
  """
105
156
  Get the number of models contained in a :class:`PDBxFile`.
106
157
 
107
158
  Parameters
108
159
  ----------
109
- file : PDBxFile
160
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
110
161
  The file object.
111
162
  data_block : str, optional
112
- The name of the data block. Default is the first
113
- (and most times only) data block of the file.
163
+ The name of the data block.
164
+ Default is the first (and most times only) data block of the
165
+ file.
166
+ If the data block object is passed directly to `pdbx_file`,
167
+ this parameter is ignored.
114
168
 
115
169
  Returns
116
170
  -------
117
171
  model_count : int
118
172
  The number of models.
119
173
  """
120
- atom_site_dict = file.get_category("atom_site", data_block)
121
- return len(_get_model_starts(atom_site_dict["pdbx_PDB_model_num"]))
174
+ block = _get_block(pdbx_file, data_block)
175
+ return len(_get_model_starts(
176
+ block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32)
177
+ ))
122
178
 
123
179
 
124
180
  def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
125
- extra_fields=None, use_author_fields=True):
181
+ extra_fields=None, use_author_fields=True,
182
+ include_bonds=False):
126
183
  """
127
184
  Create an :class:`AtomArray` or :class:`AtomArrayStack` from the
128
185
  ``atom_site`` category in a :class:`PDBxFile`.
129
186
 
130
187
  Parameters
131
188
  ----------
132
- pdbx_file : PDBxFile
189
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
133
190
  The file object.
134
191
  model : int, optional
135
192
  If this parameter is given, the function will return an
@@ -141,8 +198,11 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
141
198
  containing all models will be returned, even if the structure
142
199
  contains only one model.
143
200
  data_block : str, optional
144
- The name of the data block. Default is the first
145
- (and most times only) data block of the file.
201
+ The name of the data block.
202
+ Default is the first (and most times only) data block of the
203
+ file.
204
+ If the data block object is passed directly to `pdbx_file`,
205
+ this parameter is ignored.
146
206
  altloc : {'first', 'occupancy', 'all'}
147
207
  This parameter defines how *altloc* IDs are handled:
148
208
  - ``'first'`` - Use atoms that have the first *altloc* ID
@@ -176,6 +236,15 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
176
236
  otherwise from the the ``label_xxx`` fields.
177
237
  If the requested field is not available, the respective other
178
238
  field is taken as fallback.
239
+ include_bonds : bool, optional
240
+ If set to true, a :class:`BondList` will be created for the
241
+ resulting :class:`AtomArray` containing the bond information
242
+ from the file.
243
+ Inter-residue bonds, will be read from the ``struct_conn``
244
+ category.
245
+ Intra-residue bonds will be read from the ``chem_comp_bond``, if
246
+ available, otherwise they will be derived from the Chemical
247
+ Component Dictionary.
179
248
 
180
249
  Returns
181
250
  -------
@@ -186,31 +255,31 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
186
255
  --------
187
256
 
188
257
  >>> import os.path
189
- >>> file = PDBxFile.read(os.path.join(path_to_structures, "1l2y.cif"))
258
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1l2y.cif"))
190
259
  >>> arr = get_structure(file, model=1)
191
260
  >>> print(len(arr))
192
261
  304
193
262
 
194
263
  """
195
- extra_fields = [] if extra_fields is None else extra_fields
264
+ block = _get_block(pdbx_file, data_block)
265
+
266
+ extra_fields = set() if extra_fields is None else set(extra_fields)
196
267
 
197
- atom_site_dict = pdbx_file.get_category("atom_site", data_block)
198
- if atom_site_dict is None:
268
+ atom_site = block.get("atom_site")
269
+ if atom_site is None:
199
270
  raise InvalidFileError("Missing 'atom_site' category in file")
200
-
201
- models = atom_site_dict["pdbx_PDB_model_num"]
271
+
272
+ models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
202
273
  model_starts = _get_model_starts(models)
203
274
  model_count = len(model_starts)
204
275
  atom_count = len(models)
205
276
 
206
277
  if model is None:
207
278
  # For a stack, the annotations are derived from the first model
208
- model_dict = _get_model_dict(atom_site_dict, model_starts, 1)
279
+ model_atom_site = _filter_model(atom_site, model_starts, 1)
209
280
  # Any field of the category would work here to get the length
210
- model_length = len(model_dict["group_PDB"])
211
- stack = AtomArrayStack(model_count, model_length)
212
-
213
- _fill_annotations(stack, model_dict, extra_fields, use_author_fields)
281
+ model_length = model_atom_site.row_count
282
+ atoms = AtomArrayStack(model_count, model_length)
214
283
 
215
284
  # Check if each model has the same amount of atoms
216
285
  # If not, raise exception
@@ -221,27 +290,17 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
221
290
  "instead"
222
291
  )
223
292
 
224
- stack.coord = np.zeros(
225
- (model_count, model_length, 3), dtype=np.float32
226
- )
227
- stack.coord[:, :, 0] = atom_site_dict["Cartn_x"].reshape(
228
- (model_count, model_length)
229
- )
230
- stack.coord[:, :, 1] = atom_site_dict["Cartn_y"].reshape(
231
- (model_count, model_length)
232
- )
233
- stack.coord[:, :, 2] = atom_site_dict["Cartn_z"].reshape(
234
- (model_count, model_length)
235
- )
236
-
237
- stack = _filter_altloc(stack, model_dict, altloc)
293
+ atoms.coord[:, :, 0] = atom_site["Cartn_x"].as_array(np.float32) \
294
+ .reshape((model_count, model_length))
295
+ atoms.coord[:, :, 1] = atom_site["Cartn_y"].as_array(np.float32) \
296
+ .reshape((model_count, model_length))
297
+ atoms.coord[:, :, 2] = atom_site["Cartn_z"].as_array(np.float32) \
298
+ .reshape((model_count, model_length))
238
299
 
239
- box = _get_box(pdbx_file, data_block)
300
+ box = _get_box(block)
240
301
  if box is not None:
241
302
  # Duplicate same box for each model
242
- stack.box = np.repeat(box[np.newaxis, ...], model_count, axis=0)
243
-
244
- return stack
303
+ atoms.box = np.repeat(box[np.newaxis, ...], model_count, axis=0)
245
304
 
246
305
  else:
247
306
  if model == 0:
@@ -254,47 +313,94 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
254
313
  f"the given model {model} does not exist"
255
314
  )
256
315
 
257
- model_dict = _get_model_dict(atom_site_dict, model_starts, model)
316
+ model_atom_site = _filter_model(atom_site, model_starts, model)
258
317
  # Any field of the category would work here to get the length
259
- model_length = len(model_dict["group_PDB"])
260
- array = AtomArray(model_length)
318
+ model_length = model_atom_site.row_count
319
+ atoms = AtomArray(model_length)
261
320
 
262
- _fill_annotations(array, model_dict, extra_fields, use_author_fields)
321
+ atoms.coord[:, 0] = model_atom_site["Cartn_x"].as_array(np.float32)
322
+ atoms.coord[:, 1] = model_atom_site["Cartn_y"].as_array(np.float32)
323
+ atoms.coord[:, 2] = model_atom_site["Cartn_z"].as_array(np.float32)
263
324
 
264
- # Append exclusive stop
265
- model_starts = np.append(
266
- model_starts, [len(atom_site_dict["group_PDB"])]
267
- )
268
- # Indexing starts at 0, but model number starts at 1
269
- model_index = model - 1
270
- start, stop = model_starts[model_index], model_starts[model_index + 1]
271
- array.coord = np.zeros((model_length, 3), dtype=np.float32)
272
- array.coord[:, 0] = atom_site_dict["Cartn_x"][start:stop].astype(
273
- np.float32
274
- )
275
- array.coord[:, 1] = atom_site_dict["Cartn_y"][start:stop].astype(
276
- np.float32
277
- )
278
- array.coord[:, 2] = atom_site_dict["Cartn_z"][start:stop].astype(
279
- np.float32
280
- )
325
+ atoms.box = _get_box(block)
281
326
 
282
- array = _filter_altloc(array, model_dict, altloc)
327
+ # The below part is the same for both, AtomArray and AtomArrayStack
328
+ _fill_annotations(
329
+ atoms, model_atom_site, extra_fields, use_author_fields
330
+ )
331
+ if include_bonds:
332
+ if "chem_comp_bond" in block:
333
+ try:
334
+ custom_bond_dict = _parse_intra_residue_bonds(
335
+ block["chem_comp_bond"]
336
+ )
337
+ except KeyError:
338
+ warnings.warn(
339
+ "The 'chem_comp_bond' category has missing columns, "
340
+ "falling back to using Chemical Component Dictionary",
341
+ UserWarning
342
+ )
343
+ custom_bond_dict = None
344
+ bonds = connect_via_residue_names(
345
+ atoms, custom_bond_dict=custom_bond_dict
346
+ )
347
+ else:
348
+ bonds = connect_via_residue_names(atoms)
349
+ if "struct_conn" in block:
350
+ bonds = bonds.merge(_parse_inter_residue_bonds(
351
+ model_atom_site, block["struct_conn"]
352
+ ))
353
+ atoms.bonds = bonds
354
+ atoms = _filter_altloc(atoms, model_atom_site, altloc)
355
+
356
+ return atoms
357
+
358
+
359
+ def _get_block(pdbx_component, block_name):
360
+ if isinstance(pdbx_component, PDBxFile):
361
+ # The deprecated 'PDBxFile' is a thin wrapper around 'CIFFile'
362
+ pdbx_component = pdbx_component.cif_file
363
+
364
+ if not isinstance(pdbx_component, (CIFBlock, BinaryCIFBlock)):
365
+ # Determine block
366
+ if block_name is None:
367
+ return pdbx_component.block
368
+ else:
369
+ return pdbx_component[block_name]
370
+ else:
371
+ return pdbx_component
283
372
 
284
- array.box = _get_box(pdbx_file, data_block)
285
373
 
286
- return array
374
+ def _get_or_fallback(category, key, fallback_key):
375
+ """
376
+ Return column related to key in category if it exists,
377
+ otherwise try to get the column related to fallback key.
378
+ """
379
+ if key not in category:
380
+ warnings.warn(
381
+ f"Attribute '{key}' not found within 'atom_site' category. "
382
+ f"The fallback attribute '{fallback_key}' will be used instead",
383
+ UserWarning
384
+ )
385
+ try:
386
+ return category[fallback_key]
387
+ except KeyError as key_exc:
388
+ raise InvalidFileError(
389
+ f"Fallback attribute '{fallback_key}' not found within "
390
+ "'atom_site' category"
391
+ ) from key_exc
392
+ return category[key]
287
393
 
288
394
 
289
- def _fill_annotations(array, model_dict, extra_fields, use_author_fields):
395
+ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
290
396
  """Fill atom_site annotations in atom array or atom array stack.
291
397
 
292
398
  Parameters
293
399
  ----------
294
400
  array : AtomArray or AtomArrayStack
295
401
  Atom array or stack which will be annotated.
296
- model_dict : dict(str, ndarray)
297
- ``atom_site`` dictionary with values for one model.
402
+ atom_site : CIFCategory or BinaryCIFCategory
403
+ ``atom_site`` category with values for one model.
298
404
  extra_fields : list of str
299
405
  Entry names, that are additionally added as annotation arrays.
300
406
  use_author_fields : bool
@@ -302,121 +408,226 @@ def _fill_annotations(array, model_dict, extra_fields, use_author_fields):
302
408
  instead of ``label_``.
303
409
  """
304
410
 
305
- def get_or_fallback_from_dict(input_dict, key, fallback_key,
306
- dict_name="input"):
307
- """
308
- Return value related to key in input dict if it exists,
309
- otherwise try to get the value related to fallback key."""
310
- if key not in input_dict:
311
- warnings.warn(
312
- f"Attribute '{key}' not found within '{dict_name}' category. "
313
- f"The fallback attribute '{fallback_key}' will be used instead",
314
- UserWarning
315
- )
316
- try:
317
- return input_dict[fallback_key]
318
- except KeyError as key_exc:
319
- raise InvalidFileError(
320
- f"Fallback attribute '{fallback_key}' not found in "
321
- "'{dict_name}' category"
322
- ) from key_exc
323
- return input_dict[key]
324
-
325
- def get_annotation_from_model(
326
- model_dict,
327
- annotation_name,
328
- annotation_fallback=None,
329
- as_type=None,
330
- formatter=None,
331
- ):
332
- """Get and format annotation array from model dictionary."""
333
- array = (
334
- get_or_fallback_from_dict(
335
- model_dict, annotation_name, annotation_fallback,
336
- dict_name="atom_site"
337
- )
338
- if annotation_fallback is not None
339
- else model_dict[annotation_name]
340
- )
341
- if as_type is not None:
342
- array = array.astype(as_type)
343
- return formatter(array) if formatter is not None else array
344
-
345
411
  prefix, alt_prefix = (
346
412
  ("auth", "label") if use_author_fields else ("label", "auth")
347
413
  )
348
414
 
349
- annotation_data = {
350
- "chain_id": (f"{prefix}_asym_id", f"{alt_prefix}_asym_id", "U4", None),
351
- "res_id": (
352
- f"{prefix}_seq_id",
353
- f"{alt_prefix}_seq_id",
354
- None,
355
- lambda annot: np.array(
356
- [-1 if elt in [".", "?"] else int(elt) for elt in annot]
357
- ),
358
- ),
359
- "ins_code": (
360
- "pdbx_PDB_ins_code",
361
- None,
362
- "U1",
363
- lambda annot: np.array(
364
- ["" if elt in [".", "?"] else elt for elt in annot]
365
- ),
366
- ),
367
- "res_name": (f"{prefix}_comp_id", f"{alt_prefix}_comp_id", "U5", None),
368
- "hetero": ("group_PDB", None, None, lambda annot: annot == "HETATM"),
369
- "atom_name": (
370
- f"{prefix}_atom_id",
371
- f"{alt_prefix}_atom_id",
372
- "U6",
373
- None,
374
- ),
375
- "element": ("type_symbol", None, "U2", None),
376
- "atom_id": ("id", None, int, None),
377
- "b_factor": ("B_iso_or_equiv", None, float, None),
378
- "occupancy": ("occupancy", None, float, None),
379
- "charge": (
380
- "pdbx_formal_charge",
381
- None,
382
- None,
383
- lambda annot: np.array(
384
- [
385
- 0 if charge in ["?", "."] else int(charge)
386
- for charge in annot
387
- ],
388
- dtype=int,
389
- ),
390
- ),
391
- }
392
-
393
- mandatory_annotations = [
415
+ array.set_annotation(
394
416
  "chain_id",
417
+ _get_or_fallback(
418
+ atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
419
+ ).as_array("U4")
420
+ )
421
+ array.set_annotation(
395
422
  "res_id",
423
+ _get_or_fallback(
424
+ atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
425
+ ).as_array(int, -1)
426
+ )
427
+ array.set_annotation(
396
428
  "ins_code",
429
+ atom_site["pdbx_PDB_ins_code"].as_array("U1", "")
430
+ )
431
+ array.set_annotation(
397
432
  "res_name",
433
+ _get_or_fallback(
434
+ atom_site, f"{prefix}_comp_id", f"{alt_prefix}_comp_id"
435
+ ).as_array("U5")
436
+ )
437
+ array.set_annotation(
398
438
  "hetero",
439
+ atom_site["group_PDB"].as_array(str) == "HETATM"
440
+ )
441
+ array.set_annotation(
399
442
  "atom_name",
443
+ _get_or_fallback(
444
+ atom_site, f"{prefix}_atom_id", f"{alt_prefix}_atom_id"
445
+ ).as_array("U6")
446
+ )
447
+ array.set_annotation(
400
448
  "element",
401
- ]
449
+ atom_site["type_symbol"].as_array("U2")
450
+ )
402
451
 
403
- # Iterate over mandatory annotations and given extra_fields
404
- for annotation_name in mandatory_annotations + extra_fields:
452
+ if "atom_id" in extra_fields:
405
453
  array.set_annotation(
406
- annotation_name,
407
- get_annotation_from_model(
408
- model_dict, *annotation_data[annotation_name]
409
- )
410
- if annotation_name in annotation_data
411
- else get_annotation_from_model(
412
- model_dict, annotation_name, as_type=str
413
- ),
454
+ "atom_id",
455
+ atom_site["id"].as_array(int)
456
+ )
457
+ extra_fields.remove("atom_id")
458
+ if "b_factor" in extra_fields:
459
+ array.set_annotation(
460
+ "b_factor",
461
+ atom_site["B_iso_or_equiv"].as_array(float)
462
+ )
463
+ extra_fields.remove("b_factor")
464
+ if "occupancy" in extra_fields:
465
+ array.set_annotation(
466
+ "occupancy",
467
+ atom_site["occupancy"].as_array(float)
468
+ )
469
+ extra_fields.remove("occupancy")
470
+ if "charge" in extra_fields:
471
+ array.set_annotation(
472
+ "charge",
473
+ atom_site["pdbx_formal_charge"].as_array(int, 0)
474
+ )
475
+ extra_fields.remove("charge")
476
+
477
+ # Handle all remaining custom fields
478
+ for field in extra_fields:
479
+ array.set_annotation(
480
+ field,
481
+ atom_site[field].as_array(str)
482
+ )
483
+
484
+
485
+ def _parse_intra_residue_bonds(chem_comp_bond):
486
+ """
487
+ Create a :func:`connect_via_residue_names()` compatible
488
+ `custom_bond_dict` from the ``chem_comp_bond`` category.
489
+ """
490
+ custom_bond_dict = {}
491
+ for res_name, atom_1, atom_2, order, aromatic_flag in zip(
492
+ chem_comp_bond["comp_id"].as_array(str),
493
+ chem_comp_bond["atom_id_1"].as_array(str),
494
+ chem_comp_bond["atom_id_2"].as_array(str),
495
+ chem_comp_bond["value_order"].as_array(str),
496
+ chem_comp_bond["pdbx_aromatic_flag"].as_array(str)
497
+ ):
498
+ if res_name not in custom_bond_dict:
499
+ custom_bond_dict[res_name] = {}
500
+ bond_type = COMP_BOND_ORDER_TO_TYPE.get(
501
+ (order.upper(), aromatic_flag), BondType.ANY
502
+ )
503
+ custom_bond_dict[res_name][atom_1.item(), atom_2.item()] = bond_type
504
+ return custom_bond_dict
505
+
506
+
507
+ def _parse_inter_residue_bonds(atom_site, struct_conn):
508
+ """
509
+ Create inter-residue bonds by parsing the ``struct_conn`` category.
510
+ The atom indices of each bond are found by matching the bond labels
511
+ to the ``atom_site`` category.
512
+ """
513
+ # Identity symmetry operation
514
+ IDENTITY = "1_555"
515
+ # Columns in 'atom_site' that should be matched by 'struct_conn'
516
+ COLUMNS = [
517
+ "label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id",
518
+ "label_alt_id", "auth_asym_id", "auth_comp_id", "auth_seq_id",
519
+ "pdbx_PDB_ins_code"
520
+ ]
521
+
522
+ covale_mask = np.isin(
523
+ struct_conn["conn_type_id"].as_array(str), PDBX_COVALENT_TYPES
524
+ )
525
+ if "ptnr1_symmetry" in struct_conn:
526
+ covale_mask &= (
527
+ struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
528
+ )
529
+ if "ptnr2_symmetry" in struct_conn:
530
+ covale_mask &= (
531
+ struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY
532
+ )
533
+
534
+ atom_indices = [None] * 2
535
+ for i in range(2):
536
+ reference_arrays = []
537
+ query_arrays = []
538
+ for col_name in COLUMNS:
539
+ struct_conn_col_name = _get_struct_conn_col_name(col_name, i+1)
540
+ if (
541
+ col_name not in atom_site
542
+ or struct_conn_col_name not in struct_conn
543
+ ):
544
+ continue
545
+ # Ensure both arrays have the same dtype to allow comparison
546
+ reference = atom_site[col_name].as_array()
547
+ dtype = reference.dtype
548
+ query = struct_conn[struct_conn_col_name].as_array(dtype)
549
+ if np.issubdtype(reference.dtype, str):
550
+ # The mask value is not necessarily consistent
551
+ # between query and reference
552
+ # -> make it consistent
553
+ reference[reference == "?"] = "."
554
+ query[query == "?"] = "."
555
+ reference_arrays.append(reference)
556
+ query_arrays.append(query[covale_mask])
557
+ # Match the combination of 'label_asym_id', 'label_comp_id', etc.
558
+ # in 'atom_site' and 'struct_conn'
559
+ atom_indices[i] = _find_matches(query_arrays, reference_arrays)
560
+ atoms_indices_1 = atom_indices[0]
561
+ atoms_indices_2 = atom_indices[1]
562
+
563
+ # Some bonds in 'struct_conn' may not be found in 'atom_site'
564
+ # This is okay,
565
+ # as 'atom_site' might already be reduced to a single model
566
+ mapping_exists_mask = (atoms_indices_1 != -1) & (atoms_indices_2 != -1)
567
+ atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
568
+ atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
569
+
570
+ # Interpret missing values as ANY bonds
571
+ bond_order = struct_conn["pdbx_value_order"].as_array("U4", "")
572
+ # Consecutively apply the same masks as applied to the atom indices
573
+ # Logical combination does not work here,
574
+ # as the second mask was created based on already filtered data
575
+ bond_order = bond_order[covale_mask][mapping_exists_mask]
576
+ bond_types = [PDBX_BOND_ORDER_TO_TYPE[order] for order in bond_order]
577
+
578
+ return BondList(
579
+ atom_site.row_count,
580
+ np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1)
581
+ )
582
+
583
+
584
+ def _find_matches(query_arrays, reference_arrays):
585
+ """
586
+ For each index in the `query_arrays` find the indices in the
587
+ `reference_arrays` where all query values the reference counterpart.
588
+ If no match is found for a query, the corresponding index is -1.
589
+ """
590
+ match_masks_for_all_columns = np.stack([
591
+ query[:, np.newaxis] == reference[np.newaxis, :]
592
+ for query, reference in zip(query_arrays, reference_arrays)
593
+ ], axis=-1)
594
+ match_masks = np.all(match_masks_for_all_columns, axis=-1)
595
+ query_matches, reference_matches = np.where(match_masks)
596
+
597
+ # Duplicate matches indicate that an atom from the query cannot
598
+ # be uniquely matched to an atom in the reference
599
+ unique_query_matches, counts = np.unique(query_matches, return_counts=True)
600
+ if np.any(counts > 1):
601
+ ambiguous_query = unique_query_matches[np.where(counts > 1)[0][0]]
602
+ raise InvalidFileError(
603
+ f"The covalent bond in the 'struct_conn' category at index "
604
+ f"{ambiguous_query} cannot be unambiguously assigned to atoms in "
605
+ f"the 'atom_site' category"
414
606
  )
415
607
 
608
+ # -1 indicates that no match was found in the reference
609
+ match_indices = np.full(len(query_arrays[0]), -1, dtype=int)
610
+ match_indices[query_matches] = reference_matches
611
+ return match_indices
612
+
613
+
614
+ def _get_struct_conn_col_name(col_name, partner):
615
+ """
616
+ For a column name in ``atom_site`` get the corresponding column name
617
+ in ``struct_conn``.
618
+ """
619
+ if col_name == "label_alt_id":
620
+ return f"pdbx_ptnr{partner}_label_alt_id"
621
+ elif col_name.startswith("pdbx_"):
622
+ # Move 'pdbx_' to front
623
+ return f"pdbx_ptnr{partner}_{col_name[5:]}"
624
+ else:
625
+ return f"ptnr{partner}_{col_name}"
626
+
416
627
 
417
- def _filter_altloc(array, model_dict, altloc):
418
- altloc_ids = model_dict.get("label_alt_id")
419
- occupancy = model_dict.get("occupancy")
628
+ def _filter_altloc(array, atom_site, altloc):
629
+ altloc_ids = atom_site.get("label_alt_id")
630
+ occupancy = atom_site.get("occupancy")
420
631
 
421
632
  # Filter altloc IDs and return
422
633
  if altloc_ids is None:
@@ -425,14 +636,14 @@ def _filter_altloc(array, model_dict, altloc):
425
636
  return array[
426
637
  ...,
427
638
  filter_highest_occupancy_altloc(
428
- array, altloc_ids, occupancy.astype(float)
639
+ array, altloc_ids.as_array(str), occupancy.as_array(float)
429
640
  ),
430
641
  ]
431
642
  # 'first' is also fallback if file has no occupancy information
432
643
  elif altloc == "first":
433
- return array[..., filter_first_altloc(array, altloc_ids)]
644
+ return array[..., filter_first_altloc(array, altloc_ids.as_array(str))]
434
645
  elif altloc == "all":
435
- array.set_annotation("altloc_id", altloc_ids)
646
+ array.set_annotation("altloc_id", altloc_ids.as_array(str))
436
647
  return array
437
648
  else:
438
649
  raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
@@ -443,122 +654,154 @@ def _get_model_starts(model_array):
443
654
  Get the start index for each model in the arrays of the
444
655
  ``atom_site`` category.
445
656
  """
446
- models, indices = np.unique(model_array, return_index=True)
657
+ _, indices = np.unique(model_array, return_index=True)
447
658
  indices.sort()
448
659
  return indices
449
660
 
450
661
 
451
- def _get_model_dict(atom_site_dict, model_starts, model):
662
+ def _filter_model(atom_site, model_starts, model):
452
663
  """
453
- Reduce the ``atom_site`` dictionary to the values for the given
664
+ Reduce the ``atom_site`` category to the values for the given
454
665
  model.
455
666
  """
667
+ Category = type(atom_site)
668
+ Column = Category.subcomponent_class()
669
+ Data = Column.subcomponent_class()
670
+
456
671
  # Append exclusive stop
457
672
  model_starts = np.append(
458
- model_starts, [len(atom_site_dict["pdbx_PDB_model_num"])]
673
+ model_starts, [atom_site.row_count]
459
674
  )
460
- model_dict = {}
461
675
  # Indexing starts at 0, but model number starts at 1
462
676
  model_index = model - 1
463
- for key in atom_site_dict.keys():
464
- model_dict[key] = atom_site_dict[key][
465
- model_starts[model_index] : model_starts[model_index + 1]
466
- ]
467
- return model_dict
677
+ index = slice(model_starts[model_index], model_starts[model_index + 1])
678
+ return _filter(atom_site, index)
468
679
 
469
680
 
470
- def _get_box(pdbx_file, data_block):
471
- if data_block is None:
472
- cell_dict = pdbx_file.get("cell")
473
- else:
474
- cell_dict = pdbx_file.get((data_block, "cell"))
475
- if cell_dict is None:
681
+ def _get_box(block):
682
+ cell = block.get("cell")
683
+ if cell is None:
476
684
  return None
477
685
  try:
478
686
  len_a, len_b, len_c = [
479
- float(cell_dict[length])
687
+ float(cell[length].as_item())
480
688
  for length in ["length_a", "length_b", "length_c"]
481
689
  ]
690
+ alpha, beta, gamma = [
691
+ np.deg2rad(float(cell[angle].as_item()))
692
+ for angle in ["angle_alpha", "angle_beta", "angle_gamma"]
693
+ ]
482
694
  except ValueError:
483
695
  # 'cell_dict' has no proper unit cell values, e.g. '?'
484
696
  return None
485
- alpha, beta, gamma = [
486
- np.deg2rad(float(cell_dict[angle]))
487
- for angle in ["angle_alpha", "angle_beta", "angle_gamma"]
488
- ]
489
697
  return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
490
698
 
491
699
 
492
- def set_structure(pdbx_file, array, data_block=None):
700
+ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
493
701
  """
494
702
  Set the ``atom_site`` category with atom information from an
495
703
  :class:`AtomArray` or :class:`AtomArrayStack`.
496
704
 
497
705
  This will save the coordinates, the mandatory annotation categories
498
706
  and the optional annotation categories
499
- ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
707
+ ``atom_id``, ``b_factor``, ``occupancy`` and ``charge``.
500
708
  If the atom array (stack) contains the annotation ``'atom_id'``,
501
709
  these values will be used for atom numbering instead of continuous
502
710
  numbering.
711
+ Furthermore, inter-residue bonds will be written into the
712
+ ``struct_conn`` category.
503
713
 
504
714
  Parameters
505
715
  ----------
506
- pdbx_file : PDBxFile
716
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
507
717
  The file object.
508
718
  array : AtomArray or AtomArrayStack
509
719
  The structure to be written. If a stack is given, each array in
510
720
  the stack will be in a separate model.
511
721
  data_block : str, optional
512
- The name of the data block. Default is the first
513
- (and most times only) data block of the file.
722
+ The name of the data block.
723
+ Default is the first (and most times only) data block of the
724
+ file.
725
+ If the data block object is passed directly to `pdbx_file`,
726
+ this parameter is ignored.
727
+ If the file is empty, a new data block will be created.
728
+ include_bonds : bool, optional
729
+ If set to true and `array` has associated ``bonds`` , the
730
+ intra-residue bonds will be written into the ``chem_comp_bond``
731
+ category.
732
+ Inter-residue bonds will be written into the ``struct_conn``
733
+ independent of this parameter.
734
+
735
+ Notes
736
+ -----
737
+ In some cases, the written inter-residue bonds cannot be read again
738
+ due to ambiguity to which atoms the bond refers.
739
+ This is the case, when two equal residues in the same chain have
740
+ the same (or a masked) `res_id`.
514
741
 
515
742
  Examples
516
743
  --------
517
744
 
518
745
  >>> import os.path
519
- >>> file = PDBxFile()
520
- >>> set_structure(file, atom_array, data_block="structure")
746
+ >>> file = CIFFile()
747
+ >>> set_structure(file, atom_array)
521
748
  >>> file.write(os.path.join(path_to_directory, "structure.cif"))
522
749
 
523
750
  """
751
+ _check_non_empty(array)
752
+
753
+ block = _get_or_create_block(pdbx_file, data_block)
754
+ Category = block.subcomponent_class()
755
+ Column = Category.subcomponent_class()
756
+
524
757
  # Fill PDBx columns from information
525
758
  # in structures' attribute arrays as good as possible
526
- # Use OrderedDict in order to ensure the usually used column order.
527
- atom_site_dict = OrderedDict()
528
- # Save list of annotation categories for checks,
529
- # if an optional category exists
530
- annot_categories = array.get_annotation_categories()
531
- atom_site_dict["group_PDB"] = np.array(
532
- ["ATOM" if e == False else "HETATM" for e in array.hetero]
759
+ atom_site = Category()
760
+ atom_site["group_PDB"] = np.where(
761
+ array.hetero, "HETATM", "ATOM"
533
762
  )
534
- atom_site_dict["type_symbol"] = np.copy(array.element)
535
- atom_site_dict["label_atom_id"] = np.copy(array.atom_name)
536
- atom_site_dict["label_alt_id"] = np.full(array.array_length(), ".")
537
- atom_site_dict["label_comp_id"] = np.copy(array.res_name)
538
- atom_site_dict["label_asym_id"] = np.copy(array.chain_id)
539
- atom_site_dict["label_entity_id"] = _determine_entity_id(array.chain_id)
540
- atom_site_dict["label_seq_id"] = np.array([str(e) for e in array.res_id])
541
- atom_site_dict["pdbx_PDB_ins_code"] = array.ins_code
542
- atom_site_dict["auth_seq_id"] = atom_site_dict["label_seq_id"]
543
- atom_site_dict["auth_comp_id"] = atom_site_dict["label_comp_id"]
544
- atom_site_dict["auth_asym_id"] = atom_site_dict["label_asym_id"]
545
- atom_site_dict["auth_atom_id"] = atom_site_dict["label_atom_id"]
763
+ atom_site["type_symbol"] = np.copy(array.element)
764
+ atom_site["label_atom_id"] = np.copy(array.atom_name)
765
+ atom_site["label_alt_id"] = Column(
766
+ # AtomArrays do not store altloc atoms
767
+ np.full(array.array_length(), "."),
768
+ np.full(array.array_length(), MaskValue.INAPPLICABLE),
769
+ )
770
+ atom_site["label_comp_id"] = np.copy(array.res_name)
771
+ atom_site["label_asym_id"] = np.copy(array.chain_id)
772
+ atom_site["label_entity_id"] = _determine_entity_id(array.chain_id)
773
+ atom_site["label_seq_id"] = np.copy(array.res_id)
774
+ atom_site["pdbx_PDB_ins_code"] = Column(
775
+ np.copy(array.ins_code),
776
+ np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT)
777
+ )
778
+ atom_site["auth_seq_id"] = atom_site["label_seq_id"]
779
+ atom_site["auth_comp_id"] = atom_site["label_comp_id"]
780
+ atom_site["auth_asym_id"] = atom_site["label_asym_id"]
781
+ atom_site["auth_atom_id"] = atom_site["label_atom_id"]
546
782
 
783
+ annot_categories = array.get_annotation_categories()
547
784
  if "atom_id" in annot_categories:
548
- atom_site_dict["id"] = array.atom_id.astype(str)
785
+ atom_site["id"] = np.copy(array.atom_id)
549
786
  if "b_factor" in annot_categories:
550
- atom_site_dict["B_iso_or_equiv"] = np.array(
551
- [f"{b:.2f}" for b in array.b_factor]
552
- )
787
+ atom_site["B_iso_or_equiv"] = np.copy(array.b_factor)
553
788
  if "occupancy" in annot_categories:
554
- atom_site_dict["occupancy"] = np.array(
555
- [f"{occ:.2f}" for occ in array.occupancy]
556
- )
789
+ atom_site["occupancy"] = np.copy(array.occupancy)
557
790
  if "charge" in annot_categories:
558
- atom_site_dict["pdbx_formal_charge"] = np.array(
559
- [f"{c:+d}" if c != 0 else "?" for c in array.charge]
791
+ atom_site["pdbx_formal_charge"] = Column(
792
+ np.array([f"{c:+d}" if c != 0 else "?" for c in array.charge]),
793
+ np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT)
560
794
  )
561
795
 
796
+ if array.bonds is not None:
797
+ struct_conn = _set_inter_residue_bonds(array, atom_site)
798
+ if struct_conn is not None:
799
+ block["struct_conn"] = struct_conn
800
+ if include_bonds:
801
+ chem_comp_bond = _set_intra_residue_bonds(array, atom_site)
802
+ if chem_comp_bond is not None:
803
+ block["chem_comp_bond"] = chem_comp_bond
804
+
562
805
  # In case of a single model handle each coordinate
563
806
  # simply like a flattened array
564
807
  if type(array) == AtomArray or (
@@ -566,42 +809,32 @@ def set_structure(pdbx_file, array, data_block=None):
566
809
  ):
567
810
  # 'ravel' flattens coord without copy
568
811
  # in case of stack with stack_depth = 1
569
- atom_site_dict["Cartn_x"] = np.array(
570
- [f"{c:.3f}" for c in np.ravel(array.coord[..., 0])]
571
- )
572
- atom_site_dict["Cartn_y"] = np.array(
573
- [f"{c:.3f}" for c in np.ravel(array.coord[..., 1])]
574
- )
575
- atom_site_dict["Cartn_z"] = np.array(
576
- [f"{c:.3f}" for c in np.ravel(array.coord[..., 2])]
577
- )
578
- atom_site_dict["pdbx_PDB_model_num"] = np.full(
579
- array.array_length(), "1"
812
+ atom_site["Cartn_x"] = np.copy(np.ravel(array.coord[..., 0]))
813
+ atom_site["Cartn_y"] = np.copy(np.ravel(array.coord[..., 1]))
814
+ atom_site["Cartn_z"] = np.copy(np.ravel(array.coord[..., 2]))
815
+ atom_site["pdbx_PDB_model_num"] = np.ones(
816
+ array.array_length(), dtype=np.int32
580
817
  )
581
818
  # In case of multiple models repeat annotations
582
819
  # and use model specific coordinates
583
- elif type(array) == AtomArrayStack:
584
- for key, value in atom_site_dict.items():
585
- atom_site_dict[key] = np.tile(value, reps=array.stack_depth())
820
+ else:
821
+ atom_site = _repeat(atom_site, array.stack_depth())
586
822
  coord = np.reshape(
587
823
  array.coord, (array.stack_depth() * array.array_length(), 3)
588
824
  )
589
- atom_site_dict["Cartn_x"] = np.array([f"{c:.3f}" for c in coord[:, 0]])
590
- atom_site_dict["Cartn_y"] = np.array([f"{c:.3f}" for c in coord[:, 1]])
591
- atom_site_dict["Cartn_z"] = np.array([f"{c:.3f}" for c in coord[:, 2]])
592
- models = np.repeat(
593
- np.arange(1, array.stack_depth() + 1).astype(str),
825
+ atom_site["Cartn_x"] = np.copy(coord[:, 0])
826
+ atom_site["Cartn_y"] = np.copy(coord[:, 1])
827
+ atom_site["Cartn_z"] = np.copy(coord[:, 2])
828
+ atom_site["pdbx_PDB_model_num"] = np.repeat(
829
+ np.arange(1, array.stack_depth() + 1, dtype=np.int32),
594
830
  repeats=array.array_length(),
595
831
  )
596
- atom_site_dict["pdbx_PDB_model_num"] = models
597
- else:
598
- raise ValueError("Structure must be AtomArray or AtomArrayStack")
599
832
  if not "atom_id" in annot_categories:
600
833
  # Count from 1
601
- atom_site_dict["id"] = np.arange(
602
- 1, len(atom_site_dict["group_PDB"]) + 1
603
- ).astype("U6")
604
- pdbx_file.set_category("atom_site", atom_site_dict, data_block)
834
+ atom_site["id"] = np.arange(
835
+ 1, len(atom_site["group_PDB"]) + 1
836
+ )
837
+ block["atom_site"] = atom_site
605
838
 
606
839
  # Write box into file
607
840
  if array.box is not None:
@@ -612,14 +845,52 @@ def set_structure(pdbx_file, array, data_block=None):
612
845
  else:
613
846
  box = array.box
614
847
  len_a, len_b, len_c, alpha, beta, gamma = unitcell_from_vectors(box)
615
- cell_dict = OrderedDict()
616
- cell_dict["length_a"] = "{:6.3f}".format(len_a)
617
- cell_dict["length_b"] = "{:6.3f}".format(len_b)
618
- cell_dict["length_c"] = "{:6.3f}".format(len_c)
619
- cell_dict["angle_alpha"] = "{:5.3f}".format(np.rad2deg(alpha))
620
- cell_dict["angle_beta"] = "{:5.3f}".format(np.rad2deg(beta))
621
- cell_dict["angle_gamma"] = "{:5.3f}".format(np.rad2deg(gamma))
622
- pdbx_file.set_category("cell", cell_dict, data_block)
848
+ cell = Category()
849
+ cell["length_a"] = len_a
850
+ cell["length_b"] = len_b
851
+ cell["length_c"] = len_c
852
+ cell["angle_alpha"] = np.rad2deg(alpha)
853
+ cell["angle_beta"] = np.rad2deg(beta)
854
+ cell["angle_gamma"] = np.rad2deg(gamma)
855
+ block["cell"] = cell
856
+
857
+
858
+ def _check_non_empty(array):
859
+ if isinstance(array, AtomArray):
860
+ if array.array_length() == 0:
861
+ raise BadStructureError("Structure must not be empty")
862
+ elif isinstance(array, AtomArrayStack):
863
+ if array.array_length() == 0 or array.stack_depth() == 0:
864
+ raise BadStructureError("Structure must not be empty")
865
+ else:
866
+ raise ValueError(
867
+ "Structure must be AtomArray or AtomArrayStack, "
868
+ f"but got {type(array).__name__}"
869
+ )
870
+
871
+
872
+ def _get_or_create_block(pdbx_component, block_name):
873
+ if isinstance(pdbx_component, PDBxFile):
874
+ # The deprecated 'PDBxFile' is a thin wrapper around 'CIFFile'
875
+ pdbx_component = pdbx_component.cif_file
876
+
877
+ Block = pdbx_component.subcomponent_class()
878
+
879
+ if isinstance(pdbx_component, (CIFFile, BinaryCIFFile)):
880
+ if block_name is None:
881
+ if len(pdbx_component) > 0:
882
+ block_name = next(iter(pdbx_component.keys()))
883
+ else:
884
+ # File is empty -> invent a new block name
885
+ block_name = "structure"
886
+
887
+ if block_name not in pdbx_component:
888
+ block = Block()
889
+ pdbx_component[block_name] = block
890
+ return pdbx_component[block_name]
891
+ else:
892
+ # Already a block
893
+ return pdbx_component
623
894
 
624
895
 
625
896
  def _determine_entity_id(chain_id):
@@ -635,10 +906,155 @@ def _determine_entity_id(chain_id):
635
906
  id_translation[chain_id[i]] = id
636
907
  entity_id[i] = id_translation[chain_id[i]]
637
908
  id += 1
638
- return entity_id.astype(str)
909
+ return entity_id
910
+
911
+
912
+ def _repeat(category, repetitions):
913
+ Category = type(category)
914
+ Column = Category.subcomponent_class()
915
+ Data = Column.subcomponent_class()
916
+
917
+ category_dict = {}
918
+ for key, column in category.items():
919
+ if isinstance(column, BinaryCIFColumn):
920
+ data_encoding = column.data.encoding
921
+ # Optimization: The repeated string array has the same
922
+ # unique values, as the original string array
923
+ # -> Use same unique values (faster due to shorter array)
924
+ if isinstance(data_encoding[0], StringArrayEncoding):
925
+ data_encoding[0].strings = np.unique(column.data.array)
926
+ data = Data(np.tile(column.data.array, repetitions), data_encoding)
927
+ else:
928
+ data = Data(np.tile(column.data.array, repetitions))
929
+ mask = Data(np.tile(column.mask.array, repetitions)) \
930
+ if column.mask is not None else None
931
+ category_dict[key] = Column(data, mask)
932
+ return Category(category_dict)
933
+
934
+
935
+ def _set_intra_residue_bonds(array, atom_site):
936
+ """
937
+ Create the ``chem_comp_bond`` category containing the intra-residue
938
+ bonds.
939
+ ``atom_site`` is only used to infer the right :class:`Category` type
940
+ (either :class:`CIFCategory` or :class:`BinaryCIFCategory`).
941
+ """
942
+ if (array.res_name == "").any():
943
+ raise BadStructureError(
944
+ "Structure contains atoms with empty residue name, "
945
+ "but it is required to write intra-residue bonds"
946
+ )
947
+ if (array.atom_name == "").any():
948
+ raise BadStructureError(
949
+ "Structure contains atoms with empty atom name, "
950
+ "but it is required to write intra-residue bonds"
951
+ )
952
+
953
+ Category = type(atom_site)
954
+ Column = Category.subcomponent_class()
955
+
956
+ bond_array = _filter_bonds(array, "intra")
957
+ if len(bond_array) == 0:
958
+ return None
959
+ value_order = np.zeros(len(bond_array), dtype="U4")
960
+ aromatic_flag = np.zeros(len(bond_array), dtype="U1")
961
+ for i, bond_type in enumerate(bond_array[:, 2]):
962
+ if bond_type == BondType.ANY:
963
+ # ANY bonds will be masked anyway, no need to set the value
964
+ continue
965
+ order, aromatic = COMP_BOND_TYPE_TO_ORDER[bond_type]
966
+ value_order[i] = order
967
+ aromatic_flag[i] = aromatic
968
+ any_mask = bond_array[:, 2] == BondType.ANY
969
+
970
+ chem_comp_bond = Category()
971
+ # Take the residue name from the first atom index, as the residue
972
+ # name is the same for both atoms, since we have only intra bonds
973
+ chem_comp_bond["comp_id"] = array.res_name[bond_array[:, 0]]
974
+ chem_comp_bond["atom_id_1"] = array.atom_name[bond_array[:, 0]]
975
+ chem_comp_bond["atom_id_2"] = array.atom_name[bond_array[:, 1]]
976
+ chem_comp_bond["value_order"] = Column(
977
+ value_order,
978
+ np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT)
979
+ )
980
+ chem_comp_bond["pdbx_aromatic_flag"] = Column(
981
+ aromatic_flag,
982
+ np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT)
983
+ )
984
+ # BondList does not contain stereo information
985
+ # -> all values are missing
986
+ chem_comp_bond["pdbx_stereo_config"] = Column(
987
+ np.zeros(len(bond_array), dtype="U1"),
988
+ np.full(len(bond_array), MaskValue.MISSING)
989
+ )
990
+ chem_comp_bond["pdbx_ordinal"] = np.arange(
991
+ 1, len(bond_array) + 1, dtype=np.int32
992
+ )
993
+ return chem_comp_bond
994
+
995
+
996
+ def _set_inter_residue_bonds(array, atom_site):
997
+ """
998
+ Create the ``struct_conn`` category containing the inter-residue
999
+ bonds.
1000
+ The involved atoms are identified by annotations from the
1001
+ ``atom_site`` category.
1002
+ """
1003
+ COLUMNS = [
1004
+ "label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id",
1005
+ "pdbx_PDB_ins_code"
1006
+ ]
1007
+
1008
+ Category = type(atom_site)
1009
+ Column = Category.subcomponent_class()
1010
+
1011
+ bond_array = _filter_bonds(array, "inter")
1012
+ if len(bond_array) == 0:
1013
+ return None
1014
+ struct_conn = Category()
1015
+ struct_conn["id"] = np.arange(1, len(bond_array) + 1)
1016
+ struct_conn["conn_type_id"] = np.full(len(bond_array), "covale")
1017
+ struct_conn["pdbx_value_order"] = Column(
1018
+ np.array(
1019
+ [PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]
1020
+ ),
1021
+ np.where(
1022
+ bond_array[:, 2] == BondType.ANY,
1023
+ MaskValue.MISSING, MaskValue.PRESENT,
1024
+ )
1025
+ )
1026
+ # Write the identifying annotation...
1027
+ for col_name in COLUMNS:
1028
+ annot = atom_site[col_name].as_array()
1029
+ # ...for each bond partner
1030
+ for i in range(2):
1031
+ atom_indices = bond_array[:, i]
1032
+ struct_conn[_get_struct_conn_col_name(col_name, i+1)] \
1033
+ = annot[atom_indices]
1034
+ return struct_conn
1035
+
1036
+
1037
+ def _filter_bonds(array, connection):
1038
+ """
1039
+ Get a bonds array, that contain either only intra-residue or
1040
+ only inter-residue bonds.
1041
+ """
1042
+ bond_array = array.bonds.as_array()
1043
+ # To save computation time call 'get_residue_starts_for()' only once
1044
+ # with indices of the first and second atom of each bond
1045
+ residue_starts_1, residue_starts_2 = get_residue_starts_for(
1046
+ array, bond_array[:, :2].flatten()
1047
+ ).reshape(-1, 2).T
1048
+ if connection == "intra":
1049
+ return bond_array[residue_starts_1 == residue_starts_2]
1050
+ elif connection == "inter":
1051
+ return bond_array[residue_starts_1 != residue_starts_2]
1052
+ else:
1053
+ raise ValueError("Invalid 'connection' option")
639
1054
 
640
1055
 
641
- def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
1056
+ def get_component(pdbx_file, data_block=None, use_ideal_coord=True,
1057
+ res_name=None):
642
1058
  """
643
1059
  Create an :class:`AtomArray` for a chemical component from the
644
1060
  ``chem_comp_atom`` and, if available, the ``chem_comp_bond``
@@ -646,26 +1062,37 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
646
1062
 
647
1063
  Parameters
648
1064
  ----------
1065
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1066
+ The file object.
649
1067
  data_block : str, optional
650
- The name of the data block. Default is the first
651
- (and most times only) data block of the file.
1068
+ The name of the data block.
1069
+ Default is the first (and most times only) data block of the
1070
+ file.
1071
+ If the data block object is passed directly to `pdbx_file`,
1072
+ this parameter is ignored.
652
1073
  use_ideal_coord : bool, optional
653
1074
  If true, the *ideal* coordinates are read from the file
654
1075
  (``pdbx_model_Cartn_<dim>_ideal`` fields), typically
655
1076
  originating from computations.
656
1077
  If set to false, alternative coordinates are read
657
1078
  (``model_Cartn_<dim>_`` fields).
658
-
1079
+ res_name : str
1080
+ In rare cases the categories may contain rows for multiple
1081
+ components.
1082
+ In this case, the component with the given residue name is
1083
+ read.
1084
+ By default, all rows would be read in this case.
1085
+
659
1086
  Returns
660
1087
  -------
661
1088
  array : AtomArray
662
1089
  The parsed chemical component.
663
-
1090
+
664
1091
  Examples
665
1092
  --------
666
1093
 
667
1094
  >>> import os.path
668
- >>> file = PDBxFile.read(
1095
+ >>> file = CIFFile.read(
669
1096
  ... os.path.join(path_to_structures, "molecules", "TYR.cif")
670
1097
  ... )
671
1098
  >>> comp = get_component(file)
@@ -695,26 +1122,31 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
695
1122
  HET 0 TYR HH H -0.123 -0.399 -5.059
696
1123
  HET 0 TYR HXT H -1.333 -0.030 4.784
697
1124
  """
698
- atom_dict = pdbx_file.get_category(
699
- "chem_comp_atom", block=data_block, expect_looped=True
700
- )
701
- if atom_dict is None:
1125
+ block = _get_block(pdbx_file, data_block)
1126
+
1127
+ try:
1128
+ atom_category = block["chem_comp_atom"]
1129
+ except KeyError:
702
1130
  raise InvalidFileError("Missing 'chem_comp_atom' category in file")
703
- bond_dict = pdbx_file.get_category(
704
- "chem_comp_bond", block=data_block, expect_looped=True
705
- )
1131
+ if res_name is not None:
1132
+ atom_category = _filter(
1133
+ atom_category, atom_category["comp_id"].as_array() == res_name
1134
+ )
1135
+ if atom_category.row_count == 0:
1136
+ raise KeyError(
1137
+ f"No rows with residue name '{res_name}' found in "
1138
+ f"'chem_comp_atom' category"
1139
+ )
706
1140
 
707
- array = AtomArray(len(list(atom_dict.values())[0]))
1141
+ array = AtomArray(atom_category.row_count)
708
1142
 
709
1143
  array.hetero[:] = True
710
- array.res_name = atom_dict["comp_id"]
711
- array.atom_name = atom_dict["atom_id"]
712
- array.element = atom_dict["type_symbol"]
1144
+ array.res_name = atom_category["comp_id"].as_array("U5")
1145
+ array.atom_name = atom_category["atom_id"].as_array("U6")
1146
+ array.element = atom_category["type_symbol"].as_array("U2")
713
1147
  array.add_annotation("charge", int)
714
- array.charge = np.array(
715
- [int(c) if c != "?" else 0 for c in atom_dict["charge"]]
716
- )
717
-
1148
+ array.charge = atom_category["charge"].as_array(int, 0)
1149
+
718
1150
  coord_fields = [f"pdbx_model_Cartn_{dim}_ideal" for dim in ("x", "y", "z")]
719
1151
  alt_coord_fields = [f"model_Cartn_{dim}" for dim in ("x", "y", "z")]
720
1152
  if not use_ideal_coord:
@@ -722,7 +1154,7 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
722
1154
  coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
723
1155
  try:
724
1156
  for i, field in enumerate(coord_fields):
725
- array.coord[:,i] = atom_dict[field]
1157
+ array.coord[:,i] = atom_category[field].as_array(np.float32)
726
1158
  except KeyError as err:
727
1159
  key = err.args[0]
728
1160
  warnings.warn(
@@ -731,9 +1163,15 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
731
1163
  UserWarning
732
1164
  )
733
1165
  for i, field in enumerate(alt_coord_fields):
734
- array.coord[:,i] = atom_dict[field]
735
-
736
- if bond_dict is None:
1166
+ array.coord[:,i] = atom_category[field].as_array(np.float32)
1167
+
1168
+ try:
1169
+ bond_category = block["chem_comp_bond"]
1170
+ if res_name is not None:
1171
+ bond_category = _filter(
1172
+ bond_category, bond_category["comp_id"].as_array() == res_name
1173
+ )
1174
+ except KeyError:
737
1175
  warnings.warn(
738
1176
  f"Category 'chem_comp_bond' not found. "
739
1177
  f"No bonds will be parsed",
@@ -742,12 +1180,14 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
742
1180
  else:
743
1181
  bonds = BondList(array.array_length())
744
1182
  for atom1, atom2, order, aromatic_flag in zip(
745
- bond_dict["atom_id_1"], bond_dict["atom_id_2"],
746
- bond_dict["value_order"], bond_dict["pdbx_aromatic_flag"]
1183
+ bond_category["atom_id_1"].as_array(str),
1184
+ bond_category["atom_id_2"].as_array(str),
1185
+ bond_category["value_order"].as_array(str),
1186
+ bond_category["pdbx_aromatic_flag"].as_array(str)
747
1187
  ):
748
1188
  atom_i = np.where(array.atom_name == atom1)[0][0]
749
1189
  atom_j = np.where(array.atom_name == atom2)[0][0]
750
- bond_type = BOND_ORDER_TO_BOND_TYPE[order, aromatic_flag]
1190
+ bond_type = COMP_BOND_ORDER_TO_TYPE[order, aromatic_flag]
751
1191
  bonds.add_bond(atom_i, atom_j, bond_type)
752
1192
  array.bonds = bonds
753
1193
 
@@ -766,15 +1206,24 @@ def set_component(pdbx_file, array, data_block=None):
766
1206
 
767
1207
  Parameters
768
1208
  ----------
769
- pdbx_file : PDBxFile
1209
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
770
1210
  The file object.
771
1211
  array : AtomArray
772
1212
  The chemical component to be written.
773
1213
  Must contain only a single residue.
774
1214
  data_block : str, optional
775
- The name of the data block. Default is the first
776
- (and most times only) data block of the file.
1215
+ The name of the data block.
1216
+ Default is the first (and most times only) data block of the
1217
+ file.
1218
+ If the file is empty, a new data will be created.
1219
+ If the data block object is passed directly to `pdbx_file`,
1220
+ this parameter is ignored.
777
1221
  """
1222
+ _check_non_empty(array)
1223
+
1224
+ block = _get_or_create_block(pdbx_file, data_block)
1225
+ Category = block.subcomponent_class()
1226
+
778
1227
  if get_residue_count(array) > 1:
779
1228
  raise BadStructureError(
780
1229
  "The input atom array must comprise only one residue"
@@ -787,45 +1236,44 @@ def set_component(pdbx_file, array, data_block=None):
787
1236
  else:
788
1237
  charge = np.full(array.array_length(), "?", dtype="U2")
789
1238
 
790
- chem_comp_dict = OrderedDict()
791
- chem_comp_dict["comp_id"] = np.full(array.array_length(), res_name)
792
- chem_comp_dict["atom_id"] = np.copy(array.atom_name)
793
- chem_comp_dict["alt_atom_id"] = chem_comp_dict["atom_id"]
794
- chem_comp_dict["type_symbol"] = np.copy(array.element)
795
- chem_comp_dict["charge"] = charge
796
- chem_comp_dict["model_Cartn_x"] = np.copy(array.coord[:, 0])
797
- chem_comp_dict["model_Cartn_y"] = np.copy(array.coord[:, 1])
798
- chem_comp_dict["model_Cartn_z"] = np.copy(array.coord[:, 2])
799
- chem_comp_dict["pdbx_model_Cartn_x_ideal"] = chem_comp_dict["model_Cartn_x"]
800
- chem_comp_dict["pdbx_model_Cartn_y_ideal"] = chem_comp_dict["model_Cartn_y"]
801
- chem_comp_dict["pdbx_model_Cartn_z_ideal"] = chem_comp_dict["model_Cartn_z"]
802
- chem_comp_dict["pdbx_component_atom_id"] = chem_comp_dict["atom_id"]
803
- chem_comp_dict["pdbx_component_comp_id"] = chem_comp_dict["comp_id"]
804
- chem_comp_dict["pdbx_ordinal"] = np.arange(
1239
+ atom_cat = Category()
1240
+ atom_cat["comp_id"] = np.full(array.array_length(), res_name)
1241
+ atom_cat["atom_id"] = np.copy(array.atom_name)
1242
+ atom_cat["alt_atom_id"] = atom_cat["atom_id"]
1243
+ atom_cat["type_symbol"] = np.copy(array.element)
1244
+ atom_cat["charge"] = charge
1245
+ atom_cat["model_Cartn_x"] = np.copy(array.coord[:, 0])
1246
+ atom_cat["model_Cartn_y"] = np.copy(array.coord[:, 1])
1247
+ atom_cat["model_Cartn_z"] = np.copy(array.coord[:, 2])
1248
+ atom_cat["pdbx_model_Cartn_x_ideal"] = atom_cat["model_Cartn_x"]
1249
+ atom_cat["pdbx_model_Cartn_y_ideal"] = atom_cat["model_Cartn_y"]
1250
+ atom_cat["pdbx_model_Cartn_z_ideal"] = atom_cat["model_Cartn_z"]
1251
+ atom_cat["pdbx_component_atom_id"] = atom_cat["atom_id"]
1252
+ atom_cat["pdbx_component_comp_id"] = atom_cat["comp_id"]
1253
+ atom_cat["pdbx_ordinal"] = np.arange(
805
1254
  1, array.array_length() + 1
806
1255
  ).astype(str)
807
- pdbx_file.set_category("chem_comp_atom", chem_comp_dict, data_block)
1256
+ block["chem_comp_atom"] = atom_cat
808
1257
 
809
- if array.bonds is not None:
1258
+ if array.bonds is not None and array.bonds.get_bond_count() > 0:
810
1259
  bond_array = array.bonds.as_array()
811
1260
  order_flags = []
812
1261
  aromatic_flags = []
813
1262
  for bond_type in bond_array[:,2]:
814
- order_flag, aromatic_flag = BOND_TYPE_TO_BOND_ORDER[bond_type]
1263
+ order_flag, aromatic_flag = COMP_BOND_TYPE_TO_ORDER[bond_type]
815
1264
  order_flags.append(order_flag)
816
1265
  aromatic_flags.append(aromatic_flag)
817
1266
 
818
- chem_comp_bond_dict = OrderedDict()
819
- chem_comp_bond_dict["comp_id"] = np.full(len(bond_array), res_name)
820
- chem_comp_bond_dict["atom_id_1"] = array.atom_name[bond_array[:,0]]
821
- chem_comp_bond_dict["atom_id_2"] = array.atom_name[bond_array[:,1]]
822
- chem_comp_bond_dict["value_order"] = np.array(order_flags)
823
- chem_comp_bond_dict["pdbx_aromatic_flag"] = np.array(aromatic_flags)
824
- chem_comp_bond_dict["pdbx_ordinal"] = np.arange(
1267
+ bond_cat = Category()
1268
+ bond_cat["comp_id"] = np.full(len(bond_array), res_name)
1269
+ bond_cat["atom_id_1"] = array.atom_name[bond_array[:,0]]
1270
+ bond_cat["atom_id_2"] = array.atom_name[bond_array[:,1]]
1271
+ bond_cat["value_order"] = np.array(order_flags)
1272
+ bond_cat["pdbx_aromatic_flag"] = np.array(aromatic_flags)
1273
+ bond_cat["pdbx_ordinal"] = np.arange(
825
1274
  1, len(bond_array) + 1
826
1275
  ).astype(str)
827
- pdbx_file.set_category("chem_comp_bond", chem_comp_bond_dict, data_block)
828
-
1276
+ block["chem_comp_bond"] = bond_cat
829
1277
 
830
1278
  def list_assemblies(pdbx_file, data_block=None):
831
1279
  """
@@ -838,23 +1286,25 @@ def list_assemblies(pdbx_file, data_block=None):
838
1286
 
839
1287
  Parameters
840
1288
  ----------
841
- pdbx_file : PDBxFile
1289
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
842
1290
  The file object.
843
1291
  data_block : str, optional
844
1292
  The name of the data block.
845
- Defaults to the first (and most times only) data block of the
1293
+ Default is the first (and most times only) data block of the
846
1294
  file.
1295
+ If the data block object is passed directly to `pdbx_file`,
1296
+ this parameter is ignored.
847
1297
 
848
1298
  Returns
849
1299
  -------
850
1300
  assemblies : dict of str -> str
851
1301
  A dictionary that maps an assembly ID to a description of the
852
1302
  corresponding assembly.
853
-
1303
+
854
1304
  Examples
855
1305
  --------
856
1306
  >>> import os.path
857
- >>> file = PDBxFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1307
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
858
1308
  >>> assembly_ids = list_assemblies(file)
859
1309
  >>> for key, val in assembly_ids.items():
860
1310
  ... print(f"'{key}' : '{val}'")
@@ -865,21 +1315,24 @@ def list_assemblies(pdbx_file, data_block=None):
865
1315
  '5' : 'icosahedral asymmetric unit, std point frame'
866
1316
  '6' : 'crystal asymmetric unit, crystal frame'
867
1317
  """
868
- assembly_category = pdbx_file.get_category(
869
- "pdbx_struct_assembly", data_block, expect_looped=True
870
- )
871
- if assembly_category is None:
1318
+ block = _get_block(pdbx_file, data_block)
1319
+
1320
+ try:
1321
+ assembly_category = block["pdbx_struct_assembly"]
1322
+ except KeyError:
872
1323
  raise InvalidFileError("File has no 'pdbx_struct_assembly' category")
873
1324
  return {
874
1325
  id: details
875
1326
  for id, details in zip(
876
- assembly_category["id"], assembly_category["details"]
1327
+ assembly_category["id"].as_array(str),
1328
+ assembly_category["details"].as_array(str)
877
1329
  )
878
1330
  }
879
1331
 
880
1332
 
881
1333
  def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
882
- altloc="first", extra_fields=None, use_author_fields=True):
1334
+ altloc="first", extra_fields=None, use_author_fields=True,
1335
+ include_bonds=False):
883
1336
  """
884
1337
  Build the given biological assembly.
885
1338
 
@@ -890,7 +1343,7 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
890
1343
 
891
1344
  Parameters
892
1345
  ----------
893
- pdbx_file : PDBxFile
1346
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
894
1347
  The file object.
895
1348
  assembly_id : str
896
1349
  The assembly to build.
@@ -907,8 +1360,10 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
907
1360
  contains only one model.
908
1361
  data_block : str, optional
909
1362
  The name of the data block.
910
- Defaults to the first (and most times only) data block of the
1363
+ Default is the first (and most times only) data block of the
911
1364
  file.
1365
+ If the data block object is passed directly to `pdbx_file`,
1366
+ this parameter is ignored.
912
1367
  altloc : {'first', 'occupancy', 'all'}
913
1368
  This parameter defines how *altloc* IDs are handled:
914
1369
  - ``'first'`` - Use atoms that have the first *altloc* ID
@@ -940,36 +1395,46 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
940
1395
  If `use_author_fields` is true, the annotation arrays will be
941
1396
  read from the ``auth_xxx`` fields (if applicable),
942
1397
  otherwise from the the ``label_xxx`` fields.
1398
+ include_bonds : bool, optional
1399
+ If set to true, a :class:`BondList` will be created for the
1400
+ resulting :class:`AtomArray` containing the bond information
1401
+ from the file.
1402
+ Bonds, whose order could not be determined from the
1403
+ *Chemical Component Dictionary*
1404
+ (e.g. especially inter-residue bonds),
1405
+ have :attr:`BondType.ANY`, since the PDB format itself does
1406
+ not support bond orders.
943
1407
 
944
1408
  Returns
945
1409
  -------
946
1410
  assembly : AtomArray or AtomArrayStack
947
1411
  The assembly. The return type depends on the `model` parameter.
948
-
1412
+
949
1413
  Examples
950
1414
  --------
951
1415
 
952
1416
  >>> import os.path
953
- >>> file = PDBxFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1417
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
954
1418
  >>> assembly = get_assembly(file, model=1)
955
1419
  """
956
- assembly_gen_category = pdbx_file.get_category(
957
- "pdbx_struct_assembly_gen", data_block, expect_looped=True
958
- )
959
- if assembly_gen_category is None:
1420
+ block = _get_block(pdbx_file, data_block)
1421
+
1422
+ try:
1423
+ assembly_gen_category = block["pdbx_struct_assembly_gen"]
1424
+ except KeyError:
960
1425
  raise InvalidFileError(
961
1426
  "File has no 'pdbx_struct_assembly_gen' category"
962
1427
  )
963
1428
 
964
- struct_oper_category = pdbx_file.get_category(
965
- "pdbx_struct_oper_list", data_block, expect_looped=True
966
- )
967
- if struct_oper_category is None:
1429
+ try:
1430
+ struct_oper_category = block["pdbx_struct_oper_list"]
1431
+ except KeyError:
968
1432
  raise InvalidFileError("File has no 'pdbx_struct_oper_list' category")
969
1433
 
1434
+ assembly_ids = assembly_gen_category["assembly_id"].as_array(str)
970
1435
  if assembly_id is None:
971
- assembly_id = assembly_gen_category["assembly_id"][0]
972
- elif assembly_id not in assembly_gen_category["assembly_id"]:
1436
+ assembly_id = assembly_ids[0]
1437
+ elif assembly_id not in assembly_ids:
973
1438
  raise KeyError(f"File has no Assembly ID '{assembly_id}'")
974
1439
 
975
1440
  ### Calculate all possible transformations
@@ -982,6 +1447,8 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
982
1447
  if "label_asym_id" in extra_fields:
983
1448
  extra_fields_and_asym = extra_fields
984
1449
  else:
1450
+ # The operations apply on asym IDs
1451
+ # -> they need to be included to select the correct atoms
985
1452
  extra_fields_and_asym = extra_fields + ["label_asym_id"]
986
1453
  structure = get_structure(
987
1454
  pdbx_file,
@@ -990,14 +1457,15 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
990
1457
  altloc,
991
1458
  extra_fields_and_asym,
992
1459
  use_author_fields,
1460
+ include_bonds
993
1461
  )
994
1462
 
995
1463
  ### Get transformations and apply them to the affected asym IDs
996
1464
  assembly = None
997
1465
  for id, op_expr, asym_id_expr in zip(
998
- assembly_gen_category["assembly_id"],
999
- assembly_gen_category["oper_expression"],
1000
- assembly_gen_category["asym_id_list"],
1466
+ assembly_gen_category["assembly_id"].as_array(str),
1467
+ assembly_gen_category["oper_expression"].as_array(str),
1468
+ assembly_gen_category["asym_id_list"].as_array(str),
1001
1469
  ):
1002
1470
  # Find the operation expressions for given assembly ID
1003
1471
  # We already asserted that the ID is actually present
@@ -1017,12 +1485,12 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
1017
1485
  assembly = sub_assembly
1018
1486
  else:
1019
1487
  assembly += sub_assembly
1020
-
1488
+
1021
1489
  # Remove 'label_asym_id', if it was not included in the original
1022
1490
  # user-supplied 'extra_fields'
1023
1491
  if "label_asym_id" not in extra_fields:
1024
1492
  assembly.del_annotation("label_asym_id")
1025
-
1493
+
1026
1494
  return assembly
1027
1495
 
1028
1496
 
@@ -1056,19 +1524,20 @@ def _get_transformations(struct_oper):
1056
1524
  translation for each operation ID in ``pdbx_struct_oper_list``.
1057
1525
  """
1058
1526
  transformation_dict = {}
1059
- for index, id in enumerate(struct_oper["id"]):
1527
+ for index, id in enumerate(struct_oper["id"].as_array(str)):
1060
1528
  rotation_matrix = np.array(
1061
1529
  [
1062
1530
  [
1063
- float(struct_oper[f"matrix[{i}][{j}]"][index])
1531
+ struct_oper[f"matrix[{i}][{j}]"].as_array(float)[index]
1064
1532
  for j in (1, 2, 3)
1065
1533
  ]
1066
1534
  for i in (1, 2, 3)
1067
1535
  ]
1068
1536
  )
1069
- translation_vector = np.array(
1070
- [float(struct_oper[f"vector[{i}]"][index]) for i in (1, 2, 3)]
1071
- )
1537
+ translation_vector = np.array([
1538
+ struct_oper[f"vector[{i}]"].as_array(float)[index]
1539
+ for i in (1, 2, 3)
1540
+ ])
1072
1541
  transformation_dict[id] = (rotation_matrix, translation_vector)
1073
1542
  return transformation_dict
1074
1543
 
@@ -1082,25 +1551,26 @@ def _parse_operation_expression(expression):
1082
1551
  # Split groups by parentheses:
1083
1552
  # use the opening parenthesis as delimiter
1084
1553
  # and just remove the closing parenthesis
1554
+ # example: '(X0)(1-10,21-25)' from 1a34
1085
1555
  expressions_per_step = expression.replace(")", "").split("(")
1086
1556
  expressions_per_step = [e for e in expressions_per_step if len(e) > 0]
1087
1557
  # Important: Operations are applied from right to left
1088
1558
  expressions_per_step.reverse()
1089
1559
 
1090
1560
  operations = []
1091
- for expr in expressions_per_step:
1092
- if "-" in expr:
1093
- # Range of operation IDs, they must be integers
1094
- first, last = expr.split("-")
1095
- operations.append(
1096
- [str(id) for id in range(int(first), int(last) + 1)]
1097
- )
1098
- elif "," in expr:
1099
- # List of operation IDs
1100
- operations.append(expr.split(","))
1101
- else:
1102
- # Single operation ID
1103
- operations.append([expr])
1561
+ for one_step_expr in expressions_per_step:
1562
+ one_step_op_ids = []
1563
+ for expr in one_step_expr.split(","):
1564
+ if "-" in expr:
1565
+ # Range of operation IDs, they must be integers
1566
+ first, last = expr.split("-")
1567
+ one_step_op_ids.extend(
1568
+ [str(id) for id in range(int(first), int(last) + 1)]
1569
+ )
1570
+ else:
1571
+ # Single operation ID
1572
+ one_step_op_ids.append(expr)
1573
+ operations.append(one_step_op_ids)
1104
1574
 
1105
1575
  # Cartesian product of operations
1106
1576
  return list(itertools.product(*operations))
@@ -1112,6 +1582,8 @@ def _convert_string_to_sequence(string, stype):
1112
1582
  ``proteinseq_type_list`` or to ``NucleotideSequence`` if `stype` is
1113
1583
  contained in ``_nucleotideseq_type_list``.
1114
1584
  """
1585
+ # sequence may be stored as multiline string
1586
+ string = string.replace("\n", "")
1115
1587
  if stype in _proteinseq_type_list:
1116
1588
  return ProteinSequence(string)
1117
1589
  elif stype in _nucleotideseq_type_list: