biotite 0.39.0__cp310-cp310-macosx_11_0_arm64.whl → 0.40.0__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (104) hide show
  1. biotite/__init__.py +3 -3
  2. biotite/application/dssp/app.py +18 -18
  3. biotite/database/rcsb/download.py +19 -14
  4. biotite/sequence/align/banded.c +258 -237
  5. biotite/sequence/align/banded.cpython-310-darwin.so +0 -0
  6. biotite/sequence/align/kmeralphabet.c +243 -222
  7. biotite/sequence/align/kmeralphabet.cpython-310-darwin.so +0 -0
  8. biotite/sequence/align/kmersimilarity.c +215 -196
  9. biotite/sequence/align/kmersimilarity.cpython-310-darwin.so +0 -0
  10. biotite/sequence/align/kmertable.cpp +233 -205
  11. biotite/sequence/align/kmertable.cpython-310-darwin.so +0 -0
  12. biotite/sequence/align/localgapped.c +258 -237
  13. biotite/sequence/align/localgapped.cpython-310-darwin.so +0 -0
  14. biotite/sequence/align/localungapped.c +235 -214
  15. biotite/sequence/align/localungapped.cpython-310-darwin.so +0 -0
  16. biotite/sequence/align/multiple.c +255 -234
  17. biotite/sequence/align/multiple.cpython-310-darwin.so +0 -0
  18. biotite/sequence/align/pairwise.c +274 -253
  19. biotite/sequence/align/pairwise.cpython-310-darwin.so +0 -0
  20. biotite/sequence/align/permutation.c +215 -196
  21. biotite/sequence/align/permutation.cpython-310-darwin.so +0 -0
  22. biotite/sequence/align/selector.c +217 -197
  23. biotite/sequence/align/selector.cpython-310-darwin.so +0 -0
  24. biotite/sequence/align/tracetable.c +215 -195
  25. biotite/sequence/align/tracetable.cpython-310-darwin.so +0 -0
  26. biotite/sequence/codec.c +235 -214
  27. biotite/sequence/codec.cpython-310-darwin.so +0 -0
  28. biotite/sequence/phylo/nj.c +215 -196
  29. biotite/sequence/phylo/nj.cpython-310-darwin.so +0 -0
  30. biotite/sequence/phylo/tree.c +227 -202
  31. biotite/sequence/phylo/tree.cpython-310-darwin.so +0 -0
  32. biotite/sequence/phylo/upgma.c +215 -196
  33. biotite/sequence/phylo/upgma.cpython-310-darwin.so +0 -0
  34. biotite/structure/basepairs.py +7 -12
  35. biotite/structure/bonds.c +1175 -1226
  36. biotite/structure/bonds.cpython-310-darwin.so +0 -0
  37. biotite/structure/celllist.c +217 -197
  38. biotite/structure/celllist.cpython-310-darwin.so +0 -0
  39. biotite/structure/charges.c +1052 -1101
  40. biotite/structure/charges.cpython-310-darwin.so +0 -0
  41. biotite/structure/filter.py +30 -37
  42. biotite/structure/info/__init__.py +5 -8
  43. biotite/structure/info/atoms.py +25 -67
  44. biotite/structure/info/bonds.py +46 -100
  45. biotite/structure/info/ccd/README.rst +8 -0
  46. biotite/structure/info/ccd/amino_acids.txt +1646 -0
  47. biotite/structure/info/ccd/carbohydrates.txt +1133 -0
  48. biotite/structure/info/ccd/components.bcif +0 -0
  49. biotite/structure/info/ccd/nucleotides.txt +797 -0
  50. biotite/structure/info/ccd.py +95 -0
  51. biotite/structure/info/groups.py +90 -0
  52. biotite/structure/info/masses.py +21 -20
  53. biotite/structure/info/misc.py +11 -22
  54. biotite/structure/info/standardize.py +17 -12
  55. biotite/structure/io/__init__.py +2 -4
  56. biotite/structure/io/ctab.py +1 -1
  57. biotite/structure/io/general.py +37 -43
  58. biotite/structure/io/mmtf/__init__.py +3 -0
  59. biotite/structure/io/mmtf/convertarray.c +219 -198
  60. biotite/structure/io/mmtf/convertarray.cpython-310-darwin.so +0 -0
  61. biotite/structure/io/mmtf/convertfile.c +217 -197
  62. biotite/structure/io/mmtf/convertfile.cpython-310-darwin.so +0 -0
  63. biotite/structure/io/mmtf/decode.c +225 -204
  64. biotite/structure/io/mmtf/decode.cpython-310-darwin.so +0 -0
  65. biotite/structure/io/mmtf/encode.c +215 -196
  66. biotite/structure/io/mmtf/encode.cpython-310-darwin.so +0 -0
  67. biotite/structure/io/mmtf/file.py +34 -26
  68. biotite/structure/io/npz/__init__.py +3 -0
  69. biotite/structure/io/npz/file.py +21 -18
  70. biotite/structure/io/pdb/__init__.py +3 -3
  71. biotite/structure/io/pdb/file.py +5 -3
  72. biotite/structure/io/pdb/hybrid36.c +63 -43
  73. biotite/structure/io/pdb/hybrid36.cpython-310-darwin.so +0 -0
  74. biotite/structure/io/pdbqt/file.py +32 -32
  75. biotite/structure/io/pdbx/__init__.py +13 -6
  76. biotite/structure/io/pdbx/bcif.py +649 -0
  77. biotite/structure/io/pdbx/cif.py +1028 -0
  78. biotite/structure/io/pdbx/component.py +243 -0
  79. biotite/structure/io/pdbx/convert.py +707 -359
  80. biotite/structure/io/pdbx/encoding.c +112813 -0
  81. biotite/structure/io/pdbx/encoding.cpython-310-darwin.so +0 -0
  82. biotite/structure/io/pdbx/error.py +14 -0
  83. biotite/structure/io/pdbx/legacy.py +267 -0
  84. biotite/structure/molecules.py +151 -151
  85. biotite/structure/sasa.c +215 -196
  86. biotite/structure/sasa.cpython-310-darwin.so +0 -0
  87. biotite/structure/superimpose.py +158 -115
  88. {biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/METADATA +2 -2
  89. {biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/RECORD +92 -90
  90. {biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/WHEEL +1 -1
  91. biotite/structure/info/amino_acids.json +0 -1556
  92. biotite/structure/info/amino_acids.py +0 -42
  93. biotite/structure/info/carbohydrates.json +0 -1122
  94. biotite/structure/info/carbohydrates.py +0 -39
  95. biotite/structure/info/intra_bonds.msgpack +0 -0
  96. biotite/structure/info/link_types.msgpack +0 -1
  97. biotite/structure/info/nucleotides.json +0 -772
  98. biotite/structure/info/nucleotides.py +0 -39
  99. biotite/structure/info/residue_masses.msgpack +0 -0
  100. biotite/structure/info/residue_names.msgpack +0 -3
  101. biotite/structure/info/residues.msgpack +0 -0
  102. biotite/structure/io/pdbx/file.py +0 -652
  103. {biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/LICENSE.rst +0 -0
  104. {biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1028 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.structure.io.pdbx"
6
+ __author__ = "Patrick Kunzmann"
7
+ __all__ = ["CIFFile", "CIFBlock", "CIFCategory", "CIFColumn", "CIFData"]
8
+
9
+ import itertools
10
+ import shlex
11
+ from collections.abc import MutableMapping, Sequence
12
+ import numpy as np
13
+ from .component import _Component, MaskValue
14
+ from .error import DeserializationError, SerializationError
15
+ from ....file import File, is_open_compatible, is_text
16
+
17
+
18
+ UNICODE_CHAR_SIZE = 4
19
+
20
+
21
+ # Small class without much functionality
22
+ # It exists merely for consistency with BinaryCIFFile
23
+ class CIFData:
24
+ """
25
+ This class represents the data in a :class:`CIFColumn`.
26
+
27
+ Parameters
28
+ ----------
29
+ array : array_like or int or float or str
30
+ The data array to be stored.
31
+ If a single item is given, it is converted into an array.
32
+ dtype : dtype-like, optional
33
+ If given, the *dtype* the stored array should be converted to.
34
+
35
+ Attributes
36
+ ----------
37
+ array : ndarray
38
+ The stored data array.
39
+
40
+ Notes
41
+ -----
42
+ When a :class:`CIFFile` is written, the data type is automatically
43
+ converted to string.
44
+ The other way around, when a :class:`CIFFile` is read, the data type
45
+ is always a string type.
46
+
47
+ Examples
48
+ --------
49
+
50
+ >>> data = CIFData([1, 2, 3])
51
+ >>> print(data.array)
52
+ [1 2 3]
53
+ >>> print(len(data))
54
+ 3
55
+ >>> # A single item is converted into an array
56
+ >>> data = CIFData("apple")
57
+ >>> print(data.array)
58
+ ['apple']
59
+ """
60
+
61
+ def __init__(self, array, dtype=None):
62
+ self._array = _arrayfy(array)
63
+ if np.issubdtype(self._array.dtype, np.object_):
64
+ raise ValueError("Object arrays are not supported")
65
+ if dtype is not None:
66
+ self._array = self._array.astype(dtype)
67
+
68
+ @property
69
+ def array(self):
70
+ return self._array
71
+
72
+ @staticmethod
73
+ def subcomponent_class():
74
+ return None
75
+
76
+ @staticmethod
77
+ def supercomponent_class():
78
+ return CIFColumn
79
+
80
+ def __len__(self):
81
+ return len(self._array)
82
+
83
+ def __eq__(self, other):
84
+ if not isinstance(other, type(self)):
85
+ return False
86
+ return np.array_equal(self._array, other._array)
87
+
88
+
89
+ class CIFColumn:
90
+ """
91
+ This class represents a single column in a :class:`CIFCategory`.
92
+
93
+ Parameters
94
+ ----------
95
+ data : CIFData or array_like or int or float or str
96
+ The data to be stored.
97
+ If no :class:`CIFData` is given, the passed argument is
98
+ coerced into such an object.
99
+ mask : CIFData or array_like, dtype=int or int
100
+ The mask to be stored.
101
+ If given, the mask indicates whether the `data` is
102
+ inapplicable (``.``) or missing (``?``) in some rows.
103
+ The data presence is indicated by values from the
104
+ :class:`MaskValue` enum.
105
+ If no :class:`CIFData` is given, the passed argument is
106
+ coerced into such an object.
107
+ By default, no mask is created.
108
+
109
+ Attributes
110
+ ----------
111
+ data : CIFData
112
+ The stored data.
113
+ mask : CIFData
114
+ The mask that indicates whether certain data elements are
115
+ inapplicable or missing.
116
+ If no mask is present, this attribute is ``None``.
117
+
118
+ Examples
119
+ --------
120
+
121
+ >>> print(CIFColumn([1, 2, 3]).as_array())
122
+ ['1' '2' '3']
123
+ >>> mask = [MaskValue.PRESENT, MaskValue.INAPPLICABLE, MaskValue.MISSING]
124
+ >>> print(CIFColumn([1, 2, 3], mask).as_array())
125
+ ['1' '.' '?']
126
+ >>> print(CIFColumn([1]).as_item())
127
+ 1
128
+ >>> print(CIFColumn([1], mask=[MaskValue.MISSING]).as_item())
129
+ ?
130
+ """
131
+
132
+ def __init__(self, data, mask=None):
133
+ if not isinstance(data, CIFData):
134
+ data = CIFData(data, str)
135
+ if mask is None:
136
+ mask = np.full(
137
+ len(data), MaskValue.PRESENT, dtype=np.uint8
138
+ )
139
+ mask[data.array == "."] = MaskValue.INAPPLICABLE
140
+ mask[data.array == "?"] = MaskValue.MISSING
141
+ if np.all(mask == MaskValue.PRESENT):
142
+ # No mask required
143
+ mask = None
144
+ else:
145
+ mask = CIFData(mask)
146
+ else:
147
+ if not isinstance(mask, CIFData):
148
+ mask = CIFData(mask, np.uint8)
149
+ if len(mask) != len(data):
150
+ raise IndexError(
151
+ f"Data has length {len(data)}, "
152
+ f"but mask has length {len(mask)}"
153
+ )
154
+ self._data = data
155
+ self._mask = mask
156
+
157
+ @property
158
+ def data(self):
159
+ return self._data
160
+
161
+ @property
162
+ def mask(self):
163
+ return self._mask
164
+
165
+ @staticmethod
166
+ def subcomponent_class():
167
+ return CIFData
168
+
169
+ @staticmethod
170
+ def supercomponent_class():
171
+ return CIFCategory
172
+
173
+ def as_item(self):
174
+ """
175
+ Get the only item in the data of this column.
176
+
177
+ If the data is masked as inapplicable or missing, ``'.'`` or
178
+ ``'?'`` is returned, respectively.
179
+ If the data contains more than one item, an exception is raised.
180
+
181
+ Returns
182
+ -------
183
+ item : str
184
+ The item in the data.
185
+ """
186
+ if self._mask is None:
187
+ return self._data.array.item()
188
+ mask = self._mask.array.item()
189
+ if self._mask is None or mask == MaskValue.PRESENT:
190
+ item = self._data.array.item()
191
+ # Limit float precision to 3 decimals
192
+ if isinstance(item, float):
193
+ return f"{item:.3f}"
194
+ else:
195
+ return str(item)
196
+ elif mask == MaskValue.INAPPLICABLE:
197
+ return "."
198
+ elif mask == MaskValue.MISSING:
199
+ return "?"
200
+
201
+ def as_array(self, dtype=str, masked_value=None):
202
+ """
203
+ Get the data of this column as an :class:`ndarray`.
204
+
205
+ This is a shortcut to get ``CIFColumn.data.array``.
206
+ Furthermore, the mask is applied to the data.
207
+
208
+ Parameters
209
+ ----------
210
+ dtype : dtype-like, optional
211
+ The data type the array should be converted to.
212
+ By default, a string type is used.
213
+ masked_value : str, optional
214
+ The value that should be used for masked elements, i.e.
215
+ ``MaskValue.INAPPLICABLE`` or ``MaskValue.MISSING``.
216
+ By default, masked elements are converted to ``'.'`` or
217
+ ``'?'`` depending on the :class:`MaskValue`.
218
+ """
219
+ if self._mask is None:
220
+ return self._data.array.astype(dtype, copy=False)
221
+
222
+ elif np.issubdtype(dtype, np.str_):
223
+ # Limit float precision to 3 decimals
224
+ if np.issubdtype(self._data.array.dtype, np.floating):
225
+ array = np.array(
226
+ [f"{e:.3f}" for e in self._data.array], type=dtype
227
+ )
228
+ else:
229
+ # Copy, as otherwise original data would be overwritten
230
+ # with mask values
231
+ array = self._data.array.astype(dtype, copy=True)
232
+ if masked_value is None:
233
+ array[self._mask.array == MaskValue.INAPPLICABLE] = "."
234
+ array[self._mask.array == MaskValue.MISSING] = "?"
235
+ else:
236
+ array[self._mask.array == MaskValue.INAPPLICABLE] = masked_value
237
+ array[self._mask.array == MaskValue.MISSING] = masked_value
238
+ return array
239
+
240
+ else:
241
+ # Array needs to be converted, but masked values are
242
+ # not necessarily convertible
243
+ # (e.g. '' cannot be converted to int)
244
+ if masked_value is None:
245
+ array = np.zeros(len(self._data), dtype=dtype)
246
+ else:
247
+ array = np.full(len(self._data), masked_value, dtype=dtype)
248
+
249
+ present_mask = self._mask.array == MaskValue.PRESENT
250
+ array[present_mask] = (
251
+ self._data.array[present_mask].astype(dtype)
252
+ )
253
+ return array
254
+
255
+ def __len__(self):
256
+ return len(self._data)
257
+
258
+ def __eq__(self, other):
259
+ if not isinstance(other, type(self)):
260
+ return False
261
+ if self._data != other._data:
262
+ return False
263
+ if self._mask != other._mask:
264
+ return False
265
+ return True
266
+
267
+
268
+ class CIFCategory(_Component, MutableMapping):
269
+ """
270
+ This class represents a category in a :class:`CIFBlock`.
271
+
272
+ Columns can be accessed and modified like a dictionary.
273
+ The values are :class:`CIFColumn` objects.
274
+
275
+ Parameters
276
+ ----------
277
+ columns : dict, optional
278
+ The columns of the category.
279
+ The keys are the column names and the values are the
280
+ :class:`CIFColumn` objects (or objects that can be coerced into
281
+ a :class:`CIFColumn`).
282
+ By default, an empty category is created.
283
+ Each column must have the same length.
284
+ name : str, optional
285
+ The name of the category.
286
+ This is only used for serialization and is automatically set,
287
+ when the :class:`CIFCategory` is added to a :class:`CIFBlock`.
288
+ It only needs to be set manually, when the category is directly
289
+ serialized.
290
+
291
+ Attributes
292
+ ----------
293
+ name : str
294
+ The name of the category.
295
+ row_count : int
296
+ The number of rows in the category, i.e. the length of each
297
+ column.
298
+
299
+ Notes
300
+ -----
301
+ When a column containing strings with line breaks are added, these
302
+ strings are written as multiline strings to the CIF file.
303
+
304
+ Examples
305
+ --------
306
+
307
+ >>> # Add column on creation
308
+ >>> category = CIFCategory({"fruit": ["apple", "banana"]}, name="fruits")
309
+ >>> # Add column later on
310
+ >>> category["taste"] = ["delicious", "tasty"]
311
+ >>> # Add column the formal way
312
+ >>> category["color"] = CIFColumn(CIFData(["red", "yellow"]))
313
+ >>> # Access a column
314
+ >>> print(category["fruit"].as_array())
315
+ ['apple' 'banana']
316
+ >>> print(category.serialize())
317
+ loop_
318
+ _fruits.fruit
319
+ _fruits.taste
320
+ _fruits.color
321
+ apple delicious red
322
+ banana tasty yellow
323
+ """
324
+
325
+ def __init__(self, columns=None, name=None):
326
+ self._name = name
327
+ if columns is None:
328
+ columns = {}
329
+ else:
330
+ columns = {
331
+ key: CIFColumn(col) if not isinstance(col, CIFColumn) else col
332
+ for key, col in columns.items()
333
+ }
334
+
335
+ self._row_count = None
336
+ self._columns = columns
337
+
338
+ @property
339
+ def name(self):
340
+ return self._name
341
+
342
+ @name.setter
343
+ def name(self, name):
344
+ self._name = name
345
+
346
+ @property
347
+ def row_count(self):
348
+ if self._row_count is None:
349
+ # Row count is not determined yet
350
+ # -> check the length of the first column
351
+ self._row_count = len(next(iter(self.values())))
352
+ return self._row_count
353
+
354
+ @staticmethod
355
+ def subcomponent_class():
356
+ return CIFColumn
357
+
358
+ @staticmethod
359
+ def supercomponent_class():
360
+ return CIFBlock
361
+
362
+ @staticmethod
363
+ def deserialize(text, expect_whitespace=True):
364
+ lines = [
365
+ line.strip() for line in text.splitlines() if not _is_empty(line)
366
+ ]
367
+
368
+ if _is_loop_start(lines[0]):
369
+ is_looped = True
370
+ lines.pop(0)
371
+ else:
372
+ is_looped = False
373
+
374
+ category_name = _parse_category_name(lines[0])
375
+ if category_name is None:
376
+ raise DeserializationError(
377
+ "Failed to parse category name"
378
+ )
379
+
380
+ lines = _to_single(lines, is_looped)
381
+ if is_looped:
382
+ category_dict = CIFCategory._deserialize_looped(
383
+ lines, expect_whitespace
384
+ )
385
+ else:
386
+ category_dict = CIFCategory._deserialize_single(lines)
387
+ return CIFCategory(category_dict, category_name)
388
+
389
+ def serialize(self):
390
+ if self._name is None:
391
+ raise SerializationError("Category name is required")
392
+ if not self._columns:
393
+ raise ValueError("At least one column is required")
394
+
395
+ for column_name, column in self.items():
396
+ if self._row_count is None:
397
+ self._row_count = len(column)
398
+ elif len(column) != self._row_count:
399
+ raise SerializationError(
400
+ f"All columns must have the same length, "
401
+ f"but '{column_name}' has length {len(column)}, "
402
+ f"while the first column has row_count {self._row_count}"
403
+ )
404
+
405
+ if self._row_count == 1:
406
+ lines = self._serialize_single()
407
+ else:
408
+ lines = self._serialize_looped()
409
+ # Enforce terminal line break
410
+ lines.append("")
411
+ return "\n".join(lines)
412
+
413
+ def __getitem__(self, key):
414
+ return self._columns[key]
415
+
416
+ def __setitem__(self, key, column):
417
+ if not isinstance(column, CIFColumn):
418
+ column = CIFColumn(column)
419
+ self._columns[key] = column
420
+
421
+ def __delitem__(self, key):
422
+ if len(self._columns) == 1:
423
+ raise ValueError("At least one column must remain")
424
+ del self._columns[key]
425
+
426
+ def __iter__(self):
427
+ return iter(self._columns)
428
+
429
+ def __len__(self):
430
+ return len(self._columns)
431
+
432
+ def __eq__(self, other):
433
+ # Row count can be omitted here, as it is based on the columns
434
+ if not isinstance(other, type(self)):
435
+ return False
436
+ if set(self.keys()) != set(other.keys()):
437
+ return False
438
+ for col_name in self.keys():
439
+ if self[col_name] != other[col_name]:
440
+ return False
441
+ return True
442
+
443
+ @staticmethod
444
+ def _deserialize_single(lines):
445
+ """
446
+ Process a category where each field has a single value.
447
+ """
448
+ category_dict = {}
449
+ for line in lines:
450
+ parts = shlex.split(line)
451
+ column_name = parts[0].split(".")[1]
452
+ column = parts[1]
453
+ category_dict[column_name] = CIFColumn(column)
454
+ return category_dict
455
+
456
+ @staticmethod
457
+ def _deserialize_looped(lines, expect_whitespace):
458
+ """
459
+ Process a category where each field has multiple values
460
+ (category is a table).
461
+ """
462
+ category_dict = {}
463
+ column_names = []
464
+ i = 0
465
+ for key_line in lines:
466
+ if key_line[0] == "_":
467
+ # Key line
468
+ key = key_line.split(".")[1]
469
+ column_names.append(key)
470
+ category_dict[key] = []
471
+ i += 1
472
+ else:
473
+ break
474
+
475
+ data_lines = lines[i:]
476
+ # Rows may be split over multiple lines -> do not rely on
477
+ # row-line-alignment at all and simply cycle through columns
478
+ column_names = itertools.cycle(column_names)
479
+ for data_line in data_lines:
480
+ # If whitespace is expected in quote protected values,
481
+ # use standard shlex split
482
+ # Otherwise use much more faster whitespace split
483
+ # and quote removal if applicable,
484
+ # bypassing the slow shlex module
485
+ if expect_whitespace:
486
+ values = shlex.split(data_line)
487
+ else:
488
+ values = data_line.split()
489
+ for k in range(len(values)):
490
+ # Remove quotes
491
+ if (values[k][0] == '"' and values[k][-1] == '"') or (
492
+ values[k][0] == "'" and values[k][-1] == "'"
493
+ ):
494
+ values[k] = values[k][1:-1]
495
+ for val in values:
496
+ column_name = next(column_names)
497
+ category_dict[column_name].append(val)
498
+
499
+ return category_dict
500
+
501
+ def _serialize_single(self):
502
+ keys = ["_" + self._name + "." + name for name in self.keys()]
503
+ max_len = max(len(key) for key in keys)
504
+ # "+3" Because of three whitespace chars after longest key
505
+ req_len = max_len + 3
506
+ return [
507
+ key.ljust(req_len) + _multiline(_quote(column.as_item()))
508
+ for key, column in zip(keys, self.values())
509
+ ]
510
+
511
+ def _serialize_looped(self):
512
+ key_lines = [
513
+ "_" + self._name + "." + key + " "
514
+ for key in self.keys()
515
+ ]
516
+
517
+ column_arrays = []
518
+ for column in self.values():
519
+ array = column.as_array(str)
520
+ # Quote before measuring the number of chars,
521
+ # as the quote characters modify the length
522
+ array = np.array(
523
+ [_multiline(_quote(element)) for element in array]
524
+ )
525
+ column_arrays.append(array)
526
+
527
+ # Number of characters the longest string in the column needs
528
+ # This can be deduced from the dtype
529
+ # The "+1" is for the small whitespace column
530
+ column_n_chars = [
531
+ array.dtype.itemsize // UNICODE_CHAR_SIZE + 1
532
+ for array in column_arrays
533
+ ]
534
+ value_lines = [""] * self._row_count
535
+ for i in range(self._row_count):
536
+ for j, array in enumerate(column_arrays):
537
+ value_lines[i] += array[i].ljust(column_n_chars[j])
538
+ # Remove trailing justification of last column
539
+ value_lines[i].rstrip()
540
+
541
+ return ["loop_"] + key_lines + value_lines
542
+
543
+
544
+ class CIFBlock(_Component, MutableMapping):
545
+ """
546
+ This class represents a block in a :class:`CIFFile`.
547
+
548
+ Categories can be accessed and modified like a dictionary.
549
+ The values are :class:`CIFCategory` objects.
550
+
551
+ Parameters
552
+ ----------
553
+ categories : dict, optional
554
+ The categories of the block.
555
+ The keys are the category names and the values are the
556
+ :class:`CIFCategory` objects.
557
+ By default, an empty block is created.
558
+
559
+ Notes
560
+ -----
561
+ The category names do not include the leading underscore character.
562
+ This character is automatically added when the category is
563
+ serialized.
564
+
565
+ Examples
566
+ --------
567
+
568
+ >>> # Add category on creation
569
+ >>> block = CIFBlock({"foo": CIFCategory({"some_column": 1})})
570
+ >>> # Add category later on
571
+ >>> block["bar"] = CIFCategory({"another_column": [2, 3]})
572
+ >>> # Access a column
573
+ >>> print(block["bar"]["another_column"].as_array())
574
+ ['2' '3']
575
+ >>> print(block.serialize())
576
+ _foo.some_column 1
577
+ #
578
+ loop_
579
+ _bar.another_column
580
+ 2
581
+ 3
582
+ #
583
+ """
584
+
585
+ def __init__(self, categories=None):
586
+ if categories is None:
587
+ categories = {}
588
+ self._categories = categories
589
+
590
+ @staticmethod
591
+ def subcomponent_class():
592
+ return CIFCategory
593
+
594
+ @staticmethod
595
+ def supercomponent_class():
596
+ return CIFFile
597
+
598
+ @staticmethod
599
+ def deserialize(text):
600
+ lines = text.splitlines()
601
+ current_category_name = None
602
+ category_starts = []
603
+ category_names = []
604
+ for i, line in enumerate(lines):
605
+ if not _is_empty(line):
606
+ is_loop_in_line = _is_loop_start(line)
607
+ category_name_in_line = _parse_category_name(line)
608
+ if is_loop_in_line or (
609
+ category_name_in_line != current_category_name
610
+ and category_name_in_line is not None
611
+ ):
612
+ # Track the new category
613
+ if is_loop_in_line:
614
+ # In case of lines with "loop_" the category is
615
+ # in the next line
616
+ category_name_in_line = _parse_category_name(
617
+ lines[i + 1]
618
+ )
619
+ current_category_name = category_name_in_line
620
+ category_starts.append(i)
621
+ category_names.append(current_category_name)
622
+ return CIFBlock(_create_element_dict(
623
+ lines, category_names, category_starts
624
+ ))
625
+
626
+ def serialize(self):
627
+ text_blocks = []
628
+ for category_name, category in self._categories.items():
629
+ if isinstance(category, str):
630
+ # Category is already stored as lines
631
+ text_blocks.append(category)
632
+ else:
633
+ try:
634
+ category.name = category_name
635
+ text_blocks.append(category.serialize())
636
+ except:
637
+ raise SerializationError(
638
+ f"Failed to serialize category '{category_name}'"
639
+ )
640
+ # A comment line is set after each category
641
+ text_blocks.append("#\n")
642
+ return "".join(text_blocks)
643
+
644
+ def __getitem__(self, key):
645
+ category = self._categories[key]
646
+ if isinstance(category, str):
647
+ # Element is stored in serialized form
648
+ # -> must be deserialized first
649
+ try:
650
+ # Special optimization for "atom_site":
651
+ # Even if the values are quote protected,
652
+ # no whitespace is expected in escaped values
653
+ # Therefore slow shlex.split() call is not necessary
654
+ if key == "atom_site":
655
+ expect_whitespace = False
656
+ else:
657
+ expect_whitespace = True
658
+ category = CIFCategory.deserialize(category, expect_whitespace)
659
+ except:
660
+ raise DeserializationError(
661
+ f"Failed to deserialize category '{key}'"
662
+ )
663
+ # Update with deserialized object
664
+ self._categories[key] = category
665
+ return category
666
+
667
+ def __setitem__(self, key, category):
668
+ if not isinstance(category, CIFCategory):
669
+ raise TypeError(
670
+ f"Expected 'CIFCategory', but got '{type(category).__name__}'"
671
+ )
672
+ category.name = key
673
+ self._categories[key] = category
674
+
675
+ def __delitem__(self, key):
676
+ del self._categories[key]
677
+
678
+ def __iter__(self):
679
+ return iter(self._categories)
680
+
681
+ def __len__(self):
682
+ return len(self._categories)
683
+
684
+ def __eq__(self, other):
685
+ if not isinstance(other, type(self)):
686
+ return False
687
+ if set(self.keys()) != set(other.keys()):
688
+ return False
689
+ for cat_name in self.keys():
690
+ if self[cat_name] != other[cat_name]:
691
+ return False
692
+ return True
693
+
694
+
695
+ class CIFFile(_Component, File, MutableMapping):
696
+ """
697
+ This class represents a CIF file.
698
+
699
+ The categories of the file can be accessed and modified like a
700
+ dictionary.
701
+ The values are :class:`CIFBlock` objects.
702
+
703
+ To parse or write a structure from/to a :class:`CIFFile` object,
704
+ use the high-level :func:`get_structure()` or
705
+ :func:`set_structure()` function respectively.
706
+
707
+ Notes
708
+ -----
709
+ The content of CIF files are lazily deserialized:
710
+ When reading the file only the line positions of all blocks are
711
+ indexed.
712
+ The time consuming deserialization of a block/category is only
713
+ performed when accessed.
714
+ The deserialized :class:`CIFBlock`/:class:`CIFCategory` objects
715
+ are cached for subsequent accesses.
716
+
717
+ Attributes
718
+ ----------
719
+ block : CIFBlock
720
+ The sole block of the file.
721
+ If the file contains multiple blocks, an exception is raised.
722
+
723
+ Examples
724
+ --------
725
+ Read a CIF file and access its content:
726
+
727
+ >>> import os.path
728
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1l2y.cif"))
729
+ >>> print(file["1L2Y"]["citation_author"]["name"].as_array())
730
+ ['Neidigh, J.W.' 'Fesinmeyer, R.M.' 'Andersen, N.H.']
731
+ >>> # Access the only block in the file
732
+ >>> print(file.block["entity"]["pdbx_description"].as_item())
733
+ TC5b
734
+
735
+ Create a CIF file and write it to disk:
736
+
737
+ >>> category = CIFCategory(
738
+ ... {"some_column": "some_value", "another_column": "another_value"}
739
+ ... )
740
+ >>> block = CIFBlock({"some_category": category, "another_category": category})
741
+ >>> file = CIFFile({"some_block": block, "another_block": block})
742
+ >>> print(file.serialize())
743
+ data_some_block
744
+ #
745
+ _some_category.some_column some_value
746
+ _some_category.another_column another_value
747
+ #
748
+ _another_category.some_column some_value
749
+ _another_category.another_column another_value
750
+ #
751
+ data_another_block
752
+ #
753
+ _some_category.some_column some_value
754
+ _some_category.another_column another_value
755
+ #
756
+ _another_category.some_column some_value
757
+ _another_category.another_column another_value
758
+ #
759
+ >>> file.write(os.path.join(path_to_directory, "some_file.cif"))
760
+ """
761
+
762
+ def __init__(self, blocks=None):
763
+ if blocks is None:
764
+ blocks = {}
765
+ self._blocks = blocks
766
+
767
+ @property
768
+ def lines(self):
769
+ return "\n".join(self.serialize())
770
+
771
+ @property
772
+ def block(self):
773
+ if len(self) != 1:
774
+ raise ValueError("There are multiple blocks in the file")
775
+ return self[next(iter(self))]
776
+
777
+ @staticmethod
778
+ def subcomponent_class():
779
+ return CIFBlock
780
+
781
+ @staticmethod
782
+ def supercomponent_class():
783
+ return None
784
+
785
+ @staticmethod
786
+ def deserialize(text):
787
+ lines = text.splitlines()
788
+ block_starts = []
789
+ block_names = []
790
+ for i, line in enumerate(lines):
791
+ if not _is_empty(line):
792
+ data_block_name = _parse_data_block_name(line)
793
+ if data_block_name is not None:
794
+ block_starts.append(i)
795
+ block_names.append(data_block_name)
796
+ return CIFFile(_create_element_dict(lines, block_names, block_starts))
797
+
798
+ def serialize(self):
799
+ text_blocks = []
800
+ for block_name, block in self._blocks.items():
801
+ text_blocks.append("data_" + block_name + "\n")
802
+ # A comment line is set after the block indicator
803
+ text_blocks.append("#\n")
804
+ if isinstance(block, str):
805
+ # Block is already stored as text
806
+ text_blocks.append(block)
807
+ else:
808
+ try:
809
+ text_blocks.append(block.serialize())
810
+ except:
811
+ raise SerializationError(
812
+ f"Failed to serialize block '{block_name}'"
813
+ )
814
+ # Enforce terminal line break
815
+ text_blocks.append("")
816
+ return "".join(text_blocks)
817
+
818
+ @classmethod
819
+ def read(cls, file):
820
+ """
821
+ Read a CIF file.
822
+
823
+ Parameters
824
+ ----------
825
+ file : file-like object or str
826
+ The file to be read.
827
+ Alternatively a file path can be supplied.
828
+
829
+ Returns
830
+ -------
831
+ file_object : CIFFile
832
+ The parsed file.
833
+ """
834
+ # File name
835
+ if is_open_compatible(file):
836
+ with open(file, "r") as f:
837
+ text = f.read()
838
+ # File object
839
+ else:
840
+ if not is_text(file):
841
+ raise TypeError("A file opened in 'text' mode is required")
842
+ text = file.read()
843
+ return CIFFile.deserialize(text)
844
+
845
+ def write(self, file):
846
+ """
847
+ Write the contents of this object into a CIF file.
848
+
849
+ Parameters
850
+ ----------
851
+ file : file-like object or str
852
+ The file to be written to.
853
+ Alternatively a file path can be supplied.
854
+ """
855
+ if is_open_compatible(file):
856
+ with open(file, "w") as f:
857
+ f.write(self.serialize())
858
+ else:
859
+ if not is_text(file):
860
+ raise TypeError("A file opened in 'text' mode is required")
861
+ file.write(self.serialize())
862
+
863
+ def __getitem__(self, key):
864
+ block = self._blocks[key]
865
+ if isinstance(block, str):
866
+ # Element is stored in serialized form
867
+ # -> must be deserialized first
868
+ try:
869
+ block = CIFBlock.deserialize(block)
870
+ except:
871
+ raise DeserializationError(
872
+ f"Failed to deserialize block '{key}'"
873
+ )
874
+ # Update with deserialized object
875
+ self._blocks[key] = block
876
+ return block
877
+
878
+ def __setitem__(self, key, block):
879
+ if not isinstance(block, CIFBlock):
880
+ raise TypeError(
881
+ f"Expected 'CIFBlock', but got '{type(block).__name__}'"
882
+ )
883
+ self._blocks[key] = block
884
+
885
+ def __delitem__(self, key):
886
+ del self._blocks[key]
887
+
888
+ def __iter__(self):
889
+ return iter(self._blocks)
890
+
891
+ def __len__(self):
892
+ return len(self._blocks)
893
+
894
+ def __eq__(self, other):
895
+ if not isinstance(other, type(self)):
896
+ return False
897
+ if set(self.keys()) != set(other.keys()):
898
+ return False
899
+ for block_name in self.keys():
900
+ if self[block_name] != other[block_name]:
901
+ return False
902
+ return True
903
+
904
+
905
+ def _is_empty(line):
906
+ return len(line.strip()) == 0 or line[0] == "#"
907
+
908
+
909
+ def _create_element_dict(lines, element_names, element_starts):
910
+ """
911
+ Create a dict mapping the `element_names` to the corresponding
912
+ `lines`, which are located between ``element_starts[i]`` and
913
+ ``element_starts[i+1]``.
914
+ """
915
+ # Add exclusive stop to indices for easier slicing
916
+ element_starts.append(len(lines))
917
+ # Lazy deserialization
918
+ # -> keep as text for now and deserialize later if needed
919
+ return {
920
+ element_name: "\n".join(lines[element_starts[i] : element_starts[i+1]])
921
+ for i, element_name in enumerate(element_names)
922
+ }
923
+
924
+
925
+ def _parse_data_block_name(line):
926
+ """
927
+ If the line defines a data block, return this name.
928
+ Return ``None`` otherwise.
929
+ """
930
+ if line.startswith("data_"):
931
+ return line[5:]
932
+ else:
933
+ return None
934
+
935
+
936
+ def _parse_category_name(line):
937
+ """
938
+ If the line defines a category, return this name.
939
+ Return ``None`` otherwise.
940
+ """
941
+ if line[0] != "_":
942
+ return None
943
+ else:
944
+ return line[1 : line.find(".")]
945
+
946
+
947
+ def _is_loop_start(line):
948
+ """
949
+ Return whether the line starts a looped category.
950
+ """
951
+ return line.startswith("loop_")
952
+
953
+
954
+ def _to_single(lines, is_looped):
955
+ """
956
+ Convert multiline values into singleline values
957
+ (in terms of 'lines' list elements).
958
+ Linebreaks are preserved.
959
+ """
960
+ processed_lines = [None] * len(lines)
961
+ in_i = 0
962
+ out_i = 0
963
+ while in_i < len(lines):
964
+ if lines[in_i][0] == ";":
965
+ # Multiline value
966
+ multi_line_str = lines[in_i][1:]
967
+ j = in_i + 1
968
+ while lines[j] != ";":
969
+ # Preserve linebreaks
970
+ multi_line_str += "\n" + lines[j]
971
+ j += 1
972
+ if is_looped:
973
+ # Create a line for the multiline string only
974
+ processed_lines[out_i] = shlex.quote(multi_line_str)
975
+ out_i += 1
976
+ else:
977
+ # Append multiline string to previous line
978
+ processed_lines[out_i - 1] += " " + shlex.quote(multi_line_str)
979
+ in_i = j + 1
980
+
981
+ elif not is_looped and lines[in_i][0] in ["'", '"']:
982
+ # Singleline value in the line after the corresponding key
983
+ processed_lines[out_i - 1] += " " + lines[in_i]
984
+ in_i += 1
985
+
986
+ else:
987
+ # Normal singleline value in the same row as the key
988
+ processed_lines[out_i] = lines[in_i]
989
+ in_i += 1
990
+ out_i += 1
991
+
992
+ return [line for line in processed_lines if line is not None]
993
+
994
+
995
+ def _quote(value):
996
+ """
997
+ A less secure but much quicker version of ``shlex.quote()``.
998
+ """
999
+ if len(value) == 0:
1000
+ return "''"
1001
+ elif value[0] == "_":
1002
+ return "'" + value + "'"
1003
+ elif "'" in value:
1004
+ return '"' + value + '"'
1005
+ elif '"' in value:
1006
+ return "'" + value + "'"
1007
+ elif " " in value:
1008
+ return "'" + value + "'"
1009
+ elif "\t" in value:
1010
+ return "'" + value + "'"
1011
+ else:
1012
+ return value
1013
+
1014
+
1015
+ def _multiline(value):
1016
+ """
1017
+ Convert a string containing linebreaks into CIF-compatible
1018
+ multiline string.
1019
+ """
1020
+ if "\n" in value:
1021
+ return "\n;" + value + "\n;\n"
1022
+ return value
1023
+
1024
+
1025
+ def _arrayfy(data):
1026
+ if not isinstance(data, (Sequence, np.ndarray)) or isinstance(data, str):
1027
+ data = [data]
1028
+ return np.asarray(data)