biotite 1.0.0__cp312-cp312-macosx_11_0_arm64.whl → 1.1.0__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (92) hide show
  1. biotite/application/dssp/app.py +13 -3
  2. biotite/application/localapp.py +34 -0
  3. biotite/application/muscle/app3.py +2 -15
  4. biotite/application/muscle/app5.py +2 -2
  5. biotite/application/util.py +1 -1
  6. biotite/application/viennarna/rnaplot.py +6 -2
  7. biotite/database/rcsb/query.py +6 -6
  8. biotite/database/uniprot/check.py +20 -15
  9. biotite/database/uniprot/download.py +1 -1
  10. biotite/database/uniprot/query.py +1 -1
  11. biotite/sequence/align/alignment.py +16 -3
  12. biotite/sequence/align/banded.cpython-312-darwin.so +0 -0
  13. biotite/sequence/align/banded.pyx +5 -5
  14. biotite/sequence/align/kmeralphabet.cpython-312-darwin.so +0 -0
  15. biotite/sequence/align/kmeralphabet.pyx +17 -0
  16. biotite/sequence/align/kmersimilarity.cpython-312-darwin.so +0 -0
  17. biotite/sequence/align/kmertable.cpython-312-darwin.so +0 -0
  18. biotite/sequence/align/kmertable.pyx +52 -42
  19. biotite/sequence/align/localgapped.cpython-312-darwin.so +0 -0
  20. biotite/sequence/align/localungapped.cpython-312-darwin.so +0 -0
  21. biotite/sequence/align/matrix.py +273 -55
  22. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  23. biotite/sequence/align/matrix_data/PB.license +21 -0
  24. biotite/sequence/align/matrix_data/PB.mat +18 -0
  25. biotite/sequence/align/multiple.cpython-312-darwin.so +0 -0
  26. biotite/sequence/align/pairwise.cpython-312-darwin.so +0 -0
  27. biotite/sequence/align/permutation.cpython-312-darwin.so +0 -0
  28. biotite/sequence/align/selector.cpython-312-darwin.so +0 -0
  29. biotite/sequence/align/tracetable.cpython-312-darwin.so +0 -0
  30. biotite/sequence/alphabet.py +3 -0
  31. biotite/sequence/codec.cpython-312-darwin.so +0 -0
  32. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  33. biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
  34. biotite/sequence/graphics/colorschemes.py +44 -11
  35. biotite/sequence/phylo/nj.cpython-312-darwin.so +0 -0
  36. biotite/sequence/phylo/tree.cpython-312-darwin.so +0 -0
  37. biotite/sequence/phylo/upgma.cpython-312-darwin.so +0 -0
  38. biotite/sequence/profile.py +86 -4
  39. biotite/sequence/seqtypes.py +124 -3
  40. biotite/setup_ccd.py +197 -0
  41. biotite/structure/__init__.py +4 -3
  42. biotite/structure/alphabet/__init__.py +25 -0
  43. biotite/structure/alphabet/encoder.py +332 -0
  44. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  45. biotite/structure/alphabet/i3d.py +110 -0
  46. biotite/structure/alphabet/layers.py +86 -0
  47. biotite/structure/alphabet/pb.license +21 -0
  48. biotite/structure/alphabet/pb.py +171 -0
  49. biotite/structure/alphabet/unkerasify.py +122 -0
  50. biotite/structure/atoms.py +156 -43
  51. biotite/structure/bonds.cpython-312-darwin.so +0 -0
  52. biotite/structure/bonds.pyx +72 -21
  53. biotite/structure/celllist.cpython-312-darwin.so +0 -0
  54. biotite/structure/charges.cpython-312-darwin.so +0 -0
  55. biotite/structure/filter.py +1 -1
  56. biotite/structure/geometry.py +60 -113
  57. biotite/structure/info/__init__.py +1 -0
  58. biotite/structure/info/atoms.py +13 -13
  59. biotite/structure/info/bonds.py +12 -6
  60. biotite/structure/info/ccd.py +125 -32
  61. biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
  62. biotite/structure/info/groups.py +63 -17
  63. biotite/structure/info/masses.py +9 -6
  64. biotite/structure/info/misc.py +15 -21
  65. biotite/structure/info/standardize.py +3 -2
  66. biotite/structure/io/mol/sdf.py +41 -40
  67. biotite/structure/io/pdb/convert.py +2 -0
  68. biotite/structure/io/pdb/file.py +74 -3
  69. biotite/structure/io/pdb/hybrid36.cpython-312-darwin.so +0 -0
  70. biotite/structure/io/pdbqt/file.py +32 -32
  71. biotite/structure/io/pdbx/__init__.py +1 -0
  72. biotite/structure/io/pdbx/bcif.py +32 -8
  73. biotite/structure/io/pdbx/cif.py +148 -107
  74. biotite/structure/io/pdbx/component.py +9 -4
  75. biotite/structure/io/pdbx/compress.py +321 -0
  76. biotite/structure/io/pdbx/convert.py +227 -68
  77. biotite/structure/io/pdbx/encoding.cpython-312-darwin.so +0 -0
  78. biotite/structure/io/pdbx/encoding.pyx +98 -17
  79. biotite/structure/io/trajfile.py +16 -16
  80. biotite/structure/molecules.py +141 -141
  81. biotite/structure/sasa.cpython-312-darwin.so +0 -0
  82. biotite/structure/segments.py +1 -2
  83. biotite/structure/util.py +73 -1
  84. biotite/version.py +2 -2
  85. {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/METADATA +4 -1
  86. {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/RECORD +88 -78
  87. biotite/structure/info/ccd/README.rst +0 -8
  88. biotite/structure/info/ccd/amino_acids.txt +0 -1663
  89. biotite/structure/info/ccd/carbohydrates.txt +0 -1135
  90. biotite/structure/info/ccd/nucleotides.txt +0 -798
  91. {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/WHEEL +0 -0
  92. {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/licenses/LICENSE.rst +0 -0
@@ -0,0 +1,321 @@
1
+ __all__ = ["compress"]
2
+ __name__ = "biotite.structure.io.pdbx"
3
+ __author__ = "Patrick Kunzmann"
4
+
5
+ import itertools
6
+ import msgpack
7
+ import numpy as np
8
+ import biotite.structure.io.pdbx.bcif as bcif
9
+ from biotite.structure.io.pdbx.bcif import _encode_numpy as encode_numpy
10
+ from biotite.structure.io.pdbx.encoding import (
11
+ ByteArrayEncoding,
12
+ DeltaEncoding,
13
+ FixedPointEncoding,
14
+ IntegerPackingEncoding,
15
+ RunLengthEncoding,
16
+ StringArrayEncoding,
17
+ )
18
+
19
+
20
+ def compress(data, float_tolerance=1e-6):
21
+ """
22
+ Try to reduce the size of a *BinaryCIF* file (or block, category, etc.) by testing
23
+ different data encodings for each data array and selecting the one, which results in
24
+ the smallest size.
25
+
26
+ Parameters
27
+ ----------
28
+ data : BinaryCIFFile or BinaryCIFBlock or BinaryCIFCategory or BinaryCIFColumn or BinaryCIFData
29
+ The data to compress.
30
+
31
+ Returns
32
+ -------
33
+ compressed_file : BinaryCIFFile or BinaryCIFBlock or BinaryCIFCategory or BinaryCIFColumn or BinaryCIFData
34
+ The compressed data with the same type as the input data.
35
+ If no improved compression is found for a :class:`BinaryCIFData` array,
36
+ the input data is kept.
37
+ Hence, the return value is no deep copy of the input data.
38
+ float_tolerance : float, optional
39
+ The relative error that is accepted when compressing floating point numbers.
40
+
41
+ Examples
42
+ --------
43
+
44
+ >>> from io import BytesIO
45
+ >>> pdbx_file = BinaryCIFFile()
46
+ >>> set_structure(pdbx_file, atom_array_stack)
47
+ >>> # Write uncompressed file
48
+ >>> uncompressed_file = BytesIO()
49
+ >>> pdbx_file.write(uncompressed_file)
50
+ >>> _ = uncompressed_file.seek(0)
51
+ >>> print(f"{len(uncompressed_file.read()) // 1000} KB")
52
+ 927 KB
53
+ >>> # Write compressed file
54
+ >>> pdbx_file = compress(pdbx_file)
55
+ >>> compressed_file = BytesIO()
56
+ >>> pdbx_file.write(compressed_file)
57
+ >>> _ = compressed_file.seek(0)
58
+ >>> print(f"{len(compressed_file.read()) // 1000} KB")
59
+ 111 KB
60
+ """
61
+ match type(data):
62
+ case bcif.BinaryCIFFile:
63
+ return _compress_file(data, float_tolerance)
64
+ case bcif.BinaryCIFBlock:
65
+ return _compress_block(data, float_tolerance)
66
+ case bcif.BinaryCIFCategory:
67
+ return _compress_category(data, float_tolerance)
68
+ case bcif.BinaryCIFColumn:
69
+ return _compress_column(data, float_tolerance)
70
+ case bcif.BinaryCIFData:
71
+ return _compress_data(data, float_tolerance)
72
+ case _:
73
+ raise TypeError(f"Unsupported type {type(data).__name__}")
74
+
75
+
76
+ def _compress_file(bcif_file, float_tolerance):
77
+ compressed_file = bcif.BinaryCIFFile()
78
+ for block_name, bcif_block in bcif_file.items():
79
+ compressed_block = _compress_block(bcif_block, float_tolerance)
80
+ compressed_file[block_name] = compressed_block
81
+ return compressed_file
82
+
83
+
84
+ def _compress_block(bcif_block, float_tolerance):
85
+ compressed_block = bcif.BinaryCIFBlock()
86
+ for category_name, bcif_category in bcif_block.items():
87
+ compressed_category = _compress_category(bcif_category, float_tolerance)
88
+ compressed_block[category_name] = compressed_category
89
+ return compressed_block
90
+
91
+
92
+ def _compress_category(bcif_category, float_tolerance):
93
+ compressed_category = bcif.BinaryCIFCategory()
94
+ for column_name, bcif_column in bcif_category.items():
95
+ compressed_column = _compress_column(bcif_column, float_tolerance)
96
+ compressed_category[column_name] = compressed_column
97
+ return compressed_category
98
+
99
+
100
+ def _compress_column(bcif_column, float_tolerance):
101
+ data = _compress_data(bcif_column.data, float_tolerance)
102
+ if bcif_column.mask is not None:
103
+ mask = _compress_data(bcif_column.mask, float_tolerance)
104
+ else:
105
+ mask = None
106
+ return bcif.BinaryCIFColumn(data, mask)
107
+
108
+
109
+ def _compress_data(bcif_data, float_tolerance):
110
+ array = bcif_data.array
111
+ if len(array) == 1:
112
+ # No need to compress a single value -> Use default uncompressed encoding
113
+ return bcif.BinaryCIFData(array)
114
+
115
+ if np.issubdtype(array.dtype, np.str_):
116
+ # Leave encoding empty for now, as it is explicitly set later
117
+ encoding = StringArrayEncoding(data_encoding=[], offset_encoding=[])
118
+ # Run encode to initialize the data and offset arrays
119
+ indices = encoding.encode(array)
120
+ offsets = np.cumsum([0] + [len(s) for s in encoding.strings])
121
+ encoding.data_encoding, _ = _find_best_integer_compression(indices)
122
+ encoding.offset_encoding, _ = _find_best_integer_compression(offsets)
123
+ return bcif.BinaryCIFData(array, [encoding])
124
+
125
+ elif np.issubdtype(array.dtype, np.floating):
126
+ to_integer_encoding = FixedPointEncoding(
127
+ 10 ** _get_decimal_places(array, float_tolerance)
128
+ )
129
+ integer_array = to_integer_encoding.encode(array)
130
+ best_encoding, size_compressed = _find_best_integer_compression(integer_array)
131
+ if size_compressed < _data_size_in_file(bcif.BinaryCIFData(array)):
132
+ return bcif.BinaryCIFData(array, [to_integer_encoding] + best_encoding)
133
+ else:
134
+ # The float array is smaller -> encode it directly as bytes
135
+ return bcif.BinaryCIFData(array, [ByteArrayEncoding()])
136
+
137
+ elif np.issubdtype(array.dtype, np.integer):
138
+ array = _to_smallest_integer_type(array)
139
+ encodings, _ = _find_best_integer_compression(array)
140
+ return bcif.BinaryCIFData(array, encodings)
141
+
142
+ else:
143
+ raise TypeError(f"Unsupported data type {array.dtype}")
144
+
145
+
146
+ def _find_best_integer_compression(array):
147
+ """
148
+ Try different data encodings on an integer array and return the one that results in
149
+ the smallest size.
150
+ """
151
+ best_encoding_sequence = None
152
+ smallest_size = np.inf
153
+
154
+ for use_delta in [False, True]:
155
+ if use_delta:
156
+ encoding = DeltaEncoding()
157
+ array_after_delta = encoding.encode(array)
158
+ encodings_after_delta = [encoding]
159
+ else:
160
+ encodings_after_delta = []
161
+ array_after_delta = array
162
+ for use_run_length in [False, True]:
163
+ # Use encoded data from previous step to save time
164
+ if use_run_length:
165
+ encoding = RunLengthEncoding()
166
+ array_after_rle = encoding.encode(array_after_delta)
167
+ encodings_after_rle = encodings_after_delta + [encoding]
168
+ else:
169
+ encodings_after_rle = encodings_after_delta
170
+ array_after_rle = array_after_delta
171
+ for packed_byte_count in [None, 1, 2]:
172
+ if packed_byte_count is not None:
173
+ # Quickly check this heuristic
174
+ # to avoid computing an exploding packed data array
175
+ if (
176
+ _estimate_packed_length(array_after_rle, packed_byte_count)
177
+ >= array_after_rle.nbytes
178
+ ):
179
+ # Packing would not reduce the size
180
+ continue
181
+ encoding = IntegerPackingEncoding(packed_byte_count)
182
+ array_after_packing = encoding.encode(array_after_rle)
183
+ encodings_after_packing = encodings_after_rle + [encoding]
184
+ else:
185
+ encodings_after_packing = encodings_after_rle
186
+ array_after_packing = array_after_rle
187
+ encoding = ByteArrayEncoding()
188
+ encoded_array = encoding.encode(array_after_packing)
189
+ encodings = encodings_after_packing + [encoding]
190
+ # Pack data directly instead of using the BinaryCIFData class
191
+ # to avoid the unnecessary re-encoding of the array,
192
+ # as it is already available in 'encoded_array'
193
+ serialized_encoding = [enc.serialize() for enc in encodings]
194
+ serialized_data = {
195
+ "data": encoded_array,
196
+ "encoding": serialized_encoding,
197
+ }
198
+ size = _data_size_in_file(serialized_data)
199
+ if size < smallest_size:
200
+ best_encoding_sequence = encodings
201
+ smallest_size = size
202
+ return best_encoding_sequence, smallest_size
203
+
204
+
205
+ def _estimate_packed_length(array, packed_byte_count):
206
+ """
207
+ Estimate the length of an integer array after packing it with a given number of
208
+ bytes.
209
+
210
+ Parameters
211
+ ----------
212
+ array : numpy.ndarray
213
+ The array to pack.
214
+ packed_byte_count : int
215
+ The number of bytes used for packing.
216
+
217
+ Returns
218
+ -------
219
+ length : int
220
+ The estimated length of the packed array.
221
+ """
222
+ # Use int64 to avoid integer overflow in the following line
223
+ max_val_per_element = np.int64(2 ** (8 * packed_byte_count))
224
+ n_bytes_per_element = packed_byte_count * (np.abs(array // max_val_per_element) + 1)
225
+ return np.sum(n_bytes_per_element, dtype=np.int64)
226
+
227
+
228
+ def _to_smallest_integer_type(array):
229
+ """
230
+ Convert an integer array to the smallest possible integer type, that is still able
231
+ to represent all values in the array.
232
+
233
+ Parameters
234
+ ----------
235
+ array : numpy.ndarray
236
+ The array to convert.
237
+
238
+ Returns
239
+ -------
240
+ array : numpy.ndarray
241
+ The converted array.
242
+ """
243
+ if array.min() >= 0:
244
+ for dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
245
+ if np.all(array <= np.iinfo(dtype).max):
246
+ return array.astype(dtype)
247
+ for dtype in [np.int8, np.int16, np.int32, np.int64]:
248
+ if np.all(array >= np.iinfo(dtype).min) and np.all(
249
+ array <= np.iinfo(dtype).max
250
+ ):
251
+ return array.astype(dtype)
252
+ raise ValueError("Array is out of bounds for all integer types")
253
+
254
+
255
+ def _data_size_in_file(data):
256
+ """
257
+ Get the size of the data, it would have when written into a *BinaryCIF* file.
258
+
259
+ Parameters
260
+ ----------
261
+ data : BinaryCIFData or dict
262
+ The data array whose size is measured.
263
+ Can be either a :class:`BinaryCIFData` object or already serialized data.
264
+
265
+ Returns
266
+ -------
267
+ size : int
268
+ The size of the data array in the file in bytes.
269
+ """
270
+ if isinstance(data, bcif.BinaryCIFData):
271
+ data = data.serialize()
272
+ bytes_in_file = msgpack.packb(data, use_bin_type=True, default=encode_numpy)
273
+ return len(bytes_in_file)
274
+
275
+
276
+ def _get_decimal_places(array, tol):
277
+ """
278
+ Get the number of decimal places in a floating point array.
279
+
280
+ Parameters
281
+ ----------
282
+ array : numpy.ndarray
283
+ The array to analyze.
284
+ tol : float, optional
285
+ The relative tolerance allowed when the values are cut off after the returned
286
+ number of decimal places.
287
+
288
+ Returns
289
+ -------
290
+ decimals : int
291
+ The number of decimal places.
292
+ """
293
+ # Decimals of NaN or infinite values do not make sense
294
+ # and 0 would give NaN when rounding on decimals
295
+ array = array[np.isfinite(array) & (array != 0)]
296
+ for decimals in itertools.count(start=-_order_magnitude(array)):
297
+ error = np.abs(np.round(array, decimals) - array)
298
+ if np.all(error < tol * np.abs(array)):
299
+ return decimals
300
+
301
+
302
+ def _order_magnitude(array):
303
+ """
304
+ Get the order of magnitude of floating point values.
305
+
306
+ Parameters
307
+ ----------
308
+ array : ndarray, dtype=float
309
+ The value to analyze.
310
+
311
+ Returns
312
+ -------
313
+ magnitude : int
314
+ The order of magnitude, i.e. the maximum exponent a number in the array would
315
+ have in scientific notation, if only one digit is left of the decimal point.
316
+ """
317
+ array = array[array != 0]
318
+ if len(array) == 0:
319
+ # No non-zero values -> define order of magnitude as 0
320
+ return 0
321
+ return int(np.max(np.floor(np.log10(np.abs(array)))).item())