biotite 1.0.0__cp312-cp312-macosx_11_0_arm64.whl → 1.1.0__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/application/dssp/app.py +13 -3
- biotite/application/localapp.py +34 -0
- biotite/application/muscle/app3.py +2 -15
- biotite/application/muscle/app5.py +2 -2
- biotite/application/util.py +1 -1
- biotite/application/viennarna/rnaplot.py +6 -2
- biotite/database/rcsb/query.py +6 -6
- biotite/database/uniprot/check.py +20 -15
- biotite/database/uniprot/download.py +1 -1
- biotite/database/uniprot/query.py +1 -1
- biotite/sequence/align/alignment.py +16 -3
- biotite/sequence/align/banded.cpython-312-darwin.so +0 -0
- biotite/sequence/align/banded.pyx +5 -5
- biotite/sequence/align/kmeralphabet.cpython-312-darwin.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +17 -0
- biotite/sequence/align/kmersimilarity.cpython-312-darwin.so +0 -0
- biotite/sequence/align/kmertable.cpython-312-darwin.so +0 -0
- biotite/sequence/align/kmertable.pyx +52 -42
- biotite/sequence/align/localgapped.cpython-312-darwin.so +0 -0
- biotite/sequence/align/localungapped.cpython-312-darwin.so +0 -0
- biotite/sequence/align/matrix.py +273 -55
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/multiple.cpython-312-darwin.so +0 -0
- biotite/sequence/align/pairwise.cpython-312-darwin.so +0 -0
- biotite/sequence/align/permutation.cpython-312-darwin.so +0 -0
- biotite/sequence/align/selector.cpython-312-darwin.so +0 -0
- biotite/sequence/align/tracetable.cpython-312-darwin.so +0 -0
- biotite/sequence/alphabet.py +3 -0
- biotite/sequence/codec.cpython-312-darwin.so +0 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
- biotite/sequence/graphics/colorschemes.py +44 -11
- biotite/sequence/phylo/nj.cpython-312-darwin.so +0 -0
- biotite/sequence/phylo/tree.cpython-312-darwin.so +0 -0
- biotite/sequence/phylo/upgma.cpython-312-darwin.so +0 -0
- biotite/sequence/profile.py +86 -4
- biotite/sequence/seqtypes.py +124 -3
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +4 -3
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +110 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +171 -0
- biotite/structure/alphabet/unkerasify.py +122 -0
- biotite/structure/atoms.py +156 -43
- biotite/structure/bonds.cpython-312-darwin.so +0 -0
- biotite/structure/bonds.pyx +72 -21
- biotite/structure/celllist.cpython-312-darwin.so +0 -0
- biotite/structure/charges.cpython-312-darwin.so +0 -0
- biotite/structure/filter.py +1 -1
- biotite/structure/geometry.py +60 -113
- biotite/structure/info/__init__.py +1 -0
- biotite/structure/info/atoms.py +13 -13
- biotite/structure/info/bonds.py +12 -6
- biotite/structure/info/ccd.py +125 -32
- biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
- biotite/structure/info/groups.py +63 -17
- biotite/structure/info/masses.py +9 -6
- biotite/structure/info/misc.py +15 -21
- biotite/structure/info/standardize.py +3 -2
- biotite/structure/io/mol/sdf.py +41 -40
- biotite/structure/io/pdb/convert.py +2 -0
- biotite/structure/io/pdb/file.py +74 -3
- biotite/structure/io/pdb/hybrid36.cpython-312-darwin.so +0 -0
- biotite/structure/io/pdbqt/file.py +32 -32
- biotite/structure/io/pdbx/__init__.py +1 -0
- biotite/structure/io/pdbx/bcif.py +32 -8
- biotite/structure/io/pdbx/cif.py +148 -107
- biotite/structure/io/pdbx/component.py +9 -4
- biotite/structure/io/pdbx/compress.py +321 -0
- biotite/structure/io/pdbx/convert.py +227 -68
- biotite/structure/io/pdbx/encoding.cpython-312-darwin.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +98 -17
- biotite/structure/io/trajfile.py +16 -16
- biotite/structure/molecules.py +141 -141
- biotite/structure/sasa.cpython-312-darwin.so +0 -0
- biotite/structure/segments.py +1 -2
- biotite/structure/util.py +73 -1
- biotite/version.py +2 -2
- {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/METADATA +4 -1
- {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/RECORD +88 -78
- biotite/structure/info/ccd/README.rst +0 -8
- biotite/structure/info/ccd/amino_acids.txt +0 -1663
- biotite/structure/info/ccd/carbohydrates.txt +0 -1135
- biotite/structure/info/ccd/nucleotides.txt +0 -798
- {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/WHEEL +0 -0
- {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/licenses/LICENSE.rst +0 -0
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
__all__ = ["compress"]
|
|
2
|
+
__name__ = "biotite.structure.io.pdbx"
|
|
3
|
+
__author__ = "Patrick Kunzmann"
|
|
4
|
+
|
|
5
|
+
import itertools
|
|
6
|
+
import msgpack
|
|
7
|
+
import numpy as np
|
|
8
|
+
import biotite.structure.io.pdbx.bcif as bcif
|
|
9
|
+
from biotite.structure.io.pdbx.bcif import _encode_numpy as encode_numpy
|
|
10
|
+
from biotite.structure.io.pdbx.encoding import (
|
|
11
|
+
ByteArrayEncoding,
|
|
12
|
+
DeltaEncoding,
|
|
13
|
+
FixedPointEncoding,
|
|
14
|
+
IntegerPackingEncoding,
|
|
15
|
+
RunLengthEncoding,
|
|
16
|
+
StringArrayEncoding,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def compress(data, float_tolerance=1e-6):
|
|
21
|
+
"""
|
|
22
|
+
Try to reduce the size of a *BinaryCIF* file (or block, category, etc.) by testing
|
|
23
|
+
different data encodings for each data array and selecting the one, which results in
|
|
24
|
+
the smallest size.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
data : BinaryCIFFile or BinaryCIFBlock or BinaryCIFCategory or BinaryCIFColumn or BinaryCIFData
|
|
29
|
+
The data to compress.
|
|
30
|
+
|
|
31
|
+
Returns
|
|
32
|
+
-------
|
|
33
|
+
compressed_file : BinaryCIFFile or BinaryCIFBlock or BinaryCIFCategory or BinaryCIFColumn or BinaryCIFData
|
|
34
|
+
The compressed data with the same type as the input data.
|
|
35
|
+
If no improved compression is found for a :class:`BinaryCIFData` array,
|
|
36
|
+
the input data is kept.
|
|
37
|
+
Hence, the return value is no deep copy of the input data.
|
|
38
|
+
float_tolerance : float, optional
|
|
39
|
+
The relative error that is accepted when compressing floating point numbers.
|
|
40
|
+
|
|
41
|
+
Examples
|
|
42
|
+
--------
|
|
43
|
+
|
|
44
|
+
>>> from io import BytesIO
|
|
45
|
+
>>> pdbx_file = BinaryCIFFile()
|
|
46
|
+
>>> set_structure(pdbx_file, atom_array_stack)
|
|
47
|
+
>>> # Write uncompressed file
|
|
48
|
+
>>> uncompressed_file = BytesIO()
|
|
49
|
+
>>> pdbx_file.write(uncompressed_file)
|
|
50
|
+
>>> _ = uncompressed_file.seek(0)
|
|
51
|
+
>>> print(f"{len(uncompressed_file.read()) // 1000} KB")
|
|
52
|
+
927 KB
|
|
53
|
+
>>> # Write compressed file
|
|
54
|
+
>>> pdbx_file = compress(pdbx_file)
|
|
55
|
+
>>> compressed_file = BytesIO()
|
|
56
|
+
>>> pdbx_file.write(compressed_file)
|
|
57
|
+
>>> _ = compressed_file.seek(0)
|
|
58
|
+
>>> print(f"{len(compressed_file.read()) // 1000} KB")
|
|
59
|
+
111 KB
|
|
60
|
+
"""
|
|
61
|
+
match type(data):
|
|
62
|
+
case bcif.BinaryCIFFile:
|
|
63
|
+
return _compress_file(data, float_tolerance)
|
|
64
|
+
case bcif.BinaryCIFBlock:
|
|
65
|
+
return _compress_block(data, float_tolerance)
|
|
66
|
+
case bcif.BinaryCIFCategory:
|
|
67
|
+
return _compress_category(data, float_tolerance)
|
|
68
|
+
case bcif.BinaryCIFColumn:
|
|
69
|
+
return _compress_column(data, float_tolerance)
|
|
70
|
+
case bcif.BinaryCIFData:
|
|
71
|
+
return _compress_data(data, float_tolerance)
|
|
72
|
+
case _:
|
|
73
|
+
raise TypeError(f"Unsupported type {type(data).__name__}")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _compress_file(bcif_file, float_tolerance):
|
|
77
|
+
compressed_file = bcif.BinaryCIFFile()
|
|
78
|
+
for block_name, bcif_block in bcif_file.items():
|
|
79
|
+
compressed_block = _compress_block(bcif_block, float_tolerance)
|
|
80
|
+
compressed_file[block_name] = compressed_block
|
|
81
|
+
return compressed_file
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _compress_block(bcif_block, float_tolerance):
|
|
85
|
+
compressed_block = bcif.BinaryCIFBlock()
|
|
86
|
+
for category_name, bcif_category in bcif_block.items():
|
|
87
|
+
compressed_category = _compress_category(bcif_category, float_tolerance)
|
|
88
|
+
compressed_block[category_name] = compressed_category
|
|
89
|
+
return compressed_block
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _compress_category(bcif_category, float_tolerance):
|
|
93
|
+
compressed_category = bcif.BinaryCIFCategory()
|
|
94
|
+
for column_name, bcif_column in bcif_category.items():
|
|
95
|
+
compressed_column = _compress_column(bcif_column, float_tolerance)
|
|
96
|
+
compressed_category[column_name] = compressed_column
|
|
97
|
+
return compressed_category
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _compress_column(bcif_column, float_tolerance):
|
|
101
|
+
data = _compress_data(bcif_column.data, float_tolerance)
|
|
102
|
+
if bcif_column.mask is not None:
|
|
103
|
+
mask = _compress_data(bcif_column.mask, float_tolerance)
|
|
104
|
+
else:
|
|
105
|
+
mask = None
|
|
106
|
+
return bcif.BinaryCIFColumn(data, mask)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _compress_data(bcif_data, float_tolerance):
|
|
110
|
+
array = bcif_data.array
|
|
111
|
+
if len(array) == 1:
|
|
112
|
+
# No need to compress a single value -> Use default uncompressed encoding
|
|
113
|
+
return bcif.BinaryCIFData(array)
|
|
114
|
+
|
|
115
|
+
if np.issubdtype(array.dtype, np.str_):
|
|
116
|
+
# Leave encoding empty for now, as it is explicitly set later
|
|
117
|
+
encoding = StringArrayEncoding(data_encoding=[], offset_encoding=[])
|
|
118
|
+
# Run encode to initialize the data and offset arrays
|
|
119
|
+
indices = encoding.encode(array)
|
|
120
|
+
offsets = np.cumsum([0] + [len(s) for s in encoding.strings])
|
|
121
|
+
encoding.data_encoding, _ = _find_best_integer_compression(indices)
|
|
122
|
+
encoding.offset_encoding, _ = _find_best_integer_compression(offsets)
|
|
123
|
+
return bcif.BinaryCIFData(array, [encoding])
|
|
124
|
+
|
|
125
|
+
elif np.issubdtype(array.dtype, np.floating):
|
|
126
|
+
to_integer_encoding = FixedPointEncoding(
|
|
127
|
+
10 ** _get_decimal_places(array, float_tolerance)
|
|
128
|
+
)
|
|
129
|
+
integer_array = to_integer_encoding.encode(array)
|
|
130
|
+
best_encoding, size_compressed = _find_best_integer_compression(integer_array)
|
|
131
|
+
if size_compressed < _data_size_in_file(bcif.BinaryCIFData(array)):
|
|
132
|
+
return bcif.BinaryCIFData(array, [to_integer_encoding] + best_encoding)
|
|
133
|
+
else:
|
|
134
|
+
# The float array is smaller -> encode it directly as bytes
|
|
135
|
+
return bcif.BinaryCIFData(array, [ByteArrayEncoding()])
|
|
136
|
+
|
|
137
|
+
elif np.issubdtype(array.dtype, np.integer):
|
|
138
|
+
array = _to_smallest_integer_type(array)
|
|
139
|
+
encodings, _ = _find_best_integer_compression(array)
|
|
140
|
+
return bcif.BinaryCIFData(array, encodings)
|
|
141
|
+
|
|
142
|
+
else:
|
|
143
|
+
raise TypeError(f"Unsupported data type {array.dtype}")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _find_best_integer_compression(array):
|
|
147
|
+
"""
|
|
148
|
+
Try different data encodings on an integer array and return the one that results in
|
|
149
|
+
the smallest size.
|
|
150
|
+
"""
|
|
151
|
+
best_encoding_sequence = None
|
|
152
|
+
smallest_size = np.inf
|
|
153
|
+
|
|
154
|
+
for use_delta in [False, True]:
|
|
155
|
+
if use_delta:
|
|
156
|
+
encoding = DeltaEncoding()
|
|
157
|
+
array_after_delta = encoding.encode(array)
|
|
158
|
+
encodings_after_delta = [encoding]
|
|
159
|
+
else:
|
|
160
|
+
encodings_after_delta = []
|
|
161
|
+
array_after_delta = array
|
|
162
|
+
for use_run_length in [False, True]:
|
|
163
|
+
# Use encoded data from previous step to save time
|
|
164
|
+
if use_run_length:
|
|
165
|
+
encoding = RunLengthEncoding()
|
|
166
|
+
array_after_rle = encoding.encode(array_after_delta)
|
|
167
|
+
encodings_after_rle = encodings_after_delta + [encoding]
|
|
168
|
+
else:
|
|
169
|
+
encodings_after_rle = encodings_after_delta
|
|
170
|
+
array_after_rle = array_after_delta
|
|
171
|
+
for packed_byte_count in [None, 1, 2]:
|
|
172
|
+
if packed_byte_count is not None:
|
|
173
|
+
# Quickly check this heuristic
|
|
174
|
+
# to avoid computing an exploding packed data array
|
|
175
|
+
if (
|
|
176
|
+
_estimate_packed_length(array_after_rle, packed_byte_count)
|
|
177
|
+
>= array_after_rle.nbytes
|
|
178
|
+
):
|
|
179
|
+
# Packing would not reduce the size
|
|
180
|
+
continue
|
|
181
|
+
encoding = IntegerPackingEncoding(packed_byte_count)
|
|
182
|
+
array_after_packing = encoding.encode(array_after_rle)
|
|
183
|
+
encodings_after_packing = encodings_after_rle + [encoding]
|
|
184
|
+
else:
|
|
185
|
+
encodings_after_packing = encodings_after_rle
|
|
186
|
+
array_after_packing = array_after_rle
|
|
187
|
+
encoding = ByteArrayEncoding()
|
|
188
|
+
encoded_array = encoding.encode(array_after_packing)
|
|
189
|
+
encodings = encodings_after_packing + [encoding]
|
|
190
|
+
# Pack data directly instead of using the BinaryCIFData class
|
|
191
|
+
# to avoid the unnecessary re-encoding of the array,
|
|
192
|
+
# as it is already available in 'encoded_array'
|
|
193
|
+
serialized_encoding = [enc.serialize() for enc in encodings]
|
|
194
|
+
serialized_data = {
|
|
195
|
+
"data": encoded_array,
|
|
196
|
+
"encoding": serialized_encoding,
|
|
197
|
+
}
|
|
198
|
+
size = _data_size_in_file(serialized_data)
|
|
199
|
+
if size < smallest_size:
|
|
200
|
+
best_encoding_sequence = encodings
|
|
201
|
+
smallest_size = size
|
|
202
|
+
return best_encoding_sequence, smallest_size
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _estimate_packed_length(array, packed_byte_count):
|
|
206
|
+
"""
|
|
207
|
+
Estimate the length of an integer array after packing it with a given number of
|
|
208
|
+
bytes.
|
|
209
|
+
|
|
210
|
+
Parameters
|
|
211
|
+
----------
|
|
212
|
+
array : numpy.ndarray
|
|
213
|
+
The array to pack.
|
|
214
|
+
packed_byte_count : int
|
|
215
|
+
The number of bytes used for packing.
|
|
216
|
+
|
|
217
|
+
Returns
|
|
218
|
+
-------
|
|
219
|
+
length : int
|
|
220
|
+
The estimated length of the packed array.
|
|
221
|
+
"""
|
|
222
|
+
# Use int64 to avoid integer overflow in the following line
|
|
223
|
+
max_val_per_element = np.int64(2 ** (8 * packed_byte_count))
|
|
224
|
+
n_bytes_per_element = packed_byte_count * (np.abs(array // max_val_per_element) + 1)
|
|
225
|
+
return np.sum(n_bytes_per_element, dtype=np.int64)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _to_smallest_integer_type(array):
|
|
229
|
+
"""
|
|
230
|
+
Convert an integer array to the smallest possible integer type, that is still able
|
|
231
|
+
to represent all values in the array.
|
|
232
|
+
|
|
233
|
+
Parameters
|
|
234
|
+
----------
|
|
235
|
+
array : numpy.ndarray
|
|
236
|
+
The array to convert.
|
|
237
|
+
|
|
238
|
+
Returns
|
|
239
|
+
-------
|
|
240
|
+
array : numpy.ndarray
|
|
241
|
+
The converted array.
|
|
242
|
+
"""
|
|
243
|
+
if array.min() >= 0:
|
|
244
|
+
for dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
|
|
245
|
+
if np.all(array <= np.iinfo(dtype).max):
|
|
246
|
+
return array.astype(dtype)
|
|
247
|
+
for dtype in [np.int8, np.int16, np.int32, np.int64]:
|
|
248
|
+
if np.all(array >= np.iinfo(dtype).min) and np.all(
|
|
249
|
+
array <= np.iinfo(dtype).max
|
|
250
|
+
):
|
|
251
|
+
return array.astype(dtype)
|
|
252
|
+
raise ValueError("Array is out of bounds for all integer types")
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _data_size_in_file(data):
|
|
256
|
+
"""
|
|
257
|
+
Get the size of the data, it would have when written into a *BinaryCIF* file.
|
|
258
|
+
|
|
259
|
+
Parameters
|
|
260
|
+
----------
|
|
261
|
+
data : BinaryCIFData or dict
|
|
262
|
+
The data array whose size is measured.
|
|
263
|
+
Can be either a :class:`BinaryCIFData` object or already serialized data.
|
|
264
|
+
|
|
265
|
+
Returns
|
|
266
|
+
-------
|
|
267
|
+
size : int
|
|
268
|
+
The size of the data array in the file in bytes.
|
|
269
|
+
"""
|
|
270
|
+
if isinstance(data, bcif.BinaryCIFData):
|
|
271
|
+
data = data.serialize()
|
|
272
|
+
bytes_in_file = msgpack.packb(data, use_bin_type=True, default=encode_numpy)
|
|
273
|
+
return len(bytes_in_file)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _get_decimal_places(array, tol):
|
|
277
|
+
"""
|
|
278
|
+
Get the number of decimal places in a floating point array.
|
|
279
|
+
|
|
280
|
+
Parameters
|
|
281
|
+
----------
|
|
282
|
+
array : numpy.ndarray
|
|
283
|
+
The array to analyze.
|
|
284
|
+
tol : float, optional
|
|
285
|
+
The relative tolerance allowed when the values are cut off after the returned
|
|
286
|
+
number of decimal places.
|
|
287
|
+
|
|
288
|
+
Returns
|
|
289
|
+
-------
|
|
290
|
+
decimals : int
|
|
291
|
+
The number of decimal places.
|
|
292
|
+
"""
|
|
293
|
+
# Decimals of NaN or infinite values do not make sense
|
|
294
|
+
# and 0 would give NaN when rounding on decimals
|
|
295
|
+
array = array[np.isfinite(array) & (array != 0)]
|
|
296
|
+
for decimals in itertools.count(start=-_order_magnitude(array)):
|
|
297
|
+
error = np.abs(np.round(array, decimals) - array)
|
|
298
|
+
if np.all(error < tol * np.abs(array)):
|
|
299
|
+
return decimals
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _order_magnitude(array):
|
|
303
|
+
"""
|
|
304
|
+
Get the order of magnitude of floating point values.
|
|
305
|
+
|
|
306
|
+
Parameters
|
|
307
|
+
----------
|
|
308
|
+
array : ndarray, dtype=float
|
|
309
|
+
The value to analyze.
|
|
310
|
+
|
|
311
|
+
Returns
|
|
312
|
+
-------
|
|
313
|
+
magnitude : int
|
|
314
|
+
The order of magnitude, i.e. the maximum exponent a number in the array would
|
|
315
|
+
have in scientific notation, if only one digit is left of the decimal point.
|
|
316
|
+
"""
|
|
317
|
+
array = array[array != 0]
|
|
318
|
+
if len(array) == 0:
|
|
319
|
+
# No non-zero values -> define order of magnitude as 0
|
|
320
|
+
return 0
|
|
321
|
+
return int(np.max(np.floor(np.log10(np.abs(array)))).item())
|