biotite 1.0.1__cp310-cp310-win_amd64.whl → 1.1.0__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/application/dssp/app.py +13 -3
- biotite/application/localapp.py +34 -0
- biotite/application/muscle/app3.py +2 -15
- biotite/application/muscle/app5.py +2 -2
- biotite/application/util.py +1 -1
- biotite/application/viennarna/rnaplot.py +6 -2
- biotite/database/rcsb/query.py +6 -6
- biotite/database/uniprot/check.py +20 -15
- biotite/database/uniprot/download.py +1 -1
- biotite/database/uniprot/query.py +1 -1
- biotite/sequence/align/alignment.py +16 -3
- biotite/sequence/align/banded.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/banded.pyx +5 -5
- biotite/sequence/align/kmeralphabet.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +17 -0
- biotite/sequence/align/kmersimilarity.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +52 -42
- biotite/sequence/align/localgapped.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/matrix.py +273 -55
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/multiple.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/pairwise.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.cp310-win_amd64.pyd +0 -0
- biotite/sequence/align/tracetable.cp310-win_amd64.pyd +0 -0
- biotite/sequence/alphabet.py +3 -0
- biotite/sequence/codec.cp310-win_amd64.pyd +0 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
- biotite/sequence/graphics/colorschemes.py +44 -11
- biotite/sequence/phylo/nj.cp310-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.cp310-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.cp310-win_amd64.pyd +0 -0
- biotite/sequence/profile.py +86 -4
- biotite/sequence/seqtypes.py +124 -3
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +4 -3
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +110 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +171 -0
- biotite/structure/alphabet/unkerasify.py +122 -0
- biotite/structure/atoms.py +129 -40
- biotite/structure/bonds.cp310-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +72 -21
- biotite/structure/celllist.cp310-win_amd64.pyd +0 -0
- biotite/structure/charges.cp310-win_amd64.pyd +0 -0
- biotite/structure/geometry.py +60 -113
- biotite/structure/info/__init__.py +1 -0
- biotite/structure/info/atoms.py +13 -13
- biotite/structure/info/bonds.py +12 -6
- biotite/structure/info/ccd.py +125 -32
- biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
- biotite/structure/info/groups.py +63 -17
- biotite/structure/info/masses.py +9 -6
- biotite/structure/info/misc.py +15 -21
- biotite/structure/info/standardize.py +3 -2
- biotite/structure/io/mol/sdf.py +41 -40
- biotite/structure/io/pdb/convert.py +2 -0
- biotite/structure/io/pdb/file.py +74 -3
- biotite/structure/io/pdb/hybrid36.cp310-win_amd64.pyd +0 -0
- biotite/structure/io/pdbqt/file.py +32 -32
- biotite/structure/io/pdbx/__init__.py +1 -0
- biotite/structure/io/pdbx/bcif.py +32 -8
- biotite/structure/io/pdbx/cif.py +72 -59
- biotite/structure/io/pdbx/component.py +9 -4
- biotite/structure/io/pdbx/compress.py +321 -0
- biotite/structure/io/pdbx/convert.py +194 -48
- biotite/structure/io/pdbx/encoding.cp310-win_amd64.pyd +0 -0
- biotite/structure/io/pdbx/encoding.pyx +98 -17
- biotite/structure/molecules.py +141 -141
- biotite/structure/sasa.cp310-win_amd64.pyd +0 -0
- biotite/structure/segments.py +1 -2
- biotite/structure/util.py +73 -1
- biotite/version.py +2 -2
- {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/METADATA +3 -1
- {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/RECORD +86 -76
- biotite/structure/info/ccd/README.rst +0 -8
- biotite/structure/info/ccd/amino_acids.txt +0 -1663
- biotite/structure/info/ccd/carbohydrates.txt +0 -1135
- biotite/structure/info/ccd/nucleotides.txt +0 -798
- {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/WHEEL +0 -0
- {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/licenses/LICENSE.rst +0 -0
biotite/structure/io/pdbx/cif.py
CHANGED
|
@@ -7,7 +7,6 @@ __author__ = "Patrick Kunzmann"
|
|
|
7
7
|
__all__ = ["CIFFile", "CIFBlock", "CIFCategory", "CIFColumn", "CIFData"]
|
|
8
8
|
|
|
9
9
|
import itertools
|
|
10
|
-
import re
|
|
11
10
|
from collections.abc import MutableMapping, Sequence
|
|
12
11
|
import numpy as np
|
|
13
12
|
from biotite.file import (
|
|
@@ -357,7 +356,7 @@ class CIFCategory(_Component, MutableMapping):
|
|
|
357
356
|
return CIFBlock
|
|
358
357
|
|
|
359
358
|
@staticmethod
|
|
360
|
-
def deserialize(text
|
|
359
|
+
def deserialize(text):
|
|
361
360
|
lines = [line.strip() for line in text.splitlines() if not _is_empty(line)]
|
|
362
361
|
|
|
363
362
|
if _is_loop_start(lines[0]):
|
|
@@ -372,7 +371,7 @@ class CIFCategory(_Component, MutableMapping):
|
|
|
372
371
|
|
|
373
372
|
lines = _to_single(lines)
|
|
374
373
|
if is_looped:
|
|
375
|
-
category_dict = CIFCategory._deserialize_looped(lines
|
|
374
|
+
category_dict = CIFCategory._deserialize_looped(lines)
|
|
376
375
|
else:
|
|
377
376
|
category_dict = CIFCategory._deserialize_single(lines)
|
|
378
377
|
return CIFCategory(category_dict, category_name)
|
|
@@ -416,6 +415,9 @@ class CIFCategory(_Component, MutableMapping):
|
|
|
416
415
|
raise ValueError("At least one column must remain")
|
|
417
416
|
del self._columns[key]
|
|
418
417
|
|
|
418
|
+
def __contains__(self, key):
|
|
419
|
+
return key in self._columns
|
|
420
|
+
|
|
419
421
|
def __iter__(self):
|
|
420
422
|
return iter(self._columns)
|
|
421
423
|
|
|
@@ -442,7 +444,7 @@ class CIFCategory(_Component, MutableMapping):
|
|
|
442
444
|
line_i = 0
|
|
443
445
|
while line_i < len(lines):
|
|
444
446
|
line = lines[line_i]
|
|
445
|
-
parts = _split_one_line(line)
|
|
447
|
+
parts = list(_split_one_line(line))
|
|
446
448
|
if len(parts) == 2:
|
|
447
449
|
# Standard case -> name and value in one line
|
|
448
450
|
name_part, value_part = parts
|
|
@@ -450,7 +452,7 @@ class CIFCategory(_Component, MutableMapping):
|
|
|
450
452
|
elif len(parts) == 1:
|
|
451
453
|
# Value is a multiline value on the next line
|
|
452
454
|
name_part = parts[0]
|
|
453
|
-
parts = _split_one_line(lines[line_i + 1])
|
|
455
|
+
parts = list(_split_one_line(lines[line_i + 1]))
|
|
454
456
|
if len(parts) == 1:
|
|
455
457
|
value_part = parts[0]
|
|
456
458
|
else:
|
|
@@ -464,7 +466,7 @@ class CIFCategory(_Component, MutableMapping):
|
|
|
464
466
|
return category_dict
|
|
465
467
|
|
|
466
468
|
@staticmethod
|
|
467
|
-
def _deserialize_looped(lines
|
|
469
|
+
def _deserialize_looped(lines):
|
|
468
470
|
"""
|
|
469
471
|
Process a category where each field has multiple values
|
|
470
472
|
(category is a table).
|
|
@@ -487,20 +489,7 @@ class CIFCategory(_Component, MutableMapping):
|
|
|
487
489
|
# row-line-alignment at all and simply cycle through columns
|
|
488
490
|
column_indices = itertools.cycle(range(len(column_names)))
|
|
489
491
|
for data_line in data_lines:
|
|
490
|
-
|
|
491
|
-
# use regex-based _split_one_line() to split
|
|
492
|
-
# Otherwise use much more faster whitespace split
|
|
493
|
-
# and quote removal if applicable.
|
|
494
|
-
if expect_whitespace:
|
|
495
|
-
values = _split_one_line(data_line)
|
|
496
|
-
else:
|
|
497
|
-
values = data_line.split()
|
|
498
|
-
for k in range(len(values)):
|
|
499
|
-
# Remove quotes
|
|
500
|
-
if (values[k][0] == '"' and values[k][-1] == '"') or (
|
|
501
|
-
values[k][0] == "'" and values[k][-1] == "'"
|
|
502
|
-
):
|
|
503
|
-
values[k] = values[k][1:-1]
|
|
492
|
+
values = _split_one_line(data_line)
|
|
504
493
|
for val in values:
|
|
505
494
|
column_index = next(column_indices)
|
|
506
495
|
column_name = column_names[column_index]
|
|
@@ -569,6 +558,17 @@ class CIFBlock(_Component, MutableMapping):
|
|
|
569
558
|
The keys are the category names and the values are the
|
|
570
559
|
:class:`CIFCategory` objects.
|
|
571
560
|
By default, an empty block is created.
|
|
561
|
+
name : str, optional
|
|
562
|
+
The name of the block.
|
|
563
|
+
This is only used for serialization and is automatically set,
|
|
564
|
+
when the :class:`CIFBlock` is added to a :class:`CIFFile`.
|
|
565
|
+
It only needs to be set manually, when the block is directly
|
|
566
|
+
serialized.
|
|
567
|
+
|
|
568
|
+
Attributes
|
|
569
|
+
----------
|
|
570
|
+
name : str
|
|
571
|
+
The name of the block.
|
|
572
572
|
|
|
573
573
|
Notes
|
|
574
574
|
-----
|
|
@@ -580,13 +580,15 @@ class CIFBlock(_Component, MutableMapping):
|
|
|
580
580
|
--------
|
|
581
581
|
|
|
582
582
|
>>> # Add category on creation
|
|
583
|
-
>>> block = CIFBlock({"foo": CIFCategory({"some_column": 1})})
|
|
583
|
+
>>> block = CIFBlock({"foo": CIFCategory({"some_column": 1})}, name="baz")
|
|
584
584
|
>>> # Add category later on
|
|
585
585
|
>>> block["bar"] = CIFCategory({"another_column": [2, 3]})
|
|
586
586
|
>>> # Access a column
|
|
587
587
|
>>> print(block["bar"]["another_column"].as_array())
|
|
588
588
|
['2' '3']
|
|
589
589
|
>>> print(block.serialize())
|
|
590
|
+
data_baz
|
|
591
|
+
#
|
|
590
592
|
_foo.some_column 1
|
|
591
593
|
#
|
|
592
594
|
loop_
|
|
@@ -596,11 +598,20 @@ class CIFBlock(_Component, MutableMapping):
|
|
|
596
598
|
#
|
|
597
599
|
"""
|
|
598
600
|
|
|
599
|
-
def __init__(self, categories=None):
|
|
601
|
+
def __init__(self, categories=None, name=None):
|
|
602
|
+
self._name = name
|
|
600
603
|
if categories is None:
|
|
601
604
|
categories = {}
|
|
602
605
|
self._categories = categories
|
|
603
606
|
|
|
607
|
+
@property
|
|
608
|
+
def name(self):
|
|
609
|
+
return self._name
|
|
610
|
+
|
|
611
|
+
@name.setter
|
|
612
|
+
def name(self, name):
|
|
613
|
+
self._name = name
|
|
614
|
+
|
|
604
615
|
@staticmethod
|
|
605
616
|
def subcomponent_class():
|
|
606
617
|
return CIFCategory
|
|
@@ -634,7 +645,10 @@ class CIFBlock(_Component, MutableMapping):
|
|
|
634
645
|
return CIFBlock(_create_element_dict(lines, category_names, category_starts))
|
|
635
646
|
|
|
636
647
|
def serialize(self):
|
|
637
|
-
|
|
648
|
+
if self._name is None:
|
|
649
|
+
raise SerializationError("Block name is required")
|
|
650
|
+
# The block starts with the black name line followed by a comment line
|
|
651
|
+
text_blocks = ["data_" + self._name + "\n#\n"]
|
|
638
652
|
for category_name, category in self._categories.items():
|
|
639
653
|
if isinstance(category, str):
|
|
640
654
|
# Category is already stored as lines
|
|
@@ -657,15 +671,7 @@ class CIFBlock(_Component, MutableMapping):
|
|
|
657
671
|
# Element is stored in serialized form
|
|
658
672
|
# -> must be deserialized first
|
|
659
673
|
try:
|
|
660
|
-
|
|
661
|
-
# Even if the values are quote protected,
|
|
662
|
-
# no whitespace is expected in escaped values
|
|
663
|
-
# Therefore slow regex-based _split_one_line() call is not necessary
|
|
664
|
-
if key == "atom_site":
|
|
665
|
-
expect_whitespace = False
|
|
666
|
-
else:
|
|
667
|
-
expect_whitespace = True
|
|
668
|
-
category = CIFCategory.deserialize(category, expect_whitespace)
|
|
674
|
+
category = CIFCategory.deserialize(category)
|
|
669
675
|
except Exception:
|
|
670
676
|
raise DeserializationError(f"Failed to deserialize category '{key}'")
|
|
671
677
|
# Update with deserialized object
|
|
@@ -683,6 +689,9 @@ class CIFBlock(_Component, MutableMapping):
|
|
|
683
689
|
def __delitem__(self, key):
|
|
684
690
|
del self._categories[key]
|
|
685
691
|
|
|
692
|
+
def __contains__(self, key):
|
|
693
|
+
return key in self._categories
|
|
694
|
+
|
|
686
695
|
def __iter__(self):
|
|
687
696
|
return iter(self._categories)
|
|
688
697
|
|
|
@@ -806,14 +815,12 @@ class CIFFile(_Component, File, MutableMapping):
|
|
|
806
815
|
def serialize(self):
|
|
807
816
|
text_blocks = []
|
|
808
817
|
for block_name, block in self._blocks.items():
|
|
809
|
-
text_blocks.append("data_" + block_name + "\n")
|
|
810
|
-
# A comment line is set after the block indicator
|
|
811
|
-
text_blocks.append("#\n")
|
|
812
818
|
if isinstance(block, str):
|
|
813
819
|
# Block is already stored as text
|
|
814
820
|
text_blocks.append(block)
|
|
815
821
|
else:
|
|
816
822
|
try:
|
|
823
|
+
block.name = block_name
|
|
817
824
|
text_blocks.append(block.serialize())
|
|
818
825
|
except Exception:
|
|
819
826
|
raise SerializationError(
|
|
@@ -884,11 +891,15 @@ class CIFFile(_Component, File, MutableMapping):
|
|
|
884
891
|
def __setitem__(self, key, block):
|
|
885
892
|
if not isinstance(block, CIFBlock):
|
|
886
893
|
raise TypeError(f"Expected 'CIFBlock', but got '{type(block).__name__}'")
|
|
894
|
+
block.name = key
|
|
887
895
|
self._blocks[key] = block
|
|
888
896
|
|
|
889
897
|
def __delitem__(self, key):
|
|
890
898
|
del self._blocks[key]
|
|
891
899
|
|
|
900
|
+
def __contains__(self, key):
|
|
901
|
+
return key in self._blocks
|
|
902
|
+
|
|
892
903
|
def __iter__(self):
|
|
893
904
|
return iter(self._blocks)
|
|
894
905
|
|
|
@@ -921,7 +932,7 @@ def _create_element_dict(lines, element_names, element_starts):
|
|
|
921
932
|
# Lazy deserialization
|
|
922
933
|
# -> keep as text for now and deserialize later if needed
|
|
923
934
|
return {
|
|
924
|
-
element_name: "\n".join(lines[element_starts[i] : element_starts[i + 1]])
|
|
935
|
+
element_name: "\n".join(lines[element_starts[i] : element_starts[i + 1]]) + "\n"
|
|
925
936
|
for i, element_name in enumerate(element_names)
|
|
926
937
|
}
|
|
927
938
|
|
|
@@ -1029,29 +1040,31 @@ def _split_one_line(line):
|
|
|
1029
1040
|
"""
|
|
1030
1041
|
# Special case of multiline value, where the line starts with ';'
|
|
1031
1042
|
if line[0] == ";":
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1043
|
+
yield line[1:]
|
|
1044
|
+
elif "'" in line or '"' in line:
|
|
1045
|
+
# Quoted values in the line
|
|
1046
|
+
while line:
|
|
1047
|
+
# Strip leading whitespace(s)
|
|
1048
|
+
stripped_line = line.lstrip()
|
|
1049
|
+
# Split the line on whitespace
|
|
1050
|
+
word, _, line = stripped_line.partition(" ")
|
|
1051
|
+
# Handle the case where the word start with a quote
|
|
1052
|
+
if word.startswith(("'", '"')):
|
|
1053
|
+
# Set the separator to the quote found
|
|
1054
|
+
separator = word[0]
|
|
1055
|
+
# Handle the case of a quoted word without space
|
|
1056
|
+
if word.endswith(separator) and len(word) > 1:
|
|
1057
|
+
# Yield the word without the opening and closing quotes
|
|
1058
|
+
yield word[1:-1]
|
|
1059
|
+
continue
|
|
1060
|
+
# split the word on the separator
|
|
1061
|
+
word, _, line = stripped_line[1:].partition(separator)
|
|
1062
|
+
|
|
1063
|
+
yield word
|
|
1064
|
+
else:
|
|
1065
|
+
# No quoted values in the line -> simple whitespace split
|
|
1066
|
+
for line in line.split():
|
|
1067
|
+
yield line
|
|
1055
1068
|
|
|
1056
1069
|
|
|
1057
1070
|
def _arrayfy(data):
|
|
@@ -171,10 +171,10 @@ class _HierarchicalContainer(_Component, MutableMapping, metaclass=ABCMeta):
|
|
|
171
171
|
Parameters
|
|
172
172
|
----------
|
|
173
173
|
store_key_in: str, optional
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
174
|
+
If given, the key of each element is stored as value in the
|
|
175
|
+
serialized element.
|
|
176
|
+
This is basically the reverse operation of `take_key_from` in
|
|
177
|
+
:meth:`_deserialize_elements()`.
|
|
178
178
|
"""
|
|
179
179
|
serialized_elements = []
|
|
180
180
|
for key, element in self._elements.items():
|
|
@@ -223,6 +223,11 @@ class _HierarchicalContainer(_Component, MutableMapping, metaclass=ABCMeta):
|
|
|
223
223
|
def __delitem__(self, key):
|
|
224
224
|
del self._elements[key]
|
|
225
225
|
|
|
226
|
+
# Implement `__contains__()` explicitly,
|
|
227
|
+
# because the mixin method unnecessarily deserializes the value, if available
|
|
228
|
+
def __contains__(self, key):
|
|
229
|
+
return key in self._elements
|
|
230
|
+
|
|
226
231
|
def __iter__(self):
|
|
227
232
|
return iter(self._elements)
|
|
228
233
|
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
__all__ = ["compress"]
|
|
2
|
+
__name__ = "biotite.structure.io.pdbx"
|
|
3
|
+
__author__ = "Patrick Kunzmann"
|
|
4
|
+
|
|
5
|
+
import itertools
|
|
6
|
+
import msgpack
|
|
7
|
+
import numpy as np
|
|
8
|
+
import biotite.structure.io.pdbx.bcif as bcif
|
|
9
|
+
from biotite.structure.io.pdbx.bcif import _encode_numpy as encode_numpy
|
|
10
|
+
from biotite.structure.io.pdbx.encoding import (
|
|
11
|
+
ByteArrayEncoding,
|
|
12
|
+
DeltaEncoding,
|
|
13
|
+
FixedPointEncoding,
|
|
14
|
+
IntegerPackingEncoding,
|
|
15
|
+
RunLengthEncoding,
|
|
16
|
+
StringArrayEncoding,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def compress(data, float_tolerance=1e-6):
|
|
21
|
+
"""
|
|
22
|
+
Try to reduce the size of a *BinaryCIF* file (or block, category, etc.) by testing
|
|
23
|
+
different data encodings for each data array and selecting the one, which results in
|
|
24
|
+
the smallest size.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
data : BinaryCIFFile or BinaryCIFBlock or BinaryCIFCategory or BinaryCIFColumn or BinaryCIFData
|
|
29
|
+
The data to compress.
|
|
30
|
+
|
|
31
|
+
Returns
|
|
32
|
+
-------
|
|
33
|
+
compressed_file : BinaryCIFFile or BinaryCIFBlock or BinaryCIFCategory or BinaryCIFColumn or BinaryCIFData
|
|
34
|
+
The compressed data with the same type as the input data.
|
|
35
|
+
If no improved compression is found for a :class:`BinaryCIFData` array,
|
|
36
|
+
the input data is kept.
|
|
37
|
+
Hence, the return value is no deep copy of the input data.
|
|
38
|
+
float_tolerance : float, optional
|
|
39
|
+
The relative error that is accepted when compressing floating point numbers.
|
|
40
|
+
|
|
41
|
+
Examples
|
|
42
|
+
--------
|
|
43
|
+
|
|
44
|
+
>>> from io import BytesIO
|
|
45
|
+
>>> pdbx_file = BinaryCIFFile()
|
|
46
|
+
>>> set_structure(pdbx_file, atom_array_stack)
|
|
47
|
+
>>> # Write uncompressed file
|
|
48
|
+
>>> uncompressed_file = BytesIO()
|
|
49
|
+
>>> pdbx_file.write(uncompressed_file)
|
|
50
|
+
>>> _ = uncompressed_file.seek(0)
|
|
51
|
+
>>> print(f"{len(uncompressed_file.read()) // 1000} KB")
|
|
52
|
+
927 KB
|
|
53
|
+
>>> # Write compressed file
|
|
54
|
+
>>> pdbx_file = compress(pdbx_file)
|
|
55
|
+
>>> compressed_file = BytesIO()
|
|
56
|
+
>>> pdbx_file.write(compressed_file)
|
|
57
|
+
>>> _ = compressed_file.seek(0)
|
|
58
|
+
>>> print(f"{len(compressed_file.read()) // 1000} KB")
|
|
59
|
+
111 KB
|
|
60
|
+
"""
|
|
61
|
+
match type(data):
|
|
62
|
+
case bcif.BinaryCIFFile:
|
|
63
|
+
return _compress_file(data, float_tolerance)
|
|
64
|
+
case bcif.BinaryCIFBlock:
|
|
65
|
+
return _compress_block(data, float_tolerance)
|
|
66
|
+
case bcif.BinaryCIFCategory:
|
|
67
|
+
return _compress_category(data, float_tolerance)
|
|
68
|
+
case bcif.BinaryCIFColumn:
|
|
69
|
+
return _compress_column(data, float_tolerance)
|
|
70
|
+
case bcif.BinaryCIFData:
|
|
71
|
+
return _compress_data(data, float_tolerance)
|
|
72
|
+
case _:
|
|
73
|
+
raise TypeError(f"Unsupported type {type(data).__name__}")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _compress_file(bcif_file, float_tolerance):
|
|
77
|
+
compressed_file = bcif.BinaryCIFFile()
|
|
78
|
+
for block_name, bcif_block in bcif_file.items():
|
|
79
|
+
compressed_block = _compress_block(bcif_block, float_tolerance)
|
|
80
|
+
compressed_file[block_name] = compressed_block
|
|
81
|
+
return compressed_file
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _compress_block(bcif_block, float_tolerance):
|
|
85
|
+
compressed_block = bcif.BinaryCIFBlock()
|
|
86
|
+
for category_name, bcif_category in bcif_block.items():
|
|
87
|
+
compressed_category = _compress_category(bcif_category, float_tolerance)
|
|
88
|
+
compressed_block[category_name] = compressed_category
|
|
89
|
+
return compressed_block
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _compress_category(bcif_category, float_tolerance):
|
|
93
|
+
compressed_category = bcif.BinaryCIFCategory()
|
|
94
|
+
for column_name, bcif_column in bcif_category.items():
|
|
95
|
+
compressed_column = _compress_column(bcif_column, float_tolerance)
|
|
96
|
+
compressed_category[column_name] = compressed_column
|
|
97
|
+
return compressed_category
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _compress_column(bcif_column, float_tolerance):
|
|
101
|
+
data = _compress_data(bcif_column.data, float_tolerance)
|
|
102
|
+
if bcif_column.mask is not None:
|
|
103
|
+
mask = _compress_data(bcif_column.mask, float_tolerance)
|
|
104
|
+
else:
|
|
105
|
+
mask = None
|
|
106
|
+
return bcif.BinaryCIFColumn(data, mask)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _compress_data(bcif_data, float_tolerance):
|
|
110
|
+
array = bcif_data.array
|
|
111
|
+
if len(array) == 1:
|
|
112
|
+
# No need to compress a single value -> Use default uncompressed encoding
|
|
113
|
+
return bcif.BinaryCIFData(array)
|
|
114
|
+
|
|
115
|
+
if np.issubdtype(array.dtype, np.str_):
|
|
116
|
+
# Leave encoding empty for now, as it is explicitly set later
|
|
117
|
+
encoding = StringArrayEncoding(data_encoding=[], offset_encoding=[])
|
|
118
|
+
# Run encode to initialize the data and offset arrays
|
|
119
|
+
indices = encoding.encode(array)
|
|
120
|
+
offsets = np.cumsum([0] + [len(s) for s in encoding.strings])
|
|
121
|
+
encoding.data_encoding, _ = _find_best_integer_compression(indices)
|
|
122
|
+
encoding.offset_encoding, _ = _find_best_integer_compression(offsets)
|
|
123
|
+
return bcif.BinaryCIFData(array, [encoding])
|
|
124
|
+
|
|
125
|
+
elif np.issubdtype(array.dtype, np.floating):
|
|
126
|
+
to_integer_encoding = FixedPointEncoding(
|
|
127
|
+
10 ** _get_decimal_places(array, float_tolerance)
|
|
128
|
+
)
|
|
129
|
+
integer_array = to_integer_encoding.encode(array)
|
|
130
|
+
best_encoding, size_compressed = _find_best_integer_compression(integer_array)
|
|
131
|
+
if size_compressed < _data_size_in_file(bcif.BinaryCIFData(array)):
|
|
132
|
+
return bcif.BinaryCIFData(array, [to_integer_encoding] + best_encoding)
|
|
133
|
+
else:
|
|
134
|
+
# The float array is smaller -> encode it directly as bytes
|
|
135
|
+
return bcif.BinaryCIFData(array, [ByteArrayEncoding()])
|
|
136
|
+
|
|
137
|
+
elif np.issubdtype(array.dtype, np.integer):
|
|
138
|
+
array = _to_smallest_integer_type(array)
|
|
139
|
+
encodings, _ = _find_best_integer_compression(array)
|
|
140
|
+
return bcif.BinaryCIFData(array, encodings)
|
|
141
|
+
|
|
142
|
+
else:
|
|
143
|
+
raise TypeError(f"Unsupported data type {array.dtype}")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _find_best_integer_compression(array):
|
|
147
|
+
"""
|
|
148
|
+
Try different data encodings on an integer array and return the one that results in
|
|
149
|
+
the smallest size.
|
|
150
|
+
"""
|
|
151
|
+
best_encoding_sequence = None
|
|
152
|
+
smallest_size = np.inf
|
|
153
|
+
|
|
154
|
+
for use_delta in [False, True]:
|
|
155
|
+
if use_delta:
|
|
156
|
+
encoding = DeltaEncoding()
|
|
157
|
+
array_after_delta = encoding.encode(array)
|
|
158
|
+
encodings_after_delta = [encoding]
|
|
159
|
+
else:
|
|
160
|
+
encodings_after_delta = []
|
|
161
|
+
array_after_delta = array
|
|
162
|
+
for use_run_length in [False, True]:
|
|
163
|
+
# Use encoded data from previous step to save time
|
|
164
|
+
if use_run_length:
|
|
165
|
+
encoding = RunLengthEncoding()
|
|
166
|
+
array_after_rle = encoding.encode(array_after_delta)
|
|
167
|
+
encodings_after_rle = encodings_after_delta + [encoding]
|
|
168
|
+
else:
|
|
169
|
+
encodings_after_rle = encodings_after_delta
|
|
170
|
+
array_after_rle = array_after_delta
|
|
171
|
+
for packed_byte_count in [None, 1, 2]:
|
|
172
|
+
if packed_byte_count is not None:
|
|
173
|
+
# Quickly check this heuristic
|
|
174
|
+
# to avoid computing an exploding packed data array
|
|
175
|
+
if (
|
|
176
|
+
_estimate_packed_length(array_after_rle, packed_byte_count)
|
|
177
|
+
>= array_after_rle.nbytes
|
|
178
|
+
):
|
|
179
|
+
# Packing would not reduce the size
|
|
180
|
+
continue
|
|
181
|
+
encoding = IntegerPackingEncoding(packed_byte_count)
|
|
182
|
+
array_after_packing = encoding.encode(array_after_rle)
|
|
183
|
+
encodings_after_packing = encodings_after_rle + [encoding]
|
|
184
|
+
else:
|
|
185
|
+
encodings_after_packing = encodings_after_rle
|
|
186
|
+
array_after_packing = array_after_rle
|
|
187
|
+
encoding = ByteArrayEncoding()
|
|
188
|
+
encoded_array = encoding.encode(array_after_packing)
|
|
189
|
+
encodings = encodings_after_packing + [encoding]
|
|
190
|
+
# Pack data directly instead of using the BinaryCIFData class
|
|
191
|
+
# to avoid the unnecessary re-encoding of the array,
|
|
192
|
+
# as it is already available in 'encoded_array'
|
|
193
|
+
serialized_encoding = [enc.serialize() for enc in encodings]
|
|
194
|
+
serialized_data = {
|
|
195
|
+
"data": encoded_array,
|
|
196
|
+
"encoding": serialized_encoding,
|
|
197
|
+
}
|
|
198
|
+
size = _data_size_in_file(serialized_data)
|
|
199
|
+
if size < smallest_size:
|
|
200
|
+
best_encoding_sequence = encodings
|
|
201
|
+
smallest_size = size
|
|
202
|
+
return best_encoding_sequence, smallest_size
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _estimate_packed_length(array, packed_byte_count):
|
|
206
|
+
"""
|
|
207
|
+
Estimate the length of an integer array after packing it with a given number of
|
|
208
|
+
bytes.
|
|
209
|
+
|
|
210
|
+
Parameters
|
|
211
|
+
----------
|
|
212
|
+
array : numpy.ndarray
|
|
213
|
+
The array to pack.
|
|
214
|
+
packed_byte_count : int
|
|
215
|
+
The number of bytes used for packing.
|
|
216
|
+
|
|
217
|
+
Returns
|
|
218
|
+
-------
|
|
219
|
+
length : int
|
|
220
|
+
The estimated length of the packed array.
|
|
221
|
+
"""
|
|
222
|
+
# Use int64 to avoid integer overflow in the following line
|
|
223
|
+
max_val_per_element = np.int64(2 ** (8 * packed_byte_count))
|
|
224
|
+
n_bytes_per_element = packed_byte_count * (np.abs(array // max_val_per_element) + 1)
|
|
225
|
+
return np.sum(n_bytes_per_element, dtype=np.int64)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _to_smallest_integer_type(array):
|
|
229
|
+
"""
|
|
230
|
+
Convert an integer array to the smallest possible integer type, that is still able
|
|
231
|
+
to represent all values in the array.
|
|
232
|
+
|
|
233
|
+
Parameters
|
|
234
|
+
----------
|
|
235
|
+
array : numpy.ndarray
|
|
236
|
+
The array to convert.
|
|
237
|
+
|
|
238
|
+
Returns
|
|
239
|
+
-------
|
|
240
|
+
array : numpy.ndarray
|
|
241
|
+
The converted array.
|
|
242
|
+
"""
|
|
243
|
+
if array.min() >= 0:
|
|
244
|
+
for dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
|
|
245
|
+
if np.all(array <= np.iinfo(dtype).max):
|
|
246
|
+
return array.astype(dtype)
|
|
247
|
+
for dtype in [np.int8, np.int16, np.int32, np.int64]:
|
|
248
|
+
if np.all(array >= np.iinfo(dtype).min) and np.all(
|
|
249
|
+
array <= np.iinfo(dtype).max
|
|
250
|
+
):
|
|
251
|
+
return array.astype(dtype)
|
|
252
|
+
raise ValueError("Array is out of bounds for all integer types")
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _data_size_in_file(data):
|
|
256
|
+
"""
|
|
257
|
+
Get the size of the data, it would have when written into a *BinaryCIF* file.
|
|
258
|
+
|
|
259
|
+
Parameters
|
|
260
|
+
----------
|
|
261
|
+
data : BinaryCIFData or dict
|
|
262
|
+
The data array whose size is measured.
|
|
263
|
+
Can be either a :class:`BinaryCIFData` object or already serialized data.
|
|
264
|
+
|
|
265
|
+
Returns
|
|
266
|
+
-------
|
|
267
|
+
size : int
|
|
268
|
+
The size of the data array in the file in bytes.
|
|
269
|
+
"""
|
|
270
|
+
if isinstance(data, bcif.BinaryCIFData):
|
|
271
|
+
data = data.serialize()
|
|
272
|
+
bytes_in_file = msgpack.packb(data, use_bin_type=True, default=encode_numpy)
|
|
273
|
+
return len(bytes_in_file)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _get_decimal_places(array, tol):
|
|
277
|
+
"""
|
|
278
|
+
Get the number of decimal places in a floating point array.
|
|
279
|
+
|
|
280
|
+
Parameters
|
|
281
|
+
----------
|
|
282
|
+
array : numpy.ndarray
|
|
283
|
+
The array to analyze.
|
|
284
|
+
tol : float, optional
|
|
285
|
+
The relative tolerance allowed when the values are cut off after the returned
|
|
286
|
+
number of decimal places.
|
|
287
|
+
|
|
288
|
+
Returns
|
|
289
|
+
-------
|
|
290
|
+
decimals : int
|
|
291
|
+
The number of decimal places.
|
|
292
|
+
"""
|
|
293
|
+
# Decimals of NaN or infinite values do not make sense
|
|
294
|
+
# and 0 would give NaN when rounding on decimals
|
|
295
|
+
array = array[np.isfinite(array) & (array != 0)]
|
|
296
|
+
for decimals in itertools.count(start=-_order_magnitude(array)):
|
|
297
|
+
error = np.abs(np.round(array, decimals) - array)
|
|
298
|
+
if np.all(error < tol * np.abs(array)):
|
|
299
|
+
return decimals
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _order_magnitude(array):
|
|
303
|
+
"""
|
|
304
|
+
Get the order of magnitude of floating point values.
|
|
305
|
+
|
|
306
|
+
Parameters
|
|
307
|
+
----------
|
|
308
|
+
array : ndarray, dtype=float
|
|
309
|
+
The value to analyze.
|
|
310
|
+
|
|
311
|
+
Returns
|
|
312
|
+
-------
|
|
313
|
+
magnitude : int
|
|
314
|
+
The order of magnitude, i.e. the maximum exponent a number in the array would
|
|
315
|
+
have in scientific notation, if only one digit is left of the decimal point.
|
|
316
|
+
"""
|
|
317
|
+
array = array[array != 0]
|
|
318
|
+
if len(array) == 0:
|
|
319
|
+
# No non-zero values -> define order of magnitude as 0
|
|
320
|
+
return 0
|
|
321
|
+
return int(np.max(np.floor(np.log10(np.abs(array)))).item())
|