biotite 1.0.1__cp311-cp311-macosx_11_0_arm64.whl → 1.2.0__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/application/application.py +3 -3
- biotite/application/autodock/app.py +1 -1
- biotite/application/blast/webapp.py +1 -1
- biotite/application/clustalo/app.py +1 -1
- biotite/application/dssp/app.py +13 -3
- biotite/application/localapp.py +36 -2
- biotite/application/msaapp.py +10 -10
- biotite/application/muscle/app3.py +5 -18
- biotite/application/muscle/app5.py +5 -5
- biotite/application/sra/app.py +0 -5
- biotite/application/util.py +22 -2
- biotite/application/viennarna/rnaalifold.py +8 -8
- biotite/application/viennarna/rnaplot.py +9 -3
- biotite/application/viennarna/util.py +1 -1
- biotite/application/webapp.py +1 -1
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +191 -0
- biotite/database/entrez/dbnames.py +10 -0
- biotite/database/entrez/download.py +9 -10
- biotite/database/entrez/key.py +1 -1
- biotite/database/entrez/query.py +5 -4
- biotite/database/pubchem/download.py +6 -6
- biotite/database/pubchem/error.py +10 -0
- biotite/database/pubchem/query.py +12 -23
- biotite/database/rcsb/download.py +3 -2
- biotite/database/rcsb/query.py +8 -9
- biotite/database/uniprot/check.py +22 -17
- biotite/database/uniprot/download.py +3 -6
- biotite/database/uniprot/query.py +4 -5
- biotite/file.py +14 -2
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +16 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +198 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1226 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +15 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +71 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/align/__init__.py +0 -4
- biotite/sequence/align/alignment.py +49 -14
- biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
- biotite/sequence/align/banded.pyx +26 -26
- biotite/sequence/align/cigar.py +2 -2
- biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +19 -2
- biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmertable.pyx +58 -48
- biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
- biotite/sequence/align/localgapped.pyx +47 -47
- biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
- biotite/sequence/align/localungapped.pyx +10 -10
- biotite/sequence/align/matrix.py +284 -57
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
- biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
- biotite/sequence/align/pairwise.pyx +35 -35
- biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
- biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
- biotite/sequence/align/selector.pyx +2 -2
- biotite/sequence/align/statistics.py +1 -1
- biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
- biotite/sequence/alphabet.py +5 -2
- biotite/sequence/annotation.py +19 -13
- biotite/sequence/codec.cpython-311-darwin.so +0 -0
- biotite/sequence/codon.py +1 -2
- biotite/sequence/graphics/alignment.py +25 -39
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
- biotite/sequence/graphics/colorschemes.py +44 -11
- biotite/sequence/graphics/dendrogram.py +4 -2
- biotite/sequence/graphics/features.py +2 -2
- biotite/sequence/graphics/logo.py +10 -12
- biotite/sequence/io/fasta/convert.py +1 -2
- biotite/sequence/io/fasta/file.py +1 -1
- biotite/sequence/io/fastq/file.py +3 -3
- biotite/sequence/io/genbank/file.py +3 -3
- biotite/sequence/io/genbank/sequence.py +2 -0
- biotite/sequence/io/gff/convert.py +1 -1
- biotite/sequence/io/gff/file.py +1 -2
- biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
- biotite/sequence/profile.py +105 -29
- biotite/sequence/search.py +0 -1
- biotite/sequence/seqtypes.py +136 -8
- biotite/sequence/sequence.py +1 -2
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +6 -3
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +163 -66
- biotite/structure/basepairs.py +26 -26
- biotite/structure/bonds.cpython-311-darwin.so +0 -0
- biotite/structure/bonds.pyx +79 -25
- biotite/structure/box.py +19 -21
- biotite/structure/celllist.cpython-311-darwin.so +0 -0
- biotite/structure/celllist.pyx +83 -67
- biotite/structure/chains.py +5 -37
- biotite/structure/charges.cpython-311-darwin.so +0 -0
- biotite/structure/compare.py +420 -13
- biotite/structure/density.py +1 -1
- biotite/structure/dotbracket.py +27 -28
- biotite/structure/filter.py +8 -8
- biotite/structure/geometry.py +74 -127
- biotite/structure/hbond.py +17 -19
- biotite/structure/info/__init__.py +1 -0
- biotite/structure/info/atoms.py +24 -15
- biotite/structure/info/bonds.py +12 -6
- biotite/structure/info/ccd.py +125 -34
- biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
- biotite/structure/info/groups.py +62 -19
- biotite/structure/info/masses.py +9 -6
- biotite/structure/info/misc.py +15 -22
- biotite/structure/info/radii.py +92 -22
- biotite/structure/info/standardize.py +4 -4
- biotite/structure/integrity.py +4 -6
- biotite/structure/io/general.py +2 -2
- biotite/structure/io/gro/file.py +8 -9
- biotite/structure/io/mol/convert.py +1 -1
- biotite/structure/io/mol/ctab.py +33 -28
- biotite/structure/io/mol/mol.py +1 -1
- biotite/structure/io/mol/sdf.py +80 -53
- biotite/structure/io/pdb/convert.py +4 -3
- biotite/structure/io/pdb/file.py +85 -25
- biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
- biotite/structure/io/pdbqt/file.py +36 -36
- biotite/structure/io/pdbx/__init__.py +1 -0
- biotite/structure/io/pdbx/bcif.py +54 -15
- biotite/structure/io/pdbx/cif.py +92 -66
- biotite/structure/io/pdbx/component.py +15 -4
- biotite/structure/io/pdbx/compress.py +321 -0
- biotite/structure/io/pdbx/convert.py +410 -75
- biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +98 -17
- biotite/structure/io/trajfile.py +9 -6
- biotite/structure/io/util.py +38 -0
- biotite/structure/mechanics.py +0 -1
- biotite/structure/molecules.py +141 -156
- biotite/structure/pseudoknots.py +7 -13
- biotite/structure/repair.py +2 -4
- biotite/structure/residues.py +13 -24
- biotite/structure/rings.py +335 -0
- biotite/structure/sasa.cpython-311-darwin.so +0 -0
- biotite/structure/sasa.pyx +2 -1
- biotite/structure/segments.py +69 -11
- biotite/structure/sequence.py +0 -1
- biotite/structure/sse.py +0 -2
- biotite/structure/superimpose.py +74 -62
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +12 -25
- biotite/structure/util.py +76 -4
- biotite/version.py +9 -4
- biotite/visualize.py +111 -1
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/METADATA +6 -2
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/RECORD +173 -143
- biotite/structure/info/ccd/README.rst +0 -8
- biotite/structure/info/ccd/amino_acids.txt +0 -1663
- biotite/structure/info/ccd/carbohydrates.txt +0 -1135
- biotite/structure/info/ccd/nucleotides.txt +0 -798
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/WHEEL +0 -0
- {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/licenses/LICENSE.rst +0 -0
biotite/structure/io/pdbx/cif.py
CHANGED
|
@@ -7,7 +7,6 @@ __author__ = "Patrick Kunzmann"
|
|
|
7
7
|
__all__ = ["CIFFile", "CIFBlock", "CIFCategory", "CIFColumn", "CIFData"]
|
|
8
8
|
|
|
9
9
|
import itertools
|
|
10
|
-
import re
|
|
11
10
|
from collections.abc import MutableMapping, Sequence
|
|
12
11
|
import numpy as np
|
|
13
12
|
from biotite.file import (
|
|
@@ -150,7 +149,7 @@ class CIFColumn:
|
|
|
150
149
|
mask = CIFData(mask, np.uint8)
|
|
151
150
|
if len(mask) != len(data):
|
|
152
151
|
raise IndexError(
|
|
153
|
-
f"Data has length {len(data)},
|
|
152
|
+
f"Data has length {len(data)}, but mask has length {len(mask)}"
|
|
154
153
|
)
|
|
155
154
|
self._data = data
|
|
156
155
|
self._mask = mask
|
|
@@ -216,6 +215,11 @@ class CIFColumn:
|
|
|
216
215
|
``MaskValue.INAPPLICABLE`` or ``MaskValue.MISSING``.
|
|
217
216
|
By default, masked elements are converted to ``'.'`` or
|
|
218
217
|
``'?'`` depending on the :class:`MaskValue`.
|
|
218
|
+
|
|
219
|
+
Returns
|
|
220
|
+
-------
|
|
221
|
+
array : ndarray
|
|
222
|
+
The column data as array.
|
|
219
223
|
"""
|
|
220
224
|
if self._mask is None:
|
|
221
225
|
return self._data.array.astype(dtype, copy=False)
|
|
@@ -357,7 +361,7 @@ class CIFCategory(_Component, MutableMapping):
|
|
|
357
361
|
return CIFBlock
|
|
358
362
|
|
|
359
363
|
@staticmethod
|
|
360
|
-
def deserialize(text
|
|
364
|
+
def deserialize(text):
|
|
361
365
|
lines = [line.strip() for line in text.splitlines() if not _is_empty(line)]
|
|
362
366
|
|
|
363
367
|
if _is_loop_start(lines[0]):
|
|
@@ -372,7 +376,7 @@ class CIFCategory(_Component, MutableMapping):
|
|
|
372
376
|
|
|
373
377
|
lines = _to_single(lines)
|
|
374
378
|
if is_looped:
|
|
375
|
-
category_dict = CIFCategory._deserialize_looped(lines
|
|
379
|
+
category_dict = CIFCategory._deserialize_looped(lines)
|
|
376
380
|
else:
|
|
377
381
|
category_dict = CIFCategory._deserialize_single(lines)
|
|
378
382
|
return CIFCategory(category_dict, category_name)
|
|
@@ -416,6 +420,9 @@ class CIFCategory(_Component, MutableMapping):
|
|
|
416
420
|
raise ValueError("At least one column must remain")
|
|
417
421
|
del self._columns[key]
|
|
418
422
|
|
|
423
|
+
def __contains__(self, key):
|
|
424
|
+
return key in self._columns
|
|
425
|
+
|
|
419
426
|
def __iter__(self):
|
|
420
427
|
return iter(self._columns)
|
|
421
428
|
|
|
@@ -442,7 +449,7 @@ class CIFCategory(_Component, MutableMapping):
|
|
|
442
449
|
line_i = 0
|
|
443
450
|
while line_i < len(lines):
|
|
444
451
|
line = lines[line_i]
|
|
445
|
-
parts = _split_one_line(line)
|
|
452
|
+
parts = list(_split_one_line(line))
|
|
446
453
|
if len(parts) == 2:
|
|
447
454
|
# Standard case -> name and value in one line
|
|
448
455
|
name_part, value_part = parts
|
|
@@ -450,7 +457,7 @@ class CIFCategory(_Component, MutableMapping):
|
|
|
450
457
|
elif len(parts) == 1:
|
|
451
458
|
# Value is a multiline value on the next line
|
|
452
459
|
name_part = parts[0]
|
|
453
|
-
parts = _split_one_line(lines[line_i + 1])
|
|
460
|
+
parts = list(_split_one_line(lines[line_i + 1]))
|
|
454
461
|
if len(parts) == 1:
|
|
455
462
|
value_part = parts[0]
|
|
456
463
|
else:
|
|
@@ -464,7 +471,7 @@ class CIFCategory(_Component, MutableMapping):
|
|
|
464
471
|
return category_dict
|
|
465
472
|
|
|
466
473
|
@staticmethod
|
|
467
|
-
def _deserialize_looped(lines
|
|
474
|
+
def _deserialize_looped(lines):
|
|
468
475
|
"""
|
|
469
476
|
Process a category where each field has multiple values
|
|
470
477
|
(category is a table).
|
|
@@ -487,20 +494,7 @@ class CIFCategory(_Component, MutableMapping):
|
|
|
487
494
|
# row-line-alignment at all and simply cycle through columns
|
|
488
495
|
column_indices = itertools.cycle(range(len(column_names)))
|
|
489
496
|
for data_line in data_lines:
|
|
490
|
-
|
|
491
|
-
# use regex-based _split_one_line() to split
|
|
492
|
-
# Otherwise use much more faster whitespace split
|
|
493
|
-
# and quote removal if applicable.
|
|
494
|
-
if expect_whitespace:
|
|
495
|
-
values = _split_one_line(data_line)
|
|
496
|
-
else:
|
|
497
|
-
values = data_line.split()
|
|
498
|
-
for k in range(len(values)):
|
|
499
|
-
# Remove quotes
|
|
500
|
-
if (values[k][0] == '"' and values[k][-1] == '"') or (
|
|
501
|
-
values[k][0] == "'" and values[k][-1] == "'"
|
|
502
|
-
):
|
|
503
|
-
values[k] = values[k][1:-1]
|
|
497
|
+
values = _split_one_line(data_line)
|
|
504
498
|
for val in values:
|
|
505
499
|
column_index = next(column_indices)
|
|
506
500
|
column_name = column_names[column_index]
|
|
@@ -569,6 +563,17 @@ class CIFBlock(_Component, MutableMapping):
|
|
|
569
563
|
The keys are the category names and the values are the
|
|
570
564
|
:class:`CIFCategory` objects.
|
|
571
565
|
By default, an empty block is created.
|
|
566
|
+
name : str, optional
|
|
567
|
+
The name of the block.
|
|
568
|
+
This is only used for serialization and is automatically set,
|
|
569
|
+
when the :class:`CIFBlock` is added to a :class:`CIFFile`.
|
|
570
|
+
It only needs to be set manually, when the block is directly
|
|
571
|
+
serialized.
|
|
572
|
+
|
|
573
|
+
Attributes
|
|
574
|
+
----------
|
|
575
|
+
name : str
|
|
576
|
+
The name of the block.
|
|
572
577
|
|
|
573
578
|
Notes
|
|
574
579
|
-----
|
|
@@ -580,13 +585,15 @@ class CIFBlock(_Component, MutableMapping):
|
|
|
580
585
|
--------
|
|
581
586
|
|
|
582
587
|
>>> # Add category on creation
|
|
583
|
-
>>> block = CIFBlock({"foo": CIFCategory({"some_column": 1})})
|
|
588
|
+
>>> block = CIFBlock({"foo": CIFCategory({"some_column": 1})}, name="baz")
|
|
584
589
|
>>> # Add category later on
|
|
585
590
|
>>> block["bar"] = CIFCategory({"another_column": [2, 3]})
|
|
586
591
|
>>> # Access a column
|
|
587
592
|
>>> print(block["bar"]["another_column"].as_array())
|
|
588
593
|
['2' '3']
|
|
589
594
|
>>> print(block.serialize())
|
|
595
|
+
data_baz
|
|
596
|
+
#
|
|
590
597
|
_foo.some_column 1
|
|
591
598
|
#
|
|
592
599
|
loop_
|
|
@@ -596,11 +603,20 @@ class CIFBlock(_Component, MutableMapping):
|
|
|
596
603
|
#
|
|
597
604
|
"""
|
|
598
605
|
|
|
599
|
-
def __init__(self, categories=None):
|
|
606
|
+
def __init__(self, categories=None, name=None):
|
|
607
|
+
self._name = name
|
|
600
608
|
if categories is None:
|
|
601
609
|
categories = {}
|
|
602
610
|
self._categories = categories
|
|
603
611
|
|
|
612
|
+
@property
|
|
613
|
+
def name(self):
|
|
614
|
+
return self._name
|
|
615
|
+
|
|
616
|
+
@name.setter
|
|
617
|
+
def name(self, name):
|
|
618
|
+
self._name = name
|
|
619
|
+
|
|
604
620
|
@staticmethod
|
|
605
621
|
def subcomponent_class():
|
|
606
622
|
return CIFCategory
|
|
@@ -634,7 +650,10 @@ class CIFBlock(_Component, MutableMapping):
|
|
|
634
650
|
return CIFBlock(_create_element_dict(lines, category_names, category_starts))
|
|
635
651
|
|
|
636
652
|
def serialize(self):
|
|
637
|
-
|
|
653
|
+
if self._name is None:
|
|
654
|
+
raise SerializationError("Block name is required")
|
|
655
|
+
# The block starts with the black name line followed by a comment line
|
|
656
|
+
text_blocks = ["data_" + self._name + "\n#\n"]
|
|
638
657
|
for category_name, category in self._categories.items():
|
|
639
658
|
if isinstance(category, str):
|
|
640
659
|
# Category is already stored as lines
|
|
@@ -657,15 +676,7 @@ class CIFBlock(_Component, MutableMapping):
|
|
|
657
676
|
# Element is stored in serialized form
|
|
658
677
|
# -> must be deserialized first
|
|
659
678
|
try:
|
|
660
|
-
|
|
661
|
-
# Even if the values are quote protected,
|
|
662
|
-
# no whitespace is expected in escaped values
|
|
663
|
-
# Therefore slow regex-based _split_one_line() call is not necessary
|
|
664
|
-
if key == "atom_site":
|
|
665
|
-
expect_whitespace = False
|
|
666
|
-
else:
|
|
667
|
-
expect_whitespace = True
|
|
668
|
-
category = CIFCategory.deserialize(category, expect_whitespace)
|
|
679
|
+
category = CIFCategory.deserialize(category)
|
|
669
680
|
except Exception:
|
|
670
681
|
raise DeserializationError(f"Failed to deserialize category '{key}'")
|
|
671
682
|
# Update with deserialized object
|
|
@@ -683,6 +694,9 @@ class CIFBlock(_Component, MutableMapping):
|
|
|
683
694
|
def __delitem__(self, key):
|
|
684
695
|
del self._categories[key]
|
|
685
696
|
|
|
697
|
+
def __contains__(self, key):
|
|
698
|
+
return key in self._categories
|
|
699
|
+
|
|
686
700
|
def __iter__(self):
|
|
687
701
|
return iter(self._categories)
|
|
688
702
|
|
|
@@ -712,6 +726,19 @@ class CIFFile(_Component, File, MutableMapping):
|
|
|
712
726
|
use the high-level :func:`get_structure()` or
|
|
713
727
|
:func:`set_structure()` function respectively.
|
|
714
728
|
|
|
729
|
+
Parameters
|
|
730
|
+
----------
|
|
731
|
+
blocks : dict (str -> CIFBlock), optional
|
|
732
|
+
The initial blocks of the file.
|
|
733
|
+
Maps the block names to the corresponding :class:`CIFBlock` objects.
|
|
734
|
+
By default no initial blocks are added.
|
|
735
|
+
|
|
736
|
+
Attributes
|
|
737
|
+
----------
|
|
738
|
+
block : CIFBlock
|
|
739
|
+
The sole block of the file.
|
|
740
|
+
If the file contains multiple blocks, an exception is raised.
|
|
741
|
+
|
|
715
742
|
Notes
|
|
716
743
|
-----
|
|
717
744
|
The content of CIF files are lazily deserialized:
|
|
@@ -722,12 +749,6 @@ class CIFFile(_Component, File, MutableMapping):
|
|
|
722
749
|
The deserialized :class:`CIFBlock`/:class:`CIFCategory` objects
|
|
723
750
|
are cached for subsequent accesses.
|
|
724
751
|
|
|
725
|
-
Attributes
|
|
726
|
-
----------
|
|
727
|
-
block : CIFBlock
|
|
728
|
-
The sole block of the file.
|
|
729
|
-
If the file contains multiple blocks, an exception is raised.
|
|
730
|
-
|
|
731
752
|
Examples
|
|
732
753
|
--------
|
|
733
754
|
Read a CIF file and access its content:
|
|
@@ -806,14 +827,12 @@ class CIFFile(_Component, File, MutableMapping):
|
|
|
806
827
|
def serialize(self):
|
|
807
828
|
text_blocks = []
|
|
808
829
|
for block_name, block in self._blocks.items():
|
|
809
|
-
text_blocks.append("data_" + block_name + "\n")
|
|
810
|
-
# A comment line is set after the block indicator
|
|
811
|
-
text_blocks.append("#\n")
|
|
812
830
|
if isinstance(block, str):
|
|
813
831
|
# Block is already stored as text
|
|
814
832
|
text_blocks.append(block)
|
|
815
833
|
else:
|
|
816
834
|
try:
|
|
835
|
+
block.name = block_name
|
|
817
836
|
text_blocks.append(block.serialize())
|
|
818
837
|
except Exception:
|
|
819
838
|
raise SerializationError(
|
|
@@ -877,6 +896,7 @@ class CIFFile(_Component, File, MutableMapping):
|
|
|
877
896
|
block = CIFBlock.deserialize(block)
|
|
878
897
|
except Exception:
|
|
879
898
|
raise DeserializationError(f"Failed to deserialize block '{key}'")
|
|
899
|
+
block.name = key
|
|
880
900
|
# Update with deserialized object
|
|
881
901
|
self._blocks[key] = block
|
|
882
902
|
return block
|
|
@@ -884,11 +904,15 @@ class CIFFile(_Component, File, MutableMapping):
|
|
|
884
904
|
def __setitem__(self, key, block):
|
|
885
905
|
if not isinstance(block, CIFBlock):
|
|
886
906
|
raise TypeError(f"Expected 'CIFBlock', but got '{type(block).__name__}'")
|
|
907
|
+
block.name = key
|
|
887
908
|
self._blocks[key] = block
|
|
888
909
|
|
|
889
910
|
def __delitem__(self, key):
|
|
890
911
|
del self._blocks[key]
|
|
891
912
|
|
|
913
|
+
def __contains__(self, key):
|
|
914
|
+
return key in self._blocks
|
|
915
|
+
|
|
892
916
|
def __iter__(self):
|
|
893
917
|
return iter(self._blocks)
|
|
894
918
|
|
|
@@ -921,7 +945,7 @@ def _create_element_dict(lines, element_names, element_starts):
|
|
|
921
945
|
# Lazy deserialization
|
|
922
946
|
# -> keep as text for now and deserialize later if needed
|
|
923
947
|
return {
|
|
924
|
-
element_name: "\n".join(lines[element_starts[i] : element_starts[i + 1]])
|
|
948
|
+
element_name: "\n".join(lines[element_starts[i] : element_starts[i + 1]]) + "\n"
|
|
925
949
|
for i, element_name in enumerate(element_names)
|
|
926
950
|
}
|
|
927
951
|
|
|
@@ -1029,29 +1053,31 @@ def _split_one_line(line):
|
|
|
1029
1053
|
"""
|
|
1030
1054
|
# Special case of multiline value, where the line starts with ';'
|
|
1031
1055
|
if line[0] == ";":
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1056
|
+
yield line[1:]
|
|
1057
|
+
elif "'" in line or '"' in line:
|
|
1058
|
+
# Quoted values in the line
|
|
1059
|
+
while line:
|
|
1060
|
+
# Strip leading whitespace(s)
|
|
1061
|
+
stripped_line = line.lstrip()
|
|
1062
|
+
# Split the line on whitespace
|
|
1063
|
+
word, _, line = stripped_line.partition(" ")
|
|
1064
|
+
# Handle the case where the word start with a quote
|
|
1065
|
+
if word.startswith(("'", '"')):
|
|
1066
|
+
# Set the separator to the quote found
|
|
1067
|
+
separator = word[0]
|
|
1068
|
+
# Handle the case of a quoted word without space
|
|
1069
|
+
if word.endswith(separator) and len(word) > 1:
|
|
1070
|
+
# Yield the word without the opening and closing quotes
|
|
1071
|
+
yield word[1:-1]
|
|
1072
|
+
continue
|
|
1073
|
+
# split the word on the separator
|
|
1074
|
+
word, _, line = stripped_line[1:].partition(separator)
|
|
1075
|
+
|
|
1076
|
+
yield word
|
|
1077
|
+
else:
|
|
1078
|
+
# No quoted values in the line -> simple whitespace split
|
|
1079
|
+
for line in line.split():
|
|
1080
|
+
yield line
|
|
1055
1081
|
|
|
1056
1082
|
|
|
1057
1083
|
def _arrayfy(data):
|
|
@@ -120,6 +120,12 @@ class _HierarchicalContainer(_Component, MutableMapping, metaclass=ABCMeta):
|
|
|
120
120
|
A component is only deserialized from the serialized data, if it
|
|
121
121
|
is accessed.
|
|
122
122
|
The deserialized component is then cached in the container.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
elements : dict, optional
|
|
127
|
+
The initial elements of the container.
|
|
128
|
+
By default no initial elements are added.
|
|
123
129
|
"""
|
|
124
130
|
|
|
125
131
|
def __init__(self, elements=None):
|
|
@@ -171,10 +177,10 @@ class _HierarchicalContainer(_Component, MutableMapping, metaclass=ABCMeta):
|
|
|
171
177
|
Parameters
|
|
172
178
|
----------
|
|
173
179
|
store_key_in: str, optional
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
180
|
+
If given, the key of each element is stored as value in the
|
|
181
|
+
serialized element.
|
|
182
|
+
This is basically the reverse operation of `take_key_from` in
|
|
183
|
+
:meth:`_deserialize_elements()`.
|
|
178
184
|
"""
|
|
179
185
|
serialized_elements = []
|
|
180
186
|
for key, element in self._elements.items():
|
|
@@ -223,6 +229,11 @@ class _HierarchicalContainer(_Component, MutableMapping, metaclass=ABCMeta):
|
|
|
223
229
|
def __delitem__(self, key):
|
|
224
230
|
del self._elements[key]
|
|
225
231
|
|
|
232
|
+
# Implement `__contains__()` explicitly,
|
|
233
|
+
# because the mixin method unnecessarily deserializes the value, if available
|
|
234
|
+
def __contains__(self, key):
|
|
235
|
+
return key in self._elements
|
|
236
|
+
|
|
226
237
|
def __iter__(self):
|
|
227
238
|
return iter(self._elements)
|
|
228
239
|
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
__all__ = ["compress"]
|
|
2
|
+
__name__ = "biotite.structure.io.pdbx"
|
|
3
|
+
__author__ = "Patrick Kunzmann"
|
|
4
|
+
|
|
5
|
+
import itertools
|
|
6
|
+
import msgpack
|
|
7
|
+
import numpy as np
|
|
8
|
+
import biotite.structure.io.pdbx.bcif as bcif
|
|
9
|
+
from biotite.structure.io.pdbx.bcif import _encode_numpy as encode_numpy
|
|
10
|
+
from biotite.structure.io.pdbx.encoding import (
|
|
11
|
+
ByteArrayEncoding,
|
|
12
|
+
DeltaEncoding,
|
|
13
|
+
FixedPointEncoding,
|
|
14
|
+
IntegerPackingEncoding,
|
|
15
|
+
RunLengthEncoding,
|
|
16
|
+
StringArrayEncoding,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def compress(data, float_tolerance=1e-6):
|
|
21
|
+
"""
|
|
22
|
+
Try to reduce the size of a *BinaryCIF* file (or block, category, etc.) by testing
|
|
23
|
+
different data encodings for each data array and selecting the one, which results in
|
|
24
|
+
the smallest size.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
data : BinaryCIFFile or BinaryCIFBlock or BinaryCIFCategory or BinaryCIFColumn or BinaryCIFData
|
|
29
|
+
The data to compress.
|
|
30
|
+
float_tolerance : float, optional
|
|
31
|
+
The relative error that is accepted when compressing floating point numbers.
|
|
32
|
+
|
|
33
|
+
Returns
|
|
34
|
+
-------
|
|
35
|
+
compressed_file : BinaryCIFFile or BinaryCIFBlock or BinaryCIFCategory or BinaryCIFColumn or BinaryCIFData
|
|
36
|
+
The compressed data with the same type as the input data.
|
|
37
|
+
If no improved compression is found for a :class:`BinaryCIFData` array,
|
|
38
|
+
the input data is kept.
|
|
39
|
+
Hence, the return value is no deep copy of the input data.
|
|
40
|
+
|
|
41
|
+
Examples
|
|
42
|
+
--------
|
|
43
|
+
|
|
44
|
+
>>> from io import BytesIO
|
|
45
|
+
>>> pdbx_file = BinaryCIFFile()
|
|
46
|
+
>>> set_structure(pdbx_file, atom_array_stack)
|
|
47
|
+
>>> # Write uncompressed file
|
|
48
|
+
>>> uncompressed_file = BytesIO()
|
|
49
|
+
>>> pdbx_file.write(uncompressed_file)
|
|
50
|
+
>>> _ = uncompressed_file.seek(0)
|
|
51
|
+
>>> print(f"{len(uncompressed_file.read()) // 1000} KB")
|
|
52
|
+
927 KB
|
|
53
|
+
>>> # Write compressed file
|
|
54
|
+
>>> pdbx_file = compress(pdbx_file)
|
|
55
|
+
>>> compressed_file = BytesIO()
|
|
56
|
+
>>> pdbx_file.write(compressed_file)
|
|
57
|
+
>>> _ = compressed_file.seek(0)
|
|
58
|
+
>>> print(f"{len(compressed_file.read()) // 1000} KB")
|
|
59
|
+
111 KB
|
|
60
|
+
"""
|
|
61
|
+
match type(data):
|
|
62
|
+
case bcif.BinaryCIFFile:
|
|
63
|
+
return _compress_file(data, float_tolerance)
|
|
64
|
+
case bcif.BinaryCIFBlock:
|
|
65
|
+
return _compress_block(data, float_tolerance)
|
|
66
|
+
case bcif.BinaryCIFCategory:
|
|
67
|
+
return _compress_category(data, float_tolerance)
|
|
68
|
+
case bcif.BinaryCIFColumn:
|
|
69
|
+
return _compress_column(data, float_tolerance)
|
|
70
|
+
case bcif.BinaryCIFData:
|
|
71
|
+
return _compress_data(data, float_tolerance)
|
|
72
|
+
case _:
|
|
73
|
+
raise TypeError(f"Unsupported type {type(data).__name__}")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _compress_file(bcif_file, float_tolerance):
|
|
77
|
+
compressed_file = bcif.BinaryCIFFile()
|
|
78
|
+
for block_name, bcif_block in bcif_file.items():
|
|
79
|
+
compressed_block = _compress_block(bcif_block, float_tolerance)
|
|
80
|
+
compressed_file[block_name] = compressed_block
|
|
81
|
+
return compressed_file
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _compress_block(bcif_block, float_tolerance):
|
|
85
|
+
compressed_block = bcif.BinaryCIFBlock()
|
|
86
|
+
for category_name, bcif_category in bcif_block.items():
|
|
87
|
+
compressed_category = _compress_category(bcif_category, float_tolerance)
|
|
88
|
+
compressed_block[category_name] = compressed_category
|
|
89
|
+
return compressed_block
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _compress_category(bcif_category, float_tolerance):
|
|
93
|
+
compressed_category = bcif.BinaryCIFCategory()
|
|
94
|
+
for column_name, bcif_column in bcif_category.items():
|
|
95
|
+
compressed_column = _compress_column(bcif_column, float_tolerance)
|
|
96
|
+
compressed_category[column_name] = compressed_column
|
|
97
|
+
return compressed_category
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _compress_column(bcif_column, float_tolerance):
|
|
101
|
+
data = _compress_data(bcif_column.data, float_tolerance)
|
|
102
|
+
if bcif_column.mask is not None:
|
|
103
|
+
mask = _compress_data(bcif_column.mask, float_tolerance)
|
|
104
|
+
else:
|
|
105
|
+
mask = None
|
|
106
|
+
return bcif.BinaryCIFColumn(data, mask)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _compress_data(bcif_data, float_tolerance):
|
|
110
|
+
array = bcif_data.array
|
|
111
|
+
if len(array) == 1:
|
|
112
|
+
# No need to compress a single value -> Use default uncompressed encoding
|
|
113
|
+
return bcif.BinaryCIFData(array)
|
|
114
|
+
|
|
115
|
+
if np.issubdtype(array.dtype, np.str_):
|
|
116
|
+
# Leave encoding empty for now, as it is explicitly set later
|
|
117
|
+
encoding = StringArrayEncoding(data_encoding=[], offset_encoding=[])
|
|
118
|
+
# Run encode to initialize the data and offset arrays
|
|
119
|
+
indices = encoding.encode(array)
|
|
120
|
+
offsets = np.cumsum([0] + [len(s) for s in encoding.strings])
|
|
121
|
+
encoding.data_encoding, _ = _find_best_integer_compression(indices)
|
|
122
|
+
encoding.offset_encoding, _ = _find_best_integer_compression(offsets)
|
|
123
|
+
return bcif.BinaryCIFData(array, [encoding])
|
|
124
|
+
|
|
125
|
+
elif np.issubdtype(array.dtype, np.floating):
|
|
126
|
+
to_integer_encoding = FixedPointEncoding(
|
|
127
|
+
10 ** _get_decimal_places(array, float_tolerance)
|
|
128
|
+
)
|
|
129
|
+
integer_array = to_integer_encoding.encode(array)
|
|
130
|
+
best_encoding, size_compressed = _find_best_integer_compression(integer_array)
|
|
131
|
+
if size_compressed < _data_size_in_file(bcif.BinaryCIFData(array)):
|
|
132
|
+
return bcif.BinaryCIFData(array, [to_integer_encoding] + best_encoding)
|
|
133
|
+
else:
|
|
134
|
+
# The float array is smaller -> encode it directly as bytes
|
|
135
|
+
return bcif.BinaryCIFData(array, [ByteArrayEncoding()])
|
|
136
|
+
|
|
137
|
+
elif np.issubdtype(array.dtype, np.integer):
|
|
138
|
+
array = _to_smallest_integer_type(array)
|
|
139
|
+
encodings, _ = _find_best_integer_compression(array)
|
|
140
|
+
return bcif.BinaryCIFData(array, encodings)
|
|
141
|
+
|
|
142
|
+
else:
|
|
143
|
+
raise TypeError(f"Unsupported data type {array.dtype}")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _find_best_integer_compression(array):
|
|
147
|
+
"""
|
|
148
|
+
Try different data encodings on an integer array and return the one that results in
|
|
149
|
+
the smallest size.
|
|
150
|
+
"""
|
|
151
|
+
best_encoding_sequence = None
|
|
152
|
+
smallest_size = np.inf
|
|
153
|
+
|
|
154
|
+
for use_delta in [False, True]:
|
|
155
|
+
if use_delta:
|
|
156
|
+
encoding = DeltaEncoding()
|
|
157
|
+
array_after_delta = encoding.encode(array)
|
|
158
|
+
encodings_after_delta = [encoding]
|
|
159
|
+
else:
|
|
160
|
+
encodings_after_delta = []
|
|
161
|
+
array_after_delta = array
|
|
162
|
+
for use_run_length in [False, True]:
|
|
163
|
+
# Use encoded data from previous step to save time
|
|
164
|
+
if use_run_length:
|
|
165
|
+
encoding = RunLengthEncoding()
|
|
166
|
+
array_after_rle = encoding.encode(array_after_delta)
|
|
167
|
+
encodings_after_rle = encodings_after_delta + [encoding]
|
|
168
|
+
else:
|
|
169
|
+
encodings_after_rle = encodings_after_delta
|
|
170
|
+
array_after_rle = array_after_delta
|
|
171
|
+
for packed_byte_count in [None, 1, 2]:
|
|
172
|
+
if packed_byte_count is not None:
|
|
173
|
+
# Quickly check this heuristic
|
|
174
|
+
# to avoid computing an exploding packed data array
|
|
175
|
+
if (
|
|
176
|
+
_estimate_packed_length(array_after_rle, packed_byte_count)
|
|
177
|
+
>= array_after_rle.nbytes
|
|
178
|
+
):
|
|
179
|
+
# Packing would not reduce the size
|
|
180
|
+
continue
|
|
181
|
+
encoding = IntegerPackingEncoding(packed_byte_count)
|
|
182
|
+
array_after_packing = encoding.encode(array_after_rle)
|
|
183
|
+
encodings_after_packing = encodings_after_rle + [encoding]
|
|
184
|
+
else:
|
|
185
|
+
encodings_after_packing = encodings_after_rle
|
|
186
|
+
array_after_packing = array_after_rle
|
|
187
|
+
encoding = ByteArrayEncoding()
|
|
188
|
+
encoded_array = encoding.encode(array_after_packing)
|
|
189
|
+
encodings = encodings_after_packing + [encoding]
|
|
190
|
+
# Pack data directly instead of using the BinaryCIFData class
|
|
191
|
+
# to avoid the unnecessary re-encoding of the array,
|
|
192
|
+
# as it is already available in 'encoded_array'
|
|
193
|
+
serialized_encoding = [enc.serialize() for enc in encodings]
|
|
194
|
+
serialized_data = {
|
|
195
|
+
"data": encoded_array,
|
|
196
|
+
"encoding": serialized_encoding,
|
|
197
|
+
}
|
|
198
|
+
size = _data_size_in_file(serialized_data)
|
|
199
|
+
if size < smallest_size:
|
|
200
|
+
best_encoding_sequence = encodings
|
|
201
|
+
smallest_size = size
|
|
202
|
+
return best_encoding_sequence, smallest_size
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _estimate_packed_length(array, packed_byte_count):
|
|
206
|
+
"""
|
|
207
|
+
Estimate the length of an integer array after packing it with a given number of
|
|
208
|
+
bytes.
|
|
209
|
+
|
|
210
|
+
Parameters
|
|
211
|
+
----------
|
|
212
|
+
array : numpy.ndarray
|
|
213
|
+
The array to pack.
|
|
214
|
+
packed_byte_count : int
|
|
215
|
+
The number of bytes used for packing.
|
|
216
|
+
|
|
217
|
+
Returns
|
|
218
|
+
-------
|
|
219
|
+
length : int
|
|
220
|
+
The estimated length of the packed array.
|
|
221
|
+
"""
|
|
222
|
+
# Use int64 to avoid integer overflow in the following line
|
|
223
|
+
max_val_per_element = np.int64(2 ** (8 * packed_byte_count))
|
|
224
|
+
n_bytes_per_element = packed_byte_count * (np.abs(array // max_val_per_element) + 1)
|
|
225
|
+
return np.sum(n_bytes_per_element, dtype=np.int64)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _to_smallest_integer_type(array):
|
|
229
|
+
"""
|
|
230
|
+
Convert an integer array to the smallest possible integer type, that is still able
|
|
231
|
+
to represent all values in the array.
|
|
232
|
+
|
|
233
|
+
Parameters
|
|
234
|
+
----------
|
|
235
|
+
array : numpy.ndarray
|
|
236
|
+
The array to convert.
|
|
237
|
+
|
|
238
|
+
Returns
|
|
239
|
+
-------
|
|
240
|
+
array : numpy.ndarray
|
|
241
|
+
The converted array.
|
|
242
|
+
"""
|
|
243
|
+
if array.min() >= 0:
|
|
244
|
+
for dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
|
|
245
|
+
if np.all(array <= np.iinfo(dtype).max):
|
|
246
|
+
return array.astype(dtype)
|
|
247
|
+
for dtype in [np.int8, np.int16, np.int32, np.int64]:
|
|
248
|
+
if np.all(array >= np.iinfo(dtype).min) and np.all(
|
|
249
|
+
array <= np.iinfo(dtype).max
|
|
250
|
+
):
|
|
251
|
+
return array.astype(dtype)
|
|
252
|
+
raise ValueError("Array is out of bounds for all integer types")
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _data_size_in_file(data):
|
|
256
|
+
"""
|
|
257
|
+
Get the size of the data, it would have when written into a *BinaryCIF* file.
|
|
258
|
+
|
|
259
|
+
Parameters
|
|
260
|
+
----------
|
|
261
|
+
data : BinaryCIFData or dict
|
|
262
|
+
The data array whose size is measured.
|
|
263
|
+
Can be either a :class:`BinaryCIFData` object or already serialized data.
|
|
264
|
+
|
|
265
|
+
Returns
|
|
266
|
+
-------
|
|
267
|
+
size : int
|
|
268
|
+
The size of the data array in the file in bytes.
|
|
269
|
+
"""
|
|
270
|
+
if isinstance(data, bcif.BinaryCIFData):
|
|
271
|
+
data = data.serialize()
|
|
272
|
+
bytes_in_file = msgpack.packb(data, use_bin_type=True, default=encode_numpy)
|
|
273
|
+
return len(bytes_in_file)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _get_decimal_places(array, tol):
|
|
277
|
+
"""
|
|
278
|
+
Get the number of decimal places in a floating point array.
|
|
279
|
+
|
|
280
|
+
Parameters
|
|
281
|
+
----------
|
|
282
|
+
array : numpy.ndarray
|
|
283
|
+
The array to analyze.
|
|
284
|
+
tol : float, optional
|
|
285
|
+
The relative tolerance allowed when the values are cut off after the returned
|
|
286
|
+
number of decimal places.
|
|
287
|
+
|
|
288
|
+
Returns
|
|
289
|
+
-------
|
|
290
|
+
decimals : int
|
|
291
|
+
The number of decimal places.
|
|
292
|
+
"""
|
|
293
|
+
# Decimals of NaN or infinite values do not make sense
|
|
294
|
+
# and 0 would give NaN when rounding on decimals
|
|
295
|
+
array = array[np.isfinite(array) & (array != 0)]
|
|
296
|
+
for decimals in itertools.count(start=-_order_magnitude(array)):
|
|
297
|
+
error = np.abs(np.round(array, decimals) - array)
|
|
298
|
+
if np.all(error < tol * np.abs(array)):
|
|
299
|
+
return decimals
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _order_magnitude(array):
|
|
303
|
+
"""
|
|
304
|
+
Get the order of magnitude of floating point values.
|
|
305
|
+
|
|
306
|
+
Parameters
|
|
307
|
+
----------
|
|
308
|
+
array : ndarray, dtype=float
|
|
309
|
+
The value to analyze.
|
|
310
|
+
|
|
311
|
+
Returns
|
|
312
|
+
-------
|
|
313
|
+
magnitude : int
|
|
314
|
+
The order of magnitude, i.e. the maximum exponent a number in the array would
|
|
315
|
+
have in scientific notation, if only one digit is left of the decimal point.
|
|
316
|
+
"""
|
|
317
|
+
array = array[array != 0]
|
|
318
|
+
if len(array) == 0:
|
|
319
|
+
# No non-zero values -> define order of magnitude as 0
|
|
320
|
+
return 0
|
|
321
|
+
return int(np.max(np.floor(np.log10(np.abs(array)))).item())
|