biotite 1.5.0__cp314-cp314-macosx_10_13_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +428 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +197 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +60 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +228 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +258 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +161 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +126 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +702 -0
- biotite/sequence/align/banded.cpython-314-darwin.so +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cpython-314-darwin.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cpython-314-darwin.so +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cpython-314-darwin.so +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cpython-314-darwin.so +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cpython-314-darwin.so +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cpython-314-darwin.so +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cpython-314-darwin.so +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cpython-314-darwin.so +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cpython-314-darwin.so +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cpython-314-darwin.so +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cpython-314-darwin.so +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +283 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cpython-314-darwin.so +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cpython-314-darwin.so +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cpython-314-darwin.so +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1562 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cpython-314-darwin.so +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cpython-314-darwin.so +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cpython-314-darwin.so +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +591 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +425 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cpython-314-darwin.so +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2113 -0
- biotite/structure/io/pdbx/encoding.cpython-314-darwin.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +451 -0
- biotite/structure/sasa.cpython-314-darwin.so +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.5.0.dist-info/METADATA +162 -0
- biotite-1.5.0.dist-info/RECORD +354 -0
- biotite-1.5.0.dist-info/WHEEL +6 -0
- biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,1078 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
This module contains data encodings for BinaryCIF files.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
__name__ = "biotite.structure.io.pdbx"
|
|
10
|
+
__author__ = "Patrick Kunzmann"
|
|
11
|
+
__all__ = ["ByteArrayEncoding", "FixedPointEncoding",
|
|
12
|
+
"IntervalQuantizationEncoding", "RunLengthEncoding",
|
|
13
|
+
"DeltaEncoding", "IntegerPackingEncoding", "StringArrayEncoding",
|
|
14
|
+
"TypeCode"]
|
|
15
|
+
|
|
16
|
+
cimport cython
|
|
17
|
+
cimport numpy as np
|
|
18
|
+
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from abc import ABCMeta, abstractmethod
|
|
21
|
+
from numbers import Integral
|
|
22
|
+
from enum import IntEnum
|
|
23
|
+
import re
|
|
24
|
+
import numpy as np
|
|
25
|
+
from .component import _Component
|
|
26
|
+
from ....file import InvalidFileError
|
|
27
|
+
|
|
28
|
+
ctypedef np.int8_t int8
|
|
29
|
+
ctypedef np.int16_t int16
|
|
30
|
+
ctypedef np.int32_t int32
|
|
31
|
+
ctypedef np.uint8_t uint8
|
|
32
|
+
ctypedef np.uint16_t uint16
|
|
33
|
+
ctypedef np.uint32_t uint32
|
|
34
|
+
ctypedef np.float32_t float32
|
|
35
|
+
ctypedef np.float64_t float64
|
|
36
|
+
|
|
37
|
+
ctypedef fused Integer:
|
|
38
|
+
uint8
|
|
39
|
+
uint16
|
|
40
|
+
uint32
|
|
41
|
+
int8
|
|
42
|
+
int16
|
|
43
|
+
int32
|
|
44
|
+
|
|
45
|
+
# Used to create cartesian product of type combinations
|
|
46
|
+
# in run-length encoding
|
|
47
|
+
ctypedef fused OutputInteger:
|
|
48
|
+
uint8
|
|
49
|
+
uint16
|
|
50
|
+
uint32
|
|
51
|
+
int8
|
|
52
|
+
int16
|
|
53
|
+
int32
|
|
54
|
+
|
|
55
|
+
ctypedef fused Float:
|
|
56
|
+
float32
|
|
57
|
+
float64
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
CAMEL_CASE_PATTERN = re.compile(r"(?<!^)(?=[A-Z])")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class TypeCode(IntEnum):
|
|
64
|
+
"""
|
|
65
|
+
This enum type represents integers that represent data types in
|
|
66
|
+
*BinaryCIF*.
|
|
67
|
+
"""
|
|
68
|
+
INT8 = 1
|
|
69
|
+
INT16 = 2
|
|
70
|
+
INT32 = 3
|
|
71
|
+
UINT8 = 4
|
|
72
|
+
UINT16 = 5
|
|
73
|
+
UINT32 = 6
|
|
74
|
+
FLOAT32 = 32
|
|
75
|
+
FLOAT64 = 33
|
|
76
|
+
|
|
77
|
+
@staticmethod
|
|
78
|
+
def from_dtype(dtype):
|
|
79
|
+
"""
|
|
80
|
+
Convert a *NumPy* dtype to a *BinaryCIF* type code.
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
dtype : dtype or int or TypeCode
|
|
85
|
+
The data type to be converted.
|
|
86
|
+
If already a type code, it is simply returned.
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
type_code : TypeCode
|
|
91
|
+
The corresponding type code.
|
|
92
|
+
"""
|
|
93
|
+
if isinstance(dtype, Integral):
|
|
94
|
+
# Already a type code
|
|
95
|
+
return TypeCode(dtype)
|
|
96
|
+
else:
|
|
97
|
+
dtype = np.dtype(dtype)
|
|
98
|
+
# Find the closest dtype supported by the format
|
|
99
|
+
if np.issubdtype(dtype, np.integer):
|
|
100
|
+
# int64 is not supported by format
|
|
101
|
+
if dtype == np.int64:
|
|
102
|
+
supported_dtype = np.int32
|
|
103
|
+
elif dtype == np.uint64:
|
|
104
|
+
supported_dtype = np.uint32
|
|
105
|
+
else:
|
|
106
|
+
supported_dtype = dtype
|
|
107
|
+
elif np.issubdtype(dtype, np.floating):
|
|
108
|
+
if dtype == np.float16:
|
|
109
|
+
supported_dtype = np.float32
|
|
110
|
+
# float128 is not available on all architectures
|
|
111
|
+
elif hasattr(np, "float128") and dtype == np.float128:
|
|
112
|
+
supported_dtype = np.float64
|
|
113
|
+
else:
|
|
114
|
+
supported_dtype = dtype
|
|
115
|
+
else:
|
|
116
|
+
raise ValueError(
|
|
117
|
+
f"dtype '{dtype}' is not supported by BinaryCIF"
|
|
118
|
+
)
|
|
119
|
+
return _DTYPE_TO_TYPE_CODE[
|
|
120
|
+
np.dtype(supported_dtype).newbyteorder("<").str
|
|
121
|
+
]
|
|
122
|
+
|
|
123
|
+
def to_dtype(self):
|
|
124
|
+
"""
|
|
125
|
+
Convert this type code to a *NumPy* dtype.
|
|
126
|
+
|
|
127
|
+
Returns
|
|
128
|
+
-------
|
|
129
|
+
dtype : dtype
|
|
130
|
+
The corresponding data type.
|
|
131
|
+
"""
|
|
132
|
+
return _TYPE_CODE_TO_DTYPE[self]
|
|
133
|
+
|
|
134
|
+
# Converts BCIF integers representing the type to an actual NumPy dtype
|
|
135
|
+
_TYPE_CODE_TO_DTYPE = {
|
|
136
|
+
# All data types are little-endian
|
|
137
|
+
TypeCode.INT8: "|i1",
|
|
138
|
+
TypeCode.INT16: "<i2",
|
|
139
|
+
TypeCode.INT32: "<i4",
|
|
140
|
+
TypeCode.UINT8: "|u1",
|
|
141
|
+
TypeCode.UINT16: "<u2",
|
|
142
|
+
TypeCode.UINT32: "<u4",
|
|
143
|
+
TypeCode.FLOAT32: "<f4",
|
|
144
|
+
TypeCode.FLOAT64: "<f8"
|
|
145
|
+
}
|
|
146
|
+
_DTYPE_TO_TYPE_CODE = {val: key for key, val in _TYPE_CODE_TO_DTYPE.items()}
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class Encoding(_Component, metaclass=ABCMeta):
|
|
150
|
+
"""
|
|
151
|
+
Abstract base class for *BinaryCIF* data encodings.
|
|
152
|
+
|
|
153
|
+
Notes
|
|
154
|
+
-----
|
|
155
|
+
The encoding classes do not omit bound checks for decoding,
|
|
156
|
+
since the file content may be invalid/malicious.
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
@classmethod
|
|
160
|
+
def deserialize(cls, content):
|
|
161
|
+
params = {
|
|
162
|
+
_camel_to_snake_case(param): value
|
|
163
|
+
for param, value in content.items()
|
|
164
|
+
}
|
|
165
|
+
# 'kind' is no parameter, but indicates the class itself
|
|
166
|
+
params.pop("kind")
|
|
167
|
+
try:
|
|
168
|
+
encoding = cls(**params)
|
|
169
|
+
except TypeError as e:
|
|
170
|
+
raise InvalidFileError(
|
|
171
|
+
f"Invalid encoding parameters for {cls.__name__}"
|
|
172
|
+
)
|
|
173
|
+
except ValueError:
|
|
174
|
+
raise InvalidFileError(
|
|
175
|
+
f"Missing encoding parameters for {cls.__name__}"
|
|
176
|
+
)
|
|
177
|
+
return encoding
|
|
178
|
+
|
|
179
|
+
def serialize(self):
|
|
180
|
+
for param in self.__annotations__:
|
|
181
|
+
if getattr(self, param) is None:
|
|
182
|
+
raise ValueError(
|
|
183
|
+
f"'{param}' must be explicitly given or needs to be "
|
|
184
|
+
"determined from first encoding pass, before it is "
|
|
185
|
+
"serialized"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
serialized = {
|
|
189
|
+
_snake_to_camel_case(param): getattr(self, param)
|
|
190
|
+
for param in self.__annotations__
|
|
191
|
+
}
|
|
192
|
+
serialized.update({
|
|
193
|
+
"kind": _encoding_classes_kinds[type(self).__name__]
|
|
194
|
+
})
|
|
195
|
+
return serialized
|
|
196
|
+
|
|
197
|
+
@abstractmethod
|
|
198
|
+
def encode(self, data):
|
|
199
|
+
"""
|
|
200
|
+
Apply this encoding to the given data.
|
|
201
|
+
|
|
202
|
+
Parameters
|
|
203
|
+
----------
|
|
204
|
+
data : ndarray
|
|
205
|
+
The data to be encoded.
|
|
206
|
+
|
|
207
|
+
Returns
|
|
208
|
+
-------
|
|
209
|
+
encoded_data : ndarray or bytes
|
|
210
|
+
The encoded data.
|
|
211
|
+
"""
|
|
212
|
+
raise NotImplementedError()
|
|
213
|
+
|
|
214
|
+
@abstractmethod
|
|
215
|
+
def decode(self, data):
|
|
216
|
+
"""
|
|
217
|
+
Apply the inverse of this encoding to the given data.
|
|
218
|
+
|
|
219
|
+
Parameters
|
|
220
|
+
----------
|
|
221
|
+
data : ndarray or bytes
|
|
222
|
+
The data to be decoded.
|
|
223
|
+
|
|
224
|
+
Returns
|
|
225
|
+
-------
|
|
226
|
+
decoded_data : ndarray
|
|
227
|
+
The decoded data.
|
|
228
|
+
|
|
229
|
+
Warnings
|
|
230
|
+
--------
|
|
231
|
+
When overriding this method, do not omit bound checks with
|
|
232
|
+
``@cython.boundscheck(False)`` or ``@cython.wraparound(False)``,
|
|
233
|
+
since the file content may be invalid/malicious.
|
|
234
|
+
"""
|
|
235
|
+
raise NotImplementedError()
|
|
236
|
+
|
|
237
|
+
def __str__(self):
|
|
238
|
+
# Restore original behavior, as `__str__()` implementation of `_Component`
|
|
239
|
+
# may require serialization, which is not possible for some encodings prior
|
|
240
|
+
# to the first encoding pass
|
|
241
|
+
return object.__str__(self)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
@dataclass
|
|
245
|
+
class ByteArrayEncoding(Encoding):
|
|
246
|
+
r"""
|
|
247
|
+
Encoding that encodes an array into bytes.
|
|
248
|
+
|
|
249
|
+
Parameters
|
|
250
|
+
----------
|
|
251
|
+
type : dytpe or TypeCode, optional
|
|
252
|
+
The data type of the array to be encoded.
|
|
253
|
+
Either a NumPy dtype or a *BinaryCIF* type code is accepted.
|
|
254
|
+
If omitted, the data type is taken from the data the
|
|
255
|
+
first time :meth:`encode()` is called.
|
|
256
|
+
|
|
257
|
+
Attributes
|
|
258
|
+
----------
|
|
259
|
+
type : TypeCode
|
|
260
|
+
|
|
261
|
+
Examples
|
|
262
|
+
--------
|
|
263
|
+
|
|
264
|
+
>>> data = np.arange(3)
|
|
265
|
+
>>> print(data)
|
|
266
|
+
[0 1 2]
|
|
267
|
+
>>> print(ByteArrayEncoding().encode(data))
|
|
268
|
+
b'\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00'
|
|
269
|
+
"""
|
|
270
|
+
type: ... = None
|
|
271
|
+
|
|
272
|
+
def __post_init__(self):
|
|
273
|
+
if self.type is not None:
|
|
274
|
+
self.type = TypeCode.from_dtype(self.type)
|
|
275
|
+
|
|
276
|
+
def encode(self, data):
|
|
277
|
+
if self.type is None:
|
|
278
|
+
self.type = TypeCode.from_dtype(data.dtype)
|
|
279
|
+
return _safe_cast(data, self.type.to_dtype()).tobytes()
|
|
280
|
+
|
|
281
|
+
def decode(self, data):
|
|
282
|
+
# Data is raw bytes in this case
|
|
283
|
+
return np.frombuffer(data, dtype=self.type.to_dtype())
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
@dataclass
|
|
287
|
+
class FixedPointEncoding(Encoding):
|
|
288
|
+
"""
|
|
289
|
+
Lossy encoding that multiplies floating point values with a given
|
|
290
|
+
factor and subsequently rounds them to the nearest integer.
|
|
291
|
+
|
|
292
|
+
Parameters
|
|
293
|
+
----------
|
|
294
|
+
factor : float
|
|
295
|
+
The factor by which the data is multiplied before rounding.
|
|
296
|
+
src_type : dtype or TypeCode, optional
|
|
297
|
+
The data type of the array to be encoded.
|
|
298
|
+
Either a NumPy dtype or a *BinaryCIF* type code is accepted.
|
|
299
|
+
The dtype must be a float type.
|
|
300
|
+
If omitted, the data type is taken from the data the
|
|
301
|
+
first time :meth:`encode()` is called.
|
|
302
|
+
|
|
303
|
+
Attributes
|
|
304
|
+
----------
|
|
305
|
+
factor : float
|
|
306
|
+
src_type : TypeCode
|
|
307
|
+
|
|
308
|
+
Examples
|
|
309
|
+
--------
|
|
310
|
+
|
|
311
|
+
>>> data = np.array([9.87, 6.543])
|
|
312
|
+
>>> print(data)
|
|
313
|
+
[9.870 6.543]
|
|
314
|
+
>>> print(FixedPointEncoding(factor=100).encode(data))
|
|
315
|
+
[987 654]
|
|
316
|
+
"""
|
|
317
|
+
factor: ...
|
|
318
|
+
src_type: ... = None
|
|
319
|
+
|
|
320
|
+
def __post_init__(self):
|
|
321
|
+
if self.src_type is not None:
|
|
322
|
+
self.src_type = TypeCode.from_dtype(self.src_type)
|
|
323
|
+
if self.src_type not in (TypeCode.FLOAT32, TypeCode.FLOAT64):
|
|
324
|
+
raise ValueError(
|
|
325
|
+
"Only floating point types are supported"
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
def encode(self, data):
|
|
329
|
+
# If not given in constructor, it is determined from the data
|
|
330
|
+
if self.src_type is None:
|
|
331
|
+
self.src_type = TypeCode.from_dtype(data.dtype)
|
|
332
|
+
if self.src_type not in (TypeCode.FLOAT32, TypeCode.FLOAT64):
|
|
333
|
+
raise ValueError(
|
|
334
|
+
"Only floating point types are supported"
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
# Round to avoid wrong values due to floating point inaccuracies
|
|
338
|
+
scaled_data = np.round(data * self.factor)
|
|
339
|
+
return _safe_cast(scaled_data, np.int32, allow_decimal_loss=True)
|
|
340
|
+
|
|
341
|
+
def decode(self, data):
|
|
342
|
+
return (data / self.factor).astype(
|
|
343
|
+
dtype=self.src_type.to_dtype(), copy=False
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
@dataclass
|
|
348
|
+
class IntervalQuantizationEncoding(Encoding):
|
|
349
|
+
"""
|
|
350
|
+
Lossy encoding that sorts floating point values into bins.
|
|
351
|
+
Each bin is represented by an integer
|
|
352
|
+
|
|
353
|
+
Parameters
|
|
354
|
+
----------
|
|
355
|
+
min, max : float
|
|
356
|
+
The minimum and maximum value the bins comprise.
|
|
357
|
+
num_steps : int
|
|
358
|
+
The number of bins.
|
|
359
|
+
src_type : dtype or TypeCode, optional
|
|
360
|
+
The data type of the array to be encoded.
|
|
361
|
+
Either a NumPy dtype or a *BinaryCIF* type code is accepted.
|
|
362
|
+
The dtype must be a float type.
|
|
363
|
+
If omitted, the data type is taken from the data the
|
|
364
|
+
first time :meth:`encode()` is called.
|
|
365
|
+
|
|
366
|
+
Attributes
|
|
367
|
+
----------
|
|
368
|
+
min, max : float
|
|
369
|
+
num_steps : int
|
|
370
|
+
src_type : TypeCode
|
|
371
|
+
|
|
372
|
+
Examples
|
|
373
|
+
--------
|
|
374
|
+
|
|
375
|
+
>>> data = np.linspace(11, 12, 6)
|
|
376
|
+
>>> print(data)
|
|
377
|
+
[11.0 11.2 11.4 11.6 11.8 12.0]
|
|
378
|
+
>>> # Use 0.5 as step size
|
|
379
|
+
>>> encoding = IntervalQuantizationEncoding(min=10, max=20, num_steps=21)
|
|
380
|
+
>>> # The encoding is lossy, as different values are mapped to the same bin
|
|
381
|
+
>>> encoded = encoding.encode(data)
|
|
382
|
+
>>> print(encoded)
|
|
383
|
+
[2 3 3 4 4 4]
|
|
384
|
+
>>> decoded = encoding.decode(encoded)
|
|
385
|
+
>>> print(decoded)
|
|
386
|
+
[11.0 11.5 11.5 12.0 12.0 12.0]
|
|
387
|
+
"""
|
|
388
|
+
min: ...
|
|
389
|
+
max: ...
|
|
390
|
+
num_steps: ...
|
|
391
|
+
src_type: ... = None
|
|
392
|
+
|
|
393
|
+
def __post_init__(self):
|
|
394
|
+
if self.src_type is not None:
|
|
395
|
+
self.src_type = TypeCode.from_dtype(self.src_type)
|
|
396
|
+
|
|
397
|
+
def encode(self, data):
|
|
398
|
+
# If not given in constructor, it is determined from the data
|
|
399
|
+
if self.src_type is None:
|
|
400
|
+
self.src_type = TypeCode.from_dtype(data.dtype)
|
|
401
|
+
|
|
402
|
+
steps = np.linspace(
|
|
403
|
+
self.min, self.max, self.num_steps, dtype=data.dtype
|
|
404
|
+
)
|
|
405
|
+
indices = np.searchsorted(steps, data, side="left")
|
|
406
|
+
return _safe_cast(indices, np.int32)
|
|
407
|
+
|
|
408
|
+
def decode(self, data):
|
|
409
|
+
output = data * (self.max - self.min) / (self.num_steps - 1)
|
|
410
|
+
output = output.astype(self.src_type.to_dtype(), copy=False)
|
|
411
|
+
output += self.min
|
|
412
|
+
return output
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
@dataclass
|
|
416
|
+
class RunLengthEncoding(Encoding):
|
|
417
|
+
"""
|
|
418
|
+
Encoding that compresses runs of equal values into pairs of
|
|
419
|
+
(value, run length).
|
|
420
|
+
|
|
421
|
+
Parameters
|
|
422
|
+
----------
|
|
423
|
+
src_size : int, optional
|
|
424
|
+
The size of the array to be encoded.
|
|
425
|
+
If omitted, the size is determined from the data the
|
|
426
|
+
first time :meth:`encode()` is called.
|
|
427
|
+
src_type : dtype or TypeCode, optional
|
|
428
|
+
The data type of the array to be encoded.
|
|
429
|
+
Either a NumPy dtype or a *BinaryCIF* type code is accepted.
|
|
430
|
+
The dtype must be a integer type.
|
|
431
|
+
If omitted, the data type is taken from the data the
|
|
432
|
+
first time :meth:`encode()` is called.
|
|
433
|
+
|
|
434
|
+
Attributes
|
|
435
|
+
----------
|
|
436
|
+
src_size : int
|
|
437
|
+
src_type : TypeCode
|
|
438
|
+
|
|
439
|
+
Examples
|
|
440
|
+
--------
|
|
441
|
+
|
|
442
|
+
>>> data = np.array([1, 1, 1, 5, 3, 3])
|
|
443
|
+
>>> print(data)
|
|
444
|
+
[1 1 1 5 3 3]
|
|
445
|
+
>>> encoded = RunLengthEncoding().encode(data)
|
|
446
|
+
>>> print(encoded)
|
|
447
|
+
[1 3 5 1 3 2]
|
|
448
|
+
>>> # Emphasize the the pairs
|
|
449
|
+
>>> print(encoded.reshape(-1, 2))
|
|
450
|
+
[[1 3]
|
|
451
|
+
[5 1]
|
|
452
|
+
[3 2]]
|
|
453
|
+
"""
|
|
454
|
+
src_size: ... = None
|
|
455
|
+
src_type: ... = None
|
|
456
|
+
|
|
457
|
+
def __post_init__(self):
|
|
458
|
+
if self.src_type is not None:
|
|
459
|
+
self.src_type = TypeCode.from_dtype(self.src_type)
|
|
460
|
+
|
|
461
|
+
def encode(self, data):
|
|
462
|
+
# If not given in constructor, it is determined from the data
|
|
463
|
+
if self.src_type is None:
|
|
464
|
+
self.src_type = TypeCode.from_dtype(data.dtype)
|
|
465
|
+
if self.src_size is None:
|
|
466
|
+
self.src_size = data.shape[0]
|
|
467
|
+
elif self.src_size != data.shape[0]:
|
|
468
|
+
raise IndexError(
|
|
469
|
+
"Given source size does not match actual data size"
|
|
470
|
+
)
|
|
471
|
+
return self._encode(_safe_cast(data, self.src_type.to_dtype()))
|
|
472
|
+
|
|
473
|
+
def decode(self, data):
|
|
474
|
+
return self._decode(
|
|
475
|
+
data, np.empty(0, dtype=self.src_type.to_dtype())
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
def _encode(self, const Integer[:] data):
|
|
479
|
+
# Pessimistic allocation of output array
|
|
480
|
+
# -> Run length is 1 for every element
|
|
481
|
+
cdef int32[:] output = np.zeros(data.shape[0] * 2, dtype=np.int32)
|
|
482
|
+
cdef int i=0, j=0
|
|
483
|
+
cdef int val = data[0]
|
|
484
|
+
cdef int run_length = 0
|
|
485
|
+
cdef int curr_val
|
|
486
|
+
for i in range(data.shape[0]):
|
|
487
|
+
curr_val = data[i]
|
|
488
|
+
if curr_val == val:
|
|
489
|
+
run_length += 1
|
|
490
|
+
else:
|
|
491
|
+
# New element -> Write element with run-length
|
|
492
|
+
output[j] = val
|
|
493
|
+
output[j+1] = run_length
|
|
494
|
+
j += 2
|
|
495
|
+
val = curr_val
|
|
496
|
+
run_length = 1
|
|
497
|
+
# Write last element
|
|
498
|
+
output[j] = val
|
|
499
|
+
output[j+1] = run_length
|
|
500
|
+
j += 2
|
|
501
|
+
# Trim to correct size
|
|
502
|
+
return np.asarray(output)[:j]
|
|
503
|
+
|
|
504
|
+
def _decode(self, const Integer[:] data, OutputInteger[:] output_type):
|
|
505
|
+
"""
|
|
506
|
+
`output_type` is merely a typed placeholder to allow for static
|
|
507
|
+
typing of output.
|
|
508
|
+
"""
|
|
509
|
+
if data.shape[0] % 2 != 0:
|
|
510
|
+
raise ValueError("Invalid run-length encoded data")
|
|
511
|
+
|
|
512
|
+
cdef int length = 0
|
|
513
|
+
cdef int i, j
|
|
514
|
+
cdef int value, repeat
|
|
515
|
+
|
|
516
|
+
if self.src_size is None:
|
|
517
|
+
# Determine length of output array by summing run lengths
|
|
518
|
+
for i in range(1, data.shape[0], 2):
|
|
519
|
+
length += data[i]
|
|
520
|
+
else:
|
|
521
|
+
length = self.src_size
|
|
522
|
+
|
|
523
|
+
cdef OutputInteger[:] output = np.zeros(
|
|
524
|
+
length, dtype=np.asarray(output_type).dtype
|
|
525
|
+
)
|
|
526
|
+
# Fill output array
|
|
527
|
+
j = 0
|
|
528
|
+
for i in range(0, data.shape[0], 2):
|
|
529
|
+
value = data[i]
|
|
530
|
+
repeat = data[i+1]
|
|
531
|
+
output[j : j+repeat] = value
|
|
532
|
+
j += repeat
|
|
533
|
+
return np.asarray(output)
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
@dataclass
|
|
537
|
+
class DeltaEncoding(Encoding):
|
|
538
|
+
"""
|
|
539
|
+
Encoding that encodes an array of integers into an array of
|
|
540
|
+
consecutive differences.
|
|
541
|
+
|
|
542
|
+
Parameters
|
|
543
|
+
----------
|
|
544
|
+
src_type : dtype or TypeCode, optional
|
|
545
|
+
The data type of the array to be encoded.
|
|
546
|
+
Either a NumPy dtype or a *BinaryCIF* type code is accepted.
|
|
547
|
+
The dtype must be a integer type.
|
|
548
|
+
If omitted, the data type is taken from the data the
|
|
549
|
+
first time :meth:`encode()` is called.
|
|
550
|
+
origin : int, optional
|
|
551
|
+
The starting value from which the differences are calculated.
|
|
552
|
+
If omitted, the value is taken from the first array element the
|
|
553
|
+
first time :meth:`encode()` is called.
|
|
554
|
+
|
|
555
|
+
Attributes
|
|
556
|
+
----------
|
|
557
|
+
src_type : TypeCode
|
|
558
|
+
origin : int
|
|
559
|
+
|
|
560
|
+
Examples
|
|
561
|
+
--------
|
|
562
|
+
|
|
563
|
+
>>> data = np.array([1, 1, 2, 3, 5, 8])
|
|
564
|
+
>>> encoding = DeltaEncoding()
|
|
565
|
+
>>> print(encoding.encode(data))
|
|
566
|
+
[0 0 1 1 2 3]
|
|
567
|
+
>>> print(encoding.origin)
|
|
568
|
+
1
|
|
569
|
+
"""
|
|
570
|
+
src_type: ... = None
|
|
571
|
+
origin: ... = None
|
|
572
|
+
|
|
573
|
+
def __post_init__(self):
|
|
574
|
+
if self.src_type is not None:
|
|
575
|
+
self.src_type = TypeCode.from_dtype(self.src_type)
|
|
576
|
+
|
|
577
|
+
def encode(self, data):
|
|
578
|
+
# If not given in constructor, it is determined from the data
|
|
579
|
+
if self.src_type is None:
|
|
580
|
+
self.src_type = TypeCode.from_dtype(data.dtype)
|
|
581
|
+
if self.origin is None:
|
|
582
|
+
self.origin = data[0]
|
|
583
|
+
|
|
584
|
+
# Differences (including `np.diff`) return an array with the same dtype as the
|
|
585
|
+
# input array
|
|
586
|
+
# As the input dtype may be unsigned, the output dtype could underflow,
|
|
587
|
+
# if the difference is negative
|
|
588
|
+
# -> cast to int64 to avoid this
|
|
589
|
+
data = data.astype(np.int64, copy=False)
|
|
590
|
+
data = data - self.origin
|
|
591
|
+
return _safe_cast(np.diff(data, prepend=0), np.int32)
|
|
592
|
+
|
|
593
|
+
def decode(self, data):
|
|
594
|
+
output = np.cumsum(data, dtype=self.src_type.to_dtype())
|
|
595
|
+
output += self.origin
|
|
596
|
+
return output
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
@dataclass
|
|
600
|
+
class IntegerPackingEncoding(Encoding):
|
|
601
|
+
"""
|
|
602
|
+
Encoding that compresses an array of 32-bit integers into an array
|
|
603
|
+
of smaller sized integers.
|
|
604
|
+
|
|
605
|
+
If a value does not fit into smaller integer type,
|
|
606
|
+
the integer is represented by a sum of consecutive elements
|
|
607
|
+
in the compressed array.
|
|
608
|
+
|
|
609
|
+
Parameters
|
|
610
|
+
----------
|
|
611
|
+
byte_count : int
|
|
612
|
+
The number of bytes the packed integers should occupy.
|
|
613
|
+
Supported values are 1 and 2 for 8-bit and 16-bit integers,
|
|
614
|
+
respectively.
|
|
615
|
+
src_size : int, optional
|
|
616
|
+
The size of the array to be encoded.
|
|
617
|
+
If omitted, the size is determined from the data the
|
|
618
|
+
first time :meth:`encode()` is called.
|
|
619
|
+
is_unsigned : bool, optional
|
|
620
|
+
Whether the values should be packed into signed or unsigned
|
|
621
|
+
integers.
|
|
622
|
+
If omitted, first time :meth:`encode()` is called, determines whether
|
|
623
|
+
the values fit into unsigned integers.
|
|
624
|
+
|
|
625
|
+
Attributes
|
|
626
|
+
----------
|
|
627
|
+
byte_count : int
|
|
628
|
+
src_size : int
|
|
629
|
+
is_unsigned : bool
|
|
630
|
+
|
|
631
|
+
Examples
|
|
632
|
+
--------
|
|
633
|
+
|
|
634
|
+
>>> data = np.array([1, 2, -3, 128])
|
|
635
|
+
>>> print(data)
|
|
636
|
+
[ 1 2 -3 128]
|
|
637
|
+
>>> print(IntegerPackingEncoding(byte_count=1).encode(data))
|
|
638
|
+
[ 1 2 -3 127 1]
|
|
639
|
+
"""
|
|
640
|
+
byte_count: ...
|
|
641
|
+
src_size: ... = None
|
|
642
|
+
is_unsigned: ... = None
|
|
643
|
+
|
|
644
|
+
def encode(self, data):
|
|
645
|
+
if self.src_size is None:
|
|
646
|
+
self.src_size = len(data)
|
|
647
|
+
elif self.src_size != len(data):
|
|
648
|
+
raise IndexError(
|
|
649
|
+
"Given source size does not match actual data size"
|
|
650
|
+
)
|
|
651
|
+
if self.is_unsigned is None:
|
|
652
|
+
# Only positive values -> use unsigned integers
|
|
653
|
+
self.is_unsigned = data.min().item() >= 0
|
|
654
|
+
|
|
655
|
+
data = _safe_cast(data, np.int32)
|
|
656
|
+
return self._encode(
|
|
657
|
+
data, np.empty(0, dtype=self._determine_packed_dtype())
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
def decode(self, const Integer[:] data):
|
|
661
|
+
cdef int i, j
|
|
662
|
+
cdef int min_val, max_val
|
|
663
|
+
cdef int packed_val, unpacked_val
|
|
664
|
+
bounds = self._get_bounds(data)
|
|
665
|
+
min_val = bounds[0]
|
|
666
|
+
max_val = bounds[1]
|
|
667
|
+
# For signed integers, do not check lower bound (is always 0)
|
|
668
|
+
# -> Set lower bound to value that is never reached
|
|
669
|
+
if min_val == 0:
|
|
670
|
+
min_val = -1
|
|
671
|
+
|
|
672
|
+
cdef int32[:] output = np.zeros(self.src_size, dtype=np.int32)
|
|
673
|
+
j = 0
|
|
674
|
+
unpacked_val = 0
|
|
675
|
+
for i in range(data.shape[0]):
|
|
676
|
+
packed_val = data[i]
|
|
677
|
+
if packed_val == max_val or packed_val == min_val:
|
|
678
|
+
unpacked_val += packed_val
|
|
679
|
+
else:
|
|
680
|
+
unpacked_val += packed_val
|
|
681
|
+
output[j] = unpacked_val
|
|
682
|
+
unpacked_val = 0
|
|
683
|
+
j += 1
|
|
684
|
+
# Trim to correct size and return
|
|
685
|
+
return np.asarray(output)
|
|
686
|
+
|
|
687
|
+
def _determine_packed_dtype(self):
|
|
688
|
+
if self.byte_count == 1:
|
|
689
|
+
if self.is_unsigned:
|
|
690
|
+
return np.uint8
|
|
691
|
+
else:
|
|
692
|
+
return np.int8
|
|
693
|
+
elif self.byte_count == 2:
|
|
694
|
+
if self.is_unsigned:
|
|
695
|
+
return np.uint16
|
|
696
|
+
else:
|
|
697
|
+
return np.int16
|
|
698
|
+
else:
|
|
699
|
+
raise ValueError("Unsupported byte count")
|
|
700
|
+
|
|
701
|
+
@cython.cdivision(True)
|
|
702
|
+
def _encode(self, const Integer[:] data, OutputInteger[:] output_type):
|
|
703
|
+
"""
|
|
704
|
+
`output_type` is merely a typed placeholder to allow for static
|
|
705
|
+
typing of output.
|
|
706
|
+
"""
|
|
707
|
+
cdef int i=0, j=0
|
|
708
|
+
|
|
709
|
+
packed_type = np.asarray(output_type).dtype
|
|
710
|
+
cdef int min_val = np.iinfo(packed_type).min
|
|
711
|
+
cdef int max_val = np.iinfo(packed_type).max
|
|
712
|
+
|
|
713
|
+
# Get length of output array
|
|
714
|
+
# by summing up required length of each element
|
|
715
|
+
cdef int number
|
|
716
|
+
cdef long length = 0
|
|
717
|
+
for i in range(data.shape[0]):
|
|
718
|
+
number = data[i]
|
|
719
|
+
if number < 0:
|
|
720
|
+
if min_val == 0:
|
|
721
|
+
raise ValueError(
|
|
722
|
+
"Cannot pack negative numbers into unsigned type"
|
|
723
|
+
)
|
|
724
|
+
# The required packed length for an element is the
|
|
725
|
+
# number of times min_val/max_val need to be repeated
|
|
726
|
+
length += number // min_val + 1
|
|
727
|
+
elif number > 0:
|
|
728
|
+
length += number // max_val + 1
|
|
729
|
+
else:
|
|
730
|
+
# number = 0
|
|
731
|
+
length += 1
|
|
732
|
+
|
|
733
|
+
# Fill output
|
|
734
|
+
cdef OutputInteger[:] output = np.zeros(length, dtype=packed_type)
|
|
735
|
+
cdef int remainder
|
|
736
|
+
j = 0
|
|
737
|
+
for i in range(data.shape[0]):
|
|
738
|
+
remainder = data[i]
|
|
739
|
+
if remainder < 0:
|
|
740
|
+
if min_val == 0:
|
|
741
|
+
raise ValueError(
|
|
742
|
+
"Cannot pack negative numbers into unsigned type"
|
|
743
|
+
)
|
|
744
|
+
while remainder <= min_val:
|
|
745
|
+
remainder -= min_val
|
|
746
|
+
output[j] = min_val
|
|
747
|
+
j += 1
|
|
748
|
+
elif remainder > 0:
|
|
749
|
+
while remainder >= max_val:
|
|
750
|
+
remainder -= max_val
|
|
751
|
+
output[j] = max_val
|
|
752
|
+
j += 1
|
|
753
|
+
output[j] = remainder
|
|
754
|
+
j += 1
|
|
755
|
+
return np.asarray(output)
|
|
756
|
+
|
|
757
|
+
@staticmethod
|
|
758
|
+
def _get_bounds(const Integer[:] data):
|
|
759
|
+
if Integer is int8:
|
|
760
|
+
info = np.iinfo(np.int8)
|
|
761
|
+
elif Integer is int16:
|
|
762
|
+
info = np.iinfo(np.int16)
|
|
763
|
+
elif Integer is int32:
|
|
764
|
+
info = np.iinfo(np.int32)
|
|
765
|
+
elif Integer is uint8:
|
|
766
|
+
info = np.iinfo(np.uint8)
|
|
767
|
+
elif Integer is uint16:
|
|
768
|
+
info = np.iinfo(np.uint16)
|
|
769
|
+
elif Integer is uint32:
|
|
770
|
+
info = np.iinfo(np.uint32)
|
|
771
|
+
else:
|
|
772
|
+
raise ValueError("Unsupported integer type")
|
|
773
|
+
return info.min, info.max
|
|
774
|
+
|
|
775
|
+
|
|
776
|
+
@dataclass
|
|
777
|
+
class StringArrayEncoding(Encoding):
|
|
778
|
+
"""
|
|
779
|
+
Encoding that compresses an array of strings into an array of
|
|
780
|
+
indices that point to the unique strings in that array.
|
|
781
|
+
|
|
782
|
+
The unique strings themselves are stored as part of the
|
|
783
|
+
:class:`StringArrayEncoding` as concatenated string.
|
|
784
|
+
The start index of each unique string in the concatenated string
|
|
785
|
+
is stored in an *offset* array.
|
|
786
|
+
|
|
787
|
+
Parameters
|
|
788
|
+
----------
|
|
789
|
+
strings : ndarray, optional
|
|
790
|
+
The unique strings that are used for encoding.
|
|
791
|
+
If omitted, the unique strings are determined from the data the
|
|
792
|
+
first time :meth:`encode()` is called.
|
|
793
|
+
data_encoding : list of Encoding, optional
|
|
794
|
+
The encodings that are applied to the index array.
|
|
795
|
+
If omitted, the array is directly encoded into bytes without
|
|
796
|
+
further compression.
|
|
797
|
+
offset_encoding : list of Encoding, optional
|
|
798
|
+
The encodings that are applied to the offset array.
|
|
799
|
+
If omitted, the array is directly encoded into bytes without
|
|
800
|
+
further compression.
|
|
801
|
+
|
|
802
|
+
Attributes
|
|
803
|
+
----------
|
|
804
|
+
strings : ndarray
|
|
805
|
+
data_encoding : list of Encoding
|
|
806
|
+
offset_encoding : list of Encoding
|
|
807
|
+
|
|
808
|
+
Examples
|
|
809
|
+
--------
|
|
810
|
+
|
|
811
|
+
>>> data = np.array(["apple", "banana", "cherry", "apple", "banana", "apple"])
|
|
812
|
+
>>> print(data)
|
|
813
|
+
['apple' 'banana' 'cherry' 'apple' 'banana' 'apple']
|
|
814
|
+
>>> # By default the indices would directly be encoded into bytes
|
|
815
|
+
>>> # However, the indices should be printed here -> data_encoding=[]
|
|
816
|
+
>>> encoding = StringArrayEncoding(data_encoding=[])
|
|
817
|
+
>>> encoded = encoding.encode(data)
|
|
818
|
+
>>> print(encoding.strings)
|
|
819
|
+
['apple' 'banana' 'cherry']
|
|
820
|
+
>>> print(encoded)
|
|
821
|
+
[0 1 2 0 1 0]
|
|
822
|
+
"""
|
|
823
|
+
|
|
824
|
+
strings: ... = None
|
|
825
|
+
data_encoding: ... = None
|
|
826
|
+
offset_encoding: ... = None
|
|
827
|
+
|
|
828
|
+
def __init__(self, strings=None, data_encoding=None, offset_encoding=None):
|
|
829
|
+
self.strings = strings
|
|
830
|
+
if data_encoding is None:
|
|
831
|
+
data_encoding = [ByteArrayEncoding(TypeCode.INT32)]
|
|
832
|
+
self.data_encoding = data_encoding
|
|
833
|
+
if offset_encoding is None:
|
|
834
|
+
offset_encoding = [ByteArrayEncoding(TypeCode.INT32)]
|
|
835
|
+
self.offset_encoding = offset_encoding
|
|
836
|
+
|
|
837
|
+
@staticmethod
|
|
838
|
+
def deserialize(content):
|
|
839
|
+
data_encoding = [
|
|
840
|
+
deserialize_encoding(e) for e in content["dataEncoding"]
|
|
841
|
+
]
|
|
842
|
+
offset_encoding = [
|
|
843
|
+
deserialize_encoding(e) for e in content["offsetEncoding"]
|
|
844
|
+
]
|
|
845
|
+
cdef str concatenated_strings = content["stringData"]
|
|
846
|
+
cdef np.ndarray offsets = decode_stepwise(
|
|
847
|
+
content["offsets"], offset_encoding
|
|
848
|
+
)
|
|
849
|
+
|
|
850
|
+
strings = np.array([
|
|
851
|
+
concatenated_strings[offsets[i]:offsets[i+1]]
|
|
852
|
+
# The final offset is the exclusive stop index
|
|
853
|
+
for i in range(len(offsets)-1)
|
|
854
|
+
], dtype="U")
|
|
855
|
+
|
|
856
|
+
return StringArrayEncoding(strings, data_encoding, offset_encoding)
|
|
857
|
+
|
|
858
|
+
def serialize(self):
|
|
859
|
+
if self.strings is None:
|
|
860
|
+
raise ValueError(
|
|
861
|
+
"'strings' must be explicitly given or needs to be "
|
|
862
|
+
"determined from first encoding pass, before it is serialized"
|
|
863
|
+
)
|
|
864
|
+
|
|
865
|
+
string_data = "".join(self.strings)
|
|
866
|
+
offsets = np.cumsum([0] + [len(s) for s in self.strings])
|
|
867
|
+
|
|
868
|
+
return {
|
|
869
|
+
"kind": "StringArray",
|
|
870
|
+
"dataEncoding": [e.serialize() for e in self.data_encoding],
|
|
871
|
+
"stringData": string_data,
|
|
872
|
+
"offsets": encode_stepwise(offsets, self.offset_encoding),
|
|
873
|
+
"offsetEncoding": [e.serialize() for e in self.offset_encoding],
|
|
874
|
+
}
|
|
875
|
+
|
|
876
|
+
def encode(self, data):
|
|
877
|
+
if not np.issubdtype(data.dtype, np.str_):
|
|
878
|
+
raise TypeError("Data must be of string type")
|
|
879
|
+
|
|
880
|
+
if self.strings is None:
|
|
881
|
+
# 'unique()' already sorts the strings, but this is not necessarily
|
|
882
|
+
# desired, as this makes efficient encoding of the indices more difficult
|
|
883
|
+
# -> Bring into the original order
|
|
884
|
+
_, unique_indices = np.unique(data, return_index=True)
|
|
885
|
+
self.strings = data[np.sort(unique_indices)]
|
|
886
|
+
check_present = False
|
|
887
|
+
else:
|
|
888
|
+
check_present = True
|
|
889
|
+
|
|
890
|
+
if len(self.strings) > 0:
|
|
891
|
+
string_order = _safe_cast(np.argsort(self.strings), np.int32)
|
|
892
|
+
sorted_strings = self.strings[string_order]
|
|
893
|
+
sorted_indices = np.searchsorted(sorted_strings, data)
|
|
894
|
+
indices = string_order[sorted_indices]
|
|
895
|
+
# `"" not in self.strings` can be quite costly and is only necessary,
|
|
896
|
+
# if the the `strings` were given by the user, as otherwise we always
|
|
897
|
+
# include an empty string explicitly when we compute them in this function
|
|
898
|
+
# -> Only run if `check_present` is True
|
|
899
|
+
if check_present and "" not in self.strings:
|
|
900
|
+
# Represent empty strings as -1
|
|
901
|
+
indices[data == ""] = -1
|
|
902
|
+
else:
|
|
903
|
+
# There are no strings -> The indices can only ever be -1 to indicate
|
|
904
|
+
# missing values
|
|
905
|
+
# The check if this is correct is done below
|
|
906
|
+
indices = np.full(data.shape[0], -1, dtype=np.int32)
|
|
907
|
+
|
|
908
|
+
valid_indices_mask = indices != -1
|
|
909
|
+
if check_present and not np.all(
|
|
910
|
+
self.strings[indices[valid_indices_mask]] == data[valid_indices_mask]
|
|
911
|
+
):
|
|
912
|
+
raise ValueError("Data contains strings not present in 'strings'")
|
|
913
|
+
return encode_stepwise(indices, self.data_encoding)
|
|
914
|
+
|
|
915
|
+
def decode(self, data):
|
|
916
|
+
indices = decode_stepwise(data, self.data_encoding)
|
|
917
|
+
# Initialize with empty strings
|
|
918
|
+
strings = np.zeros(indices.shape[0], dtype=self.strings.dtype)
|
|
919
|
+
# `-1`` indices indicate missing values
|
|
920
|
+
valid_indices_mask = indices != -1
|
|
921
|
+
strings[valid_indices_mask] = self.strings[indices[valid_indices_mask]]
|
|
922
|
+
return strings
|
|
923
|
+
|
|
924
|
+
def __eq__(self, other):
|
|
925
|
+
if not isinstance(other, type(self)):
|
|
926
|
+
return False
|
|
927
|
+
if not np.array_equal(self.strings, other.strings):
|
|
928
|
+
return False
|
|
929
|
+
if self.data_encoding != other.data_encoding:
|
|
930
|
+
return False
|
|
931
|
+
if self.offset_encoding != other.offset_encoding:
|
|
932
|
+
return False
|
|
933
|
+
return True
|
|
934
|
+
|
|
935
|
+
|
|
936
|
+
_encoding_classes = {
|
|
937
|
+
"ByteArray": ByteArrayEncoding,
|
|
938
|
+
"FixedPoint": FixedPointEncoding,
|
|
939
|
+
"IntervalQuantization": IntervalQuantizationEncoding,
|
|
940
|
+
"RunLength": RunLengthEncoding,
|
|
941
|
+
"Delta": DeltaEncoding,
|
|
942
|
+
"IntegerPacking": IntegerPackingEncoding,
|
|
943
|
+
"StringArray": StringArrayEncoding,
|
|
944
|
+
}
|
|
945
|
+
_encoding_classes_kinds = {
|
|
946
|
+
"ByteArrayEncoding": "ByteArray",
|
|
947
|
+
"FixedPointEncoding": "FixedPoint",
|
|
948
|
+
"IntervalQuantizationEncoding": "IntervalQuantization",
|
|
949
|
+
"RunLengthEncoding": "RunLength",
|
|
950
|
+
"DeltaEncoding": "Delta",
|
|
951
|
+
"IntegerPackingEncoding": "IntegerPacking",
|
|
952
|
+
"StringArrayEncoding": "StringArray",
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
|
|
956
|
+
def deserialize_encoding(content):
|
|
957
|
+
"""
|
|
958
|
+
Create a :class:`Encoding` by deserializing the given *BinaryCIF* content.
|
|
959
|
+
|
|
960
|
+
Parameters
|
|
961
|
+
----------
|
|
962
|
+
content : dict
|
|
963
|
+
The encoding represenet as *BinaryCIF* dictionary.
|
|
964
|
+
|
|
965
|
+
Returns
|
|
966
|
+
-------
|
|
967
|
+
encoding : Encoding
|
|
968
|
+
The deserialized encoding.
|
|
969
|
+
"""
|
|
970
|
+
try:
|
|
971
|
+
encoding_class = _encoding_classes[content["kind"]]
|
|
972
|
+
except KeyError:
|
|
973
|
+
raise ValueError(
|
|
974
|
+
f"Unknown encoding kind '{content['kind']}'"
|
|
975
|
+
)
|
|
976
|
+
return encoding_class.deserialize(content)
|
|
977
|
+
|
|
978
|
+
|
|
979
|
+
def create_uncompressed_encoding(array):
|
|
980
|
+
"""
|
|
981
|
+
Create a simple encoding for the given array that does not compress the data.
|
|
982
|
+
|
|
983
|
+
Parameters
|
|
984
|
+
----------
|
|
985
|
+
array : ndarray
|
|
986
|
+
The array to to create the encoding for.
|
|
987
|
+
|
|
988
|
+
Returns
|
|
989
|
+
-------
|
|
990
|
+
encoding : list of Encoding
|
|
991
|
+
The encoding for the data.
|
|
992
|
+
"""
|
|
993
|
+
if np.issubdtype(array.dtype, np.str_):
|
|
994
|
+
return [StringArrayEncoding()]
|
|
995
|
+
else:
|
|
996
|
+
return [ByteArrayEncoding()]
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
def encode_stepwise(data, encoding):
|
|
1000
|
+
"""
|
|
1001
|
+
Apply a list of encodings stepwise to the given data.
|
|
1002
|
+
|
|
1003
|
+
Parameters
|
|
1004
|
+
----------
|
|
1005
|
+
data : ndarray
|
|
1006
|
+
The data to be encoded.
|
|
1007
|
+
encoding : list of Encoding
|
|
1008
|
+
The encodings to be applied.
|
|
1009
|
+
|
|
1010
|
+
Returns
|
|
1011
|
+
-------
|
|
1012
|
+
encoded_data : ndarray or bytes
|
|
1013
|
+
The encoded data.
|
|
1014
|
+
"""
|
|
1015
|
+
for encoding in encoding:
|
|
1016
|
+
data = encoding.encode(data)
|
|
1017
|
+
return data
|
|
1018
|
+
|
|
1019
|
+
|
|
1020
|
+
def decode_stepwise(data, encoding):
|
|
1021
|
+
"""
|
|
1022
|
+
Apply a list of encodings stepwise to the given data.
|
|
1023
|
+
|
|
1024
|
+
Parameters
|
|
1025
|
+
----------
|
|
1026
|
+
data : ndarray or bytes
|
|
1027
|
+
The data to be decoded.
|
|
1028
|
+
encoding : list of Encoding
|
|
1029
|
+
The encodings to be applied.
|
|
1030
|
+
|
|
1031
|
+
Returns
|
|
1032
|
+
-------
|
|
1033
|
+
decoded_data : ndarray
|
|
1034
|
+
The decoded data.
|
|
1035
|
+
"""
|
|
1036
|
+
for enc in reversed(encoding):
|
|
1037
|
+
data = enc.decode(data)
|
|
1038
|
+
# ByteEncoding may decode in a non-writable array,
|
|
1039
|
+
# as it creates the ndarray cheaply from buffer
|
|
1040
|
+
if not data.flags.writeable:
|
|
1041
|
+
# Make the resulting ndarray writable, by copying the underlying buffer
|
|
1042
|
+
data = data.copy()
|
|
1043
|
+
return data
|
|
1044
|
+
|
|
1045
|
+
|
|
1046
|
+
def _camel_to_snake_case(attribute_name):
|
|
1047
|
+
return CAMEL_CASE_PATTERN.sub("_", attribute_name).lower()
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
def _snake_to_camel_case(attribute_name):
|
|
1051
|
+
attribute_name = "".join(
|
|
1052
|
+
word.capitalize() for word in attribute_name.split("_")
|
|
1053
|
+
)
|
|
1054
|
+
return attribute_name[0].lower() + attribute_name[1:]
|
|
1055
|
+
|
|
1056
|
+
|
|
1057
|
+
def _safe_cast(array, dtype, allow_decimal_loss=False):
|
|
1058
|
+
source_dtype = array.dtype
|
|
1059
|
+
target_dtype = np.dtype(dtype)
|
|
1060
|
+
|
|
1061
|
+
if target_dtype == source_dtype:
|
|
1062
|
+
return array
|
|
1063
|
+
|
|
1064
|
+
if np.issubdtype(target_dtype, np.integer):
|
|
1065
|
+
if np.issubdtype(source_dtype, np.floating):
|
|
1066
|
+
if not allow_decimal_loss:
|
|
1067
|
+
raise ValueError("Cannot cast floating point to integer")
|
|
1068
|
+
if not np.isfinite(array).all():
|
|
1069
|
+
raise ValueError("Data contains non-finite values")
|
|
1070
|
+
elif not np.issubdtype(source_dtype, np.integer):
|
|
1071
|
+
# Neither float, nor integer -> cannot cast
|
|
1072
|
+
raise ValueError(f"Cannot cast '{source_dtype}' to integer")
|
|
1073
|
+
dtype_info = np.iinfo(target_dtype)
|
|
1074
|
+
# Check if an integer underflow/overflow would occur during conversion
|
|
1075
|
+
if np.max(array) > dtype_info.max or np.min(array) < dtype_info.min:
|
|
1076
|
+
raise ValueError("Values do not fit into the given dtype")
|
|
1077
|
+
|
|
1078
|
+
return array.astype(target_dtype)
|