numcodecs 0.13.1__cp312-cp312-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of numcodecs might be problematic. Click here for more details.

Files changed (74) hide show
  1. numcodecs/__init__.py +143 -0
  2. numcodecs/_shuffle.cpython-312-darwin.so +0 -0
  3. numcodecs/abc.py +126 -0
  4. numcodecs/astype.py +76 -0
  5. numcodecs/base64.py +27 -0
  6. numcodecs/bitround.py +79 -0
  7. numcodecs/blosc.cpython-312-darwin.so +0 -0
  8. numcodecs/bz2.py +45 -0
  9. numcodecs/categorize.py +101 -0
  10. numcodecs/checksum32.py +94 -0
  11. numcodecs/compat.py +208 -0
  12. numcodecs/compat_ext.cpython-312-darwin.so +0 -0
  13. numcodecs/delta.py +97 -0
  14. numcodecs/fixedscaleoffset.py +132 -0
  15. numcodecs/fletcher32.cpython-312-darwin.so +0 -0
  16. numcodecs/gzip.py +52 -0
  17. numcodecs/jenkins.cpython-312-darwin.so +0 -0
  18. numcodecs/json.py +107 -0
  19. numcodecs/lz4.cpython-312-darwin.so +0 -0
  20. numcodecs/lzma.py +69 -0
  21. numcodecs/msgpacks.py +86 -0
  22. numcodecs/ndarray_like.py +65 -0
  23. numcodecs/packbits.py +85 -0
  24. numcodecs/pcodec.py +89 -0
  25. numcodecs/pickles.py +55 -0
  26. numcodecs/quantize.py +100 -0
  27. numcodecs/registry.py +72 -0
  28. numcodecs/shuffle.py +61 -0
  29. numcodecs/tests/__init__.py +3 -0
  30. numcodecs/tests/common.py +354 -0
  31. numcodecs/tests/package_with_entrypoint/__init__.py +11 -0
  32. numcodecs/tests/package_with_entrypoint-0.1.dist-info/entry_points.txt +2 -0
  33. numcodecs/tests/test_astype.py +74 -0
  34. numcodecs/tests/test_base64.py +81 -0
  35. numcodecs/tests/test_bitround.py +81 -0
  36. numcodecs/tests/test_blosc.py +277 -0
  37. numcodecs/tests/test_bz2.py +66 -0
  38. numcodecs/tests/test_categorize.py +87 -0
  39. numcodecs/tests/test_checksum32.py +58 -0
  40. numcodecs/tests/test_compat.py +108 -0
  41. numcodecs/tests/test_delta.py +60 -0
  42. numcodecs/tests/test_entrypoints.py +24 -0
  43. numcodecs/tests/test_entrypoints_backport.py +35 -0
  44. numcodecs/tests/test_fixedscaleoffset.py +69 -0
  45. numcodecs/tests/test_fletcher32.py +56 -0
  46. numcodecs/tests/test_gzip.py +110 -0
  47. numcodecs/tests/test_jenkins.py +150 -0
  48. numcodecs/tests/test_json.py +85 -0
  49. numcodecs/tests/test_lz4.py +83 -0
  50. numcodecs/tests/test_lzma.py +90 -0
  51. numcodecs/tests/test_msgpacks.py +123 -0
  52. numcodecs/tests/test_ndarray_like.py +48 -0
  53. numcodecs/tests/test_packbits.py +39 -0
  54. numcodecs/tests/test_pcodec.py +80 -0
  55. numcodecs/tests/test_pickles.py +61 -0
  56. numcodecs/tests/test_quantize.py +76 -0
  57. numcodecs/tests/test_registry.py +40 -0
  58. numcodecs/tests/test_shuffle.py +168 -0
  59. numcodecs/tests/test_vlen_array.py +97 -0
  60. numcodecs/tests/test_vlen_bytes.py +93 -0
  61. numcodecs/tests/test_vlen_utf8.py +91 -0
  62. numcodecs/tests/test_zfpy.py +98 -0
  63. numcodecs/tests/test_zlib.py +94 -0
  64. numcodecs/tests/test_zstd.py +92 -0
  65. numcodecs/version.py +16 -0
  66. numcodecs/vlen.cpython-312-darwin.so +0 -0
  67. numcodecs/zfpy.py +111 -0
  68. numcodecs/zlib.py +42 -0
  69. numcodecs/zstd.cpython-312-darwin.so +0 -0
  70. numcodecs-0.13.1.dist-info/LICENSE.txt +21 -0
  71. numcodecs-0.13.1.dist-info/METADATA +64 -0
  72. numcodecs-0.13.1.dist-info/RECORD +74 -0
  73. numcodecs-0.13.1.dist-info/WHEEL +5 -0
  74. numcodecs-0.13.1.dist-info/top_level.txt +1 -0
numcodecs/__init__.py ADDED
@@ -0,0 +1,143 @@
1
+ # ruff: noqa: E402,F401
2
+ """Numcodecs is a Python package providing buffer compression and
3
+ transformation codecs for use in data storage and communication
4
+ applications. These include:
5
+
6
+ * Compression codecs, e.g., Zlib, BZ2, LZMA, ZFPY and Blosc.
7
+ * Pre-compression filters, e.g., Delta, Quantize, FixedScaleOffset,
8
+ PackBits, Categorize.
9
+ * Integrity checks, e.g., CRC32, Adler32.
10
+
11
+ All codecs implement the same API, allowing codecs to be organized into
12
+ pipelines in a variety of ways.
13
+
14
+ If you have a question, find a bug, would like to make a suggestion or
15
+ contribute code, please `raise an issue on GitHub
16
+ <https://github.com/zarr-developers/numcodecs/issues>`_.
17
+
18
+ """
19
+
20
+ import atexit
21
+ import multiprocessing
22
+ from contextlib import suppress
23
+
24
+ from numcodecs.registry import get_codec, register_codec
25
+ from numcodecs.version import version as __version__
26
+ from numcodecs.zlib import Zlib
27
+
28
+ register_codec(Zlib)
29
+
30
+ from numcodecs.gzip import GZip
31
+
32
+ register_codec(GZip)
33
+
34
+ from numcodecs.bz2 import BZ2
35
+
36
+ register_codec(BZ2)
37
+
38
+ with suppress(ImportError):
39
+ from numcodecs.lzma import LZMA
40
+
41
+ register_codec(LZMA)
42
+
43
+ with suppress(ImportError):
44
+ from numcodecs import blosc
45
+ from numcodecs.blosc import Blosc
46
+
47
+ register_codec(Blosc)
48
+ # initialize blosc
49
+ try:
50
+ ncores = multiprocessing.cpu_count()
51
+ except OSError: # pragma: no cover
52
+ ncores = 1
53
+ blosc.init()
54
+ blosc.set_nthreads(min(8, ncores))
55
+ atexit.register(blosc.destroy)
56
+
57
+ with suppress(ImportError):
58
+ from numcodecs import zstd
59
+ from numcodecs.zstd import Zstd
60
+
61
+ register_codec(Zstd)
62
+
63
+ with suppress(ImportError):
64
+ from numcodecs import lz4
65
+ from numcodecs.lz4 import LZ4
66
+
67
+ register_codec(LZ4)
68
+
69
+ with suppress(ImportError):
70
+ from numcodecs.zfpy import ZFPY
71
+
72
+ register_codec(ZFPY)
73
+
74
+ from numcodecs.astype import AsType
75
+
76
+ register_codec(AsType)
77
+
78
+ from numcodecs.delta import Delta
79
+
80
+ register_codec(Delta)
81
+
82
+ from numcodecs.quantize import Quantize
83
+
84
+ register_codec(Quantize)
85
+
86
+ from numcodecs.fixedscaleoffset import FixedScaleOffset
87
+
88
+ register_codec(FixedScaleOffset)
89
+
90
+ from numcodecs.packbits import PackBits
91
+
92
+ register_codec(PackBits)
93
+
94
+ from numcodecs.categorize import Categorize
95
+
96
+ register_codec(Categorize)
97
+
98
+ from numcodecs.pickles import Pickle
99
+
100
+ register_codec(Pickle)
101
+
102
+ from numcodecs.base64 import Base64
103
+
104
+ register_codec(Base64)
105
+
106
+ from numcodecs.shuffle import Shuffle
107
+
108
+ register_codec(Shuffle)
109
+
110
+ from numcodecs.bitround import BitRound
111
+
112
+ register_codec(BitRound)
113
+
114
+ with suppress(ImportError):
115
+ from numcodecs.msgpacks import MsgPack
116
+
117
+ register_codec(MsgPack)
118
+
119
+ from numcodecs.checksum32 import CRC32, Adler32, JenkinsLookup3
120
+
121
+ register_codec(CRC32)
122
+ register_codec(Adler32)
123
+ register_codec(JenkinsLookup3)
124
+
125
+ from numcodecs.json import JSON
126
+
127
+ register_codec(JSON)
128
+
129
+ with suppress(ImportError):
130
+ from numcodecs import vlen
131
+ from numcodecs.vlen import VLenArray, VLenBytes, VLenUTF8
132
+
133
+ register_codec(VLenUTF8)
134
+ register_codec(VLenBytes)
135
+ register_codec(VLenArray)
136
+
137
+ from numcodecs.fletcher32 import Fletcher32
138
+
139
+ register_codec(Fletcher32)
140
+
141
+ from numcodecs.pcodec import PCodec
142
+
143
+ register_codec(PCodec)
Binary file
numcodecs/abc.py ADDED
@@ -0,0 +1,126 @@
1
+ """This module defines the :class:`Codec` base class, a common interface for
2
+ all codec classes.
3
+
4
+ Codec classes must implement :func:`Codec.encode` and :func:`Codec.decode`
5
+ methods. Inputs to and outputs from these methods may be any Python object
6
+ exporting a contiguous buffer via the new-style Python protocol.
7
+
8
+ Codec classes must implement a :func:`Codec.get_config` method,
9
+ which must return a dictionary holding all configuration parameters
10
+ required to enable encoding and decoding of data. The expectation is that
11
+ these configuration parameters will be stored or communicated separately
12
+ from encoded data, and thus the codecs do not need to store all encoding
13
+ parameters within the encoded data. For broad compatibility,
14
+ the configuration object must contain only JSON-serializable values. The
15
+ configuration object must also contain an 'id' field storing the codec
16
+ identifier (see below).
17
+
18
+ Codec classes must implement a :func:`Codec.from_config` class method,
19
+ which will return an instance of the class initialized from a configuration
20
+ object.
21
+
22
+ Finally, codec classes must set a `codec_id` class-level attribute. This
23
+ must be a string. Two different codec classes may set the same value for the
24
+ `codec_id` attribute if and only if they are fully compatible, meaning that
25
+ (1) configuration parameters are the same, and (2) given the same
26
+ configuration, one class could correctly decode data encoded by the
27
+ other and vice versa.
28
+
29
+ """
30
+
31
+ from abc import ABC, abstractmethod
32
+
33
+
34
+ class Codec(ABC):
35
+ """Codec abstract base class."""
36
+
37
+ # override in sub-class
38
+ codec_id = None
39
+ """Codec identifier."""
40
+
41
+ @abstractmethod
42
+ def encode(self, buf): # pragma: no cover
43
+ """Encode data in `buf`.
44
+
45
+ Parameters
46
+ ----------
47
+ buf : buffer-like
48
+ Data to be encoded. May be any object supporting the new-style
49
+ buffer protocol.
50
+
51
+ Returns
52
+ -------
53
+ enc : buffer-like
54
+ Encoded data. May be any object supporting the new-style buffer
55
+ protocol.
56
+ """
57
+
58
+ @abstractmethod
59
+ def decode(self, buf, out=None): # pragma: no cover
60
+ """Decode data in `buf`.
61
+
62
+ Parameters
63
+ ----------
64
+ buf : buffer-like
65
+ Encoded data. May be any object supporting the new-style buffer
66
+ protocol.
67
+ out : buffer-like, optional
68
+ Writeable buffer to store decoded data. N.B. if provided, this buffer must
69
+ be exactly the right size to store the decoded data.
70
+
71
+ Returns
72
+ -------
73
+ dec : buffer-like
74
+ Decoded data. May be any object supporting the new-style
75
+ buffer protocol.
76
+ """
77
+
78
+ def get_config(self):
79
+ """Return a dictionary holding configuration parameters for this
80
+ codec. Must include an 'id' field with the codec identifier. All
81
+ values must be compatible with JSON encoding."""
82
+
83
+ # override in sub-class if need special encoding of config values
84
+
85
+ # setup config object
86
+ config = dict(id=self.codec_id)
87
+
88
+ # by default, assume all non-private members are configuration
89
+ # parameters - override this in sub-class if not the case
90
+ for k in self.__dict__:
91
+ if not k.startswith('_'):
92
+ config[k] = getattr(self, k)
93
+
94
+ return config
95
+
96
+ @classmethod
97
+ def from_config(cls, config):
98
+ """Instantiate codec from a configuration object."""
99
+ # N.B., assume at this point the 'id' field has been removed from
100
+ # the config object
101
+
102
+ # override in sub-class if need special decoding of config values
103
+
104
+ # by default, assume constructor accepts configuration parameters as
105
+ # keyword arguments without any special decoding
106
+ return cls(**config)
107
+
108
+ def __eq__(self, other):
109
+ # override in sub-class if need special equality comparison
110
+ try:
111
+ return self.get_config() == other.get_config()
112
+ except AttributeError:
113
+ return False
114
+
115
+ def __repr__(self):
116
+ # override in sub-class if need special representation
117
+
118
+ # by default, assume all non-private members are configuration
119
+ # parameters and valid keyword arguments to constructor function
120
+
121
+ r = f'{type(self).__name__}('
122
+ params = [
123
+ f'{k}={getattr(self, k)!r}' for k in sorted(self.__dict__) if not k.startswith('_')
124
+ ]
125
+ r += ', '.join(params) + ')'
126
+ return r
numcodecs/astype.py ADDED
@@ -0,0 +1,76 @@
1
+ import numpy as np
2
+
3
+ from .abc import Codec
4
+ from .compat import ensure_ndarray, ndarray_copy
5
+
6
+
7
+ class AsType(Codec):
8
+ """Filter to convert data between different types.
9
+
10
+ Parameters
11
+ ----------
12
+ encode_dtype : dtype
13
+ Data type to use for encoded data.
14
+ decode_dtype : dtype, optional
15
+ Data type to use for decoded data.
16
+
17
+ Notes
18
+ -----
19
+ If `encode_dtype` is of lower precision than `decode_dtype`, please be
20
+ aware that data loss can occur by writing data to disk using this filter.
21
+ No checks are made to ensure the casting will work in that direction and
22
+ data corruption will occur.
23
+
24
+ Examples
25
+ --------
26
+ >>> import numcodecs
27
+ >>> import numpy as np
28
+ >>> x = np.arange(100, 120, 2, dtype=np.int8)
29
+ >>> x
30
+ array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int8)
31
+ >>> f = numcodecs.AsType(encode_dtype=x.dtype, decode_dtype=np.int16)
32
+ >>> y = f.decode(x)
33
+ >>> y
34
+ array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int16)
35
+ >>> z = f.encode(y)
36
+ >>> z
37
+ array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int8)
38
+
39
+ """
40
+
41
+ codec_id = 'astype'
42
+
43
+ def __init__(self, encode_dtype, decode_dtype):
44
+ self.encode_dtype = np.dtype(encode_dtype)
45
+ self.decode_dtype = np.dtype(decode_dtype)
46
+
47
+ def encode(self, buf):
48
+ # normalise input
49
+ arr = ensure_ndarray(buf).view(self.decode_dtype)
50
+
51
+ # convert and copy
52
+ enc = arr.astype(self.encode_dtype)
53
+
54
+ return enc
55
+
56
+ def decode(self, buf, out=None):
57
+ # normalise input
58
+ enc = ensure_ndarray(buf).view(self.encode_dtype)
59
+
60
+ # convert and copy
61
+ dec = enc.astype(self.decode_dtype)
62
+
63
+ # handle output
64
+ out = ndarray_copy(dec, out)
65
+
66
+ return out
67
+
68
+ def get_config(self):
69
+ return {
70
+ 'id': self.codec_id,
71
+ 'encode_dtype': self.encode_dtype.str,
72
+ 'decode_dtype': self.decode_dtype.str,
73
+ }
74
+
75
+ def __repr__(self):
76
+ return f'{type(self).__name__}(encode_dtype={self.encode_dtype.str!r}, decode_dtype={self.decode_dtype.str!r})'
numcodecs/base64.py ADDED
@@ -0,0 +1,27 @@
1
+ import base64 as _base64
2
+
3
+ from .abc import Codec
4
+ from .compat import ensure_contiguous_ndarray, ndarray_copy
5
+
6
+
7
+ class Base64(Codec):
8
+ """Codec providing base64 compression via the Python standard library."""
9
+
10
+ codec_id = "base64"
11
+
12
+ def encode(self, buf):
13
+ # normalise inputs
14
+ buf = ensure_contiguous_ndarray(buf)
15
+ # do compression
16
+ compressed = _base64.standard_b64encode(buf)
17
+ return compressed
18
+
19
+ def decode(self, buf, out=None):
20
+ # normalise inputs
21
+ buf = ensure_contiguous_ndarray(buf)
22
+ if out is not None:
23
+ out = ensure_contiguous_ndarray(out)
24
+ # do decompression
25
+ decompressed = _base64.standard_b64decode(buf)
26
+ # handle destination
27
+ return ndarray_copy(decompressed, out)
numcodecs/bitround.py ADDED
@@ -0,0 +1,79 @@
1
+ import numpy as np
2
+
3
+ from .abc import Codec
4
+ from .compat import ensure_ndarray_like, ndarray_copy
5
+
6
+ # The size in bits of the mantissa/significand for the various floating types
7
+ # You cannot keep more bits of data than you have available
8
+ # https://en.wikipedia.org/wiki/IEEE_754
9
+ max_bits = {
10
+ "float16": 10,
11
+ "float32": 23,
12
+ "float64": 52,
13
+ }
14
+
15
+
16
+ class BitRound(Codec):
17
+ """Floating-point bit rounding codec
18
+
19
+ Drops a specified number of bits from the floating point mantissa,
20
+ leaving an array more amenable to compression. The number of bits to keep should
21
+ be determined by an information analysis of the data to be compressed.
22
+ The approach is based on the paper by Klöwer et al. 2021
23
+ (https://www.nature.com/articles/s43588-021-00156-2). See
24
+ https://github.com/zarr-developers/numcodecs/issues/298 for discussion
25
+ and the original implementation in Julia referred to at
26
+ https://github.com/milankl/BitInformation.jl
27
+
28
+ Parameters
29
+ ----------
30
+
31
+ keepbits: int
32
+ The number of bits of the mantissa to keep. The range allowed
33
+ depends on the dtype input data. If keepbits is
34
+ equal to the maximum allowed for the data type, this is equivalent
35
+ to no transform.
36
+ """
37
+
38
+ codec_id = 'bitround'
39
+
40
+ def __init__(self, keepbits: int):
41
+ if keepbits < 0:
42
+ raise ValueError("keepbits must be zero or positive")
43
+ self.keepbits = keepbits
44
+
45
+ def encode(self, buf):
46
+ """Create int array by rounding floating-point data
47
+
48
+ The itemsize will be preserved, but the output should be much more
49
+ compressible.
50
+ """
51
+ a = ensure_ndarray_like(buf)
52
+ if not a.dtype.kind == "f" or a.dtype.itemsize > 8:
53
+ raise TypeError("Only float arrays (16-64bit) can be bit-rounded")
54
+ bits = max_bits[str(a.dtype)]
55
+ # cast float to int type of same width (preserve endianness)
56
+ a_int_dtype = np.dtype(a.dtype.str.replace("f", "i"))
57
+ all_set = np.array(-1, dtype=a_int_dtype)
58
+ if self.keepbits == bits:
59
+ return a
60
+ if self.keepbits > bits:
61
+ raise ValueError("Keepbits too large for given dtype")
62
+ b = a.view(a_int_dtype)
63
+ maskbits = bits - self.keepbits
64
+ mask = (all_set >> maskbits) << maskbits
65
+ half_quantum1 = (1 << (maskbits - 1)) - 1
66
+ b += ((b >> maskbits) & 1) + half_quantum1
67
+ b &= mask
68
+ return b
69
+
70
+ def decode(self, buf, out=None):
71
+ """Remake floats from ints
72
+
73
+ As with ``encode``, preserves itemsize.
74
+ """
75
+ buf = ensure_ndarray_like(buf)
76
+ # Cast back from `int` to `float` type (noop if a `float`ing type buffer is provided)
77
+ dt = np.dtype(buf.dtype.str.replace("i", "f"))
78
+ data = buf.view(dt)
79
+ return ndarray_copy(data, out)
Binary file
numcodecs/bz2.py ADDED
@@ -0,0 +1,45 @@
1
+ import bz2 as _bz2
2
+
3
+ from numcodecs.abc import Codec
4
+ from numcodecs.compat import ensure_contiguous_ndarray, ndarray_copy
5
+
6
+
7
+ class BZ2(Codec):
8
+ """Codec providing compression using bzip2 via the Python standard library.
9
+
10
+ Parameters
11
+ ----------
12
+ level : int
13
+ Compression level.
14
+
15
+ """
16
+
17
+ codec_id = 'bz2'
18
+
19
+ def __init__(self, level=1):
20
+ self.level = level
21
+
22
+ def encode(self, buf):
23
+ # normalise input
24
+ buf = ensure_contiguous_ndarray(buf)
25
+
26
+ # do compression
27
+ return _bz2.compress(buf, self.level)
28
+
29
+ # noinspection PyMethodMayBeStatic
30
+ def decode(self, buf, out=None):
31
+ # normalise inputs
32
+ buf = ensure_contiguous_ndarray(buf)
33
+ if out is not None:
34
+ out = ensure_contiguous_ndarray(out)
35
+
36
+ # N.B., bz2 cannot handle ndarray directly because of truth testing issues
37
+ buf = memoryview(buf)
38
+
39
+ # do decompression
40
+ dec = _bz2.decompress(buf)
41
+
42
+ # handle destination - Python standard library bz2 module does not
43
+ # support direct decompression into buffer, so we have to copy into
44
+ # out if given
45
+ return ndarray_copy(dec, out)
@@ -0,0 +1,101 @@
1
+ import numpy as np
2
+
3
+ from .abc import Codec
4
+ from .compat import ensure_ndarray, ensure_text, ndarray_copy
5
+
6
+
7
+ class Categorize(Codec):
8
+ """Filter encoding categorical string data as integers.
9
+
10
+ Parameters
11
+ ----------
12
+ labels : sequence of strings
13
+ Category labels.
14
+ dtype : dtype
15
+ Data type to use for decoded data.
16
+ astype : dtype, optional
17
+ Data type to use for encoded data.
18
+
19
+ Examples
20
+ --------
21
+ >>> import numcodecs
22
+ >>> import numpy as np
23
+ >>> x = np.array(['male', 'female', 'female', 'male', 'unexpected'], dtype=object)
24
+ >>> x
25
+ array(['male', 'female', 'female', 'male', 'unexpected'],
26
+ dtype=object)
27
+ >>> codec = numcodecs.Categorize(labels=['female', 'male'], dtype=object)
28
+ >>> y = codec.encode(x)
29
+ >>> y
30
+ array([2, 1, 1, 2, 0], dtype=uint8)
31
+ >>> z = codec.decode(y)
32
+ >>> z
33
+ array(['male', 'female', 'female', 'male', ''],
34
+ dtype=object)
35
+
36
+ """
37
+
38
+ codec_id = 'categorize'
39
+
40
+ def __init__(self, labels, dtype, astype='u1'):
41
+ self.dtype = np.dtype(dtype)
42
+ if self.dtype.kind not in 'UO':
43
+ raise TypeError("only unicode ('U') and object ('O') dtypes are supported")
44
+ self.labels = [ensure_text(label) for label in labels]
45
+ self.astype = np.dtype(astype)
46
+ if self.astype == np.dtype(object):
47
+ raise TypeError('encoding as object array not supported')
48
+
49
+ def encode(self, buf):
50
+ # normalise input
51
+ if self.dtype == np.dtype(object):
52
+ arr = np.asarray(buf, dtype=object)
53
+ else:
54
+ arr = ensure_ndarray(buf).view(self.dtype)
55
+
56
+ # flatten to simplify implementation
57
+ arr = arr.reshape(-1, order='A')
58
+
59
+ # setup output array
60
+ enc = np.zeros_like(arr, dtype=self.astype)
61
+
62
+ # apply encoding, reserving 0 for values not specified in labels
63
+ for i, label in enumerate(self.labels):
64
+ enc[arr == label] = i + 1
65
+
66
+ return enc
67
+
68
+ def decode(self, buf, out=None):
69
+ # normalise input
70
+ enc = ensure_ndarray(buf).view(self.astype)
71
+
72
+ # flatten to simplify implementation
73
+ enc = enc.reshape(-1, order='A')
74
+
75
+ # setup output
76
+ dec = np.full_like(enc, fill_value='', dtype=self.dtype)
77
+
78
+ # apply decoding
79
+ for i, label in enumerate(self.labels):
80
+ dec[enc == (i + 1)] = label
81
+
82
+ # handle output
83
+ dec = ndarray_copy(dec, out)
84
+
85
+ return dec
86
+
87
+ def get_config(self):
88
+ config = dict(
89
+ id=self.codec_id,
90
+ labels=self.labels,
91
+ dtype=self.dtype.str,
92
+ astype=self.astype.str,
93
+ )
94
+ return config
95
+
96
+ def __repr__(self):
97
+ # make sure labels part is not too long
98
+ labels = repr(self.labels[:3])
99
+ if len(self.labels) > 3:
100
+ labels = labels[:-1] + ', ...]'
101
+ return f'{type(self).__name__}(dtype={self.dtype.str!r}, astype={self.astype.str!r}, labels={labels})'
@@ -0,0 +1,94 @@
1
+ import struct
2
+ import zlib
3
+
4
+ import numpy as np
5
+
6
+ from .abc import Codec
7
+ from .compat import ensure_contiguous_ndarray, ndarray_copy
8
+ from .jenkins import jenkins_lookup3
9
+
10
+
11
+ class Checksum32(Codec):
12
+ # override in sub-class
13
+ checksum = None
14
+
15
+ def encode(self, buf):
16
+ arr = ensure_contiguous_ndarray(buf).view('u1')
17
+ checksum = self.checksum(arr) & 0xFFFFFFFF
18
+ enc = np.empty(arr.nbytes + 4, dtype='u1')
19
+ enc[:4].view('<u4')[0] = checksum
20
+ ndarray_copy(arr, enc[4:])
21
+ return enc
22
+
23
+ def decode(self, buf, out=None):
24
+ arr = ensure_contiguous_ndarray(buf).view('u1')
25
+ expect = arr[:4].view('<u4')[0]
26
+ checksum = self.checksum(arr[4:]) & 0xFFFFFFFF
27
+ if expect != checksum:
28
+ raise RuntimeError('checksum failed')
29
+ return ndarray_copy(arr[4:], out)
30
+
31
+
32
+ class CRC32(Checksum32):
33
+ codec_id = 'crc32'
34
+ checksum = zlib.crc32
35
+
36
+
37
+ class Adler32(Checksum32):
38
+ codec_id = 'adler32'
39
+ checksum = zlib.adler32
40
+
41
+
42
+ class JenkinsLookup3(Checksum32):
43
+ """Bob Jenkin's lookup3 checksum with 32-bit output
44
+
45
+ This is the HDF5 implementation.
46
+ https://github.com/HDFGroup/hdf5/blob/577c192518598c7e2945683655feffcdbdf5a91b/src/H5checksum.c#L378-L472
47
+
48
+ With this codec, the checksum is concatenated on the end of the data
49
+ bytes when encoded. At decode time, the checksum is performed on
50
+ the data portion and compared with the four-byte checksum, raising
51
+ RuntimeError if inconsistent.
52
+
53
+ Attributes:
54
+ initval: initial seed passed to the hash algorithm, default: 0
55
+ prefix: bytes prepended to the buffer before evaluating the hash, default: None
56
+ """
57
+
58
+ checksum = jenkins_lookup3
59
+ codec_id = "jenkins_lookup3"
60
+
61
+ def __init__(self, initval: int = 0, prefix=None):
62
+ self.initval = initval
63
+ if prefix is None:
64
+ self.prefix = None
65
+ else:
66
+ self.prefix = np.frombuffer(prefix, dtype='uint8')
67
+
68
+ def encode(self, buf):
69
+ """Return buffer plus 4-byte Bob Jenkin's lookup3 checksum"""
70
+ buf = ensure_contiguous_ndarray(buf).ravel().view('uint8')
71
+ if self.prefix is None:
72
+ val = jenkins_lookup3(buf, self.initval)
73
+ else:
74
+ val = jenkins_lookup3(np.hstack((self.prefix, buf)), self.initval)
75
+ return buf.tobytes() + struct.pack("<I", val)
76
+
77
+ def decode(self, buf, out=None):
78
+ """Check Bob Jenkin's lookup3 checksum, and return buffer without it"""
79
+ b = ensure_contiguous_ndarray(buf).view('uint8')
80
+ if self.prefix is None:
81
+ val = jenkins_lookup3(b[:-4], self.initval)
82
+ else:
83
+ val = jenkins_lookup3(np.hstack((self.prefix, b[:-4])), self.initval)
84
+ found = b[-4:].view("<u4")[0]
85
+ if val != found:
86
+ raise RuntimeError(
87
+ f"The Bob Jenkin's lookup3 checksum of the data ({val}) did not"
88
+ f" match the expected checksum ({found}).\n"
89
+ "This could be a sign that the data has been corrupted."
90
+ )
91
+ if out is not None:
92
+ out.view("uint8")[:] = b[:-4]
93
+ return out
94
+ return memoryview(b[:-4])