numcodecs 0.16.4__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. numcodecs/__init__.py +146 -0
  2. numcodecs/_shuffle.cpython-313-darwin.so +0 -0
  3. numcodecs/abc.py +126 -0
  4. numcodecs/astype.py +72 -0
  5. numcodecs/base64.py +26 -0
  6. numcodecs/bitround.py +80 -0
  7. numcodecs/blosc.cpython-313-darwin.so +0 -0
  8. numcodecs/bz2.py +45 -0
  9. numcodecs/categorize.py +98 -0
  10. numcodecs/checksum32.py +189 -0
  11. numcodecs/compat.py +206 -0
  12. numcodecs/compat_ext.cpython-313-darwin.so +0 -0
  13. numcodecs/delta.py +94 -0
  14. numcodecs/errors.py +26 -0
  15. numcodecs/fixedscaleoffset.py +130 -0
  16. numcodecs/fletcher32.cpython-313-darwin.so +0 -0
  17. numcodecs/gzip.py +50 -0
  18. numcodecs/jenkins.cpython-313-darwin.so +0 -0
  19. numcodecs/json.py +107 -0
  20. numcodecs/lz4.cpython-313-darwin.so +0 -0
  21. numcodecs/lzma.py +71 -0
  22. numcodecs/msgpacks.py +86 -0
  23. numcodecs/ndarray_like.py +65 -0
  24. numcodecs/packbits.py +82 -0
  25. numcodecs/pcodec.py +119 -0
  26. numcodecs/pickles.py +55 -0
  27. numcodecs/quantize.py +98 -0
  28. numcodecs/registry.py +74 -0
  29. numcodecs/shuffle.py +61 -0
  30. numcodecs/tests/__init__.py +3 -0
  31. numcodecs/tests/common.py +275 -0
  32. numcodecs/tests/package_with_entrypoint/__init__.py +11 -0
  33. numcodecs/tests/package_with_entrypoint-0.1.dist-info/entry_points.txt +2 -0
  34. numcodecs/tests/test_astype.py +74 -0
  35. numcodecs/tests/test_base64.py +81 -0
  36. numcodecs/tests/test_bitround.py +81 -0
  37. numcodecs/tests/test_blosc.py +290 -0
  38. numcodecs/tests/test_bz2.py +66 -0
  39. numcodecs/tests/test_categorize.py +87 -0
  40. numcodecs/tests/test_checksum32.py +199 -0
  41. numcodecs/tests/test_compat.py +111 -0
  42. numcodecs/tests/test_delta.py +61 -0
  43. numcodecs/tests/test_entrypoints.py +24 -0
  44. numcodecs/tests/test_entrypoints_backport.py +36 -0
  45. numcodecs/tests/test_fixedscaleoffset.py +77 -0
  46. numcodecs/tests/test_fletcher32.py +56 -0
  47. numcodecs/tests/test_gzip.py +110 -0
  48. numcodecs/tests/test_jenkins.py +150 -0
  49. numcodecs/tests/test_json.py +85 -0
  50. numcodecs/tests/test_lz4.py +83 -0
  51. numcodecs/tests/test_lzma.py +94 -0
  52. numcodecs/tests/test_msgpacks.py +126 -0
  53. numcodecs/tests/test_ndarray_like.py +48 -0
  54. numcodecs/tests/test_packbits.py +39 -0
  55. numcodecs/tests/test_pcodec.py +90 -0
  56. numcodecs/tests/test_pickles.py +61 -0
  57. numcodecs/tests/test_pyzstd.py +76 -0
  58. numcodecs/tests/test_quantize.py +76 -0
  59. numcodecs/tests/test_registry.py +43 -0
  60. numcodecs/tests/test_shuffle.py +166 -0
  61. numcodecs/tests/test_vlen_array.py +97 -0
  62. numcodecs/tests/test_vlen_bytes.py +93 -0
  63. numcodecs/tests/test_vlen_utf8.py +91 -0
  64. numcodecs/tests/test_zarr3.py +48 -0
  65. numcodecs/tests/test_zarr3_import.py +13 -0
  66. numcodecs/tests/test_zfpy.py +104 -0
  67. numcodecs/tests/test_zlib.py +94 -0
  68. numcodecs/tests/test_zstd.py +189 -0
  69. numcodecs/version.py +34 -0
  70. numcodecs/vlen.cpython-313-darwin.so +0 -0
  71. numcodecs/zarr3.py +67 -0
  72. numcodecs/zfpy.py +112 -0
  73. numcodecs/zlib.py +42 -0
  74. numcodecs/zstd.cpython-313-darwin.so +0 -0
  75. numcodecs-0.16.4.dist-info/METADATA +67 -0
  76. numcodecs-0.16.4.dist-info/RECORD +87 -0
  77. numcodecs-0.16.4.dist-info/WHEEL +6 -0
  78. numcodecs-0.16.4.dist-info/licenses/LICENSE.txt +21 -0
  79. numcodecs-0.16.4.dist-info/licenses/c-blosc/LICENSE.txt +31 -0
  80. numcodecs-0.16.4.dist-info/licenses/c-blosc/LICENSES/BITSHUFFLE.txt +21 -0
  81. numcodecs-0.16.4.dist-info/licenses/c-blosc/LICENSES/FASTLZ.txt +20 -0
  82. numcodecs-0.16.4.dist-info/licenses/c-blosc/LICENSES/LZ4.txt +25 -0
  83. numcodecs-0.16.4.dist-info/licenses/c-blosc/LICENSES/SNAPPY.txt +28 -0
  84. numcodecs-0.16.4.dist-info/licenses/c-blosc/LICENSES/STDINT.txt +29 -0
  85. numcodecs-0.16.4.dist-info/licenses/c-blosc/LICENSES/ZLIB-NG.txt +17 -0
  86. numcodecs-0.16.4.dist-info/licenses/c-blosc/LICENSES/ZLIB.txt +22 -0
  87. numcodecs-0.16.4.dist-info/top_level.txt +1 -0
numcodecs/__init__.py ADDED
@@ -0,0 +1,146 @@
1
+ # ruff: noqa: E402
2
+ """Numcodecs is a Python package providing buffer compression and
3
+ transformation codecs for use in data storage and communication
4
+ applications. These include:
5
+
6
+ * Compression codecs, e.g., Zlib, BZ2, LZMA, ZFPY and Blosc.
7
+ * Pre-compression filters, e.g., Delta, Quantize, FixedScaleOffset,
8
+ PackBits, Categorize.
9
+ * Integrity checks, e.g., CRC32, Adler32.
10
+
11
+ All codecs implement the same API, allowing codecs to be organized into
12
+ pipelines in a variety of ways.
13
+
14
+ If you have a question, find a bug, would like to make a suggestion or
15
+ contribute code, please `raise an issue on GitHub
16
+ <https://github.com/zarr-developers/numcodecs/issues>`_.
17
+
18
+ """
19
+
20
+ import atexit
21
+ import multiprocessing
22
+ from contextlib import suppress
23
+
24
+ from numcodecs.registry import get_codec as get_codec
25
+ from numcodecs.registry import register_codec
26
+ from numcodecs.version import version as __version__ # noqa: F401
27
+ from numcodecs.zlib import Zlib
28
+
29
+ register_codec(Zlib)
30
+
31
+ from numcodecs.gzip import GZip
32
+
33
+ register_codec(GZip)
34
+
35
+ from numcodecs.bz2 import BZ2
36
+
37
+ register_codec(BZ2)
38
+
39
+ from numcodecs.lzma import LZMA
40
+
41
+ register_codec(LZMA)
42
+
43
+ from numcodecs import blosc
44
+ from numcodecs.blosc import Blosc
45
+
46
+ register_codec(Blosc)
47
+ # initialize blosc
48
+ try:
49
+ ncores = multiprocessing.cpu_count()
50
+ except OSError: # pragma: no cover
51
+ ncores = 1
52
+ blosc._init()
53
+ blosc.set_nthreads(min(8, ncores))
54
+ atexit.register(blosc._destroy)
55
+
56
+ from numcodecs import zstd as zstd
57
+ from numcodecs.zstd import Zstd
58
+
59
+ register_codec(Zstd)
60
+
61
+ from numcodecs import lz4 as lz4
62
+ from numcodecs.lz4 import LZ4
63
+
64
+ register_codec(LZ4)
65
+
66
+ from numcodecs.astype import AsType
67
+
68
+ register_codec(AsType)
69
+
70
+ from numcodecs.delta import Delta
71
+
72
+ register_codec(Delta)
73
+
74
+ from numcodecs.quantize import Quantize
75
+
76
+ register_codec(Quantize)
77
+
78
+ from numcodecs.fixedscaleoffset import FixedScaleOffset
79
+
80
+ register_codec(FixedScaleOffset)
81
+
82
+ from numcodecs.packbits import PackBits
83
+
84
+ register_codec(PackBits)
85
+
86
+ from numcodecs.categorize import Categorize
87
+
88
+ register_codec(Categorize)
89
+
90
+ from numcodecs.pickles import Pickle
91
+
92
+ register_codec(Pickle)
93
+
94
+ from numcodecs.base64 import Base64
95
+
96
+ register_codec(Base64)
97
+
98
+ from numcodecs.shuffle import Shuffle
99
+
100
+ register_codec(Shuffle)
101
+
102
+ from numcodecs.bitround import BitRound
103
+
104
+ register_codec(BitRound)
105
+
106
+ from numcodecs.checksum32 import CRC32, Adler32, JenkinsLookup3
107
+
108
+ register_codec(CRC32)
109
+ register_codec(Adler32)
110
+ register_codec(JenkinsLookup3)
111
+
112
+ from numcodecs.json import JSON
113
+
114
+ register_codec(JSON)
115
+
116
+ from numcodecs import vlen as vlen
117
+ from numcodecs.vlen import VLenArray, VLenBytes, VLenUTF8
118
+
119
+ register_codec(VLenUTF8)
120
+ register_codec(VLenBytes)
121
+ register_codec(VLenArray)
122
+
123
+ from numcodecs.fletcher32 import Fletcher32
124
+
125
+ register_codec(Fletcher32)
126
+
127
+ # Optional depenedencies
128
+ with suppress(ImportError):
129
+ from numcodecs.zfpy import ZFPY
130
+
131
+ register_codec(ZFPY)
132
+
133
+ with suppress(ImportError):
134
+ from numcodecs.msgpacks import MsgPack
135
+
136
+ register_codec(MsgPack)
137
+
138
+ with suppress(ImportError):
139
+ from numcodecs.checksum32 import CRC32C
140
+
141
+ register_codec(CRC32C)
142
+
143
+ with suppress(ImportError):
144
+ from numcodecs.pcodec import PCodec
145
+
146
+ register_codec(PCodec)
Binary file
numcodecs/abc.py ADDED
@@ -0,0 +1,126 @@
1
+ """This module defines the :class:`Codec` base class, a common interface for
2
+ all codec classes.
3
+
4
+ Codec classes must implement :func:`Codec.encode` and :func:`Codec.decode`
5
+ methods. Inputs to and outputs from these methods may be any Python object
6
+ exporting a contiguous buffer via the new-style Python protocol.
7
+
8
+ Codec classes must implement a :func:`Codec.get_config` method,
9
+ which must return a dictionary holding all configuration parameters
10
+ required to enable encoding and decoding of data. The expectation is that
11
+ these configuration parameters will be stored or communicated separately
12
+ from encoded data, and thus the codecs do not need to store all encoding
13
+ parameters within the encoded data. For broad compatibility,
14
+ the configuration object must contain only JSON-serializable values. The
15
+ configuration object must also contain an 'id' field storing the codec
16
+ identifier (see below).
17
+
18
+ Codec classes must implement a :func:`Codec.from_config` class method,
19
+ which will return an instance of the class initialized from a configuration
20
+ object.
21
+
22
+ Finally, codec classes must set a `codec_id` class-level attribute. This
23
+ must be a string. Two different codec classes may set the same value for the
24
+ `codec_id` attribute if and only if they are fully compatible, meaning that
25
+ (1) configuration parameters are the same, and (2) given the same
26
+ configuration, one class could correctly decode data encoded by the
27
+ other and vice versa.
28
+
29
+ """
30
+
31
+ from abc import ABC, abstractmethod
32
+
33
+
34
+ class Codec(ABC):
35
+ """Codec abstract base class."""
36
+
37
+ # override in sub-class
38
+ codec_id: str | None = None
39
+ """Codec identifier."""
40
+
41
+ @abstractmethod
42
+ def encode(self, buf): # pragma: no cover
43
+ """Encode data in `buf`.
44
+
45
+ Parameters
46
+ ----------
47
+ buf : buffer-like
48
+ Data to be encoded. May be any object supporting the new-style
49
+ buffer protocol.
50
+
51
+ Returns
52
+ -------
53
+ enc : buffer-like
54
+ Encoded data. May be any object supporting the new-style buffer
55
+ protocol.
56
+ """
57
+
58
+ @abstractmethod
59
+ def decode(self, buf, out=None): # pragma: no cover
60
+ """Decode data in `buf`.
61
+
62
+ Parameters
63
+ ----------
64
+ buf : buffer-like
65
+ Encoded data. May be any object supporting the new-style buffer
66
+ protocol.
67
+ out : buffer-like, optional
68
+ Writeable buffer to store decoded data. N.B. if provided, this buffer must
69
+ be exactly the right size to store the decoded data.
70
+
71
+ Returns
72
+ -------
73
+ dec : buffer-like
74
+ Decoded data. May be any object supporting the new-style
75
+ buffer protocol.
76
+ """
77
+
78
+ def get_config(self):
79
+ """Return a dictionary holding configuration parameters for this
80
+ codec. Must include an 'id' field with the codec identifier. All
81
+ values must be compatible with JSON encoding."""
82
+
83
+ # override in sub-class if need special encoding of config values
84
+
85
+ # setup config object
86
+ config = {'id': self.codec_id}
87
+
88
+ # by default, assume all non-private members are configuration
89
+ # parameters - override this in sub-class if not the case
90
+ for k in self.__dict__:
91
+ if not k.startswith('_'):
92
+ config[k] = getattr(self, k)
93
+
94
+ return config
95
+
96
+ @classmethod
97
+ def from_config(cls, config):
98
+ """Instantiate codec from a configuration object."""
99
+ # N.B., assume at this point the 'id' field has been removed from
100
+ # the config object
101
+
102
+ # override in sub-class if need special decoding of config values
103
+
104
+ # by default, assume constructor accepts configuration parameters as
105
+ # keyword arguments without any special decoding
106
+ return cls(**config)
107
+
108
+ def __eq__(self, other):
109
+ # override in sub-class if need special equality comparison
110
+ try:
111
+ return self.get_config() == other.get_config()
112
+ except AttributeError:
113
+ return False
114
+
115
+ def __repr__(self):
116
+ # override in sub-class if need special representation
117
+
118
+ # by default, assume all non-private members are configuration
119
+ # parameters and valid keyword arguments to constructor function
120
+
121
+ r = f'{type(self).__name__}('
122
+ params = [
123
+ f'{k}={getattr(self, k)!r}' for k in sorted(self.__dict__) if not k.startswith('_')
124
+ ]
125
+ r += ', '.join(params) + ')'
126
+ return r
numcodecs/astype.py ADDED
@@ -0,0 +1,72 @@
1
+ import numpy as np
2
+
3
+ from .abc import Codec
4
+ from .compat import ensure_ndarray, ndarray_copy
5
+
6
+
7
+ class AsType(Codec):
8
+ """Filter to convert data between different types.
9
+
10
+ Parameters
11
+ ----------
12
+ encode_dtype : dtype
13
+ Data type to use for encoded data.
14
+ decode_dtype : dtype, optional
15
+ Data type to use for decoded data.
16
+
17
+ Notes
18
+ -----
19
+ If `encode_dtype` is of lower precision than `decode_dtype`, please be
20
+ aware that data loss can occur by writing data to disk using this filter.
21
+ No checks are made to ensure the casting will work in that direction and
22
+ data corruption will occur.
23
+
24
+ Examples
25
+ --------
26
+ >>> import numcodecs
27
+ >>> import numpy as np
28
+ >>> x = np.arange(100, 120, 2, dtype=np.int8)
29
+ >>> x
30
+ array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int8)
31
+ >>> f = numcodecs.AsType(encode_dtype=x.dtype, decode_dtype=np.int16)
32
+ >>> y = f.decode(x)
33
+ >>> y
34
+ array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int16)
35
+ >>> z = f.encode(y)
36
+ >>> z
37
+ array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int8)
38
+
39
+ """
40
+
41
+ codec_id = 'astype'
42
+
43
+ def __init__(self, encode_dtype, decode_dtype):
44
+ self.encode_dtype = np.dtype(encode_dtype)
45
+ self.decode_dtype = np.dtype(decode_dtype)
46
+
47
+ def encode(self, buf):
48
+ # normalise input
49
+ arr = ensure_ndarray(buf).view(self.decode_dtype)
50
+
51
+ # convert and copy
52
+ return arr.astype(self.encode_dtype)
53
+
54
+ def decode(self, buf, out=None):
55
+ # normalise input
56
+ enc = ensure_ndarray(buf).view(self.encode_dtype)
57
+
58
+ # convert and copy
59
+ dec = enc.astype(self.decode_dtype)
60
+
61
+ # handle output
62
+ return ndarray_copy(dec, out)
63
+
64
+ def get_config(self):
65
+ return {
66
+ 'id': self.codec_id,
67
+ 'encode_dtype': self.encode_dtype.str,
68
+ 'decode_dtype': self.decode_dtype.str,
69
+ }
70
+
71
+ def __repr__(self):
72
+ return f'{type(self).__name__}(encode_dtype={self.encode_dtype.str!r}, decode_dtype={self.decode_dtype.str!r})'
numcodecs/base64.py ADDED
@@ -0,0 +1,26 @@
1
+ import base64 as _base64
2
+
3
+ from .abc import Codec
4
+ from .compat import ensure_contiguous_ndarray, ndarray_copy
5
+
6
+
7
+ class Base64(Codec):
8
+ """Codec providing base64 compression via the Python standard library."""
9
+
10
+ codec_id = "base64"
11
+
12
+ def encode(self, buf):
13
+ # normalise inputs
14
+ buf = ensure_contiguous_ndarray(buf)
15
+ # do compression
16
+ return _base64.standard_b64encode(buf)
17
+
18
+ def decode(self, buf, out=None):
19
+ # normalise inputs
20
+ buf = ensure_contiguous_ndarray(buf)
21
+ if out is not None:
22
+ out = ensure_contiguous_ndarray(out)
23
+ # do decompression
24
+ decompressed = _base64.standard_b64decode(buf)
25
+ # handle destination
26
+ return ndarray_copy(decompressed, out)
numcodecs/bitround.py ADDED
@@ -0,0 +1,80 @@
1
+ import numpy as np
2
+
3
+ from .abc import Codec
4
+ from .compat import ensure_ndarray_like, ndarray_copy
5
+
6
+ # The size in bits of the mantissa/significand for the various floating types
7
+ # You cannot keep more bits of data than you have available
8
+ # https://en.wikipedia.org/wiki/IEEE_754
9
+ max_bits = {
10
+ "float16": 10,
11
+ "float32": 23,
12
+ "float64": 52,
13
+ }
14
+
15
+
16
+ class BitRound(Codec):
17
+ """Floating-point bit rounding codec
18
+
19
+ Drops a specified number of bits from the floating point mantissa,
20
+ leaving an array more amenable to compression. The number of bits to keep should
21
+ be determined by an information analysis of the data to be compressed.
22
+ The approach is based on the paper by Klöwer et al. 2021
23
+ (https://www.nature.com/articles/s43588-021-00156-2). See
24
+ https://github.com/zarr-developers/numcodecs/issues/298 for discussion
25
+ and the original implementation in Julia referred to at
26
+ https://github.com/milankl/BitInformation.jl
27
+
28
+ Parameters
29
+ ----------
30
+
31
+ keepbits: int
32
+ The number of bits of the mantissa to keep. The range allowed
33
+ depends on the dtype input data. If keepbits is
34
+ equal to the maximum allowed for the data type, this is equivalent
35
+ to no transform.
36
+ """
37
+
38
+ codec_id = 'bitround'
39
+
40
+ def __init__(self, keepbits: int):
41
+ if keepbits < 0:
42
+ raise ValueError("keepbits must be zero or positive")
43
+ self.keepbits = keepbits
44
+
45
+ def encode(self, buf):
46
+ """Create int array by rounding floating-point data
47
+
48
+ The itemsize will be preserved, but the output should be much more
49
+ compressible.
50
+ """
51
+ a = ensure_ndarray_like(buf)
52
+ if not a.dtype.kind == "f" or a.dtype.itemsize > 8:
53
+ raise TypeError("Only float arrays (16-64bit) can be bit-rounded")
54
+ bits = max_bits[str(a.dtype)]
55
+ # cast float to int type of same width (preserve endianness)
56
+ a_int_dtype = np.dtype(a.dtype.str.replace("f", "i"))
57
+ all_set = np.array(-1, dtype=a_int_dtype)
58
+ if self.keepbits == bits:
59
+ return a
60
+ if self.keepbits > bits:
61
+ raise ValueError("Keepbits too large for given dtype")
62
+ b = a.copy()
63
+ b = b.view(a_int_dtype)
64
+ maskbits = bits - self.keepbits
65
+ mask = (all_set >> maskbits) << maskbits
66
+ half_quantum1 = (1 << (maskbits - 1)) - 1
67
+ b += ((b >> maskbits) & 1) + half_quantum1
68
+ b &= mask
69
+ return b
70
+
71
+ def decode(self, buf, out=None):
72
+ """Remake floats from ints
73
+
74
+ As with ``encode``, preserves itemsize.
75
+ """
76
+ buf = ensure_ndarray_like(buf)
77
+ # Cast back from `int` to `float` type (noop if a `float`ing type buffer is provided)
78
+ dt = np.dtype(buf.dtype.str.replace("i", "f"))
79
+ data = buf.view(dt)
80
+ return ndarray_copy(data, out)
Binary file
numcodecs/bz2.py ADDED
@@ -0,0 +1,45 @@
1
+ import bz2 as _bz2
2
+
3
+ from numcodecs.abc import Codec
4
+ from numcodecs.compat import ensure_contiguous_ndarray, ndarray_copy
5
+
6
+
7
+ class BZ2(Codec):
8
+ """Codec providing compression using bzip2 via the Python standard library.
9
+
10
+ Parameters
11
+ ----------
12
+ level : int
13
+ Compression level.
14
+
15
+ """
16
+
17
+ codec_id = 'bz2'
18
+
19
+ def __init__(self, level=1):
20
+ self.level = level
21
+
22
+ def encode(self, buf):
23
+ # normalise input
24
+ buf = ensure_contiguous_ndarray(buf)
25
+
26
+ # do compression
27
+ return _bz2.compress(buf, self.level)
28
+
29
+ # noinspection PyMethodMayBeStatic
30
+ def decode(self, buf, out=None):
31
+ # normalise inputs
32
+ buf = ensure_contiguous_ndarray(buf)
33
+ if out is not None:
34
+ out = ensure_contiguous_ndarray(out)
35
+
36
+ # N.B., bz2 cannot handle ndarray directly because of truth testing issues
37
+ buf = memoryview(buf)
38
+
39
+ # do decompression
40
+ dec = _bz2.decompress(buf)
41
+
42
+ # handle destination - Python standard library bz2 module does not
43
+ # support direct decompression into buffer, so we have to copy into
44
+ # out if given
45
+ return ndarray_copy(dec, out)
@@ -0,0 +1,98 @@
1
+ import numpy as np
2
+
3
+ from .abc import Codec
4
+ from .compat import ensure_ndarray, ensure_text, ndarray_copy
5
+
6
+
7
+ class Categorize(Codec):
8
+ """Filter encoding categorical string data as integers.
9
+
10
+ Parameters
11
+ ----------
12
+ labels : sequence of strings
13
+ Category labels.
14
+ dtype : dtype
15
+ Data type to use for decoded data.
16
+ astype : dtype, optional
17
+ Data type to use for encoded data.
18
+
19
+ Examples
20
+ --------
21
+ >>> import numcodecs
22
+ >>> import numpy as np
23
+ >>> x = np.array(['male', 'female', 'female', 'male', 'unexpected'], dtype=object)
24
+ >>> x
25
+ array(['male', 'female', 'female', 'male', 'unexpected'],
26
+ dtype=object)
27
+ >>> codec = numcodecs.Categorize(labels=['female', 'male'], dtype=object)
28
+ >>> y = codec.encode(x)
29
+ >>> y
30
+ array([2, 1, 1, 2, 0], dtype=uint8)
31
+ >>> z = codec.decode(y)
32
+ >>> z
33
+ array(['male', 'female', 'female', 'male', ''],
34
+ dtype=object)
35
+
36
+ """
37
+
38
+ codec_id = 'categorize'
39
+
40
+ def __init__(self, labels, dtype, astype='u1'):
41
+ self.dtype = np.dtype(dtype)
42
+ if self.dtype.kind not in 'UO':
43
+ raise TypeError("only unicode ('U') and object ('O') dtypes are supported")
44
+ self.labels = [ensure_text(label) for label in labels]
45
+ self.astype = np.dtype(astype)
46
+ if self.astype == np.dtype(object):
47
+ raise TypeError('encoding as object array not supported')
48
+
49
+ def encode(self, buf):
50
+ # normalise input
51
+ if self.dtype == np.dtype(object):
52
+ arr = np.asarray(buf, dtype=object)
53
+ else:
54
+ arr = ensure_ndarray(buf).view(self.dtype)
55
+
56
+ # flatten to simplify implementation
57
+ arr = arr.reshape(-1, order='A')
58
+
59
+ # setup output array
60
+ enc = np.zeros_like(arr, dtype=self.astype)
61
+
62
+ # apply encoding, reserving 0 for values not specified in labels
63
+ for i, label in enumerate(self.labels):
64
+ enc[arr == label] = i + 1
65
+
66
+ return enc
67
+
68
+ def decode(self, buf, out=None):
69
+ # normalise input
70
+ enc = ensure_ndarray(buf).view(self.astype)
71
+
72
+ # flatten to simplify implementation
73
+ enc = enc.reshape(-1, order='A')
74
+
75
+ # setup output
76
+ dec = np.full_like(enc, fill_value='', dtype=self.dtype)
77
+
78
+ # apply decoding
79
+ for i, label in enumerate(self.labels):
80
+ dec[enc == (i + 1)] = label
81
+
82
+ # handle output
83
+ return ndarray_copy(dec, out)
84
+
85
+ def get_config(self):
86
+ return {
87
+ 'id': self.codec_id,
88
+ 'labels': self.labels,
89
+ 'dtype': self.dtype.str,
90
+ 'astype': self.astype.str,
91
+ }
92
+
93
+ def __repr__(self):
94
+ # make sure labels part is not too long
95
+ labels = repr(self.labels[:3])
96
+ if len(self.labels) > 3:
97
+ labels = labels[:-1] + ', ...]'
98
+ return f'{type(self).__name__}(dtype={self.dtype.str!r}, astype={self.astype.str!r}, labels={labels})'