PyPI - numcodecs-combinators - Versions diffs - 0.2.7__tar.gz → 0.2.9__tar.gz - Mend

numcodecs-combinators 0.2.7tar.gz → 0.2.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{numcodecs_combinators-0.2.7 → numcodecs_combinators-0.2.9}/.github/workflows/ci.yml RENAMED Viewed

@@ -6,7 +6,10 @@ on:
       - main
   pull_request:
     branches:
-      - '*'
+      - "*"
+env:
+  CLICOLOR: 1
 jobs:
   check:
@@ -21,6 +24,16 @@ jobs:
         with:
           args: check
+  spelling:
+    name: Spellcheck
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout the Repository
+      uses: actions/checkout@v4
+    - name: Spellcheck repo
+      uses: crate-ci/typos@v1.32.0
   fmt:
     name: Formatting
     runs-on: ubuntu-latest
@@ -50,7 +63,7 @@ jobs:
         run: uv sync --all-extras --dev && uv pip install .
       - name: Run tests
-        run: uv run pytest
+        run: uv run pytest -v -W error
   mypy:
     name: Typecheck

{numcodecs_combinators-0.2.7 → numcodecs_combinators-0.2.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: numcodecs-combinators
-Version: 0.2.7
+Version: 0.2.9
 Summary: Combinator codecs for the `numcodecs` buffer compression API
 License: Copyright (c) 2024, Juniper Tyree
@@ -402,6 +402,7 @@ The following combinators, implementing the `CodecCombinatorMixin` are provided:
 - `CodecStack`: a stack of codecs
 - `FramedCodecStack`: a stack of codecs that is framed with array data type and shape information
+- `PickBestCodec`: pick the best codec to encode the data
 [`numcodecs`]: https://numcodecs.readthedocs.io/en/stable/

{numcodecs_combinators-0.2.7 → numcodecs_combinators-0.2.9}/README.md RENAMED Viewed

@@ -12,6 +12,7 @@ The following combinators, implementing the `CodecCombinatorMixin` are provided:
 - `CodecStack`: a stack of codecs
 - `FramedCodecStack`: a stack of codecs that is framed with array data type and shape information
+- `PickBestCodec`: pick the best codec to encode the data
 [`numcodecs`]: https://numcodecs.readthedocs.io/en/stable/

numcodecs_combinators-0.2.9/_typos.toml ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [default.extend-identifiers]
2	+ _MaybeChunkedNdArray = "_MaybeChunkedNdArray"

{numcodecs_combinators-0.2.7 → numcodecs_combinators-0.2.9}/docs/index.md RENAMED Viewed

@@ -12,6 +12,7 @@ The following combinators, implementing the [`CodecCombinatorMixin`][numcodecs_c
 - [`CodecStack`][numcodecs_combinators.stack.CodecStack]: a stack of codecs
 - [`FramedCodecStack`][numcodecs_combinators.framed.FramedCodecStack]: a stack of codecs that is framed with array data type and shape information
+- [`PickBestCodec`][numcodecs_combinators.best.PickBestCodec]: pick the best codec to encode the data
 ## Funding

{numcodecs_combinators-0.2.7 → numcodecs_combinators-0.2.9}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "numcodecs-combinators"
-version = "0.2.7"
+version = "0.2.9"
 description = "Combinator codecs for the `numcodecs` buffer compression API"
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -21,8 +21,9 @@ optional-dependencies.xarray = [ "xarray>=2024.06", "dask>=2024.6" ]
 dev = ["mypy~=1.14", "pytest~=8.3"]
 [project.entry-points."numcodecs.codecs"]
-"combinators.stack" = "numcodecs_combinators.stack:CodecStack"
+"combinators.best" = "numcodecs_combinators.best:PickBestCodec"
 "combinators.framed" = "numcodecs_combinators.framed:FramedCodecStack"
+"combinators.stack" = "numcodecs_combinators.stack:CodecStack"
 [tool.setuptools.packages.find]
 where = ["src"]

{numcodecs_combinators-0.2.7 → numcodecs_combinators-0.2.9}/src/numcodecs_combinators/__init__.py RENAMED Viewed

@@ -8,6 +8,8 @@ provided:
 - [`CodecStack`][numcodecs_combinators.stack.CodecStack]: a stack of codecs
 - [`FramedCodecStack`][numcodecs_combinators.framed.FramedCodecStack]: a stack
   of codecs that is framed with array data type and shape information
+- [`PickBestCodec`][numcodecs_combinators.best.PickBestCodec]: pick the best
+  codec to encode the data
 """
 __all__ = ["map_codec"]
@@ -18,6 +20,7 @@ from typing import Callable
 from numcodecs.abc import Codec
 from . import abc as abc
+from . import best as best
 from . import framed as framed
 from . import stack as stack

{numcodecs_combinators-0.2.7 → numcodecs_combinators-0.2.9}/src/numcodecs_combinators/abc.py RENAMED Viewed

@@ -15,6 +15,8 @@ class CodecCombinatorMixin(ABC):
     Mixin class for combinators over [`Codec`][numcodecs.abc.Codec]s.
     """
+    __slots__ = ()
     @abstractmethod
     def map(self, mapper: Callable[[Codec], Codec]) -> Codec:
         """

numcodecs_combinators-0.2.9/src/numcodecs_combinators/best.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""
+This module defines the [`PickBestCodec`][numcodecs_combinators.best.PickBestCodec] class, which picks the codec that encoded the data best.
+"""
+__all__ = ["PickBestCodec"]
+from io import BytesIO
+from typing import Callable, Optional
+import numcodecs
+import numcodecs.compat
+import numcodecs.registry
+import numpy as np
+import varint
+from numcodecs.abc import Codec
+from typing_extensions import Buffer, Self  # MSPV 3.12
+from .abc import CodecCombinatorMixin
+class PickBestCodec(Codec, CodecCombinatorMixin, tuple[Codec]):
+    """
+    A codec that tries encoding with all combined codecs and then picks the one with the fewest bytes.
+    The inner codecs must all encode to 1D byte arrays. To use a codec not
+    encoding to bytes with this combinator, you can wrap it using
+    [`FramedCodecStack(codec)`][numcodecs_combinators.framed.FramedCodecStack]
+    combinator.
+    This combinator uses the ULEB128 variable length integer encoding to encode
+    the index of the codec that was chosen to encode and uses this index as a
+    header before the encoded bytes. The header index is only included if this
+    combinator wraps at least two codecs. If this combinator wraps zero codecs,
+    it passes the original data through unchanged.
+    """
+    __slots__ = ()
+    codec_id: str = "combinators.best"  # type: ignore
+    def __init__(self, *args: dict | Codec):
+        pass
+    def __new__(cls, *args: dict | Codec) -> Self:
+        return super(PickBestCodec, cls).__new__(
+            cls,
+            tuple(
+                codec
+                if isinstance(codec, Codec)
+                else numcodecs.registry.get_codec(codec)
+                for codec in args
+            ),
+        )
+    def encode(self, buf: Buffer) -> bytes:
+        """Encode the data in `buf`.
+        Parameters
+        ----------
+        buf : Buffer
+            Data to be encoded. May be any object supporting the new-style
+            buffer protocol.
+        Returns
+        -------
+        enc : bytes
+            Encoded and data as a bytestring.
+        """
+        if len(self) == 0:
+            return buf
+        data = numcodecs.compat.ensure_ndarray(buf)
+        best_size = np.inf
+        best_index = None
+        best_encoded = None
+        for i, codec in enumerate(self):
+            encoded = numcodecs.compat.ensure_ndarray(codec.encode(np.copy(data)))
+            assert encoded.dtype == np.dtype("uint8"), (
+                f"codec best[{i}] must encode to bytes"
+            )
+            assert encoded.ndim <= 1, f"codec best[{i}] must encode to 1D bytes"
+            if encoded.nbytes < best_size:
+                best_size = encoded.nbytes
+                best_index = i
+                best_encoded = encoded
+        encoded_index = varint.encode(best_index)
+        encoded_bytes = numcodecs.compat.ensure_bytes(best_encoded)
+        if len(self) == 1:
+            return encoded_bytes
+        return encoded_index + encoded_bytes
+    def decode(self, buf: Buffer, out: Optional[Buffer] = None) -> Buffer:
+        """Decode the data in `buf`.
+        Parameters
+        ----------
+        buf : Buffer
+            Encoded data. Must be an object representing a bytestring, e.g.
+            [`bytes`][bytes] or a 1D array of [`np.uint8`][numpy.uint8]s etc.
+        out : Buffer, optional
+            Writeable buffer to store decoded data. N.B. if provided, this buffer must
+            be exactly the right size to store the decoded data.
+        Returns
+        -------
+        dec : Buffer
+            Decoded data. May be any object supporting the new-style
+            buffer protocol.
+        """
+        if len(self) == 0:
+            return numcodecs.compat.ndarray_copy(buf, out)
+        b = numcodecs.compat.ensure_bytes(buf)
+        b_io = BytesIO(b)
+        if len(self) == 1:
+            best_index = 0
+        else:
+            best_index = varint.decode_stream(b_io)
+        return self[best_index].decode(b_io.read(), out=out)
+    def get_config(self) -> dict:
+        """
+        Returns the configuration of the best codec combinator.
+        [`numcodecs.registry.get_codec(config)`][numcodecs.registry.get_codec]
+        can be used to reconstruct this combinator from the returned config.
+        Returns
+        -------
+        config : dict
+            Configuration of the best codec combinator.
+        """
+        return dict(
+            id=type(self).codec_id,
+            codecs=tuple(codec.get_config() for codec in self),
+        )
+    @classmethod
+    def from_config(cls, config: dict) -> Self:
+        """
+        Instantiate the best codec combinator from a configuration [`dict`][dict].
+        Parameters
+        ----------
+        config : dict
+            Configuration of the best codec combinator.
+        Returns
+        -------
+        best : PickBestCodec
+            Instantiated best codec combinator.
+        """
+        return cls(*config["codecs"])
+    def __repr__(self) -> str:
+        repr = ", ".join(f"{codec!r}" for codec in self)
+        return f"{type(self).__name__}({repr})"
+    def map(self, mapper: Callable[[Codec], Codec]) -> "PickBestCodec":
+        """
+        Apply the `mapper` to all codecs that are in this combinator.
+        In the returned combinator, each codec is replaced by its mapped codec.
+        The `mapper` should recursively apply itself to any inner codecs that
+        also implement the [`CodecCombinatorMixin`][numcodecs_combinators.abc.CodecCombinatorMixin]
+        mixin.
+        To automatically handle the recursive application as a caller, you can
+        use
+        ```python
+        numcodecs_combinators.map_codec(best, mapper)
+        ```
+        instead.
+        Parameters
+        ----------
+        mapper : Callable[[Codec], Codec]
+            The callable that should be applied to each codec to map over this
+            best codec combinator.
+        Returns
+        -------
+        mapped : PickBestCodec
+            The mapped best codec combinator.
+        """
+        return PickBestCodec(*map(mapper, self))
+    def __add__(self, other) -> "PickBestCodec":
+        return PickBestCodec(*tuple.__add__(self, other))
+    def __mul__(self, other) -> "PickBestCodec":
+        return PickBestCodec(*tuple.__mul__(self, other))
+    def __rmul__(self, other) -> "PickBestCodec":
+        return PickBestCodec(*tuple.__rmul__(self, other))
+numcodecs.registry.register_codec(PickBestCodec)

{numcodecs_combinators-0.2.7 → numcodecs_combinators-0.2.9}/src/numcodecs_combinators/framed.py RENAMED Viewed

@@ -28,8 +28,8 @@ class FramedCodecStack(Codec, CodecCombinatorMixin, tuple[Codec]):
     intermediary, encoded) are stored as part of the encoding, which is output
     as a bytestring.
-    On deconding, this framing information is used to apply the codecs from
-    right to left to decode into known ouput data types and shapes.
+    On decoding, this framing information is used to apply the codecs from
+    right to left to decode into known output data types and shapes.
     Therefore, the [`FramedCodecStack`][numcodecs_combinators.framed.FramedCodecStack]
     can be used to combine codecs which require knowing the output data type

{numcodecs_combinators-0.2.7 → numcodecs_combinators-0.2.9}/src/numcodecs_combinators/stack.py RENAMED Viewed

@@ -140,6 +140,8 @@ class CodecStack(Codec, CodecCombinatorMixin, tuple[Codec]):
             buffer protocol.
         """
+        chunked = getattr(buf, "chunked", False)
         encoded = np.asarray(
             numcodecs.compat.ensure_contiguous_ndarray_like(buf, flatten=False)
         )
@@ -149,16 +151,23 @@ class CodecStack(Codec, CodecCombinatorMixin, tuple[Codec]):
             silhouettes.append((encoded.shape, encoded.dtype))
             encoded = np.asarray(
                 numcodecs.compat.ensure_contiguous_ndarray_like(
-                    codec.encode((encoded)), flatten=False
+                    codec.encode(_MaybeChunkedNdArray(encoded) if chunked else encoded),
+                    flatten=False,
                 )
             )
-        decoded = encoded
+        decoded = encoded.view(np.ndarray)
         for codec in reversed(self):
             shape, dtype = silhouettes.pop()
             out = np.empty(shape=shape, dtype=dtype)
-            decoded = codec.decode(decoded, out).view(dtype).reshape(shape)
+            decoded = (
+                codec.decode(decoded, _MaybeChunkedNdArray(out) if chunked else out)
+                .view(dtype)
+                .reshape(shape)
+            )
+        decoded = decoded.view(np.ndarray)
         if isinstance(decoded, type(buf)):
             return decoded
@@ -167,7 +176,15 @@ class CodecStack(Codec, CodecCombinatorMixin, tuple[Codec]):
     def encode_decode_data_array(self, da: "xr.DataArray") -> "xr.DataArray":
         """
-        Encode, then decode the data array in `da`.
+        Encode, then decode the data array `da`. If `da` is chunked, each chunk
+        is encoded and decoded *independently*.
+        Since each chunk is encoded *independently*, this method may cause
+        chunk boundary artifacts. Do *not* use this method if the codec
+        requires access to the entire data at once or if it needs to access
+        a neighbourhood of points across the chunk boundary. In these cases,
+        it is preferable to use
+        `da.copy(data=stack.encode_decode(da.values))` instead.
         The encode-decode computation may be deferred until the
         [`compute`][xarray.DataArray.compute] method is called on the result.
@@ -188,6 +205,8 @@ class CodecStack(Codec, CodecCombinatorMixin, tuple[Codec]):
         import xarray as xr
+        chunked = da.chunks is not None
         def encode_decode_data_array_single_chunk(
             da: xr.DataArray,
         ) -> xr.DataArray:
@@ -198,9 +217,11 @@ class CodecStack(Codec, CodecCombinatorMixin, tuple[Codec]):
                 return da.copy(deep=False).chunk(single_chunk)
             # eagerly compute the input chunk and encode and decode it
-            decoded = self.encode_decode(da.values)  # type: ignore
+            decoded = self.encode_decode(_MaybeChunkedNdArray(da.values, chunked))  # type: ignore
-            return da.copy(deep=False, data=decoded).chunk(single_chunk)
+            return da.copy(deep=False, data=np.array(decoded).view(np.ndarray)).chunk(
+                single_chunk
+            )
         return xr.map_blocks(encode_decode_data_array_single_chunk, da)
@@ -286,3 +307,22 @@ class CodecStack(Codec, CodecCombinatorMixin, tuple[Codec]):
 numcodecs.registry.register_codec(CodecStack)
+class _MaybeChunkedNdArray(np.ndarray):
+    __slots__ = ("_chunked",)
+    _chunked: bool
+    def __new__(cls, array, chunked: bool = True):
+        obj = np.asarray(array).view(cls)
+        obj._chunked = chunked
+        return obj
+    def __array_finalize__(self, obj):
+        if obj is None:
+            return
+        self._chunked = getattr(obj, "chunked", True)
+    @property
+    def chunked(self) -> bool:
+        return self._chunked

numcodecs_combinators-0.2.9/tests/test_best.py ADDED Viewed

@@ -0,0 +1,85 @@
+import numcodecs
+import numcodecs.compat
+import numpy as np
+import numcodecs_combinators
+from numcodecs_combinators.best import PickBestCodec
+from numcodecs_combinators.framed import FramedCodecStack
+def assert_config_roundtrip(codec: numcodecs.abc.Codec):
+    config = codec.get_config()
+    codec2 = numcodecs.get_codec(config)
+    assert codec2 == codec
+def test_init_config():
+    best = PickBestCodec()
+    assert len(best) == 0
+    assert_config_roundtrip(best)
+    best = PickBestCodec(dict(id="zlib", level=9))
+    assert len(best) == 1
+    assert_config_roundtrip(best)
+    best = PickBestCodec(dict(id="zlib", level=9), numcodecs.CRC32())
+    assert len(best) == 2
+    assert_config_roundtrip(best)
+def test_encode_decode():
+    for best in [
+        PickBestCodec(),
+        PickBestCodec(dict(id="combinators.framed", codecs=[dict(id="zlib", level=9)])),
+        PickBestCodec(
+            FramedCodecStack(numcodecs.Zlib(level=9)),
+            FramedCodecStack(numcodecs.CRC32()),
+        ),
+        PickBestCodec(
+            FramedCodecStack(numcodecs.Zlib(level=9)),
+            FramedCodecStack(numcodecs.CRC32()),
+            FramedCodecStack(numcodecs.Zstd(level=20)),
+        ),
+    ]:
+        for data in [
+            np.zeros(shape=(0,)),
+            np.array(3),
+            np.array([97, 98, 99], dtype=np.uint8),
+            np.linspace(1, 100, 100).reshape(10, 10),
+            np.linspace(1, 100, 100).reshape(10, 10).byteswap(),
+        ]:
+            encoded = best.encode(data)
+            if len(best) > 0:
+                assert isinstance(encoded, bytes)
+            decoded = best.decode(encoded)
+            print(best)
+            assert np.all(decoded == data)
+def test_map():
+    best = PickBestCodec(numcodecs.Zlib(level=9), numcodecs.CRC32())
+    mapped = numcodecs_combinators.map_codec(best, lambda c: c)
+    assert mapped == best
+    mapped = numcodecs_combinators.map_codec(best, lambda c: PickBestCodec(c))
+    assert mapped == PickBestCodec(
+        PickBestCodec(
+            PickBestCodec(numcodecs.Zlib(level=9)),
+            PickBestCodec(numcodecs.CRC32()),
+        )
+    )
+    mapped = numcodecs_combinators.map_codec(mapped, lambda c: PickBestCodec(c))
+    assert mapped == PickBestCodec(
+        PickBestCodec(
+            PickBestCodec(
+                PickBestCodec(
+                    PickBestCodec(
+                        PickBestCodec(PickBestCodec(numcodecs.Zlib(level=9)))
+                    ),
+                    PickBestCodec(PickBestCodec(PickBestCodec(numcodecs.CRC32()))),
+                )
+            )
+        )
+    )

{numcodecs_combinators-0.2.7 → numcodecs_combinators-0.2.9}/tests/test_stack.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import numcodecs
 import numpy as np
 import xarray as xr
+from numcodecs.abc import Codec
 import numcodecs_combinators
 from numcodecs_combinators.stack import CodecStack
@@ -51,6 +52,39 @@ def test_encode_decode():
     assert encoded_decoded.equals(xr.DataArray([1.0, 2.0, 3.0]))
+def test_chunked_encode_decode():
+    class CheckChunkedCodec(Codec):
+        __slots__ = ("is_chunked",)
+        is_chunked: bool
+        def __init__(self, is_chunked: bool):
+            self.is_chunked = is_chunked
+        def encode(self, buf):
+            assert getattr(buf, "chunked", False) == self.is_chunked
+            return buf
+        def decode(self, buf, out=None):
+            assert getattr(buf, "chunked", False) is False
+            assert getattr(out, "chunked", False) == self.is_chunked
+            return numcodecs.compat.ndarray_copy(buf, out)
+    stack = CodecStack(CheckChunkedCodec(False))
+    encoded_decoded = stack.encode_decode(np.array([1.0, 2.0, 3.0]))
+    assert np.all(encoded_decoded == np.array([1.0, 2.0, 3.0]))
+    encoded_decoded = stack.encode_decode_data_array(xr.DataArray([1.0, 2.0, 3.0]))
+    assert encoded_decoded.equals(xr.DataArray([1.0, 2.0, 3.0]))
+    stack = CodecStack(CheckChunkedCodec(True))
+    encoded_decoded = stack.encode_decode_data_array(
+        xr.DataArray([1.0, 2.0, 3.0]).chunk(1)
+    )
+    assert encoded_decoded.equals(xr.DataArray([1.0, 2.0, 3.0]))
 def test_map():
     stack = CodecStack(numcodecs.Zlib(level=9), numcodecs.CRC32())