zarrs 0.2.2__cp311-abi3-manylinux_2_28_ppc64le.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
zarrs/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ from ._internal import __version__
2
+ from .pipeline import ZarrsCodecPipeline as _ZarrsCodecPipeline
3
+ from .utils import CollapsedDimensionError, DiscontiguousArrayError
4
+
5
+
6
+ # Need to do this redirection so people can access the pipeline as `zarrs.ZarrsCodecPipeline` instead of `zarrs.pipeline.ZarrsCodecPipeline`
7
+ class ZarrsCodecPipeline(_ZarrsCodecPipeline):
8
+ pass
9
+
10
+
11
+ __all__ = [
12
+ "ZarrsCodecPipeline",
13
+ "DiscontiguousArrayError",
14
+ "CollapsedDimensionError",
15
+ "__version__",
16
+ ]
Binary file
zarrs/_internal.pyi ADDED
@@ -0,0 +1,44 @@
1
+ # This file is automatically generated by pyo3_stub_gen
2
+ # ruff: noqa: E501, F401
3
+
4
+ import builtins
5
+ import typing
6
+
7
+ import numpy.typing
8
+ import zarr.abc.store
9
+
10
+ @typing.final
11
+ class ChunkItem:
12
+ def __new__(
13
+ cls,
14
+ key: builtins.str,
15
+ chunk_subset: typing.Sequence[slice],
16
+ chunk_shape: typing.Sequence[builtins.int],
17
+ subset: typing.Sequence[slice],
18
+ shape: typing.Sequence[builtins.int],
19
+ ) -> ChunkItem: ...
20
+
21
+ @typing.final
22
+ class CodecPipelineImpl:
23
+ def __new__(
24
+ cls,
25
+ array_metadata: builtins.str,
26
+ store_config: zarr.abc.store.Store,
27
+ *,
28
+ validate_checksums: builtins.bool = False,
29
+ chunk_concurrent_minimum: builtins.int | None = None,
30
+ chunk_concurrent_maximum: builtins.int | None = None,
31
+ num_threads: builtins.int | None = None,
32
+ direct_io: builtins.bool = False,
33
+ ) -> CodecPipelineImpl: ...
34
+ def retrieve_chunks_and_apply_index(
35
+ self,
36
+ chunk_descriptions: typing.Sequence[ChunkItem],
37
+ value: numpy.typing.NDArray[typing.Any],
38
+ ) -> None: ...
39
+ def store_chunks_with_indices(
40
+ self,
41
+ chunk_descriptions: typing.Sequence[ChunkItem],
42
+ value: numpy.typing.NDArray[typing.Any],
43
+ write_empty_chunks: builtins.bool,
44
+ ) -> None: ...
zarrs/pipeline.py ADDED
@@ -0,0 +1,260 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ from dataclasses import dataclass
6
+ from typing import TYPE_CHECKING, TypedDict
7
+ from warnings import warn
8
+
9
+ import numpy as np
10
+ from zarr.abc.codec import Codec, CodecPipeline
11
+ from zarr.codecs._v2 import V2Codec
12
+ from zarr.core import BatchedCodecPipeline
13
+ from zarr.core.config import config
14
+ from zarr.core.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata
15
+
16
+ if TYPE_CHECKING:
17
+ from collections.abc import Iterable, Iterator
18
+ from typing import Self
19
+
20
+ from zarr.abc.store import ByteGetter, ByteSetter, Store
21
+ from zarr.core.array_spec import ArraySpec
22
+ from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer
23
+ from zarr.core.chunk_grids import ChunkGrid
24
+ from zarr.core.indexing import SelectorTuple
25
+ from zarr.dtype import ZDType
26
+
27
+ from ._internal import CodecPipelineImpl
28
+ from .utils import (
29
+ CollapsedDimensionError,
30
+ DiscontiguousArrayError,
31
+ FillValueNoneError,
32
+ make_chunk_info_for_rust_with_indices,
33
+ )
34
+
35
+
36
+ class UnsupportedDataTypeError(Exception):
37
+ pass
38
+
39
+
40
+ class UnsupportedMetadataError(Exception):
41
+ pass
42
+
43
+
44
+ def get_codec_pipeline_impl(
45
+ metadata: ArrayMetadata, store: Store, *, strict: bool
46
+ ) -> CodecPipelineImpl | None:
47
+ try:
48
+ array_metadata_json = json.dumps(metadata.to_dict())
49
+ # Maintain old behavior: https://github.com/zarrs/zarrs-python/tree/b36ba797cafec77f5f41a25316be02c718a2b4f8?tab=readme-ov-file#configuration
50
+ validate_checksums = config.get("codec_pipeline.validate_checksums", True)
51
+ if validate_checksums is None:
52
+ validate_checksums = True
53
+ return CodecPipelineImpl(
54
+ array_metadata_json,
55
+ store_config=store,
56
+ validate_checksums=validate_checksums,
57
+ chunk_concurrent_minimum=config.get(
58
+ "codec_pipeline.chunk_concurrent_minimum", None
59
+ ),
60
+ chunk_concurrent_maximum=config.get(
61
+ "codec_pipeline.chunk_concurrent_maximum", None
62
+ ),
63
+ num_threads=config.get("threading.max_workers", None),
64
+ direct_io=config.get("codec_pipeline.direct_io", False),
65
+ )
66
+ except TypeError as e:
67
+ if strict:
68
+ raise UnsupportedMetadataError() from e
69
+
70
+ warn(
71
+ f"Array is unsupported by ZarrsCodecPipeline: {e}",
72
+ category=UserWarning,
73
+ )
74
+ return None
75
+
76
+
77
+ def get_codec_pipeline_fallback(
78
+ metadata: ArrayMetadata, *, strict: bool
79
+ ) -> BatchedCodecPipeline | None:
80
+ if strict:
81
+ return None
82
+ else:
83
+ codecs = array_metadata_to_codecs(metadata)
84
+ return BatchedCodecPipeline.from_codecs(codecs)
85
+
86
+
87
+ class ZarrsCodecPipelineState(TypedDict):
88
+ codec_metadata_json: str
89
+ codecs: tuple[Codec, ...]
90
+
91
+
92
+ def array_metadata_to_codecs(metadata: ArrayMetadata) -> list[Codec]:
93
+ if isinstance(metadata, ArrayV3Metadata):
94
+ return metadata.codecs
95
+ elif isinstance(metadata, ArrayV2Metadata):
96
+ v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor)
97
+ return [v2_codec]
98
+
99
+
100
+ @dataclass
101
+ class ZarrsCodecPipeline(CodecPipeline):
102
+ metadata: ArrayMetadata
103
+ store: Store
104
+ impl: CodecPipelineImpl | None
105
+ python_impl: BatchedCodecPipeline | None
106
+
107
+ def __getstate__(self) -> ZarrsCodecPipelineState:
108
+ return {"metadata": self.metadata, "store": self.store}
109
+
110
+ def __setstate__(self, state: ZarrsCodecPipelineState):
111
+ self.metadata = state["metadata"]
112
+ self.store = state["store"]
113
+ strict = config.get("codec_pipeline.strict", False)
114
+ self.impl = get_codec_pipeline_impl(self.metadata, self.store, strict=strict)
115
+ self.python_impl = get_codec_pipeline_fallback(self.metadata, strict=strict)
116
+
117
+ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
118
+ return self
119
+
120
+ @classmethod
121
+ def from_codecs(cls, codecs: Iterable[Codec]) -> Self:
122
+ return BatchedCodecPipeline.from_codecs(codecs)
123
+
124
+ @classmethod
125
+ def from_array_metadata_and_store(
126
+ cls, array_metadata: ArrayMetadata, store: Store
127
+ ) -> Self:
128
+ strict = config.get("codec_pipeline.strict", False)
129
+ return cls(
130
+ metadata=array_metadata,
131
+ store=store,
132
+ impl=get_codec_pipeline_impl(array_metadata, store, strict=strict),
133
+ python_impl=get_codec_pipeline_fallback(array_metadata, strict=strict),
134
+ )
135
+
136
+ @property
137
+ def supports_partial_decode(self) -> bool:
138
+ return False
139
+
140
+ @property
141
+ def supports_partial_encode(self) -> bool:
142
+ return False
143
+
144
+ def __iter__(self) -> Iterator[Codec]:
145
+ yield from self.codecs
146
+
147
+ def validate(
148
+ self, *, shape: tuple[int, ...], dtype: ZDType, chunk_grid: ChunkGrid
149
+ ) -> None:
150
+ raise NotImplementedError("validate")
151
+
152
+ def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int:
153
+ raise NotImplementedError("compute_encoded_size")
154
+
155
+ async def decode(
156
+ self,
157
+ chunk_bytes_and_specs: Iterable[tuple[Buffer | None, ArraySpec]],
158
+ ) -> Iterable[NDBuffer | None]:
159
+ raise NotImplementedError("decode")
160
+
161
+ async def encode(
162
+ self,
163
+ chunk_arrays_and_specs: Iterable[tuple[NDBuffer | None, ArraySpec]],
164
+ ) -> Iterable[Buffer | None]:
165
+ raise NotImplementedError("encode")
166
+
167
+ async def read(
168
+ self,
169
+ batch_info: Iterable[
170
+ tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]
171
+ ],
172
+ out: NDBuffer, # type: ignore
173
+ drop_axes: tuple[int, ...] = (), # FIXME: unused
174
+ ) -> None:
175
+ # FIXME: Error if array is not in host memory
176
+ if not out.dtype.isnative:
177
+ raise RuntimeError("Non-native byte order not supported")
178
+ try:
179
+ if self.impl is None:
180
+ raise UnsupportedMetadataError()
181
+ self._raise_error_on_unsupported_batch_dtype(batch_info)
182
+ chunks_desc = make_chunk_info_for_rust_with_indices(
183
+ batch_info, drop_axes, out.shape
184
+ )
185
+ except (
186
+ UnsupportedMetadataError,
187
+ DiscontiguousArrayError,
188
+ CollapsedDimensionError,
189
+ UnsupportedDataTypeError,
190
+ FillValueNoneError,
191
+ ):
192
+ if self.python_impl is None:
193
+ raise
194
+ await self.python_impl.read(batch_info, out, drop_axes)
195
+ return None
196
+ else:
197
+ out: NDArrayLike = out.as_ndarray_like()
198
+ await asyncio.to_thread(
199
+ self.impl.retrieve_chunks_and_apply_index,
200
+ chunks_desc.chunk_info_with_indices,
201
+ out,
202
+ )
203
+ return None
204
+
205
+ async def write(
206
+ self,
207
+ batch_info: Iterable[
208
+ tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]
209
+ ],
210
+ value: NDBuffer, # type: ignore
211
+ drop_axes: tuple[int, ...] = (),
212
+ ) -> None:
213
+ try:
214
+ if self.impl is None:
215
+ raise UnsupportedMetadataError()
216
+ self._raise_error_on_unsupported_batch_dtype(batch_info)
217
+ chunks_desc = make_chunk_info_for_rust_with_indices(
218
+ batch_info, drop_axes, value.shape
219
+ )
220
+ except (
221
+ UnsupportedMetadataError,
222
+ DiscontiguousArrayError,
223
+ CollapsedDimensionError,
224
+ UnsupportedDataTypeError,
225
+ FillValueNoneError,
226
+ ):
227
+ if self.python_impl is None:
228
+ raise
229
+ await self.python_impl.write(batch_info, value, drop_axes)
230
+ return None
231
+ else:
232
+ # FIXME: Error if array is not in host memory
233
+ value_np: NDArrayLike | np.ndarray = value.as_ndarray_like()
234
+ if not value_np.dtype.isnative:
235
+ value_np = np.ascontiguousarray(
236
+ value_np, dtype=value_np.dtype.newbyteorder("=")
237
+ )
238
+ elif not value_np.flags.c_contiguous:
239
+ value_np = np.ascontiguousarray(value_np)
240
+ await asyncio.to_thread(
241
+ self.impl.store_chunks_with_indices,
242
+ chunks_desc.chunk_info_with_indices,
243
+ value_np,
244
+ chunks_desc.write_empty_chunks,
245
+ )
246
+ return None
247
+
248
+ def _raise_error_on_unsupported_batch_dtype(
249
+ self,
250
+ batch_info: Iterable[
251
+ tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]
252
+ ],
253
+ ):
254
+ # https://github.com/LDeakin/zarrs/blob/0532fe983b7b42b59dbf84e50a2fe5e6f7bad4ce/zarrs_metadata/src/v2_to_v3.rs#L289-L293 for VSUMm
255
+ # Further, our pipeline does not support variable-length objects due to limitations on decode_into, so object/np.dtypes.StringDType is also out
256
+ if any(
257
+ info.dtype.to_native_dtype().kind in {"V", "S", "U", "M", "m", "O", "T"}
258
+ for (_, info, _, _, _) in batch_info
259
+ ):
260
+ raise UnsupportedDataTypeError()
zarrs/py.typed ADDED
File without changes
zarrs/utils.py ADDED
@@ -0,0 +1,205 @@
1
+ from __future__ import annotations
2
+
3
+ import operator
4
+ import os
5
+ from dataclasses import dataclass
6
+ from functools import reduce
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ import numpy as np
10
+ from zarr.core.array_spec import ArraySpec
11
+ from zarr.core.indexing import SelectorTuple, is_integer
12
+
13
+ from zarrs._internal import ChunkItem
14
+
15
+ if TYPE_CHECKING:
16
+ from collections.abc import Iterable
17
+ from types import EllipsisType
18
+
19
+ from zarr.abc.store import ByteGetter, ByteSetter
20
+ from zarr.dtype import ZDType
21
+
22
+
23
+ # adapted from https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor
24
+ def get_max_threads() -> int:
25
+ return (os.cpu_count() or 1) + 4
26
+
27
+
28
+ class DiscontiguousArrayError(Exception):
29
+ pass
30
+
31
+
32
+ class CollapsedDimensionError(Exception):
33
+ pass
34
+
35
+
36
+ class FillValueNoneError(Exception):
37
+ pass
38
+
39
+
40
+ # This is a (mostly) copy of the function from zarr.core.indexing that fixes:
41
+ # DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated
42
+ # TODO: Upstream this fix
43
+ def make_slice_selection(selection: tuple[np.ndarray | float]) -> list[slice]:
44
+ ls: list[slice] = []
45
+ for dim_selection in selection:
46
+ if is_integer(dim_selection):
47
+ ls.append(slice(int(dim_selection), int(dim_selection) + 1, 1))
48
+ elif isinstance(dim_selection, np.ndarray):
49
+ dim_selection = dim_selection.ravel()
50
+ if len(dim_selection) == 1:
51
+ ls.append(
52
+ slice(int(dim_selection.item()), int(dim_selection.item()) + 1, 1)
53
+ )
54
+ else:
55
+ diff = np.diff(dim_selection)
56
+ if (diff != 1).any() and (diff != 0).any():
57
+ raise DiscontiguousArrayError(diff)
58
+ ls.append(slice(dim_selection[0], dim_selection[-1] + 1, 1))
59
+ else:
60
+ ls.append(dim_selection)
61
+ return ls
62
+
63
+
64
+ def selector_tuple_to_slice_selection(selector_tuple: SelectorTuple) -> list[slice]:
65
+ if isinstance(selector_tuple, slice):
66
+ return [selector_tuple]
67
+ if all(isinstance(s, slice) for s in selector_tuple):
68
+ return list(selector_tuple)
69
+ return make_slice_selection(selector_tuple)
70
+
71
+
72
+ def resulting_shape_from_index(
73
+ array_shape: tuple[int, ...],
74
+ index_tuple: tuple[int | slice | EllipsisType | np.ndarray],
75
+ drop_axes: tuple[int, ...],
76
+ *,
77
+ pad: bool,
78
+ ) -> tuple[int, ...]:
79
+ result_shape = []
80
+ advanced_index_shapes = [
81
+ idx.shape for idx in index_tuple if isinstance(idx, np.ndarray)
82
+ ]
83
+ basic_shape_index = 0
84
+
85
+ # Broadcast all advanced indices, if any
86
+ if advanced_index_shapes:
87
+ result_shape += np.broadcast_shapes(*advanced_index_shapes)
88
+ # Consume dimensions from array_shape
89
+ basic_shape_index += len(advanced_index_shapes)
90
+
91
+ # Process each remaining index in index_tuple
92
+ for idx in index_tuple:
93
+ if isinstance(idx, int):
94
+ # Integer index reduces dimension, so skip this dimension in array_shape
95
+ basic_shape_index += 1
96
+ elif isinstance(idx, slice):
97
+ if idx.step is not None and idx.step > 1:
98
+ raise DiscontiguousArrayError(
99
+ "Step size greater than 1 is not supported"
100
+ )
101
+ # Slice keeps dimension, adjust size accordingly
102
+ start, stop, _ = idx.indices(array_shape[basic_shape_index])
103
+ result_shape.append(stop - start)
104
+ basic_shape_index += 1
105
+ elif idx is Ellipsis:
106
+ # Calculate number of dimensions that Ellipsis should fill
107
+ num_to_fill = len(array_shape) - len(index_tuple) + 1
108
+ result_shape += array_shape[
109
+ basic_shape_index : basic_shape_index + num_to_fill
110
+ ]
111
+ basic_shape_index += num_to_fill
112
+ elif not isinstance(idx, np.ndarray):
113
+ raise ValueError(f"Invalid index type: {type(idx)}")
114
+
115
+ # Step 4: Append remaining dimensions from array_shape if fewer indices were used
116
+ if basic_shape_index < len(array_shape) and pad:
117
+ result_shape += array_shape[basic_shape_index:]
118
+
119
+ return tuple(size for idx, size in enumerate(result_shape) if idx not in drop_axes)
120
+
121
+
122
+ def prod_op(x: Iterable[int]) -> int:
123
+ return reduce(operator.mul, x, 1)
124
+
125
+
126
+ def get_shape_for_selector(
127
+ selector_tuple: SelectorTuple,
128
+ shape: tuple[int, ...],
129
+ *,
130
+ pad: bool,
131
+ drop_axes: tuple[int, ...] = (),
132
+ ) -> tuple[int, ...]:
133
+ if isinstance(selector_tuple, slice | np.ndarray):
134
+ return resulting_shape_from_index(
135
+ shape,
136
+ (selector_tuple,),
137
+ drop_axes,
138
+ pad=pad,
139
+ )
140
+ return resulting_shape_from_index(shape, selector_tuple, drop_axes, pad=pad)
141
+
142
+
143
+ def get_implicit_fill_value(dtype: ZDType, fill_value: Any) -> Any:
144
+ if fill_value is None:
145
+ fill_value = dtype.default_scalar()
146
+ return fill_value
147
+
148
+
149
+ @dataclass(frozen=True)
150
+ class RustChunkInfo:
151
+ chunk_info_with_indices: list[ChunkItem]
152
+ write_empty_chunks: bool
153
+
154
+
155
+ def make_chunk_info_for_rust_with_indices(
156
+ batch_info: Iterable[
157
+ tuple[ByteGetter | ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]
158
+ ],
159
+ drop_axes: tuple[int, ...],
160
+ shape: tuple[int, ...],
161
+ ) -> RustChunkInfo:
162
+ shape = shape if shape else (1,) # constant array
163
+ chunk_info_with_indices: list[ChunkItem] = []
164
+ write_empty_chunks: bool = True
165
+ for (
166
+ byte_getter,
167
+ chunk_spec,
168
+ chunk_selection,
169
+ out_selection,
170
+ _,
171
+ ) in batch_info:
172
+ write_empty_chunks = chunk_spec.config.write_empty_chunks
173
+ if chunk_spec.fill_value is None:
174
+ chunk_spec = ArraySpec(
175
+ chunk_spec.shape,
176
+ chunk_spec.dtype,
177
+ get_implicit_fill_value(chunk_spec.dtype, chunk_spec.fill_value),
178
+ chunk_spec.config,
179
+ chunk_spec.prototype,
180
+ )
181
+ out_selection_as_slices = selector_tuple_to_slice_selection(out_selection)
182
+ chunk_selection_as_slices = selector_tuple_to_slice_selection(chunk_selection)
183
+ shape_chunk_selection_slices = get_shape_for_selector(
184
+ tuple(chunk_selection_as_slices),
185
+ chunk_spec.shape,
186
+ pad=True,
187
+ drop_axes=drop_axes,
188
+ )
189
+ shape_chunk_selection = get_shape_for_selector(
190
+ chunk_selection, chunk_spec.shape, pad=True, drop_axes=drop_axes
191
+ )
192
+ if prod_op(shape_chunk_selection) != prod_op(shape_chunk_selection_slices):
193
+ raise CollapsedDimensionError(
194
+ f"{shape_chunk_selection} != {shape_chunk_selection_slices}"
195
+ )
196
+ chunk_info_with_indices.append(
197
+ ChunkItem(
198
+ key=byte_getter.path,
199
+ chunk_subset=chunk_selection_as_slices,
200
+ chunk_shape=chunk_spec.shape,
201
+ subset=out_selection_as_slices,
202
+ shape=shape,
203
+ )
204
+ )
205
+ return RustChunkInfo(chunk_info_with_indices, write_empty_chunks)
@@ -0,0 +1,144 @@
1
+ Metadata-Version: 2.4
2
+ Name: zarrs
3
+ Version: 0.2.2
4
+ Classifier: Programming Language :: Rust
5
+ Classifier: Programming Language :: Python :: Implementation :: CPython
6
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
7
+ Classifier: Typing :: Typed
8
+ Requires-Dist: numpy>=1.24
9
+ Requires-Dist: zarr>=3.1
10
+ License-File: LICENSE
11
+ Summary: A CodecPipeline for zarr-python backed by the zarrs Rust crate
12
+ Author: Ilan Gold, Lachlan Deakin, Philipp Angerer
13
+ License-Expression: MIT
14
+ Requires-Python: >=3.11
15
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
16
+
17
+ # zarrs-python
18
+
19
+ [![PyPI](https://img.shields.io/pypi/v/zarrs.svg)](https://pypi.org/project/zarrs)
20
+ [![Downloads](https://static.pepy.tech/badge/zarrs/month)](https://pepy.tech/project/zarrs)
21
+ [![Downloads](https://static.pepy.tech/badge/zarrs)](https://pepy.tech/project/zarrs)
22
+ [![Stars](https://img.shields.io/github/stars/zarrs/zarrs-python?style=flat&logo=github&color=yellow)](https://github.com/zarrs/zarrs-python/stargazers)
23
+ ![CI](https://github.com/zarrs/zarrs-python/actions/workflows/ci.yml/badge.svg)
24
+ ![CD](https://github.com/zarrs/zarrs-python/actions/workflows/cd.yml/badge.svg)
25
+
26
+ This project serves as a bridge between [`zarrs`](https://docs.rs/zarrs/latest/zarrs/) (Rust) and [`zarr`](https://zarr.readthedocs.io/en/latest/index.html) (`zarr-python`) via [`PyO3`](https://pyo3.rs/v0.22.3/). The main goal of the project is to speed up i/o (see [`zarr_benchmarks`](https://github.com/LDeakin/zarr_benchmarks)).
27
+
28
+ To use the project, simply install our package (which depends on `zarr-python>=3.0.0`), and run:
29
+
30
+ ```python
31
+ import zarr
32
+ zarr.config.set({"codec_pipeline.path": "zarrs.ZarrsCodecPipeline"})
33
+ ```
34
+
35
+ You can then use your `zarr` as normal (with some caveats)!
36
+
37
+ ## API
38
+
39
+ We export a `ZarrsCodecPipeline` class so that `zarr-python` can use the class but it is not meant to be instantiated and we do not guarantee the stability of its API beyond what is required so that `zarr-python` can use it. Therefore, it is not documented here.
40
+
41
+ At the moment, we only support a subset of the `zarr-python` stores:
42
+
43
+ - [`LocalStore`](https://zarr.readthedocs.io/en/latest/api/zarr/storage/#zarr.storage.LocalStore) (local filesystem)
44
+ - [`ObjectStore`](https://zarr.readthedocs.io/en/latest/user-guide/storage/#object-store) (cloud storage)
45
+ - [`HTTPFileSystem`](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.implementations.http.HTTPFileSystem) via [`FsspecStore`](https://zarr.readthedocs.io/en/latest/api/zarr/storage/#zarr.storage.FsspecStore)
46
+
47
+ A `NotImplementedError` will be raised if a store is not supported.
48
+
49
+ ### Configuration
50
+
51
+ `ZarrsCodecPipeline` options are exposed through `zarr.config`.
52
+
53
+ Standard `zarr.config` options control some functionality (see the defaults in the [config.py](https://github.com/zarr-developers/zarr-python/blob/main/src/zarr/core/config.py) of `zarr-python`):
54
+ - `threading.max_workers`: the maximum number of threads used internally by the `ZarrsCodecPipeline` on the Rust side.
55
+ - Defaults to the number of threads in the global `rayon` thread pool if set to `None`, which is [typically the number of logical CPUs](https://docs.rs/rayon/latest/rayon/struct.ThreadPoolBuilder.html#method.num_threads).
56
+ - `array.write_empty_chunks`: whether or not to store empty chunks.
57
+ - Defaults to false if `None`. Note that checking for emptiness has some overhead, see [here](https://docs.rs/zarrs/latest/zarrs/config/struct.Config.html#store-empty-chunks) for more info.
58
+
59
+ The `ZarrsCodecPipeline` specific options are:
60
+ - `codec_pipeline.chunk_concurrent_maximum`: the maximum number of chunks stored/retrieved concurrently.
61
+ - Defaults to the number of logical CPUs if `None`. It is constrained by `threading.max_workers` as well.
62
+ - `codec_pipeline.chunk_concurrent_minimum`: the minimum number of chunks retrieved/stored concurrently when balancing chunk/codec concurrency.
63
+ - Defaults to 4 if `None`. See [here](https://docs.rs/zarrs/latest/zarrs/config/struct.Config.html#chunk-concurrent-minimum) for more info.
64
+ - `codec_pipeline.validate_checksums`: enable checksum validation (e.g. with the CRC32C codec).
65
+ - Defaults to `True`. See [here](https://docs.rs/zarrs/latest/zarrs/config/struct.Config.html#validate-checksums) for more info.
66
+ - `codec_pipeline.direct_io`: enable `O_DIRECT` read/write, needs support from the operating system (currently only Linux) and file system.
67
+ - Defaults to `False`.
68
+ - `codec_pipeline.strict`: raise exceptions for unsupported operations instead of falling back to the default codec pipeline of `zarr-python`.
69
+ - Defaults to `False`.
70
+
71
+ For example:
72
+ ```python
73
+ zarr.config.set({
74
+ "threading.max_workers": None,
75
+ "array.write_empty_chunks": False,
76
+ "codec_pipeline": {
77
+ "path": "zarrs.ZarrsCodecPipeline",
78
+ "validate_checksums": True,
79
+ "chunk_concurrent_maximum": None,
80
+ "chunk_concurrent_minimum": 4,
81
+ "direct_io": False,
82
+ "strict": False
83
+ }
84
+ })
85
+ ```
86
+
87
+ If the `ZarrsCodecPipeline` is pickled, and then un-pickled, and during that time one of `chunk_concurrent_minimum`, `chunk_concurrent_maximum`, or `num_threads` has changed, the newly un-pickled version will pick up the new value. However, once a `ZarrsCodecPipeline` object has been instantiated, these values are then fixed. This may change in the future as guidance from the `zarr` community becomes clear.
88
+
89
+ ## Concurrency
90
+
91
+ Concurrency can be classified into two types:
92
+ - chunk (outer) concurrency: the number of chunks retrieved/stored concurrently.
93
+ - This is chosen automatically based on various factors, such as the chunk size and codecs.
94
+ - It is constrained between `codec_pipeline.chunk_concurrent_minimum` and `codec_pipeline.chunk_concurrent_maximum` for operations involving multiple chunks.
95
+ - codec (inner) concurrency: the number of threads encoding/decoding a chunk.
96
+ - This is chosen automatically in combination with the chunk concurrency.
97
+
98
+ The product of the chunk and codec concurrency will approximately match `threading.max_workers`.
99
+
100
+ Chunk concurrency is typically favored because:
101
+ - parallel encoding/decoding can have a high overhead with some codecs, especially with small chunks, and
102
+ - it is advantageous to retrieve/store multiple chunks concurrently, especially with high latency stores.
103
+
104
+ `zarrs-python` will often favor codec concurrency with sharded arrays, as they are well suited to codec concurrency.
105
+
106
+ ## Supported Indexing Methods
107
+
108
+ The following methods will trigger use with the old zarr-python pipeline:
109
+
110
+ 1. Any `oindex` or `vindex` integer `np.ndarray` indexing with dimensionality >=3 i.e.,
111
+
112
+ ```python
113
+ arr[np.array([...]), :, np.array([...])]
114
+ arr[np.array([...]), np.array([...]), np.array([...])]
115
+ arr[np.array([...]), np.array([...]), np.array([...])] = ...
116
+ arr.oindex[np.array([...]), np.array([...]), np.array([...])] = ...
117
+ ```
118
+
119
+ 2. Any `vindex` or `oindex` discontinuous integer `np.ndarray` indexing for writes in 2D
120
+
121
+ ```python
122
+ arr[np.array([0, 5]), :] = ...
123
+ arr.oindex[np.array([0, 5]), :] = ...
124
+ ```
125
+
126
+ 3. `vindex` writes in 2D where both indexers are integer `np.ndarray` indices i.e.,
127
+
128
+ ```python
129
+ arr[np.array([...]), np.array([...])] = ...
130
+ ```
131
+
132
+ 4. Ellipsis indexing. We have tested some, but others fail even with `zarr-python`'s default codec pipeline. Thus for now we advise proceeding with caution here.
133
+
134
+ ```python
135
+ arr[0:10, ..., 0:5]
136
+ ```
137
+
138
+
139
+ Furthermore, using anything except contiguous (i.e., slices or consecutive integer) `np.ndarray` for numeric data will fall back to the default `zarr-python` implementation.
140
+
141
+ Please file an issue if you believe we have more holes in our coverage than we are aware of or you wish to contribute! For example, we have an [issue in zarrs for integer-array indexing](https://github.com/LDeakin/zarrs/issues/52) that would unblock a lot the use of the rust pipeline for that use-case (very useful for mini-batch training perhaps!).
142
+
143
+ Further, any codecs not supported by `zarrs` will also automatically fall back to the python implementation.
144
+
@@ -0,0 +1,11 @@
1
+ zarrs-0.2.2.dist-info/METADATA,sha256=8K1AOS_SVQgRLzQ2rWtNbJxCAb12XmTLRrT6QklQgOI,8054
2
+ zarrs-0.2.2.dist-info/WHEEL,sha256=LyKrmraG_uSXY5vhpX0CJs5HieuGNOxWFsl0o305mFk,108
3
+ zarrs-0.2.2.dist-info/entry_points.txt,sha256=EzI6yCIUPDHBHzjDdexuGGYbOLXf8x2ICokOJXnuX3k,68
4
+ zarrs-0.2.2.dist-info/licenses/LICENSE,sha256=vwIsJjEfVFehyyqcb7B3dAXAniaFMmk8u7IoiJAfBJ4,1099
5
+ zarrs/__init__.py,sha256=lRVtAPzCzJkGs4vQrW4UgANq-pC-khS0ZF7HTj4__Hg,489
6
+ zarrs/_internal.abi3.so,sha256=GCAoGz9VUm-OVKCf_MRcPJwsZxtNI0sdsYYJUTr1vRk,17179376
7
+ zarrs/_internal.pyi,sha256=a_D4yx99r4xeQX1ntY_A_Q4wVmLeLwJZHWAQV_mVu9A,1308
8
+ zarrs/pipeline.py,sha256=YfB13GWNfxELerXVtJ_ipFwSL7bN-YuPys6jCB9lnms,9008
9
+ zarrs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ zarrs/utils.py,sha256=W2XCjJDVrdHYJgtVaRKN533Ljw1MF7o0YwXuz5ZAk2g,7020
11
+ zarrs-0.2.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.9.4)
3
+ Root-Is-Purelib: false
4
+ Tag: cp311-abi3-manylinux_2_28_ppc64le
@@ -0,0 +1,2 @@
1
+ [zarr.codec_pipeline]
2
+ zarrs.codec_pipeline=zarrs:ZarrsCodecPipeline
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Ilan Gold, Lachlan Deakin, Philipp Angerer
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.