zarrs 0.2.2__cp311-abi3-manylinux_2_28_ppc64le.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zarrs/__init__.py +16 -0
- zarrs/_internal.abi3.so +0 -0
- zarrs/_internal.pyi +44 -0
- zarrs/pipeline.py +260 -0
- zarrs/py.typed +0 -0
- zarrs/utils.py +205 -0
- zarrs-0.2.2.dist-info/METADATA +144 -0
- zarrs-0.2.2.dist-info/RECORD +11 -0
- zarrs-0.2.2.dist-info/WHEEL +4 -0
- zarrs-0.2.2.dist-info/entry_points.txt +2 -0
- zarrs-0.2.2.dist-info/licenses/LICENSE +21 -0
zarrs/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from ._internal import __version__
|
|
2
|
+
from .pipeline import ZarrsCodecPipeline as _ZarrsCodecPipeline
|
|
3
|
+
from .utils import CollapsedDimensionError, DiscontiguousArrayError
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# Need to do this redirection so people can access the pipeline as `zarrs.ZarrsCodecPipeline` instead of `zarrs.pipeline.ZarrsCodecPipeline`
|
|
7
|
+
class ZarrsCodecPipeline(_ZarrsCodecPipeline):
|
|
8
|
+
pass
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"ZarrsCodecPipeline",
|
|
13
|
+
"DiscontiguousArrayError",
|
|
14
|
+
"CollapsedDimensionError",
|
|
15
|
+
"__version__",
|
|
16
|
+
]
|
zarrs/_internal.abi3.so
ADDED
|
Binary file
|
zarrs/_internal.pyi
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# This file is automatically generated by pyo3_stub_gen
|
|
2
|
+
# ruff: noqa: E501, F401
|
|
3
|
+
|
|
4
|
+
import builtins
|
|
5
|
+
import typing
|
|
6
|
+
|
|
7
|
+
import numpy.typing
|
|
8
|
+
import zarr.abc.store
|
|
9
|
+
|
|
10
|
+
@typing.final
|
|
11
|
+
class ChunkItem:
|
|
12
|
+
def __new__(
|
|
13
|
+
cls,
|
|
14
|
+
key: builtins.str,
|
|
15
|
+
chunk_subset: typing.Sequence[slice],
|
|
16
|
+
chunk_shape: typing.Sequence[builtins.int],
|
|
17
|
+
subset: typing.Sequence[slice],
|
|
18
|
+
shape: typing.Sequence[builtins.int],
|
|
19
|
+
) -> ChunkItem: ...
|
|
20
|
+
|
|
21
|
+
@typing.final
|
|
22
|
+
class CodecPipelineImpl:
|
|
23
|
+
def __new__(
|
|
24
|
+
cls,
|
|
25
|
+
array_metadata: builtins.str,
|
|
26
|
+
store_config: zarr.abc.store.Store,
|
|
27
|
+
*,
|
|
28
|
+
validate_checksums: builtins.bool = False,
|
|
29
|
+
chunk_concurrent_minimum: builtins.int | None = None,
|
|
30
|
+
chunk_concurrent_maximum: builtins.int | None = None,
|
|
31
|
+
num_threads: builtins.int | None = None,
|
|
32
|
+
direct_io: builtins.bool = False,
|
|
33
|
+
) -> CodecPipelineImpl: ...
|
|
34
|
+
def retrieve_chunks_and_apply_index(
|
|
35
|
+
self,
|
|
36
|
+
chunk_descriptions: typing.Sequence[ChunkItem],
|
|
37
|
+
value: numpy.typing.NDArray[typing.Any],
|
|
38
|
+
) -> None: ...
|
|
39
|
+
def store_chunks_with_indices(
|
|
40
|
+
self,
|
|
41
|
+
chunk_descriptions: typing.Sequence[ChunkItem],
|
|
42
|
+
value: numpy.typing.NDArray[typing.Any],
|
|
43
|
+
write_empty_chunks: builtins.bool,
|
|
44
|
+
) -> None: ...
|
zarrs/pipeline.py
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import TYPE_CHECKING, TypedDict
|
|
7
|
+
from warnings import warn
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
from zarr.abc.codec import Codec, CodecPipeline
|
|
11
|
+
from zarr.codecs._v2 import V2Codec
|
|
12
|
+
from zarr.core import BatchedCodecPipeline
|
|
13
|
+
from zarr.core.config import config
|
|
14
|
+
from zarr.core.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from collections.abc import Iterable, Iterator
|
|
18
|
+
from typing import Self
|
|
19
|
+
|
|
20
|
+
from zarr.abc.store import ByteGetter, ByteSetter, Store
|
|
21
|
+
from zarr.core.array_spec import ArraySpec
|
|
22
|
+
from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer
|
|
23
|
+
from zarr.core.chunk_grids import ChunkGrid
|
|
24
|
+
from zarr.core.indexing import SelectorTuple
|
|
25
|
+
from zarr.dtype import ZDType
|
|
26
|
+
|
|
27
|
+
from ._internal import CodecPipelineImpl
|
|
28
|
+
from .utils import (
|
|
29
|
+
CollapsedDimensionError,
|
|
30
|
+
DiscontiguousArrayError,
|
|
31
|
+
FillValueNoneError,
|
|
32
|
+
make_chunk_info_for_rust_with_indices,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class UnsupportedDataTypeError(Exception):
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class UnsupportedMetadataError(Exception):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_codec_pipeline_impl(
|
|
45
|
+
metadata: ArrayMetadata, store: Store, *, strict: bool
|
|
46
|
+
) -> CodecPipelineImpl | None:
|
|
47
|
+
try:
|
|
48
|
+
array_metadata_json = json.dumps(metadata.to_dict())
|
|
49
|
+
# Maintain old behavior: https://github.com/zarrs/zarrs-python/tree/b36ba797cafec77f5f41a25316be02c718a2b4f8?tab=readme-ov-file#configuration
|
|
50
|
+
validate_checksums = config.get("codec_pipeline.validate_checksums", True)
|
|
51
|
+
if validate_checksums is None:
|
|
52
|
+
validate_checksums = True
|
|
53
|
+
return CodecPipelineImpl(
|
|
54
|
+
array_metadata_json,
|
|
55
|
+
store_config=store,
|
|
56
|
+
validate_checksums=validate_checksums,
|
|
57
|
+
chunk_concurrent_minimum=config.get(
|
|
58
|
+
"codec_pipeline.chunk_concurrent_minimum", None
|
|
59
|
+
),
|
|
60
|
+
chunk_concurrent_maximum=config.get(
|
|
61
|
+
"codec_pipeline.chunk_concurrent_maximum", None
|
|
62
|
+
),
|
|
63
|
+
num_threads=config.get("threading.max_workers", None),
|
|
64
|
+
direct_io=config.get("codec_pipeline.direct_io", False),
|
|
65
|
+
)
|
|
66
|
+
except TypeError as e:
|
|
67
|
+
if strict:
|
|
68
|
+
raise UnsupportedMetadataError() from e
|
|
69
|
+
|
|
70
|
+
warn(
|
|
71
|
+
f"Array is unsupported by ZarrsCodecPipeline: {e}",
|
|
72
|
+
category=UserWarning,
|
|
73
|
+
)
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_codec_pipeline_fallback(
|
|
78
|
+
metadata: ArrayMetadata, *, strict: bool
|
|
79
|
+
) -> BatchedCodecPipeline | None:
|
|
80
|
+
if strict:
|
|
81
|
+
return None
|
|
82
|
+
else:
|
|
83
|
+
codecs = array_metadata_to_codecs(metadata)
|
|
84
|
+
return BatchedCodecPipeline.from_codecs(codecs)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class ZarrsCodecPipelineState(TypedDict):
|
|
88
|
+
codec_metadata_json: str
|
|
89
|
+
codecs: tuple[Codec, ...]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def array_metadata_to_codecs(metadata: ArrayMetadata) -> list[Codec]:
|
|
93
|
+
if isinstance(metadata, ArrayV3Metadata):
|
|
94
|
+
return metadata.codecs
|
|
95
|
+
elif isinstance(metadata, ArrayV2Metadata):
|
|
96
|
+
v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor)
|
|
97
|
+
return [v2_codec]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class ZarrsCodecPipeline(CodecPipeline):
|
|
102
|
+
metadata: ArrayMetadata
|
|
103
|
+
store: Store
|
|
104
|
+
impl: CodecPipelineImpl | None
|
|
105
|
+
python_impl: BatchedCodecPipeline | None
|
|
106
|
+
|
|
107
|
+
def __getstate__(self) -> ZarrsCodecPipelineState:
|
|
108
|
+
return {"metadata": self.metadata, "store": self.store}
|
|
109
|
+
|
|
110
|
+
def __setstate__(self, state: ZarrsCodecPipelineState):
|
|
111
|
+
self.metadata = state["metadata"]
|
|
112
|
+
self.store = state["store"]
|
|
113
|
+
strict = config.get("codec_pipeline.strict", False)
|
|
114
|
+
self.impl = get_codec_pipeline_impl(self.metadata, self.store, strict=strict)
|
|
115
|
+
self.python_impl = get_codec_pipeline_fallback(self.metadata, strict=strict)
|
|
116
|
+
|
|
117
|
+
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
|
|
118
|
+
return self
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
def from_codecs(cls, codecs: Iterable[Codec]) -> Self:
|
|
122
|
+
return BatchedCodecPipeline.from_codecs(codecs)
|
|
123
|
+
|
|
124
|
+
@classmethod
|
|
125
|
+
def from_array_metadata_and_store(
|
|
126
|
+
cls, array_metadata: ArrayMetadata, store: Store
|
|
127
|
+
) -> Self:
|
|
128
|
+
strict = config.get("codec_pipeline.strict", False)
|
|
129
|
+
return cls(
|
|
130
|
+
metadata=array_metadata,
|
|
131
|
+
store=store,
|
|
132
|
+
impl=get_codec_pipeline_impl(array_metadata, store, strict=strict),
|
|
133
|
+
python_impl=get_codec_pipeline_fallback(array_metadata, strict=strict),
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
@property
|
|
137
|
+
def supports_partial_decode(self) -> bool:
|
|
138
|
+
return False
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def supports_partial_encode(self) -> bool:
|
|
142
|
+
return False
|
|
143
|
+
|
|
144
|
+
def __iter__(self) -> Iterator[Codec]:
|
|
145
|
+
yield from self.codecs
|
|
146
|
+
|
|
147
|
+
def validate(
|
|
148
|
+
self, *, shape: tuple[int, ...], dtype: ZDType, chunk_grid: ChunkGrid
|
|
149
|
+
) -> None:
|
|
150
|
+
raise NotImplementedError("validate")
|
|
151
|
+
|
|
152
|
+
def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int:
|
|
153
|
+
raise NotImplementedError("compute_encoded_size")
|
|
154
|
+
|
|
155
|
+
async def decode(
|
|
156
|
+
self,
|
|
157
|
+
chunk_bytes_and_specs: Iterable[tuple[Buffer | None, ArraySpec]],
|
|
158
|
+
) -> Iterable[NDBuffer | None]:
|
|
159
|
+
raise NotImplementedError("decode")
|
|
160
|
+
|
|
161
|
+
async def encode(
|
|
162
|
+
self,
|
|
163
|
+
chunk_arrays_and_specs: Iterable[tuple[NDBuffer | None, ArraySpec]],
|
|
164
|
+
) -> Iterable[Buffer | None]:
|
|
165
|
+
raise NotImplementedError("encode")
|
|
166
|
+
|
|
167
|
+
async def read(
|
|
168
|
+
self,
|
|
169
|
+
batch_info: Iterable[
|
|
170
|
+
tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]
|
|
171
|
+
],
|
|
172
|
+
out: NDBuffer, # type: ignore
|
|
173
|
+
drop_axes: tuple[int, ...] = (), # FIXME: unused
|
|
174
|
+
) -> None:
|
|
175
|
+
# FIXME: Error if array is not in host memory
|
|
176
|
+
if not out.dtype.isnative:
|
|
177
|
+
raise RuntimeError("Non-native byte order not supported")
|
|
178
|
+
try:
|
|
179
|
+
if self.impl is None:
|
|
180
|
+
raise UnsupportedMetadataError()
|
|
181
|
+
self._raise_error_on_unsupported_batch_dtype(batch_info)
|
|
182
|
+
chunks_desc = make_chunk_info_for_rust_with_indices(
|
|
183
|
+
batch_info, drop_axes, out.shape
|
|
184
|
+
)
|
|
185
|
+
except (
|
|
186
|
+
UnsupportedMetadataError,
|
|
187
|
+
DiscontiguousArrayError,
|
|
188
|
+
CollapsedDimensionError,
|
|
189
|
+
UnsupportedDataTypeError,
|
|
190
|
+
FillValueNoneError,
|
|
191
|
+
):
|
|
192
|
+
if self.python_impl is None:
|
|
193
|
+
raise
|
|
194
|
+
await self.python_impl.read(batch_info, out, drop_axes)
|
|
195
|
+
return None
|
|
196
|
+
else:
|
|
197
|
+
out: NDArrayLike = out.as_ndarray_like()
|
|
198
|
+
await asyncio.to_thread(
|
|
199
|
+
self.impl.retrieve_chunks_and_apply_index,
|
|
200
|
+
chunks_desc.chunk_info_with_indices,
|
|
201
|
+
out,
|
|
202
|
+
)
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
async def write(
|
|
206
|
+
self,
|
|
207
|
+
batch_info: Iterable[
|
|
208
|
+
tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]
|
|
209
|
+
],
|
|
210
|
+
value: NDBuffer, # type: ignore
|
|
211
|
+
drop_axes: tuple[int, ...] = (),
|
|
212
|
+
) -> None:
|
|
213
|
+
try:
|
|
214
|
+
if self.impl is None:
|
|
215
|
+
raise UnsupportedMetadataError()
|
|
216
|
+
self._raise_error_on_unsupported_batch_dtype(batch_info)
|
|
217
|
+
chunks_desc = make_chunk_info_for_rust_with_indices(
|
|
218
|
+
batch_info, drop_axes, value.shape
|
|
219
|
+
)
|
|
220
|
+
except (
|
|
221
|
+
UnsupportedMetadataError,
|
|
222
|
+
DiscontiguousArrayError,
|
|
223
|
+
CollapsedDimensionError,
|
|
224
|
+
UnsupportedDataTypeError,
|
|
225
|
+
FillValueNoneError,
|
|
226
|
+
):
|
|
227
|
+
if self.python_impl is None:
|
|
228
|
+
raise
|
|
229
|
+
await self.python_impl.write(batch_info, value, drop_axes)
|
|
230
|
+
return None
|
|
231
|
+
else:
|
|
232
|
+
# FIXME: Error if array is not in host memory
|
|
233
|
+
value_np: NDArrayLike | np.ndarray = value.as_ndarray_like()
|
|
234
|
+
if not value_np.dtype.isnative:
|
|
235
|
+
value_np = np.ascontiguousarray(
|
|
236
|
+
value_np, dtype=value_np.dtype.newbyteorder("=")
|
|
237
|
+
)
|
|
238
|
+
elif not value_np.flags.c_contiguous:
|
|
239
|
+
value_np = np.ascontiguousarray(value_np)
|
|
240
|
+
await asyncio.to_thread(
|
|
241
|
+
self.impl.store_chunks_with_indices,
|
|
242
|
+
chunks_desc.chunk_info_with_indices,
|
|
243
|
+
value_np,
|
|
244
|
+
chunks_desc.write_empty_chunks,
|
|
245
|
+
)
|
|
246
|
+
return None
|
|
247
|
+
|
|
248
|
+
def _raise_error_on_unsupported_batch_dtype(
|
|
249
|
+
self,
|
|
250
|
+
batch_info: Iterable[
|
|
251
|
+
tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]
|
|
252
|
+
],
|
|
253
|
+
):
|
|
254
|
+
# https://github.com/LDeakin/zarrs/blob/0532fe983b7b42b59dbf84e50a2fe5e6f7bad4ce/zarrs_metadata/src/v2_to_v3.rs#L289-L293 for VSUMm
|
|
255
|
+
# Further, our pipeline does not support variable-length objects due to limitations on decode_into, so object/np.dtypes.StringDType is also out
|
|
256
|
+
if any(
|
|
257
|
+
info.dtype.to_native_dtype().kind in {"V", "S", "U", "M", "m", "O", "T"}
|
|
258
|
+
for (_, info, _, _, _) in batch_info
|
|
259
|
+
):
|
|
260
|
+
raise UnsupportedDataTypeError()
|
zarrs/py.typed
ADDED
|
File without changes
|
zarrs/utils.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import operator
|
|
4
|
+
import os
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from functools import reduce
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
from zarr.core.array_spec import ArraySpec
|
|
11
|
+
from zarr.core.indexing import SelectorTuple, is_integer
|
|
12
|
+
|
|
13
|
+
from zarrs._internal import ChunkItem
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from collections.abc import Iterable
|
|
17
|
+
from types import EllipsisType
|
|
18
|
+
|
|
19
|
+
from zarr.abc.store import ByteGetter, ByteSetter
|
|
20
|
+
from zarr.dtype import ZDType
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# adapted from https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor
|
|
24
|
+
def get_max_threads() -> int:
|
|
25
|
+
return (os.cpu_count() or 1) + 4
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DiscontiguousArrayError(Exception):
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class CollapsedDimensionError(Exception):
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class FillValueNoneError(Exception):
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# This is a (mostly) copy of the function from zarr.core.indexing that fixes:
|
|
41
|
+
# DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated
|
|
42
|
+
# TODO: Upstream this fix
|
|
43
|
+
def make_slice_selection(selection: tuple[np.ndarray | float]) -> list[slice]:
|
|
44
|
+
ls: list[slice] = []
|
|
45
|
+
for dim_selection in selection:
|
|
46
|
+
if is_integer(dim_selection):
|
|
47
|
+
ls.append(slice(int(dim_selection), int(dim_selection) + 1, 1))
|
|
48
|
+
elif isinstance(dim_selection, np.ndarray):
|
|
49
|
+
dim_selection = dim_selection.ravel()
|
|
50
|
+
if len(dim_selection) == 1:
|
|
51
|
+
ls.append(
|
|
52
|
+
slice(int(dim_selection.item()), int(dim_selection.item()) + 1, 1)
|
|
53
|
+
)
|
|
54
|
+
else:
|
|
55
|
+
diff = np.diff(dim_selection)
|
|
56
|
+
if (diff != 1).any() and (diff != 0).any():
|
|
57
|
+
raise DiscontiguousArrayError(diff)
|
|
58
|
+
ls.append(slice(dim_selection[0], dim_selection[-1] + 1, 1))
|
|
59
|
+
else:
|
|
60
|
+
ls.append(dim_selection)
|
|
61
|
+
return ls
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def selector_tuple_to_slice_selection(selector_tuple: SelectorTuple) -> list[slice]:
|
|
65
|
+
if isinstance(selector_tuple, slice):
|
|
66
|
+
return [selector_tuple]
|
|
67
|
+
if all(isinstance(s, slice) for s in selector_tuple):
|
|
68
|
+
return list(selector_tuple)
|
|
69
|
+
return make_slice_selection(selector_tuple)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def resulting_shape_from_index(
|
|
73
|
+
array_shape: tuple[int, ...],
|
|
74
|
+
index_tuple: tuple[int | slice | EllipsisType | np.ndarray],
|
|
75
|
+
drop_axes: tuple[int, ...],
|
|
76
|
+
*,
|
|
77
|
+
pad: bool,
|
|
78
|
+
) -> tuple[int, ...]:
|
|
79
|
+
result_shape = []
|
|
80
|
+
advanced_index_shapes = [
|
|
81
|
+
idx.shape for idx in index_tuple if isinstance(idx, np.ndarray)
|
|
82
|
+
]
|
|
83
|
+
basic_shape_index = 0
|
|
84
|
+
|
|
85
|
+
# Broadcast all advanced indices, if any
|
|
86
|
+
if advanced_index_shapes:
|
|
87
|
+
result_shape += np.broadcast_shapes(*advanced_index_shapes)
|
|
88
|
+
# Consume dimensions from array_shape
|
|
89
|
+
basic_shape_index += len(advanced_index_shapes)
|
|
90
|
+
|
|
91
|
+
# Process each remaining index in index_tuple
|
|
92
|
+
for idx in index_tuple:
|
|
93
|
+
if isinstance(idx, int):
|
|
94
|
+
# Integer index reduces dimension, so skip this dimension in array_shape
|
|
95
|
+
basic_shape_index += 1
|
|
96
|
+
elif isinstance(idx, slice):
|
|
97
|
+
if idx.step is not None and idx.step > 1:
|
|
98
|
+
raise DiscontiguousArrayError(
|
|
99
|
+
"Step size greater than 1 is not supported"
|
|
100
|
+
)
|
|
101
|
+
# Slice keeps dimension, adjust size accordingly
|
|
102
|
+
start, stop, _ = idx.indices(array_shape[basic_shape_index])
|
|
103
|
+
result_shape.append(stop - start)
|
|
104
|
+
basic_shape_index += 1
|
|
105
|
+
elif idx is Ellipsis:
|
|
106
|
+
# Calculate number of dimensions that Ellipsis should fill
|
|
107
|
+
num_to_fill = len(array_shape) - len(index_tuple) + 1
|
|
108
|
+
result_shape += array_shape[
|
|
109
|
+
basic_shape_index : basic_shape_index + num_to_fill
|
|
110
|
+
]
|
|
111
|
+
basic_shape_index += num_to_fill
|
|
112
|
+
elif not isinstance(idx, np.ndarray):
|
|
113
|
+
raise ValueError(f"Invalid index type: {type(idx)}")
|
|
114
|
+
|
|
115
|
+
# Step 4: Append remaining dimensions from array_shape if fewer indices were used
|
|
116
|
+
if basic_shape_index < len(array_shape) and pad:
|
|
117
|
+
result_shape += array_shape[basic_shape_index:]
|
|
118
|
+
|
|
119
|
+
return tuple(size for idx, size in enumerate(result_shape) if idx not in drop_axes)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def prod_op(x: Iterable[int]) -> int:
|
|
123
|
+
return reduce(operator.mul, x, 1)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def get_shape_for_selector(
|
|
127
|
+
selector_tuple: SelectorTuple,
|
|
128
|
+
shape: tuple[int, ...],
|
|
129
|
+
*,
|
|
130
|
+
pad: bool,
|
|
131
|
+
drop_axes: tuple[int, ...] = (),
|
|
132
|
+
) -> tuple[int, ...]:
|
|
133
|
+
if isinstance(selector_tuple, slice | np.ndarray):
|
|
134
|
+
return resulting_shape_from_index(
|
|
135
|
+
shape,
|
|
136
|
+
(selector_tuple,),
|
|
137
|
+
drop_axes,
|
|
138
|
+
pad=pad,
|
|
139
|
+
)
|
|
140
|
+
return resulting_shape_from_index(shape, selector_tuple, drop_axes, pad=pad)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def get_implicit_fill_value(dtype: ZDType, fill_value: Any) -> Any:
|
|
144
|
+
if fill_value is None:
|
|
145
|
+
fill_value = dtype.default_scalar()
|
|
146
|
+
return fill_value
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass(frozen=True)
|
|
150
|
+
class RustChunkInfo:
|
|
151
|
+
chunk_info_with_indices: list[ChunkItem]
|
|
152
|
+
write_empty_chunks: bool
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def make_chunk_info_for_rust_with_indices(
|
|
156
|
+
batch_info: Iterable[
|
|
157
|
+
tuple[ByteGetter | ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]
|
|
158
|
+
],
|
|
159
|
+
drop_axes: tuple[int, ...],
|
|
160
|
+
shape: tuple[int, ...],
|
|
161
|
+
) -> RustChunkInfo:
|
|
162
|
+
shape = shape if shape else (1,) # constant array
|
|
163
|
+
chunk_info_with_indices: list[ChunkItem] = []
|
|
164
|
+
write_empty_chunks: bool = True
|
|
165
|
+
for (
|
|
166
|
+
byte_getter,
|
|
167
|
+
chunk_spec,
|
|
168
|
+
chunk_selection,
|
|
169
|
+
out_selection,
|
|
170
|
+
_,
|
|
171
|
+
) in batch_info:
|
|
172
|
+
write_empty_chunks = chunk_spec.config.write_empty_chunks
|
|
173
|
+
if chunk_spec.fill_value is None:
|
|
174
|
+
chunk_spec = ArraySpec(
|
|
175
|
+
chunk_spec.shape,
|
|
176
|
+
chunk_spec.dtype,
|
|
177
|
+
get_implicit_fill_value(chunk_spec.dtype, chunk_spec.fill_value),
|
|
178
|
+
chunk_spec.config,
|
|
179
|
+
chunk_spec.prototype,
|
|
180
|
+
)
|
|
181
|
+
out_selection_as_slices = selector_tuple_to_slice_selection(out_selection)
|
|
182
|
+
chunk_selection_as_slices = selector_tuple_to_slice_selection(chunk_selection)
|
|
183
|
+
shape_chunk_selection_slices = get_shape_for_selector(
|
|
184
|
+
tuple(chunk_selection_as_slices),
|
|
185
|
+
chunk_spec.shape,
|
|
186
|
+
pad=True,
|
|
187
|
+
drop_axes=drop_axes,
|
|
188
|
+
)
|
|
189
|
+
shape_chunk_selection = get_shape_for_selector(
|
|
190
|
+
chunk_selection, chunk_spec.shape, pad=True, drop_axes=drop_axes
|
|
191
|
+
)
|
|
192
|
+
if prod_op(shape_chunk_selection) != prod_op(shape_chunk_selection_slices):
|
|
193
|
+
raise CollapsedDimensionError(
|
|
194
|
+
f"{shape_chunk_selection} != {shape_chunk_selection_slices}"
|
|
195
|
+
)
|
|
196
|
+
chunk_info_with_indices.append(
|
|
197
|
+
ChunkItem(
|
|
198
|
+
key=byte_getter.path,
|
|
199
|
+
chunk_subset=chunk_selection_as_slices,
|
|
200
|
+
chunk_shape=chunk_spec.shape,
|
|
201
|
+
subset=out_selection_as_slices,
|
|
202
|
+
shape=shape,
|
|
203
|
+
)
|
|
204
|
+
)
|
|
205
|
+
return RustChunkInfo(chunk_info_with_indices, write_empty_chunks)
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: zarrs
|
|
3
|
+
Version: 0.2.2
|
|
4
|
+
Classifier: Programming Language :: Rust
|
|
5
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
6
|
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
7
|
+
Classifier: Typing :: Typed
|
|
8
|
+
Requires-Dist: numpy>=1.24
|
|
9
|
+
Requires-Dist: zarr>=3.1
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Summary: A CodecPipeline for zarr-python backed by the zarrs Rust crate
|
|
12
|
+
Author: Ilan Gold, Lachlan Deakin, Philipp Angerer
|
|
13
|
+
License-Expression: MIT
|
|
14
|
+
Requires-Python: >=3.11
|
|
15
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
16
|
+
|
|
17
|
+
# zarrs-python
|
|
18
|
+
|
|
19
|
+
[](https://pypi.org/project/zarrs)
|
|
20
|
+
[](https://pepy.tech/project/zarrs)
|
|
21
|
+
[](https://pepy.tech/project/zarrs)
|
|
22
|
+
[](https://github.com/zarrs/zarrs-python/stargazers)
|
|
23
|
+

|
|
24
|
+

|
|
25
|
+
|
|
26
|
+
This project serves as a bridge between [`zarrs`](https://docs.rs/zarrs/latest/zarrs/) (Rust) and [`zarr`](https://zarr.readthedocs.io/en/latest/index.html) (`zarr-python`) via [`PyO3`](https://pyo3.rs/v0.22.3/). The main goal of the project is to speed up i/o (see [`zarr_benchmarks`](https://github.com/LDeakin/zarr_benchmarks)).
|
|
27
|
+
|
|
28
|
+
To use the project, simply install our package (which depends on `zarr-python>=3.0.0`), and run:
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import zarr
|
|
32
|
+
zarr.config.set({"codec_pipeline.path": "zarrs.ZarrsCodecPipeline"})
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
You can then use your `zarr` as normal (with some caveats)!
|
|
36
|
+
|
|
37
|
+
## API
|
|
38
|
+
|
|
39
|
+
We export a `ZarrsCodecPipeline` class so that `zarr-python` can use the class but it is not meant to be instantiated and we do not guarantee the stability of its API beyond what is required so that `zarr-python` can use it. Therefore, it is not documented here.
|
|
40
|
+
|
|
41
|
+
At the moment, we only support a subset of the `zarr-python` stores:
|
|
42
|
+
|
|
43
|
+
- [`LocalStore`](https://zarr.readthedocs.io/en/latest/api/zarr/storage/#zarr.storage.LocalStore) (local filesystem)
|
|
44
|
+
- [`ObjectStore`](https://zarr.readthedocs.io/en/latest/user-guide/storage/#object-store) (cloud storage)
|
|
45
|
+
- [`HTTPFileSystem`](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.implementations.http.HTTPFileSystem) via [`FsspecStore`](https://zarr.readthedocs.io/en/latest/api/zarr/storage/#zarr.storage.FsspecStore)
|
|
46
|
+
|
|
47
|
+
A `NotImplementedError` will be raised if a store is not supported.
|
|
48
|
+
|
|
49
|
+
### Configuration
|
|
50
|
+
|
|
51
|
+
`ZarrsCodecPipeline` options are exposed through `zarr.config`.
|
|
52
|
+
|
|
53
|
+
Standard `zarr.config` options control some functionality (see the defaults in the [config.py](https://github.com/zarr-developers/zarr-python/blob/main/src/zarr/core/config.py) of `zarr-python`):
|
|
54
|
+
- `threading.max_workers`: the maximum number of threads used internally by the `ZarrsCodecPipeline` on the Rust side.
|
|
55
|
+
- Defaults to the number of threads in the global `rayon` thread pool if set to `None`, which is [typically the number of logical CPUs](https://docs.rs/rayon/latest/rayon/struct.ThreadPoolBuilder.html#method.num_threads).
|
|
56
|
+
- `array.write_empty_chunks`: whether or not to store empty chunks.
|
|
57
|
+
- Defaults to false if `None`. Note that checking for emptiness has some overhead, see [here](https://docs.rs/zarrs/latest/zarrs/config/struct.Config.html#store-empty-chunks) for more info.
|
|
58
|
+
|
|
59
|
+
The `ZarrsCodecPipeline` specific options are:
|
|
60
|
+
- `codec_pipeline.chunk_concurrent_maximum`: the maximum number of chunks stored/retrieved concurrently.
|
|
61
|
+
- Defaults to the number of logical CPUs if `None`. It is constrained by `threading.max_workers` as well.
|
|
62
|
+
- `codec_pipeline.chunk_concurrent_minimum`: the minimum number of chunks retrieved/stored concurrently when balancing chunk/codec concurrency.
|
|
63
|
+
- Defaults to 4 if `None`. See [here](https://docs.rs/zarrs/latest/zarrs/config/struct.Config.html#chunk-concurrent-minimum) for more info.
|
|
64
|
+
- `codec_pipeline.validate_checksums`: enable checksum validation (e.g. with the CRC32C codec).
|
|
65
|
+
- Defaults to `True`. See [here](https://docs.rs/zarrs/latest/zarrs/config/struct.Config.html#validate-checksums) for more info.
|
|
66
|
+
- `codec_pipeline.direct_io`: enable `O_DIRECT` read/write, needs support from the operating system (currently only Linux) and file system.
|
|
67
|
+
- Defaults to `False`.
|
|
68
|
+
- `codec_pipeline.strict`: raise exceptions for unsupported operations instead of falling back to the default codec pipeline of `zarr-python`.
|
|
69
|
+
- Defaults to `False`.
|
|
70
|
+
|
|
71
|
+
For example:
|
|
72
|
+
```python
|
|
73
|
+
zarr.config.set({
|
|
74
|
+
"threading.max_workers": None,
|
|
75
|
+
"array.write_empty_chunks": False,
|
|
76
|
+
"codec_pipeline": {
|
|
77
|
+
"path": "zarrs.ZarrsCodecPipeline",
|
|
78
|
+
"validate_checksums": True,
|
|
79
|
+
"chunk_concurrent_maximum": None,
|
|
80
|
+
"chunk_concurrent_minimum": 4,
|
|
81
|
+
"direct_io": False,
|
|
82
|
+
"strict": False
|
|
83
|
+
}
|
|
84
|
+
})
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
If the `ZarrsCodecPipeline` is pickled, and then un-pickled, and during that time one of `chunk_concurrent_minimum`, `chunk_concurrent_maximum`, or `num_threads` has changed, the newly un-pickled version will pick up the new value. However, once a `ZarrsCodecPipeline` object has been instantiated, these values are then fixed. This may change in the future as guidance from the `zarr` community becomes clear.
|
|
88
|
+
|
|
89
|
+
## Concurrency
|
|
90
|
+
|
|
91
|
+
Concurrency can be classified into two types:
|
|
92
|
+
- chunk (outer) concurrency: the number of chunks retrieved/stored concurrently.
|
|
93
|
+
- This is chosen automatically based on various factors, such as the chunk size and codecs.
|
|
94
|
+
- It is constrained between `codec_pipeline.chunk_concurrent_minimum` and `codec_pipeline.chunk_concurrent_maximum` for operations involving multiple chunks.
|
|
95
|
+
- codec (inner) concurrency: the number of threads encoding/decoding a chunk.
|
|
96
|
+
- This is chosen automatically in combination with the chunk concurrency.
|
|
97
|
+
|
|
98
|
+
The product of the chunk and codec concurrency will approximately match `threading.max_workers`.
|
|
99
|
+
|
|
100
|
+
Chunk concurrency is typically favored because:
|
|
101
|
+
- parallel encoding/decoding can have a high overhead with some codecs, especially with small chunks, and
|
|
102
|
+
- it is advantageous to retrieve/store multiple chunks concurrently, especially with high latency stores.
|
|
103
|
+
|
|
104
|
+
`zarrs-python` will often favor codec concurrency with sharded arrays, as they are well suited to codec concurrency.
|
|
105
|
+
|
|
106
|
+
## Supported Indexing Methods
|
|
107
|
+
|
|
108
|
+
The following methods will trigger use with the old zarr-python pipeline:
|
|
109
|
+
|
|
110
|
+
1. Any `oindex` or `vindex` integer `np.ndarray` indexing with dimensionality >=3 i.e.,
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
arr[np.array([...]), :, np.array([...])]
|
|
114
|
+
arr[np.array([...]), np.array([...]), np.array([...])]
|
|
115
|
+
arr[np.array([...]), np.array([...]), np.array([...])] = ...
|
|
116
|
+
arr.oindex[np.array([...]), np.array([...]), np.array([...])] = ...
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
2. Any `vindex` or `oindex` discontinuous integer `np.ndarray` indexing for writes in 2D
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
arr[np.array([0, 5]), :] = ...
|
|
123
|
+
arr.oindex[np.array([0, 5]), :] = ...
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
3. `vindex` writes in 2D where both indexers are integer `np.ndarray` indices i.e.,
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
arr[np.array([...]), np.array([...])] = ...
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
4. Ellipsis indexing. We have tested some, but others fail even with `zarr-python`'s default codec pipeline. Thus for now we advise proceeding with caution here.
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
arr[0:10, ..., 0:5]
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
Furthermore, using anything except contiguous (i.e., slices or consecutive integer) `np.ndarray` for numeric data will fall back to the default `zarr-python` implementation.
|
|
140
|
+
|
|
141
|
+
Please file an issue if you believe we have more holes in our coverage than we are aware of or you wish to contribute! For example, we have an [issue in zarrs for integer-array indexing](https://github.com/LDeakin/zarrs/issues/52) that would unblock a lot the use of the rust pipeline for that use-case (very useful for mini-batch training perhaps!).
|
|
142
|
+
|
|
143
|
+
Further, any codecs not supported by `zarrs` will also automatically fall back to the python implementation.
|
|
144
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
zarrs-0.2.2.dist-info/METADATA,sha256=8K1AOS_SVQgRLzQ2rWtNbJxCAb12XmTLRrT6QklQgOI,8054
|
|
2
|
+
zarrs-0.2.2.dist-info/WHEEL,sha256=LyKrmraG_uSXY5vhpX0CJs5HieuGNOxWFsl0o305mFk,108
|
|
3
|
+
zarrs-0.2.2.dist-info/entry_points.txt,sha256=EzI6yCIUPDHBHzjDdexuGGYbOLXf8x2ICokOJXnuX3k,68
|
|
4
|
+
zarrs-0.2.2.dist-info/licenses/LICENSE,sha256=vwIsJjEfVFehyyqcb7B3dAXAniaFMmk8u7IoiJAfBJ4,1099
|
|
5
|
+
zarrs/__init__.py,sha256=lRVtAPzCzJkGs4vQrW4UgANq-pC-khS0ZF7HTj4__Hg,489
|
|
6
|
+
zarrs/_internal.abi3.so,sha256=GCAoGz9VUm-OVKCf_MRcPJwsZxtNI0sdsYYJUTr1vRk,17179376
|
|
7
|
+
zarrs/_internal.pyi,sha256=a_D4yx99r4xeQX1ntY_A_Q4wVmLeLwJZHWAQV_mVu9A,1308
|
|
8
|
+
zarrs/pipeline.py,sha256=YfB13GWNfxELerXVtJ_ipFwSL7bN-YuPys6jCB9lnms,9008
|
|
9
|
+
zarrs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
zarrs/utils.py,sha256=W2XCjJDVrdHYJgtVaRKN533Ljw1MF7o0YwXuz5ZAk2g,7020
|
|
11
|
+
zarrs-0.2.2.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Ilan Gold, Lachlan Deakin, Philipp Angerer
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|