zarrs 0.1.0__cp311-abi3-macosx_10_12_x86_64.whl → 0.2.2__cp311-abi3-macosx_10_12_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zarrs/__init__.py +0 -4
- zarrs/_internal.abi3.so +0 -0
- zarrs/_internal.pyi +27 -30
- zarrs/pipeline.py +175 -70
- zarrs/utils.py +51 -31
- {zarrs-0.1.0.dist-info → zarrs-0.2.2.dist-info}/METADATA +38 -53
- zarrs-0.2.2.dist-info/RECORD +11 -0
- {zarrs-0.1.0.dist-info → zarrs-0.2.2.dist-info}/WHEEL +1 -1
- zarrs-0.2.2.dist-info/entry_points.txt +2 -0
- zarrs-0.1.0.dist-info/RECORD +0 -10
- {zarrs-0.1.0.dist-info → zarrs-0.2.2.dist-info}/licenses/LICENSE +0 -0
zarrs/__init__.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from zarr.registry import register_pipeline
|
|
2
|
-
|
|
3
1
|
from ._internal import __version__
|
|
4
2
|
from .pipeline import ZarrsCodecPipeline as _ZarrsCodecPipeline
|
|
5
3
|
from .utils import CollapsedDimensionError, DiscontiguousArrayError
|
|
@@ -10,8 +8,6 @@ class ZarrsCodecPipeline(_ZarrsCodecPipeline):
|
|
|
10
8
|
pass
|
|
11
9
|
|
|
12
10
|
|
|
13
|
-
register_pipeline(ZarrsCodecPipeline)
|
|
14
|
-
|
|
15
11
|
__all__ = [
|
|
16
12
|
"ZarrsCodecPipeline",
|
|
17
13
|
"DiscontiguousArrayError",
|
zarrs/_internal.abi3.so
CHANGED
|
Binary file
|
zarrs/_internal.pyi
CHANGED
|
@@ -1,47 +1,44 @@
|
|
|
1
1
|
# This file is automatically generated by pyo3_stub_gen
|
|
2
2
|
# ruff: noqa: E501, F401
|
|
3
3
|
|
|
4
|
+
import builtins
|
|
4
5
|
import typing
|
|
5
6
|
|
|
6
|
-
import numpy
|
|
7
7
|
import numpy.typing
|
|
8
|
+
import zarr.abc.store
|
|
8
9
|
|
|
10
|
+
@typing.final
|
|
11
|
+
class ChunkItem:
|
|
12
|
+
def __new__(
|
|
13
|
+
cls,
|
|
14
|
+
key: builtins.str,
|
|
15
|
+
chunk_subset: typing.Sequence[slice],
|
|
16
|
+
chunk_shape: typing.Sequence[builtins.int],
|
|
17
|
+
subset: typing.Sequence[slice],
|
|
18
|
+
shape: typing.Sequence[builtins.int],
|
|
19
|
+
) -> ChunkItem: ...
|
|
20
|
+
|
|
21
|
+
@typing.final
|
|
9
22
|
class CodecPipelineImpl:
|
|
10
23
|
def __new__(
|
|
11
24
|
cls,
|
|
12
|
-
|
|
25
|
+
array_metadata: builtins.str,
|
|
26
|
+
store_config: zarr.abc.store.Store,
|
|
13
27
|
*,
|
|
14
|
-
validate_checksums
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
): ...
|
|
28
|
+
validate_checksums: builtins.bool = False,
|
|
29
|
+
chunk_concurrent_minimum: builtins.int | None = None,
|
|
30
|
+
chunk_concurrent_maximum: builtins.int | None = None,
|
|
31
|
+
num_threads: builtins.int | None = None,
|
|
32
|
+
direct_io: builtins.bool = False,
|
|
33
|
+
) -> CodecPipelineImpl: ...
|
|
20
34
|
def retrieve_chunks_and_apply_index(
|
|
21
35
|
self,
|
|
22
|
-
chunk_descriptions: typing.Sequence[
|
|
23
|
-
|
|
24
|
-
tuple[str, typing.Sequence[int], str, typing.Sequence[int]],
|
|
25
|
-
typing.Sequence[slice],
|
|
26
|
-
typing.Sequence[slice],
|
|
27
|
-
]
|
|
28
|
-
],
|
|
29
|
-
value: numpy.NDArray[typing.Any],
|
|
36
|
+
chunk_descriptions: typing.Sequence[ChunkItem],
|
|
37
|
+
value: numpy.typing.NDArray[typing.Any],
|
|
30
38
|
) -> None: ...
|
|
31
|
-
def retrieve_chunks(
|
|
32
|
-
self,
|
|
33
|
-
chunk_descriptions: typing.Sequence[
|
|
34
|
-
tuple[str, typing.Sequence[int], str, typing.Sequence[int]]
|
|
35
|
-
],
|
|
36
|
-
) -> list[numpy.typing.NDArray[numpy.uint8]]: ...
|
|
37
39
|
def store_chunks_with_indices(
|
|
38
40
|
self,
|
|
39
|
-
chunk_descriptions: typing.Sequence[
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
typing.Sequence[slice],
|
|
43
|
-
typing.Sequence[slice],
|
|
44
|
-
]
|
|
45
|
-
],
|
|
46
|
-
value: numpy.NDArray[typing.Any],
|
|
41
|
+
chunk_descriptions: typing.Sequence[ChunkItem],
|
|
42
|
+
value: numpy.typing.NDArray[typing.Any],
|
|
43
|
+
write_empty_chunks: builtins.bool,
|
|
47
44
|
) -> None: ...
|
zarrs/pipeline.py
CHANGED
|
@@ -3,68 +3,134 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import json
|
|
5
5
|
from dataclasses import dataclass
|
|
6
|
-
from typing import TYPE_CHECKING,
|
|
6
|
+
from typing import TYPE_CHECKING, TypedDict
|
|
7
|
+
from warnings import warn
|
|
7
8
|
|
|
8
9
|
import numpy as np
|
|
9
|
-
from zarr.abc.codec import
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
)
|
|
10
|
+
from zarr.abc.codec import Codec, CodecPipeline
|
|
11
|
+
from zarr.codecs._v2 import V2Codec
|
|
12
|
+
from zarr.core import BatchedCodecPipeline
|
|
13
13
|
from zarr.core.config import config
|
|
14
|
+
from zarr.core.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata
|
|
14
15
|
|
|
15
16
|
if TYPE_CHECKING:
|
|
16
17
|
from collections.abc import Iterable, Iterator
|
|
17
18
|
from typing import Self
|
|
18
19
|
|
|
19
|
-
from zarr.abc.store import ByteGetter, ByteSetter
|
|
20
|
+
from zarr.abc.store import ByteGetter, ByteSetter, Store
|
|
20
21
|
from zarr.core.array_spec import ArraySpec
|
|
21
|
-
from zarr.core.buffer import Buffer, NDBuffer
|
|
22
|
+
from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer
|
|
22
23
|
from zarr.core.chunk_grids import ChunkGrid
|
|
23
|
-
from zarr.core.common import ChunkCoords
|
|
24
24
|
from zarr.core.indexing import SelectorTuple
|
|
25
|
+
from zarr.dtype import ZDType
|
|
25
26
|
|
|
26
27
|
from ._internal import CodecPipelineImpl
|
|
27
28
|
from .utils import (
|
|
28
29
|
CollapsedDimensionError,
|
|
29
30
|
DiscontiguousArrayError,
|
|
30
|
-
|
|
31
|
+
FillValueNoneError,
|
|
31
32
|
make_chunk_info_for_rust_with_indices,
|
|
32
33
|
)
|
|
33
34
|
|
|
34
35
|
|
|
35
|
-
|
|
36
|
-
|
|
36
|
+
class UnsupportedDataTypeError(Exception):
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class UnsupportedMetadataError(Exception):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_codec_pipeline_impl(
|
|
45
|
+
metadata: ArrayMetadata, store: Store, *, strict: bool
|
|
46
|
+
) -> CodecPipelineImpl | None:
|
|
47
|
+
try:
|
|
48
|
+
array_metadata_json = json.dumps(metadata.to_dict())
|
|
49
|
+
# Maintain old behavior: https://github.com/zarrs/zarrs-python/tree/b36ba797cafec77f5f41a25316be02c718a2b4f8?tab=readme-ov-file#configuration
|
|
50
|
+
validate_checksums = config.get("codec_pipeline.validate_checksums", True)
|
|
51
|
+
if validate_checksums is None:
|
|
52
|
+
validate_checksums = True
|
|
53
|
+
return CodecPipelineImpl(
|
|
54
|
+
array_metadata_json,
|
|
55
|
+
store_config=store,
|
|
56
|
+
validate_checksums=validate_checksums,
|
|
57
|
+
chunk_concurrent_minimum=config.get(
|
|
58
|
+
"codec_pipeline.chunk_concurrent_minimum", None
|
|
59
|
+
),
|
|
60
|
+
chunk_concurrent_maximum=config.get(
|
|
61
|
+
"codec_pipeline.chunk_concurrent_maximum", None
|
|
62
|
+
),
|
|
63
|
+
num_threads=config.get("threading.max_workers", None),
|
|
64
|
+
direct_io=config.get("codec_pipeline.direct_io", False),
|
|
65
|
+
)
|
|
66
|
+
except TypeError as e:
|
|
67
|
+
if strict:
|
|
68
|
+
raise UnsupportedMetadataError() from e
|
|
69
|
+
|
|
70
|
+
warn(
|
|
71
|
+
f"Array is unsupported by ZarrsCodecPipeline: {e}",
|
|
72
|
+
category=UserWarning,
|
|
73
|
+
)
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_codec_pipeline_fallback(
|
|
78
|
+
metadata: ArrayMetadata, *, strict: bool
|
|
79
|
+
) -> BatchedCodecPipeline | None:
|
|
80
|
+
if strict:
|
|
81
|
+
return None
|
|
82
|
+
else:
|
|
83
|
+
codecs = array_metadata_to_codecs(metadata)
|
|
84
|
+
return BatchedCodecPipeline.from_codecs(codecs)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class ZarrsCodecPipelineState(TypedDict):
|
|
88
|
+
codec_metadata_json: str
|
|
37
89
|
codecs: tuple[Codec, ...]
|
|
38
|
-
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def array_metadata_to_codecs(metadata: ArrayMetadata) -> list[Codec]:
|
|
93
|
+
if isinstance(metadata, ArrayV3Metadata):
|
|
94
|
+
return metadata.codecs
|
|
95
|
+
elif isinstance(metadata, ArrayV2Metadata):
|
|
96
|
+
v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor)
|
|
97
|
+
return [v2_codec]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class ZarrsCodecPipeline(CodecPipeline):
|
|
102
|
+
metadata: ArrayMetadata
|
|
103
|
+
store: Store
|
|
104
|
+
impl: CodecPipelineImpl | None
|
|
105
|
+
python_impl: BatchedCodecPipeline | None
|
|
106
|
+
|
|
107
|
+
def __getstate__(self) -> ZarrsCodecPipelineState:
|
|
108
|
+
return {"metadata": self.metadata, "store": self.store}
|
|
109
|
+
|
|
110
|
+
def __setstate__(self, state: ZarrsCodecPipelineState):
|
|
111
|
+
self.metadata = state["metadata"]
|
|
112
|
+
self.store = state["store"]
|
|
113
|
+
strict = config.get("codec_pipeline.strict", False)
|
|
114
|
+
self.impl = get_codec_pipeline_impl(self.metadata, self.store, strict=strict)
|
|
115
|
+
self.python_impl = get_codec_pipeline_fallback(self.metadata, strict=strict)
|
|
39
116
|
|
|
40
117
|
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
|
|
41
|
-
|
|
118
|
+
return self
|
|
42
119
|
|
|
43
120
|
@classmethod
|
|
44
121
|
def from_codecs(cls, codecs: Iterable[Codec]) -> Self:
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
122
|
+
return BatchedCodecPipeline.from_codecs(codecs)
|
|
123
|
+
|
|
124
|
+
@classmethod
|
|
125
|
+
def from_array_metadata_and_store(
|
|
126
|
+
cls, array_metadata: ArrayMetadata, store: Store
|
|
127
|
+
) -> Self:
|
|
128
|
+
strict = config.get("codec_pipeline.strict", False)
|
|
51
129
|
return cls(
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
"codec_pipeline.validate_checksums", None
|
|
57
|
-
),
|
|
58
|
-
# TODO: upstream zarr-python array.write_empty_chunks is not merged yet #2429
|
|
59
|
-
store_empty_chunks=config.get("array.write_empty_chunks", None),
|
|
60
|
-
chunk_concurrent_minimum=config.get(
|
|
61
|
-
"codec_pipeline.chunk_concurrent_minimum", None
|
|
62
|
-
),
|
|
63
|
-
chunk_concurrent_maximum=config.get(
|
|
64
|
-
"codec_pipeline.chunk_concurrent_maximum", None
|
|
65
|
-
),
|
|
66
|
-
num_threads=config.get("threading.max_workers", None),
|
|
67
|
-
),
|
|
130
|
+
metadata=array_metadata,
|
|
131
|
+
store=store,
|
|
132
|
+
impl=get_codec_pipeline_impl(array_metadata, store, strict=strict),
|
|
133
|
+
python_impl=get_codec_pipeline_fallback(array_metadata, strict=strict),
|
|
68
134
|
)
|
|
69
135
|
|
|
70
136
|
@property
|
|
@@ -79,7 +145,7 @@ class ZarrsCodecPipeline(CodecPipeline):
|
|
|
79
145
|
yield from self.codecs
|
|
80
146
|
|
|
81
147
|
def validate(
|
|
82
|
-
self, *, shape:
|
|
148
|
+
self, *, shape: tuple[int, ...], dtype: ZDType, chunk_grid: ChunkGrid
|
|
83
149
|
) -> None:
|
|
84
150
|
raise NotImplementedError("validate")
|
|
85
151
|
|
|
@@ -101,55 +167,94 @@ class ZarrsCodecPipeline(CodecPipeline):
|
|
|
101
167
|
async def read(
|
|
102
168
|
self,
|
|
103
169
|
batch_info: Iterable[
|
|
104
|
-
tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]
|
|
170
|
+
tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]
|
|
105
171
|
],
|
|
106
|
-
out: NDBuffer,
|
|
172
|
+
out: NDBuffer, # type: ignore
|
|
107
173
|
drop_axes: tuple[int, ...] = (), # FIXME: unused
|
|
108
174
|
) -> None:
|
|
109
|
-
|
|
175
|
+
# FIXME: Error if array is not in host memory
|
|
110
176
|
if not out.dtype.isnative:
|
|
111
177
|
raise RuntimeError("Non-native byte order not supported")
|
|
112
178
|
try:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
chunks_desc =
|
|
117
|
-
|
|
118
|
-
|
|
179
|
+
if self.impl is None:
|
|
180
|
+
raise UnsupportedMetadataError()
|
|
181
|
+
self._raise_error_on_unsupported_batch_dtype(batch_info)
|
|
182
|
+
chunks_desc = make_chunk_info_for_rust_with_indices(
|
|
183
|
+
batch_info, drop_axes, out.shape
|
|
184
|
+
)
|
|
185
|
+
except (
|
|
186
|
+
UnsupportedMetadataError,
|
|
187
|
+
DiscontiguousArrayError,
|
|
188
|
+
CollapsedDimensionError,
|
|
189
|
+
UnsupportedDataTypeError,
|
|
190
|
+
FillValueNoneError,
|
|
191
|
+
):
|
|
192
|
+
if self.python_impl is None:
|
|
193
|
+
raise
|
|
194
|
+
await self.python_impl.read(batch_info, out, drop_axes)
|
|
195
|
+
return None
|
|
196
|
+
else:
|
|
197
|
+
out: NDArrayLike = out.as_ndarray_like()
|
|
119
198
|
await asyncio.to_thread(
|
|
120
199
|
self.impl.retrieve_chunks_and_apply_index,
|
|
121
|
-
chunks_desc,
|
|
200
|
+
chunks_desc.chunk_info_with_indices,
|
|
122
201
|
out,
|
|
123
202
|
)
|
|
124
203
|
return None
|
|
125
|
-
chunks = await asyncio.to_thread(self.impl.retrieve_chunks, chunks_desc)
|
|
126
|
-
for chunk, chunk_info in zip(chunks, batch_info):
|
|
127
|
-
out_selection = chunk_info[3]
|
|
128
|
-
selection = chunk_info[2]
|
|
129
|
-
spec = chunk_info[1]
|
|
130
|
-
chunk_reshaped = chunk.view(spec.dtype).reshape(spec.shape)
|
|
131
|
-
chunk_selected = chunk_reshaped[selection]
|
|
132
|
-
if drop_axes:
|
|
133
|
-
chunk_selected = np.squeeze(chunk_selected, axis=drop_axes)
|
|
134
|
-
out[out_selection] = chunk_selected
|
|
135
204
|
|
|
136
205
|
async def write(
|
|
137
206
|
self,
|
|
138
207
|
batch_info: Iterable[
|
|
139
|
-
tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]
|
|
208
|
+
tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]
|
|
140
209
|
],
|
|
141
|
-
value: NDBuffer,
|
|
210
|
+
value: NDBuffer, # type: ignore
|
|
142
211
|
drop_axes: tuple[int, ...] = (),
|
|
143
212
|
) -> None:
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
213
|
+
try:
|
|
214
|
+
if self.impl is None:
|
|
215
|
+
raise UnsupportedMetadataError()
|
|
216
|
+
self._raise_error_on_unsupported_batch_dtype(batch_info)
|
|
217
|
+
chunks_desc = make_chunk_info_for_rust_with_indices(
|
|
218
|
+
batch_info, drop_axes, value.shape
|
|
219
|
+
)
|
|
220
|
+
except (
|
|
221
|
+
UnsupportedMetadataError,
|
|
222
|
+
DiscontiguousArrayError,
|
|
223
|
+
CollapsedDimensionError,
|
|
224
|
+
UnsupportedDataTypeError,
|
|
225
|
+
FillValueNoneError,
|
|
226
|
+
):
|
|
227
|
+
if self.python_impl is None:
|
|
228
|
+
raise
|
|
229
|
+
await self.python_impl.write(batch_info, value, drop_axes)
|
|
230
|
+
return None
|
|
231
|
+
else:
|
|
232
|
+
# FIXME: Error if array is not in host memory
|
|
233
|
+
value_np: NDArrayLike | np.ndarray = value.as_ndarray_like()
|
|
234
|
+
if not value_np.dtype.isnative:
|
|
235
|
+
value_np = np.ascontiguousarray(
|
|
236
|
+
value_np, dtype=value_np.dtype.newbyteorder("=")
|
|
237
|
+
)
|
|
238
|
+
elif not value_np.flags.c_contiguous:
|
|
239
|
+
value_np = np.ascontiguousarray(value_np)
|
|
240
|
+
await asyncio.to_thread(
|
|
241
|
+
self.impl.store_chunks_with_indices,
|
|
242
|
+
chunks_desc.chunk_info_with_indices,
|
|
243
|
+
value_np,
|
|
244
|
+
chunks_desc.write_empty_chunks,
|
|
245
|
+
)
|
|
246
|
+
return None
|
|
247
|
+
|
|
248
|
+
def _raise_error_on_unsupported_batch_dtype(
|
|
249
|
+
self,
|
|
250
|
+
batch_info: Iterable[
|
|
251
|
+
tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]
|
|
252
|
+
],
|
|
253
|
+
):
|
|
254
|
+
# https://github.com/LDeakin/zarrs/blob/0532fe983b7b42b59dbf84e50a2fe5e6f7bad4ce/zarrs_metadata/src/v2_to_v3.rs#L289-L293 for VSUMm
|
|
255
|
+
# Further, our pipeline does not support variable-length objects due to limitations on decode_into, so object/np.dtypes.StringDType is also out
|
|
256
|
+
if any(
|
|
257
|
+
info.dtype.to_native_dtype().kind in {"V", "S", "U", "M", "m", "O", "T"}
|
|
258
|
+
for (_, info, _, _, _) in batch_info
|
|
259
|
+
):
|
|
260
|
+
raise UnsupportedDataTypeError()
|
zarrs/utils.py
CHANGED
|
@@ -2,19 +2,22 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import operator
|
|
4
4
|
import os
|
|
5
|
+
from dataclasses import dataclass
|
|
5
6
|
from functools import reduce
|
|
6
7
|
from typing import TYPE_CHECKING, Any
|
|
7
8
|
|
|
8
9
|
import numpy as np
|
|
10
|
+
from zarr.core.array_spec import ArraySpec
|
|
9
11
|
from zarr.core.indexing import SelectorTuple, is_integer
|
|
10
12
|
|
|
13
|
+
from zarrs._internal import ChunkItem
|
|
14
|
+
|
|
11
15
|
if TYPE_CHECKING:
|
|
12
16
|
from collections.abc import Iterable
|
|
13
17
|
from types import EllipsisType
|
|
14
18
|
|
|
15
19
|
from zarr.abc.store import ByteGetter, ByteSetter
|
|
16
|
-
from zarr.
|
|
17
|
-
from zarr.core.common import ChunkCoords
|
|
20
|
+
from zarr.dtype import ZDType
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
# adapted from https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor
|
|
@@ -30,6 +33,10 @@ class CollapsedDimensionError(Exception):
|
|
|
30
33
|
pass
|
|
31
34
|
|
|
32
35
|
|
|
36
|
+
class FillValueNoneError(Exception):
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
33
40
|
# This is a (mostly) copy of the function from zarr.core.indexing that fixes:
|
|
34
41
|
# DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated
|
|
35
42
|
# TODO: Upstream this fix
|
|
@@ -62,17 +69,6 @@ def selector_tuple_to_slice_selection(selector_tuple: SelectorTuple) -> list[sli
|
|
|
62
69
|
return make_slice_selection(selector_tuple)
|
|
63
70
|
|
|
64
71
|
|
|
65
|
-
def convert_chunk_to_primitive(
|
|
66
|
-
byte_getter: ByteGetter | ByteSetter, chunk_spec: ArraySpec
|
|
67
|
-
) -> tuple[str, ChunkCoords, str, Any]:
|
|
68
|
-
return (
|
|
69
|
-
str(byte_getter),
|
|
70
|
-
chunk_spec.shape,
|
|
71
|
-
str(chunk_spec.dtype),
|
|
72
|
-
chunk_spec.fill_value.tobytes(),
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
|
|
76
72
|
def resulting_shape_from_index(
|
|
77
73
|
array_shape: tuple[int, ...],
|
|
78
74
|
index_tuple: tuple[int | slice | EllipsisType | np.ndarray],
|
|
@@ -144,15 +140,44 @@ def get_shape_for_selector(
|
|
|
144
140
|
return resulting_shape_from_index(shape, selector_tuple, drop_axes, pad=pad)
|
|
145
141
|
|
|
146
142
|
|
|
143
|
+
def get_implicit_fill_value(dtype: ZDType, fill_value: Any) -> Any:
|
|
144
|
+
if fill_value is None:
|
|
145
|
+
fill_value = dtype.default_scalar()
|
|
146
|
+
return fill_value
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass(frozen=True)
|
|
150
|
+
class RustChunkInfo:
|
|
151
|
+
chunk_info_with_indices: list[ChunkItem]
|
|
152
|
+
write_empty_chunks: bool
|
|
153
|
+
|
|
154
|
+
|
|
147
155
|
def make_chunk_info_for_rust_with_indices(
|
|
148
156
|
batch_info: Iterable[
|
|
149
|
-
tuple[ByteGetter | ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]
|
|
157
|
+
tuple[ByteGetter | ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]
|
|
150
158
|
],
|
|
151
159
|
drop_axes: tuple[int, ...],
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
160
|
+
shape: tuple[int, ...],
|
|
161
|
+
) -> RustChunkInfo:
|
|
162
|
+
shape = shape if shape else (1,) # constant array
|
|
163
|
+
chunk_info_with_indices: list[ChunkItem] = []
|
|
164
|
+
write_empty_chunks: bool = True
|
|
165
|
+
for (
|
|
166
|
+
byte_getter,
|
|
167
|
+
chunk_spec,
|
|
168
|
+
chunk_selection,
|
|
169
|
+
out_selection,
|
|
170
|
+
_,
|
|
171
|
+
) in batch_info:
|
|
172
|
+
write_empty_chunks = chunk_spec.config.write_empty_chunks
|
|
173
|
+
if chunk_spec.fill_value is None:
|
|
174
|
+
chunk_spec = ArraySpec(
|
|
175
|
+
chunk_spec.shape,
|
|
176
|
+
chunk_spec.dtype,
|
|
177
|
+
get_implicit_fill_value(chunk_spec.dtype, chunk_spec.fill_value),
|
|
178
|
+
chunk_spec.config,
|
|
179
|
+
chunk_spec.prototype,
|
|
180
|
+
)
|
|
156
181
|
out_selection_as_slices = selector_tuple_to_slice_selection(out_selection)
|
|
157
182
|
chunk_selection_as_slices = selector_tuple_to_slice_selection(chunk_selection)
|
|
158
183
|
shape_chunk_selection_slices = get_shape_for_selector(
|
|
@@ -169,17 +194,12 @@ def make_chunk_info_for_rust_with_indices(
|
|
|
169
194
|
f"{shape_chunk_selection} != {shape_chunk_selection_slices}"
|
|
170
195
|
)
|
|
171
196
|
chunk_info_with_indices.append(
|
|
172
|
-
(
|
|
197
|
+
ChunkItem(
|
|
198
|
+
key=byte_getter.path,
|
|
199
|
+
chunk_subset=chunk_selection_as_slices,
|
|
200
|
+
chunk_shape=chunk_spec.shape,
|
|
201
|
+
subset=out_selection_as_slices,
|
|
202
|
+
shape=shape,
|
|
203
|
+
)
|
|
173
204
|
)
|
|
174
|
-
return chunk_info_with_indices
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
def make_chunk_info_for_rust(
|
|
178
|
-
batch_info: Iterable[
|
|
179
|
-
tuple[ByteGetter | ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]
|
|
180
|
-
],
|
|
181
|
-
) -> list[tuple[str, ChunkCoords, str, Any]]:
|
|
182
|
-
return list(
|
|
183
|
-
convert_chunk_to_primitive(byte_getter, chunk_spec)
|
|
184
|
-
for (byte_getter, chunk_spec, _, _) in batch_info
|
|
185
|
-
)
|
|
205
|
+
return RustChunkInfo(chunk_info_with_indices, write_empty_chunks)
|
|
@@ -1,66 +1,34 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: zarrs
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Classifier: Programming Language :: Rust
|
|
5
5
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
6
6
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
7
7
|
Classifier: Typing :: Typed
|
|
8
|
-
Requires-Dist:
|
|
9
|
-
Requires-Dist:
|
|
10
|
-
Requires-Dist: fasteners
|
|
11
|
-
Requires-Dist: numcodecs[msgpack] >=0.10.0
|
|
12
|
-
Requires-Dist: fsspec >2024
|
|
13
|
-
Requires-Dist: crc32c
|
|
14
|
-
Requires-Dist: zstandard
|
|
15
|
-
Requires-Dist: typing-extensions
|
|
16
|
-
Requires-Dist: donfig
|
|
17
|
-
Requires-Dist: pytest
|
|
18
|
-
Requires-Dist: universal-pathlib >=0.2.0
|
|
19
|
-
Requires-Dist: zarr >=3.0.0b2
|
|
20
|
-
Requires-Dist: coverage ; extra == 'test'
|
|
21
|
-
Requires-Dist: pytest ; extra == 'test'
|
|
22
|
-
Requires-Dist: pytest-cov ; extra == 'test'
|
|
23
|
-
Requires-Dist: msgpack ; extra == 'test'
|
|
24
|
-
Requires-Dist: lmdb ; extra == 'test'
|
|
25
|
-
Requires-Dist: s3fs ; extra == 'test'
|
|
26
|
-
Requires-Dist: pytest-asyncio ; extra == 'test'
|
|
27
|
-
Requires-Dist: moto[s3] ; extra == 'test'
|
|
28
|
-
Requires-Dist: flask-cors ; extra == 'test'
|
|
29
|
-
Requires-Dist: flask ; extra == 'test'
|
|
30
|
-
Requires-Dist: requests ; extra == 'test'
|
|
31
|
-
Requires-Dist: mypy ; extra == 'test'
|
|
32
|
-
Requires-Dist: hypothesis ; extra == 'test'
|
|
33
|
-
Requires-Dist: pytest-xdist ; extra == 'test'
|
|
34
|
-
Requires-Dist: maturin ; extra == 'dev'
|
|
35
|
-
Requires-Dist: pip ; extra == 'dev'
|
|
36
|
-
Requires-Dist: pre-commit ; extra == 'dev'
|
|
37
|
-
Requires-Dist: sphinx >=7.4.6 ; extra == 'doc'
|
|
38
|
-
Requires-Dist: myst-parser ; extra == 'doc'
|
|
39
|
-
Provides-Extra: test
|
|
40
|
-
Provides-Extra: dev
|
|
41
|
-
Provides-Extra: doc
|
|
8
|
+
Requires-Dist: numpy>=1.24
|
|
9
|
+
Requires-Dist: zarr>=3.1
|
|
42
10
|
License-File: LICENSE
|
|
11
|
+
Summary: A CodecPipeline for zarr-python backed by the zarrs Rust crate
|
|
43
12
|
Author: Ilan Gold, Lachlan Deakin, Philipp Angerer
|
|
44
|
-
License: MIT
|
|
13
|
+
License-Expression: MIT
|
|
45
14
|
Requires-Python: >=3.11
|
|
46
15
|
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
47
16
|
|
|
48
17
|
# zarrs-python
|
|
49
18
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
19
|
+
[](https://pypi.org/project/zarrs)
|
|
20
|
+
[](https://pepy.tech/project/zarrs)
|
|
21
|
+
[](https://pepy.tech/project/zarrs)
|
|
22
|
+
[](https://github.com/zarrs/zarrs-python/stargazers)
|
|
23
|
+

|
|
24
|
+

|
|
56
25
|
|
|
57
|
-
This project serves as a bridge between [`zarrs`](https://docs.rs/zarrs/latest/zarrs/) and [`zarr`](https://zarr.readthedocs.io/en/latest/index.html) via [`PyO3`](https://pyo3.rs/v0.22.3/). The main goal of the project is to speed up i/o.
|
|
26
|
+
This project serves as a bridge between [`zarrs`](https://docs.rs/zarrs/latest/zarrs/) (Rust) and [`zarr`](https://zarr.readthedocs.io/en/latest/index.html) (`zarr-python`) via [`PyO3`](https://pyo3.rs/v0.22.3/). The main goal of the project is to speed up i/o (see [`zarr_benchmarks`](https://github.com/LDeakin/zarr_benchmarks)).
|
|
58
27
|
|
|
59
|
-
To use the project, simply install our package (which depends on `zarr-python
|
|
28
|
+
To use the project, simply install our package (which depends on `zarr-python>=3.0.0`), and run:
|
|
60
29
|
|
|
61
30
|
```python
|
|
62
31
|
import zarr
|
|
63
|
-
import zarrs
|
|
64
32
|
zarr.config.set({"codec_pipeline.path": "zarrs.ZarrsCodecPipeline"})
|
|
65
33
|
```
|
|
66
34
|
|
|
@@ -68,7 +36,15 @@ You can then use your `zarr` as normal (with some caveats)!
|
|
|
68
36
|
|
|
69
37
|
## API
|
|
70
38
|
|
|
71
|
-
We export a `ZarrsCodecPipeline` class so that `zarr-python` can use the class but it is not meant to be instantiated and we do not guarantee the stability of its API beyond what is required so that `zarr-python` can use it. Therefore, it is not documented here.
|
|
39
|
+
We export a `ZarrsCodecPipeline` class so that `zarr-python` can use the class but it is not meant to be instantiated and we do not guarantee the stability of its API beyond what is required so that `zarr-python` can use it. Therefore, it is not documented here.
|
|
40
|
+
|
|
41
|
+
At the moment, we only support a subset of the `zarr-python` stores:
|
|
42
|
+
|
|
43
|
+
- [`LocalStore`](https://zarr.readthedocs.io/en/latest/api/zarr/storage/#zarr.storage.LocalStore) (local filesystem)
|
|
44
|
+
- [`ObjectStore`](https://zarr.readthedocs.io/en/latest/user-guide/storage/#object-store) (cloud storage)
|
|
45
|
+
- [`HTTPFileSystem`](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.implementations.http.HTTPFileSystem) via [`FsspecStore`](https://zarr.readthedocs.io/en/latest/api/zarr/storage/#zarr.storage.FsspecStore)
|
|
46
|
+
|
|
47
|
+
A `NotImplementedError` will be raised if a store is not supported.
|
|
72
48
|
|
|
73
49
|
### Configuration
|
|
74
50
|
|
|
@@ -79,7 +55,6 @@ Standard `zarr.config` options control some functionality (see the defaults in t
|
|
|
79
55
|
- Defaults to the number of threads in the global `rayon` thread pool if set to `None`, which is [typically the number of logical CPUs](https://docs.rs/rayon/latest/rayon/struct.ThreadPoolBuilder.html#method.num_threads).
|
|
80
56
|
- `array.write_empty_chunks`: whether or not to store empty chunks.
|
|
81
57
|
- Defaults to false if `None`. Note that checking for emptiness has some overhead, see [here](https://docs.rs/zarrs/latest/zarrs/config/struct.Config.html#store-empty-chunks) for more info.
|
|
82
|
-
- This option name is proposed in [zarr-python #2429](https://github.com/zarr-developers/zarr-python/pull/2429)
|
|
83
58
|
|
|
84
59
|
The `ZarrsCodecPipeline` specific options are:
|
|
85
60
|
- `codec_pipeline.chunk_concurrent_maximum`: the maximum number of chunks stored/retrieved concurrently.
|
|
@@ -87,7 +62,11 @@ The `ZarrsCodecPipeline` specific options are:
|
|
|
87
62
|
- `codec_pipeline.chunk_concurrent_minimum`: the minimum number of chunks retrieved/stored concurrently when balancing chunk/codec concurrency.
|
|
88
63
|
- Defaults to 4 if `None`. See [here](https://docs.rs/zarrs/latest/zarrs/config/struct.Config.html#chunk-concurrent-minimum) for more info.
|
|
89
64
|
- `codec_pipeline.validate_checksums`: enable checksum validation (e.g. with the CRC32C codec).
|
|
90
|
-
- Defaults to
|
|
65
|
+
- Defaults to `True`. See [here](https://docs.rs/zarrs/latest/zarrs/config/struct.Config.html#validate-checksums) for more info.
|
|
66
|
+
- `codec_pipeline.direct_io`: enable `O_DIRECT` read/write, needs support from the operating system (currently only Linux) and file system.
|
|
67
|
+
- Defaults to `False`.
|
|
68
|
+
- `codec_pipeline.strict`: raise exceptions for unsupported operations instead of falling back to the default codec pipeline of `zarr-python`.
|
|
69
|
+
- Defaults to `False`.
|
|
91
70
|
|
|
92
71
|
For example:
|
|
93
72
|
```python
|
|
@@ -97,13 +76,16 @@ zarr.config.set({
|
|
|
97
76
|
"codec_pipeline": {
|
|
98
77
|
"path": "zarrs.ZarrsCodecPipeline",
|
|
99
78
|
"validate_checksums": True,
|
|
100
|
-
"store_empty_chunks": False,
|
|
101
79
|
"chunk_concurrent_maximum": None,
|
|
102
80
|
"chunk_concurrent_minimum": 4,
|
|
81
|
+
"direct_io": False,
|
|
82
|
+
"strict": False
|
|
103
83
|
}
|
|
104
84
|
})
|
|
105
85
|
```
|
|
106
86
|
|
|
87
|
+
If the `ZarrsCodecPipeline` is pickled, and then un-pickled, and during that time one of `chunk_concurrent_minimum`, `chunk_concurrent_maximum`, or `num_threads` has changed, the newly un-pickled version will pick up the new value. However, once a `ZarrsCodecPipeline` object has been instantiated, these values are then fixed. This may change in the future as guidance from the `zarr` community becomes clear.
|
|
88
|
+
|
|
107
89
|
## Concurrency
|
|
108
90
|
|
|
109
91
|
Concurrency can be classified into two types:
|
|
@@ -123,7 +105,7 @@ Chunk concurrency is typically favored because:
|
|
|
123
105
|
|
|
124
106
|
## Supported Indexing Methods
|
|
125
107
|
|
|
126
|
-
|
|
108
|
+
The following methods will trigger use with the old zarr-python pipeline:
|
|
127
109
|
|
|
128
110
|
1. Any `oindex` or `vindex` integer `np.ndarray` indexing with dimensionality >=3 i.e.,
|
|
129
111
|
|
|
@@ -153,7 +135,10 @@ We **do not** officially support the following indexing methods. Some of these
|
|
|
153
135
|
arr[0:10, ..., 0:5]
|
|
154
136
|
```
|
|
155
137
|
|
|
156
|
-
Otherwise, we believe that we support your indexing case: slices, ints, and all integer `np.ndarray` indices in 2D for reading, contiguous integer `np.ndarray` indices along one axis for writing etc. Please file an issue if you believe we have more holes in our coverage than we are aware of or you wish to contribute! For example, we have an [issue in zarrs for integer-array indexing](https://github.com/LDeakin/zarrs/issues/52) that would unblock a lot of these issues!
|
|
157
138
|
|
|
158
|
-
|
|
139
|
+
Furthermore, using anything except contiguous (i.e., slices or consecutive integer) `np.ndarray` for numeric data will fall back to the default `zarr-python` implementation.
|
|
140
|
+
|
|
141
|
+
Please file an issue if you believe we have more holes in our coverage than we are aware of or you wish to contribute! For example, we have an [issue in zarrs for integer-array indexing](https://github.com/LDeakin/zarrs/issues/52) that would unblock a lot the use of the rust pipeline for that use-case (very useful for mini-batch training perhaps!).
|
|
142
|
+
|
|
143
|
+
Further, any codecs not supported by `zarrs` will also automatically fall back to the python implementation.
|
|
159
144
|
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
zarrs-0.2.2.dist-info/METADATA,sha256=8K1AOS_SVQgRLzQ2rWtNbJxCAb12XmTLRrT6QklQgOI,8054
|
|
2
|
+
zarrs-0.2.2.dist-info/WHEEL,sha256=N8W3-0eDM6igWj-H12r7VkxoMaJIqJLxUyWCFstEaGg,105
|
|
3
|
+
zarrs-0.2.2.dist-info/entry_points.txt,sha256=EzI6yCIUPDHBHzjDdexuGGYbOLXf8x2ICokOJXnuX3k,68
|
|
4
|
+
zarrs-0.2.2.dist-info/licenses/LICENSE,sha256=vwIsJjEfVFehyyqcb7B3dAXAniaFMmk8u7IoiJAfBJ4,1099
|
|
5
|
+
zarrs/__init__.py,sha256=lRVtAPzCzJkGs4vQrW4UgANq-pC-khS0ZF7HTj4__Hg,489
|
|
6
|
+
zarrs/_internal.abi3.so,sha256=tnP5IiuDmhfwB15cX4yTyu1mVZgShtJy1lnS87TzK1o,14928348
|
|
7
|
+
zarrs/_internal.pyi,sha256=a_D4yx99r4xeQX1ntY_A_Q4wVmLeLwJZHWAQV_mVu9A,1308
|
|
8
|
+
zarrs/pipeline.py,sha256=YfB13GWNfxELerXVtJ_ipFwSL7bN-YuPys6jCB9lnms,9008
|
|
9
|
+
zarrs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
zarrs/utils.py,sha256=W2XCjJDVrdHYJgtVaRKN533Ljw1MF7o0YwXuz5ZAk2g,7020
|
|
11
|
+
zarrs-0.2.2.dist-info/RECORD,,
|
zarrs-0.1.0.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
zarrs-0.1.0.dist-info/METADATA,sha256=Qf4O1LEJrZrrnWtO4vqOn4gF3cf882aX0kIf3TMhUMY,7923
|
|
2
|
-
zarrs-0.1.0.dist-info/WHEEL,sha256=LZygbeT1PTQw7a9tONPp78bbG4FZc86U59Z0RFJcoR8,105
|
|
3
|
-
zarrs-0.1.0.dist-info/licenses/LICENSE,sha256=vwIsJjEfVFehyyqcb7B3dAXAniaFMmk8u7IoiJAfBJ4,1099
|
|
4
|
-
zarrs/__init__.py,sha256=4oWtWDZO8r7z4Uh7Fy_brmkxXDpULQdgjlA0iFw98eA,573
|
|
5
|
-
zarrs/_internal.pyi,sha256=revBHMbEur_WKTDRtyJqah0e-D6CPy58sIIZLpicRgA,1330
|
|
6
|
-
zarrs/utils.py,sha256=qV-__rjVNs7bhvnyY4U2eOtEFESGm-XQUv6t9ECvjcc,6588
|
|
7
|
-
zarrs/pipeline.py,sha256=pcZ56LVQ131e5iq_HpEZWOyo-HzDi6GUOSulDHPWyLQ,5576
|
|
8
|
-
zarrs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
zarrs/_internal.abi3.so,sha256=uc2IGucmaDNNx0t5K5lETXYmckCIJedoPcPLJhHCfOs,3193056
|
|
10
|
-
zarrs-0.1.0.dist-info/RECORD,,
|
|
File without changes
|