zarr-n5 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
zarr_n5-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,60 @@
1
+ Metadata-Version: 2.3
2
+ Name: zarr-n5
3
+ Version: 0.1.0
4
+ Summary: Utilities for accessing N5 data through zarr v3.
5
+ Author: Chris Barnes
6
+ Author-email: Chris Barnes <chris.barnes@gerbi-gmb.de>
7
+ Requires-Dist: zarr>=3.1.5
8
+ Requires-Python: >=3.12, <4.0
9
+ Description-Content-Type: text/markdown
10
+
11
+ # zarr-python-n5
12
+
13
+ [N5](https://github.com/saalfeldlab/n5) utilities for [zarr-python](https://github.com/zarr-developers/zarr-python).
14
+
15
+ - Documentation: <https://zarr-python-n5.readthedocs.io>
16
+
17
+ ## Codecs
18
+
19
+ ### N5 Default Codec
20
+
21
+ [As described here](https://github.com/zarr-developers/zarr-extensions/tree/main/codecs/n5_default).
22
+
23
+ Only whole-chunk reading is supported.
24
+
25
+ #### N5 Compressor support
26
+
27
+ | N5 compressor | Supported | Zarr bytes-to-bytes codec | Notes |
28
+ | ------------- | --------- | ------------------------- | ----- |
29
+ | `raw` | yes | n/a | Equivalent to omitted bytes-to-bytes codec |
30
+ | `blosc` | yes | `blosc` | |
31
+ | `gzip` | yes | `gzip` | |
32
+ | `zstd` | yes | `zstd` | |
33
+ | `lz4` | no | | [Incompatible codecs](https://github.com/zarr-developers/numcodecs/issues/175) |
34
+ | `xz` | no | | No equivalent Zarr codec |
35
+ | `jpeg` | no | | Needs [N5 documentation](https://github.com/saalfeldlab/n5-jpeg/issues/1), [Zarr codec](https://github.com/zarr-developers/zarr-extensions/issues/15) |
36
+ | `bzip2` | no | | No equivalent Zarr codec |
37
+
38
+ ## Stores
39
+
40
+ `N5WrapperStore` allows reading N5 data with DEFAULT-mode blocks through any Zarr store by converting metadata on the fly.
41
+ By default, this does not replicate the N5 behaviour of inferring an empty group where a metadata document does not exist.
42
+ To achieve this, wrap it in the provided `ImplicitGroupWrapperStore`.
43
+
44
+ ## Tools
45
+
46
+ This package provides `n5tozarr`, a command-line interface for converting N5 data to Zarr in-place.
47
+ The N5 metadata are left untouched, and no chunk data is altered, moved, or copied.
48
+ A `zarr.json` file is simply added to each Zarr node.
49
+
50
+ N5 attributes are extracted and added to the `zarr.json` attributes.
51
+
52
+ The full N5 metadata document is accessible inside the `zarr.json` in an attribute called `_n5`.
53
+ If a directory/prefix was empty and the existence of an N5 group was inferred,
54
+ the `zarr.json` attribute `_implicit` will be `true`.
55
+
56
+ ## Contributing
57
+
58
+ Use [`uv`](https://docs.astral.sh/uv/) for project management.
59
+
60
+ Use [`just`](https://github.com/casey/just) for common development tasks.
@@ -0,0 +1,50 @@
1
+ # zarr-python-n5
2
+
3
+ [N5](https://github.com/saalfeldlab/n5) utilities for [zarr-python](https://github.com/zarr-developers/zarr-python).
4
+
5
+ - Documentation: <https://zarr-python-n5.readthedocs.io>
6
+
7
+ ## Codecs
8
+
9
+ ### N5 Default Codec
10
+
11
+ [As described here](https://github.com/zarr-developers/zarr-extensions/tree/main/codecs/n5_default).
12
+
13
+ Only whole-chunk reading is supported.
14
+
15
+ #### N5 Compressor support
16
+
17
+ | N5 compressor | Supported | Zarr bytes-to-bytes codec | Notes |
18
+ | ------------- | --------- | ------------------------- | ----- |
19
+ | `raw` | yes | n/a | Equivalent to omitted bytes-to-bytes codec |
20
+ | `blosc` | yes | `blosc` | |
21
+ | `gzip` | yes | `gzip` | |
22
+ | `zstd` | yes | `zstd` | |
23
+ | `lz4` | no | | [Incompatible codecs](https://github.com/zarr-developers/numcodecs/issues/175) |
24
+ | `xz` | no | | No equivalent Zarr codec |
25
+ | `jpeg` | no | | Needs [N5 documentation](https://github.com/saalfeldlab/n5-jpeg/issues/1), [Zarr codec](https://github.com/zarr-developers/zarr-extensions/issues/15) |
26
+ | `bzip2` | no | | No equivalent Zarr codec |
27
+
28
+ ## Stores
29
+
30
+ `N5WrapperStore` allows reading N5 data with DEFAULT-mode blocks through any Zarr store by converting metadata on the fly.
31
+ By default, this does not replicate the N5 behaviour of inferring an empty group where a metadata document does not exist.
32
+ To achieve this, wrap it in the provided `ImplicitGroupWrapperStore`.
33
+
34
+ ## Tools
35
+
36
+ This package provides `n5tozarr`, a command-line interface for converting N5 data to Zarr in-place.
37
+ The N5 metadata are left untouched, and no chunk data is altered, moved, or copied.
38
+ A `zarr.json` file is simply added to each Zarr node.
39
+
40
+ N5 attributes are extracted and added to the `zarr.json` attributes.
41
+
42
+ The full N5 metadata document is accessible inside the `zarr.json` in an attribute called `_n5`.
43
+ If a directory/prefix was empty and the existence of an N5 group was inferred,
44
+ the `zarr.json` attribute `_implicit` will be `true`.
45
+
46
+ ## Contributing
47
+
48
+ Use [`uv`](https://docs.astral.sh/uv/) for project management.
49
+
50
+ Use [`just`](https://github.com/casey/just) for common development tasks.
@@ -0,0 +1,37 @@
1
+ [project]
2
+ name = "zarr-n5"
3
+ version = "0.1.0"
4
+ description = "Utilities for accessing N5 data through zarr v3."
5
+ readme = "README.md"
6
+ authors = [{ name = "Chris Barnes", email = "chris.barnes@gerbi-gmb.de" }]
7
+ requires-python = ">=3.12,<4.0"
8
+ dependencies = ["zarr>=3.1.5"]
9
+
10
+ [project.entry-points."zarr.codecs"]
11
+ "n5_default" = "zarr_n5:N5DefaultCodec"
12
+
13
+ [project.scripts]
14
+ n5tozarr = "zarr_n5.cli.convert:main"
15
+
16
+
17
+ [build-system]
18
+ requires = ["uv_build>=0.9.8,<0.10.0"]
19
+ build-backend = "uv_build"
20
+
21
+ [dependency-groups]
22
+ dev = [
23
+ { include-group = "lint" },
24
+ { include-group = "test" },
25
+ { include-group = "doc" },
26
+ ]
27
+ doc = [
28
+ "pdoc>=16.0.0",
29
+ ]
30
+ lint = [
31
+ "mypy>=1.19.1",
32
+ "ruff>=0.15.6",
33
+ ]
34
+ test = [
35
+ "pytest>=9.0.2",
36
+ "tensorstore>=0.1.82",
37
+ ]
@@ -0,0 +1,13 @@
1
+ """
2
+ Utilities for working with [N5](https://github.com/saalfeldlab/n5) data through [zarr-python](https://github.com/zarr-developers/zarr-python).
3
+ """
4
+
5
+ from zarr.registry import register_codec
6
+
7
+ from .codec.default import N5DefaultCodec
8
+ from .storage.n5 import N5WrapperStore
9
+ from .storage.implicit import ImplicitGroupWrapperStore
10
+
11
+ __all__ = ["N5WrapperStore", "ImplicitGroupWrapperStore", "N5DefaultCodec"]
12
+
13
+ register_codec("n5_default", N5DefaultCodec, qualname="zarr_n5.N5DefaultCodec")
@@ -0,0 +1 @@
1
+ """Modules for command line interfaces."""
@@ -0,0 +1,33 @@
1
+ """n5tozarr command line interface."""
2
+
3
+ from argparse import ArgumentParser
4
+ import asyncio
5
+
6
+ from ..convert import convert_hierarchy, DEFAULT_TASKS
7
+
8
+
9
+ def main(raw_args=None):
10
+ """n5tozarr main function."""
11
+ parser = ArgumentParser("n5tozarr")
12
+ parser.add_argument("url", help="URL to Zarr store, using fsspec format")
13
+ parser.add_argument(
14
+ "path", help="paths within the Zarr store to process", nargs="?"
15
+ )
16
+ parser.add_argument(
17
+ "-t", "--tasks", type=int, default=DEFAULT_TASKS, help="asynchronous task count"
18
+ )
19
+ parser.add_argument(
20
+ "-d", "--max-depth", type=int, help="how far to recurse; default no maximum"
21
+ )
22
+ parser.add_argument(
23
+ "-I",
24
+ "--no-infer-groups",
25
+ action="store_true",
26
+ help="do not infer N5 groups from empty directories/ prefixes",
27
+ )
28
+
29
+ args = parser.parse_args(raw_args)
30
+ fut = convert_hierarchy(
31
+ args.url, args.path or "", not args.no_infer_groups, args.max_depth, args.tasks
32
+ )
33
+ asyncio.run(fut)
@@ -0,0 +1,5 @@
1
+ """Zarr codecs."""
2
+
3
+ from .default import N5DefaultCodec
4
+
5
+ __all__ = ["N5DefaultCodec"]
@@ -0,0 +1,262 @@
1
+ """
2
+ N5 Default codec module.
3
+ """
4
+
5
+ from collections.abc import Iterable
6
+ from dataclasses import dataclass
7
+ from typing import Self
8
+
9
+ from zarr.abc.codec import ArrayBytesCodec, Codec, BytesBytesCodec, CodecPipeline
10
+ from zarr.core.array_spec import ArraySpec
11
+ from zarr.core.buffer.core import Buffer, NDBuffer
12
+ from zarr.core.chunk_grids import ChunkGrid
13
+ from zarr.core.dtype.wrapper import TBaseDType, ZDType, TBaseScalar
14
+ from zarr.core.common import JSON, parse_named_configuration
15
+ from zarr.core.metadata.v3 import parse_codecs
16
+ from zarr.codecs import BytesCodec, Endian, TransposeCodec
17
+ from zarr.registry import get_pipeline_class
18
+
19
+ from ..metadata import COMPATIBLE_DATA_TYPES
20
+
21
+ from ..util import N5BlockHeader
22
+
23
+ __all__ = ["N5DefaultCodec"]
24
+
25
+ N5_DEFAULT_NAME = "n5_default"
26
+ N5_ENDIAN = Endian.big
27
+
28
+
29
+ def check_valid_transpose(codec: Codec):
30
+ if not isinstance(codec, TransposeCodec):
31
+ raise ValueError("not transpose codec")
32
+ if codec.order != tuple(sorted(codec.order, reverse=True)):
33
+ raise ValueError("not a full transpose")
34
+
35
+
36
+ def check_valid_bytes(codec: Codec):
37
+ if not isinstance(codec, BytesCodec):
38
+ raise ValueError("not bytes codec")
39
+ if codec.endian is not None and codec.endian != N5_ENDIAN:
40
+ raise ValueError("bytes codec must be big-endian")
41
+
42
+
43
+ def check_valid_compressor(codec: Codec):
44
+ if not isinstance(codec, BytesBytesCodec):
45
+ raise ValueError("codec is not bytes-to-bytes")
46
+
47
+
48
+ CodecTuple = (
49
+ tuple[TransposeCodec, BytesCodec]
50
+ | tuple[TransposeCodec, BytesCodec, BytesBytesCodec]
51
+ )
52
+
53
+
54
+ @dataclass(frozen=True)
55
+ class N5DefaultCodec(ArrayBytesCodec):
56
+ """Zarr codec for default-mode N5 data.
57
+
58
+ Only full-chunk reads are supported.
59
+ Use the `N5DefaultCodec.from_compressor` constructor if initialising manually.
60
+
61
+ - Reads and validates the N5 block header
62
+ - Applies the wrapped codecs to the N5 block body
63
+ - Truncates or pads the resulting array to match the requested chunk
64
+
65
+ Should be the only codec present.
66
+ """
67
+
68
+ codecs: CodecTuple
69
+ """Codecs to be applied to the N5 block body."""
70
+
71
+ def __init__(self, *, codecs: Iterable[Codec | dict[str, JSON]]) -> None:
72
+ cs = parse_codecs(codecs)
73
+ if not 2 <= len(cs) <= 3:
74
+ raise ValueError(f"expected 2-3 codecs, got {len(cs)}")
75
+ check_valid_transpose(cs[0])
76
+ check_valid_bytes(cs[1])
77
+ if len(cs) > 2:
78
+ check_valid_compressor(cs[2])
79
+
80
+ object.__setattr__(self, "codecs", cs)
81
+
82
+ @property
83
+ def codec_pipeline(self) -> CodecPipeline:
84
+ """Get the `CodecPipeline` comprising the wrapped codecs."""
85
+ return get_pipeline_class().from_codecs(self.codecs)
86
+
87
+ @classmethod
88
+ def from_compressor(cls, ndim: int, compressor: BytesBytesCodec | None = None):
89
+ """Construct the codec from minimal information."""
90
+ transpose = cls.make_transpose(ndim)
91
+ endian = cls.make_bytes()
92
+ codecs: CodecTuple
93
+ if compressor is None:
94
+ codecs = (transpose, endian)
95
+ else:
96
+ codecs = (transpose, endian, compressor)
97
+ return cls(codecs=codecs)
98
+
99
+ @classmethod
100
+ def make_transpose(cls, ndim: int) -> TransposeCodec:
101
+ """Generate the `TransposeCodec` needed for this data.
102
+
103
+ N5 data is always fully transposed.
104
+ """
105
+ order = list(range(ndim))
106
+ return TransposeCodec(order=tuple(reversed(order)))
107
+
108
+ @classmethod
109
+ def make_bytes(cls) -> BytesCodec:
110
+ """
111
+ Generate the `BytesCodec` needed for this data.
112
+
113
+ N5 data is always big-endian.
114
+ """
115
+ return BytesCodec(endian=N5_ENDIAN)
116
+
117
+ def compute_encoded_size(
118
+ self, input_byte_length: int, chunk_spec: ArraySpec
119
+ ) -> int:
120
+ header_length = N5BlockHeader.calc_size(chunk_spec.ndim, False)
121
+
122
+ for c in self.codecs:
123
+ input_byte_length = c.compute_encoded_size(input_byte_length, chunk_spec)
124
+ chunk_spec = c.resolve_metadata(chunk_spec)
125
+
126
+ return input_byte_length + header_length
127
+
128
+ def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec:
129
+ for c in self.codecs:
130
+ chunk_spec = c.resolve_metadata(chunk_spec)
131
+ return chunk_spec
132
+
133
+ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
134
+ transpose = self.make_transpose(array_spec.ndim).evolve_from_array_spec(
135
+ array_spec
136
+ )
137
+ endian = self.make_bytes().evolve_from_array_spec(array_spec)
138
+
139
+ codecs: CodecTuple
140
+ match len(self.codecs):
141
+ case 2:
142
+ codecs = (transpose, endian)
143
+ case 3:
144
+ compressor: BytesBytesCodec = self.codecs[2].evolve_from_array_spec( # type:ignore
145
+ array_spec
146
+ )
147
+ codecs = (transpose, endian, compressor)
148
+ case _:
149
+ raise ValueError("unsupported number of codecs")
150
+
151
+ return type(self)(codecs=codecs)
152
+
153
+ def validate(
154
+ self,
155
+ *,
156
+ shape: tuple[int, ...],
157
+ dtype: ZDType[TBaseDType, TBaseScalar],
158
+ chunk_grid: ChunkGrid,
159
+ ) -> None:
160
+ expected_ndim = len(self.codecs[0].order)
161
+ if len(shape) != expected_ndim:
162
+ raise ValueError(f"array is {len(shape)}D, codec is {expected_ndim}D")
163
+ if dtype._zarr_v3_name not in COMPATIBLE_DATA_TYPES:
164
+ raise ValueError(f"N5 does not support data type {dtype._zarr_v3_name}")
165
+
166
+ return super().validate(shape=shape, dtype=dtype, chunk_grid=chunk_grid)
167
+
168
+ async def _decode_single(
169
+ self, chunk_data: Buffer, chunk_spec: ArraySpec
170
+ ) -> NDBuffer:
171
+ b = chunk_data.as_buffer_like()
172
+ header = N5BlockHeader.from_bytes(b)
173
+ offset = header.size()
174
+
175
+ body_buf = chunk_data[offset:]
176
+ body_nd = chunk_spec.prototype.nd_buffer.empty(
177
+ header.shape, chunk_spec.dtype.to_native_dtype(), chunk_spec.order
178
+ )
179
+ if header.shape == chunk_spec.shape:
180
+ body_spec = chunk_spec
181
+ all_eq = True
182
+ else:
183
+ body_spec = ArraySpec(
184
+ header.shape,
185
+ chunk_spec.dtype,
186
+ chunk_spec.fill_value,
187
+ chunk_spec.config,
188
+ chunk_spec.prototype,
189
+ )
190
+ all_eq = False
191
+ maybe_body_nd, *_ = await self.codec_pipeline.decode([(body_buf, body_spec)])
192
+ # TODO: use codec_pipeline.read() instead; this should avoid the copy for truncated-block cases
193
+ if maybe_body_nd is None:
194
+ raise RuntimeError("unexpected nullish buffer")
195
+ else:
196
+ body_nd = maybe_body_nd
197
+
198
+ if all_eq:
199
+ # don't need to truncate or pad
200
+ return body_nd
201
+
202
+ # whether we can get the chunk we want by trimming down the N5 block body
203
+ can_trim = True
204
+
205
+ min_shape = []
206
+ slice_lst = []
207
+ for hs, cs in zip(header.shape, chunk_spec.shape):
208
+ if cs > hs:
209
+ # requested chunk is larger than the N5 block in some dimension
210
+ can_trim = False
211
+ min_len = min(hs, cs)
212
+ min_shape.append(min_len)
213
+ slice_lst.append(slice(0, min_len))
214
+
215
+ slicing = tuple(slice_lst)
216
+
217
+ if can_trim:
218
+ return body_nd[slicing]
219
+
220
+ out = chunk_spec.prototype.nd_buffer.create(
221
+ shape=chunk_spec.shape,
222
+ dtype=chunk_spec.dtype.to_native_dtype(),
223
+ order=chunk_spec.order,
224
+ fill_value=chunk_spec.fill_value,
225
+ )
226
+ out[slicing] = body_nd[slicing]
227
+ return out
228
+
229
+ # async def _encode_single(
230
+ # self, chunk_data: NDBuffer, chunk_spec: ArraySpec
231
+ # ) -> Buffer | None:
232
+ # header = N5BlockHeader(N5Mode.DEFAULT, chunk_spec.shape)
233
+ # for c in self.codecs:
234
+ # chunk_data = c._encode_single(chunk_data, chunk_spec) # type:ignore
235
+ # chunk_spec = c.resolve_metadata(chunk_spec)
236
+
237
+ # buf: Buffer = chunk_data # type: ignore
238
+
239
+ # bio = BytesIO()
240
+ # bio.write(header.to_bytes())
241
+ # # TODO: avoid this copy?
242
+ # bio.write(buf.as_buffer_like())
243
+ # return Buffer.from_bytes(bio.getbuffer())
244
+
245
+ @classmethod
246
+ def from_dict(
247
+ cls,
248
+ data: dict[str, JSON],
249
+ ) -> Self:
250
+ _, configuration_parsed = parse_named_configuration(
251
+ data, N5_DEFAULT_NAME, require_configuration=True
252
+ )
253
+
254
+ return cls(**configuration_parsed) # type: ignore[arg-type]
255
+
256
+ def to_dict(
257
+ self,
258
+ ) -> dict[str, JSON]:
259
+ return {
260
+ "name": N5_DEFAULT_NAME,
261
+ "configuration": {"codecs": [c.to_dict() for c in self.codecs]},
262
+ }
@@ -0,0 +1,7 @@
1
+ """Useful constant values."""
2
+
3
+ N5_METADATA_KEY = "attributes.json"
4
+ """Object name for N5 metadata files."""
5
+
6
+ ZARR_V3_METADATA_KEY = "zarr.json"
7
+ """Object name for Zarr v3 metadata files."""
@@ -0,0 +1,103 @@
1
+ import asyncio
2
+ import logging
3
+
4
+ import zarr.api.asynchronous as zarr_async
5
+ from zarr.abc.store import Store
6
+ from zarr.core.metadata.io import save_metadata
7
+ from zarr.storage import StoreLike, StorePath
8
+ from zarr.storage._common import make_store
9
+ from zarr.core.group import AsyncGroup
10
+ from zarr.core.array import AsyncArray
11
+ from .storage import ImplicitGroupWrapperStore, N5WrapperStore
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ __all__ = ["N5ToZarr", "convert_hierarchy"]
16
+
17
+ DEFAULT_TASKS = 10
18
+
19
+
20
+ class Finished:
21
+ pass
22
+
23
+
24
+ class N5ToZarr:
25
+ def __init__(self, store: Store, infer_groups: bool = True) -> None:
26
+ self.inner_store = store
27
+
28
+ self.n5_store: Store
29
+ if infer_groups:
30
+ self.n5_store = ImplicitGroupWrapperStore(N5WrapperStore(store))
31
+ else:
32
+ self.n5_store = N5WrapperStore(store)
33
+
34
+ self.queue: asyncio.Queue[AsyncArray | AsyncGroup | Finished] = asyncio.Queue()
35
+
36
+ async def convert_hierarchy(
37
+ self, path: str = "", max_depth: int | None = -1, n_tasks=10
38
+ ):
39
+ member = await zarr_async.open(store=self.n5_store, path=path)
40
+ total = await self.convert_member(member)
41
+ if total == 0:
42
+ return total
43
+ if isinstance(member, AsyncArray):
44
+ return total
45
+ if max_depth is not None and max_depth >= 0:
46
+ return total
47
+
48
+ new_depth = None if max_depth is None else max_depth - 1
49
+ tasks = [self._spawn_worker() for _ in range(n_tasks)]
50
+ total = 0
51
+ async for _, child in member.members(new_depth):
52
+ await self.queue.put(child)
53
+ total += 1
54
+ logger.info("Enqueued %s nodes", total)
55
+
56
+ await self.queue.put(Finished())
57
+ count = sum(await asyncio.gather(*tasks))
58
+ logger.info("Converted %s nodes", count)
59
+ return count
60
+
61
+ def _spawn_worker(self, name: str | None = None):
62
+ """Schedule a task for execution wrapping a worker function."""
63
+ return asyncio.create_task(self._worker(), name=name)
64
+
65
+ async def _worker(self):
66
+ """Create a worker which reads Zarr nodes from the queue and processes them."""
67
+ total = 0
68
+ while True:
69
+ value = await self.queue.get()
70
+ if isinstance(value, Finished):
71
+ await self.queue.put(value)
72
+ return total
73
+ total += await self.convert_member(value)
74
+ self.queue.task_done()
75
+
76
+ async def convert_member(self, member: AsyncArray | AsyncGroup) -> int:
77
+ """Returns 0 or 1 for whether the node was skipped or converted."""
78
+ try:
79
+ _ = await zarr_async.open(
80
+ store=self.inner_store, mode="r", path=member.path
81
+ )
82
+ logger.info("Found existing zarr node at %s, ignoring", member.path)
83
+ return 0
84
+ except Exception:
85
+ pass
86
+
87
+ await save_metadata(
88
+ StorePath(store=self.inner_store, path=member.path), member.metadata, False
89
+ )
90
+ logger.info("Converted N5 entry %s to Zarr", member.path)
91
+ return 1
92
+
93
+
94
+ async def convert_hierarchy(
95
+ store: StoreLike,
96
+ path: str = "",
97
+ infer_groups: bool = True,
98
+ max_depth=None,
99
+ n_tasks=DEFAULT_TASKS,
100
+ ) -> int:
101
+ inner_store = await make_store(store, mode="r+")
102
+ converter = N5ToZarr(inner_store, infer_groups)
103
+ return await converter.convert_hierarchy(path, max_depth=max_depth, n_tasks=n_tasks)
@@ -0,0 +1,194 @@
1
+ """
2
+ Utilities for parsing, representing, and converting N5 metadata.
3
+ """
4
+
5
+ from __future__ import annotations
6
+ from copy import deepcopy
7
+ import itertools
8
+ from typing import Any, TYPE_CHECKING, Self
9
+ from zarr.core.group import GroupMetadata
10
+ from zarr.core.metadata.v3 import ArrayV3Metadata
11
+ from zarr.core.dtype import ZDType
12
+ from zarr.core import dtype as zdt
13
+ from zarr.core.chunk_grids import RegularChunkGrid
14
+ from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding
15
+ from zarr.abc.codec import BytesBytesCodec
16
+ from zarr.codecs import blosc
17
+ from zarr.codecs import GzipCodec, ZstdCodec
18
+
19
+ from .util import N5Mode
20
+
21
+ if TYPE_CHECKING:
22
+ from typing import Self
23
+ from zarr.core.common import JSON
24
+
25
+ __all__ = ["N5GroupMetadata", "N5ArrayMetadata", "COMPATIBLE_DATA_TYPES"]
26
+
27
+ COMPATIBLE_DATA_TYPES: dict[str, tuple[ZDType, int]] = {
28
+ "uint8": (zdt.UInt8(), 1),
29
+ "uint16": (zdt.UInt16(), 2),
30
+ "uint32": (zdt.UInt32(), 4),
31
+ "uint64": (zdt.UInt64(), 8),
32
+ "int8": (zdt.Int8(), 1),
33
+ "int16": (zdt.Int16(), 2),
34
+ "int32": (zdt.Int32(), 4),
35
+ "int64": (zdt.Int64(), 8),
36
+ "float32": (zdt.Float32(), 4),
37
+ "float64": (zdt.Float64(), 8),
38
+ }
39
+ """Data types which exist in both Zarr and N5.
40
+
41
+ Maps to the Zarr data type and item size."""
42
+
43
+
44
+ class N5GroupMetadata:
45
+ def __init__(
46
+ self, n5: str | None = None, attrs: dict[str, JSON] | None = None
47
+ ) -> None:
48
+ self.n5: str | None = n5
49
+ self.attributes: dict[str, Any] = attrs or dict()
50
+
51
+ def to_jso(self) -> dict[str, JSON]:
52
+ out = deepcopy(self.attributes)
53
+ if self.n5 is not None:
54
+ out["n5"] = self.n5
55
+ return out
56
+
57
+ def is_root(self):
58
+ return self.n5 is not None
59
+
60
+ @classmethod
61
+ def from_jso(cls, jso: dict[str, JSON]) -> Self:
62
+ n5 = jso.pop("n5", None)
63
+ if n5 is not None and not isinstance(n5, str):
64
+ raise ValueError("n5 attribute is not a string")
65
+ return cls(n5, jso)
66
+
67
+ def to_zarr(self):
68
+ attrs = deepcopy(self.attributes)
69
+ attrs["_n5"] = self.to_jso()
70
+ return GroupMetadata(attrs)
71
+
72
+
73
+ class N5ArrayMetadata(N5GroupMetadata):
74
+ def __init__(
75
+ self,
76
+ dimensions: list[int],
77
+ block_size: list[int],
78
+ data_type: str,
79
+ compression: dict[str, Any],
80
+ n5: str | None = None,
81
+ attrs: dict[str, JSON] | None = None,
82
+ ):
83
+ super().__init__(n5, attrs)
84
+ if len(dimensions) != len(block_size):
85
+ raise ValueError(
86
+ f"dimensions {dimensions} and block size {block_size} must have same dimensionality"
87
+ )
88
+
89
+ if any(not is_nonzero_int(s) for s in itertools.chain(dimensions, block_size)):
90
+ raise ValueError("dimensions and block size must be positive integers")
91
+
92
+ self.dimensions = dimensions
93
+ self.block_size = block_size
94
+ self.data_type = data_type
95
+ ctype = compression.get("type")
96
+ if not isinstance(ctype, str):
97
+ raise ValueError(f"compression must have a string type, got {ctype}")
98
+ self.compression = compression
99
+
100
+ def to_jso(self) -> dict[str, JSON]:
101
+ jso = super().to_jso()
102
+ jso["dimensions"] = self.dimensions
103
+ jso["blockSize"] = self.block_size
104
+ jso["dataType"] = self.data_type
105
+ jso["compression"] = self.compression
106
+ return jso
107
+
108
+ @classmethod
109
+ def from_group(cls, grp: N5GroupMetadata) -> Self:
110
+ attrs = deepcopy(grp.attributes)
111
+ dimensions = attrs.pop("dimensions")
112
+ block_size = attrs.pop("blockSize")
113
+ data_type = attrs.pop("dataType")
114
+ compression = attrs.pop("compression")
115
+ return cls(dimensions, block_size, data_type, compression, grp.n5, attrs)
116
+
117
+ @classmethod
118
+ def from_jso(cls, jso: dict[str, JSON]) -> Self:
119
+ grp = super().from_jso(jso)
120
+ return cls.from_group(grp)
121
+
122
+ def to_zarr(self, mode: N5Mode = N5Mode.DEFAULT):
123
+ from .codec.default import N5DefaultCodec
124
+
125
+ if mode != N5Mode.DEFAULT:
126
+ raise NotImplementedError("Only default-mode N5 is supported")
127
+ compressor = self._to_zarr_codec()
128
+ attrs = deepcopy(self.attributes)
129
+ attrs["_n5"] = self.to_jso()
130
+ return ArrayV3Metadata(
131
+ shape=self.dimensions,
132
+ data_type=COMPATIBLE_DATA_TYPES[self.data_type][0],
133
+ chunk_grid=RegularChunkGrid(chunk_shape=self.block_size),
134
+ chunk_key_encoding=V2ChunkKeyEncoding("/"),
135
+ fill_value=0,
136
+ dimension_names=None,
137
+ codecs=[
138
+ N5DefaultCodec.from_compressor(
139
+ len(self.dimensions),
140
+ compressor,
141
+ ),
142
+ ],
143
+ attributes=attrs,
144
+ )
145
+
146
+ def _to_zarr_codec(self) -> BytesBytesCodec | None:
147
+ tp = self.compression.get("type")
148
+ match tp:
149
+ case "raw":
150
+ return None
151
+ case "blosc":
152
+ item_size = COMPATIBLE_DATA_TYPES[self.data_type][1]
153
+ return parse_blosc(self.compression, item_size)
154
+ case "gzip":
155
+ return parse_gzip(self.compression)
156
+ case "zstd":
157
+ return parse_zstd(self.compression)
158
+ case _:
159
+ raise ValueError(f"unsupported codec with type {tp}")
160
+
161
+
162
+ def is_nonzero_int(n) -> bool:
163
+ if not isinstance(n, int):
164
+ return False
165
+ return n > 0
166
+
167
+
168
+ def parse_blosc(d: dict[str, JSON], typesize: int | None) -> blosc.BloscCodec:
169
+ cname = d.get("cname", "blosclz")
170
+ clevel = d.get("clevel", 6)
171
+ blocksize = d.get("blocksize", 0)
172
+ shuffle_int = d.get("shuffle", 0)
173
+ shuffle = blosc.BloscShuffle.from_int(shuffle_int) # type: ignore
174
+ return blosc.BloscCodec(
175
+ typesize=typesize,
176
+ cname=cname, # type: ignore
177
+ clevel=clevel, # type:ignore
178
+ blocksize=blocksize, # type:ignore
179
+ shuffle=shuffle,
180
+ )
181
+
182
+
183
+ def parse_gzip(d: dict[str, JSON]) -> GzipCodec:
184
+ level = d.get("level", -1)
185
+ if level == -1:
186
+ return GzipCodec()
187
+ else:
188
+ level = int(level) # type: ignore
189
+ return GzipCodec(level=level)
190
+
191
+
192
+ def parse_zstd(d: dict[str, JSON]) -> ZstdCodec:
193
+ level = d.get("level", 3)
194
+ return ZstdCodec(level=level) # type: ignore
File without changes
@@ -0,0 +1,8 @@
1
+ """
2
+ Storage wrappers for N5 data.
3
+ """
4
+
5
+ from .n5 import N5WrapperStore
6
+ from .implicit import ImplicitGroupWrapperStore
7
+
8
+ __all__ = ["N5WrapperStore", "ImplicitGroupWrapperStore"]
@@ -0,0 +1,78 @@
1
+ """
2
+ Module containing `ImplicitGroupWrapperStore`,
3
+ for inferring groups with missing metadata.
4
+ """
5
+
6
+ from typing import Final
7
+ from collections.abc import Iterable
8
+ import json
9
+
10
+ from zarr.abc.store import (
11
+ Store,
12
+ ByteRequest,
13
+ )
14
+ from zarr.storage import WrapperStore
15
+ from zarr.core.group import GroupMetadata
16
+ from zarr.core.buffer import BufferPrototype, Buffer
17
+
18
+ from ..util import slice_buf, is_zarr3_metadata
19
+
20
+ __all__ = ["ImplicitGroupWrapperStore"]
21
+
22
+
23
+ def make_implicit_group_bytes() -> bytes:
24
+ g = GroupMetadata()
25
+ g.attributes["_implicit"] = True
26
+
27
+ return json.dumps(g.to_dict()).encode()
28
+
29
+
30
+ IMPLICIT_GROUP_BYTES: Final[bytes] = make_implicit_group_bytes()
31
+
32
+
33
+ class ImplicitGroupWrapperStore[T: Store](WrapperStore):
34
+ """A store which supplies empty group metadata documents if they do not exist.
35
+
36
+ Used to replicate N5's behaviour where any directory (or prefix) is a valid group,
37
+ even when no metadata document exists.
38
+ Wrap over an `N5WrapperStore`.
39
+
40
+ Inferred group metadata's attributes will contain the key/value `"_implicit": true`.
41
+ """
42
+
43
+ _store: T
44
+
45
+ async def get(
46
+ self,
47
+ key: str,
48
+ prototype: BufferPrototype,
49
+ byte_range: ByteRequest | None = None,
50
+ ) -> Buffer | None:
51
+ res = await self._store.get(key, prototype, byte_range)
52
+ if res is not None or not is_zarr3_metadata(key):
53
+ return res
54
+
55
+ b = slice_buf(IMPLICIT_GROUP_BYTES, byte_range)
56
+ return prototype.buffer.from_bytes(b)
57
+
58
+ async def get_partial_values(
59
+ self,
60
+ prototype: BufferPrototype,
61
+ key_ranges: Iterable[tuple[str, ByteRequest | None]],
62
+ ) -> list[Buffer | None]:
63
+ key_ranges = list(key_ranges)
64
+ reses = await super().get_partial_values(prototype, key_ranges)
65
+ out = []
66
+ for (key, byte_range), res in zip(key_ranges, reses):
67
+ if res is None and is_zarr3_metadata(key):
68
+ res = prototype.buffer.from_bytes(
69
+ slice_buf(IMPLICIT_GROUP_BYTES, byte_range)
70
+ )
71
+ out.append(res)
72
+
73
+ return out
74
+
75
+ async def exists(self, key: str) -> bool:
76
+ if is_zarr3_metadata(key):
77
+ return True
78
+ return await super().exists(key)
@@ -0,0 +1,159 @@
1
+ """
2
+ Module containing `N5WrapperStore`,
3
+ for silently converting N5 nodes to Zarr nodes.
4
+ """
5
+
6
+ from collections import defaultdict
7
+ from collections.abc import AsyncIterator, Iterable
8
+ from zarr.storage import WrapperStore
9
+ from zarr.abc.store import (
10
+ Store,
11
+ ByteRequest,
12
+ )
13
+ from zarr.core.buffer import Buffer, BufferPrototype
14
+ import json
15
+ import asyncio
16
+
17
+ from ..constants import N5_METADATA_KEY, ZARR_V3_METADATA_KEY
18
+ from ..metadata import N5GroupMetadata, N5ArrayMetadata
19
+ from ..util import slice_buf, is_zarr3_metadata, N5Mode
20
+
21
+
22
+ class N5WrapperStore[T: Store](WrapperStore):
23
+ """A read-only store for opening N5 hierarchies.
24
+
25
+ Requests for Zarr metadata documents are redirected to N5 attributes,
26
+ and Zarr metadata calculated on the fly.
27
+
28
+ Note that N5 attributes can be omitted in groups.
29
+ You may want to wrap this in an `ImplicitGroupWrapperStore` to replicate that behaviour.
30
+
31
+ Only compatible with DEFAULT-mode N5 arrays.
32
+ """
33
+
34
+ _store: T
35
+
36
+ def intercept_metadata(self, key: str) -> None | str:
37
+ """If the given key is for Zarr v3 metadata, return the key for N5 metadata in the equivalent node.
38
+
39
+ Otherwise, return None.
40
+ """
41
+ if "/" in key:
42
+ pref, fname = key.rsplit("/", 1)
43
+ else:
44
+ pref = None
45
+ fname = key
46
+
47
+ if fname != ZARR_V3_METADATA_KEY:
48
+ return None
49
+
50
+ if pref is None:
51
+ k2 = N5_METADATA_KEY
52
+ else:
53
+ k2 = f"{pref}/{N5_METADATA_KEY}"
54
+
55
+ return k2
56
+
57
+ async def get(
58
+ self,
59
+ key: str,
60
+ prototype: BufferPrototype,
61
+ byte_range: ByteRequest | None = None,
62
+ ) -> Buffer | None:
63
+ k2 = self.intercept_metadata(key)
64
+ if k2 is None:
65
+ return await self._store.get(key, prototype, byte_range)
66
+
67
+ b = await self._store.get(k2, prototype)
68
+
69
+ if b is None:
70
+ return None
71
+
72
+ d = json.loads(b.to_bytes())
73
+ n5_meta = N5GroupMetadata.from_jso(d)
74
+ try:
75
+ n5_meta = N5ArrayMetadata.from_group(n5_meta)
76
+ out_d = n5_meta.to_zarr(N5Mode.DEFAULT)
77
+ except KeyError:
78
+ out_d = n5_meta.to_zarr()
79
+
80
+ b2 = json.dumps(out_d.to_dict()).encode()
81
+ b2 = slice_buf(b2, byte_range)
82
+
83
+ return prototype.buffer.from_bytes(b2)
84
+
85
+ async def get_partial_values(
86
+ self,
87
+ prototype: BufferPrototype,
88
+ key_ranges: Iterable[tuple[str, ByteRequest | None]],
89
+ ) -> list[Buffer | None]:
90
+
91
+ # Split the key ranges into metadata requests and other (chunk) requests.
92
+ # We always need to read the whole N5 metadata file
93
+ # to convert it into Zarr v3 metadata before slicing it,
94
+ # so this prevents reading it multiple times.
95
+ meta_reqs: defaultdict[str, list[tuple[int, ByteRequest | None]]] = defaultdict(
96
+ list
97
+ )
98
+ other_reqs: list[tuple[int, tuple[str, ByteRequest | None]]] = []
99
+ count = 0
100
+ for idx, (key, byte_range) in enumerate(key_ranges):
101
+ if is_zarr3_metadata(key):
102
+ meta_reqs[key].append((idx, byte_range))
103
+ else:
104
+ other_reqs.append((idx, (key, byte_range)))
105
+ count += 1
106
+
107
+ other_reqs_fut = self._store.get_partial_values(
108
+ prototype, (tup[1] for tup in other_reqs)
109
+ )
110
+ meta_req_list = list(meta_reqs.items())
111
+ meta_reqs_fut = asyncio.gather(
112
+ *(self.get(k, prototype) for k, _ in meta_req_list)
113
+ )
114
+ # Gather all requests to run concurrently
115
+ other_res, meta_res = await asyncio.gather(other_reqs_fut, meta_reqs_fut)
116
+ out: list[None | Buffer] = [None for _ in range(count)]
117
+
118
+ # Slice and insert the metadata responses into the pre-allocated output list
119
+ for res, (_, meta_req) in zip(meta_res, meta_req_list):
120
+ if res is None:
121
+ continue
122
+ blike = res.as_buffer_like()
123
+ for idx, byte_range in meta_req:
124
+ out[idx] = Buffer.from_bytes(slice_buf(blike, byte_range))
125
+
126
+ # Insert the non-metadata responses into the output;
127
+ # these are already sliced by the underlying store.
128
+ for res, (idx, _) in zip(other_res, other_reqs):
129
+ out[idx] = res
130
+
131
+ return out
132
+
133
+ async def exists(self, key: str) -> bool:
134
+ k2 = self.intercept_metadata(key)
135
+ return await self._store.exists(k2 or key)
136
+
137
+ @property
138
+ def supports_writes(self) -> bool:
139
+ return False
140
+
141
+ @property
142
+ def supports_deletes(self) -> bool:
143
+ return False
144
+
145
+ async def delete(self, key: str) -> None:
146
+ raise NotImplementedError
147
+
148
+ @property
149
+ def supports_listing(self) -> bool:
150
+ return self._store.supports_listing
151
+
152
+ def list(self) -> AsyncIterator[str]:
153
+ return self._store.list()
154
+
155
+ def list_prefix(self, prefix: str) -> AsyncIterator[str]:
156
+ return self._store.list_prefix(prefix)
157
+
158
+ def list_dir(self, prefix: str) -> AsyncIterator[str]:
159
+ return self._store.list_dir(prefix)
@@ -0,0 +1,115 @@
1
+ """
2
+ General utilities.
3
+ """
4
+
5
+ from zarr.abc.store import (
6
+ ByteRequest,
7
+ RangeByteRequest,
8
+ OffsetByteRequest,
9
+ SuffixByteRequest,
10
+ )
11
+ from .constants import ZARR_V3_METADATA_KEY
12
+ from dataclasses import dataclass
13
+ from enum import IntEnum
14
+ from typing import Self, Any
15
+ import struct
16
+
17
+ __all__ = ["N5Mode", "N5BlockHeader"]
18
+
19
+
20
+ class N5Mode(IntEnum):
21
+ """N5 block mode"""
22
+
23
+ DEFAULT = 0
24
+ VARLENGTH = 1
25
+ OBJECT = 2
26
+
27
+
28
+ @dataclass
29
+ class N5BlockHeader:
30
+ """Parsed representation of the N5 block header."""
31
+
32
+ mode: N5Mode
33
+ """Stored as >u16"""
34
+
35
+ shape: tuple[int, ...]
36
+ """Length stored as >u16, elements stored as >u32"""
37
+
38
+ num_elem: int | None = None
39
+ """Stored as >u32 if mode == VARLENGTH"""
40
+
41
+ def __post_init__(self):
42
+ if self.num_elem is not None and self.mode != N5Mode.VARLENGTH:
43
+ raise ValueError("num_elem must be None if mode is not VARLENGTH")
44
+
45
+ @classmethod
46
+ def calc_size(cls, ndim: int, is_varlength: bool = False) -> int:
47
+ """Calculate the number of bytes in an N5 block header."""
48
+ base = 2 + 2 + 4 * ndim
49
+ if is_varlength:
50
+ base += 4
51
+ return base
52
+
53
+ def size(self) -> int:
54
+ """Determine the number of bytes this header will take."""
55
+ return self.calc_size(len(self.shape), self.mode == N5Mode.VARLENGTH)
56
+
57
+ @classmethod
58
+ def from_bytes(cls, b: bytes) -> Self:
59
+ p = StructParser(b, ">")
60
+ mode_num, ndim = p.unpack("HH")
61
+ mode = N5Mode(mode_num)
62
+
63
+ shape = p.unpack("I" * ndim)
64
+
65
+ if mode == N5Mode.VARLENGTH:
66
+ numel = p.unpack("I")[0]
67
+ else:
68
+ numel = None
69
+
70
+ return cls(mode=mode, shape=shape, num_elem=numel)
71
+
72
+ @property
73
+ def ndim(self):
74
+ return len(self.shape)
75
+
76
+ def to_bytes(self) -> bytes:
77
+ fmt = ">HH" + "I" * self.ndim
78
+ args = [self.mode, self.ndim, *self.shape]
79
+ if self.num_elem is not None:
80
+ fmt += "I"
81
+ args.append(self.num_elem)
82
+ return struct.pack(fmt, *args)
83
+
84
+
85
+ class StructParser:
86
+ def __init__(self, buf: bytes, endian: str = "") -> None:
87
+ self.endian = endian
88
+ self.buf = buf
89
+ self.offset = 0
90
+
91
+ def unpack(self, fmt: str) -> tuple[Any, ...]:
92
+ fmt = self.endian + fmt
93
+ sz = struct.calcsize(fmt)
94
+ out = struct.unpack(fmt, self.buf[self.offset : self.offset + sz])
95
+ self.offset += sz
96
+ return out
97
+
98
+
99
+ def slice_buf(b: bytes, byte_range: ByteRequest | None = None) -> bytes:
100
+ """Optionally slice a byte buffer."""
101
+ if byte_range is None:
102
+ return b
103
+ elif isinstance(byte_range, RangeByteRequest):
104
+ b = b[byte_range.start : byte_range.end]
105
+ elif isinstance(byte_range, OffsetByteRequest):
106
+ b = b[byte_range.offset :]
107
+ elif isinstance(byte_range, SuffixByteRequest):
108
+ b = b[-byte_range.suffix :]
109
+
110
+ raise TypeError(f"byte_range argument has unknown type {type(byte_range)}")
111
+
112
+
113
+ def is_zarr3_metadata(key: str):
114
+ """Whether a key belongs to a Zarr v3 metadata object."""
115
+ return key.split("/")[-1] == ZARR_V3_METADATA_KEY