tensogram-zarr 0.14.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ .claude/*
2
+ !.claude/commands/
3
+ .weave/
4
+ .sisyphus/
5
+ .coverage
6
+ *.dylib.dSYM/
7
+ **/target
8
+ **/pkg
9
+ **/build/
10
+ python/bindings/Cargo.lock
11
+
12
+ /docs/book
13
+ **/.venv
14
+ **/.ruff_cache
15
+ **/__pycache__
16
+ *.so
17
+ *.dylib
18
+ *.pyd
19
+ *.swp
20
+ *.swo
21
+ *~
22
+ .DS_Store
23
+ .idea/
24
+ rust/tensogram-grib/Cargo.lock
25
+ rust/tensogram-netcdf/Cargo.lock
26
+ rust/tensogram-wasm/Cargo.lock
@@ -0,0 +1,22 @@
1
+ Metadata-Version: 2.4
2
+ Name: tensogram-zarr
3
+ Version: 0.14.0
4
+ Summary: Zarr v3 store backend for tensogram .tgm files
5
+ Project-URL: Homepage, https://sites.ecmwf.int/docs/tensogram/main
6
+ Project-URL: Repository, https://github.com/ecmwf/tensogram
7
+ Project-URL: Documentation, https://sites.ecmwf.int/docs/tensogram/main
8
+ Author-email: ECMWF <software@ecmwf.int>
9
+ License-Expression: Apache-2.0
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Topic :: Scientific/Engineering
14
+ Classifier: Topic :: Scientific/Engineering :: Atmospheric Science
15
+ Requires-Python: >=3.10
16
+ Requires-Dist: numpy
17
+ Requires-Dist: tensogram<0.15,>=0.14.0
18
+ Requires-Dist: zarr>=3.0
19
+ Provides-Extra: dev
20
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
21
+ Requires-Dist: pytest>=7.0; extra == 'dev'
22
+ Requires-Dist: ruff>=0.4; extra == 'dev'
@@ -0,0 +1,46 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "tensogram-zarr"
7
+ version = "0.14.0"
8
+ description = "Zarr v3 store backend for tensogram .tgm files"
9
+ requires-python = ">=3.10"
10
+ license = "Apache-2.0"
11
+ authors = [{name = "ECMWF", email = "software@ecmwf.int"}]
12
+ classifiers = [
13
+ "Development Status :: 4 - Beta",
14
+ "License :: OSI Approved :: Apache Software License",
15
+ "Programming Language :: Python :: 3",
16
+ "Topic :: Scientific/Engineering",
17
+ "Topic :: Scientific/Engineering :: Atmospheric Science",
18
+ ]
19
+ dependencies = [
20
+ "tensogram>=0.14.0,<0.15",
21
+ "zarr>=3.0",
22
+ "numpy",
23
+ ]
24
+
25
+ [project.urls]
26
+ Homepage = "https://sites.ecmwf.int/docs/tensogram/main"
27
+ Repository = "https://github.com/ecmwf/tensogram"
28
+ Documentation = "https://sites.ecmwf.int/docs/tensogram/main"
29
+
30
+ [project.optional-dependencies]
31
+ dev = ["pytest>=7.0", "pytest-asyncio>=0.23", "ruff>=0.4"]
32
+
33
+ [tool.hatch.build.targets.wheel]
34
+ packages = ["src/tensogram_zarr"]
35
+
36
+ [tool.ruff]
37
+ line-length = 99
38
+ target-version = "py310"
39
+
40
+ [tool.ruff.lint]
41
+ select = ["E", "W", "F", "I", "N", "UP", "B", "SIM", "PT", "RUF"]
42
+ ignore = ["RUF012", "SIM117"]
43
+
44
+ [tool.pytest.ini_options]
45
+ testpaths = ["tests"]
46
+ asyncio_mode = "auto"
@@ -0,0 +1,34 @@
1
+ # (C) Copyright 2026- ECMWF and individual contributors.
2
+ #
3
+ # This software is licensed under the terms of the Apache Licence Version 2.0
4
+ # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
5
+ # In applying this licence, ECMWF does not waive the privileges and immunities
6
+ # granted to it by virtue of its status as an intergovernmental organisation nor
7
+ # does it submit to any jurisdiction.
8
+
9
+ """Zarr v3 store backend for tensogram .tgm files.
10
+
11
+ Provides ``TensogramStore`` — a Zarr v3 ``Store`` that reads and writes
12
+ Tensogram wire-format (``.tgm``) files through the standard Zarr API.
13
+
14
+ Usage::
15
+
16
+ import zarr
17
+ from tensogram_zarr import TensogramStore
18
+
19
+ # Read existing .tgm through Zarr
20
+ store = TensogramStore.open_tgm("data.tgm")
21
+ root = zarr.open_group(store=store, mode="r")
22
+ arr = root["temperature"][:]
23
+
24
+ # Write new .tgm through Zarr
25
+ import numpy as np
26
+ store = TensogramStore("output.tgm", mode="w")
27
+ root = zarr.open_group(store=store, mode="w")
28
+ root.create_array("temperature", data=np.random.rand(100, 200).astype(np.float32))
29
+ store.close()
30
+ """
31
+
32
+ from tensogram_zarr.store import TensogramStore
33
+
34
+ __all__ = ["TensogramStore"]
@@ -0,0 +1,370 @@
1
+ # (C) Copyright 2026- ECMWF and individual contributors.
2
+ #
3
+ # This software is licensed under the terms of the Apache Licence Version 2.0
4
+ # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
5
+ # In applying this licence, ECMWF does not waive the privileges and immunities
6
+ # granted to it by virtue of its status as an intergovernmental organisation nor
7
+ # does it submit to any jurisdiction.
8
+
9
+ """Bidirectional mapping between Tensogram and Zarr v3 metadata.
10
+
11
+ Converts TGM dtypes, descriptors, and global metadata into Zarr v3
12
+ ``zarr.json`` structures and back.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ import math
19
+ from typing import Any
20
+
21
+ import numpy as np
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # Dtype mapping
25
+ # ---------------------------------------------------------------------------
26
+
27
+ # TGM dtype string → (Zarr v3 data_type, numpy dtype string)
28
+ _TGM_TO_ZARR_DTYPE: dict[str, tuple[str, str]] = {
29
+ "float16": ("float16", "<f2"),
30
+ "bfloat16": ("bfloat16", "<V2"), # no native numpy; raw 2-byte
31
+ "float32": ("float32", "<f4"),
32
+ "float64": ("float64", "<f8"),
33
+ "complex64": ("complex64", "<c8"),
34
+ "complex128": ("complex128", "<c16"),
35
+ "int8": ("int8", "|i1"),
36
+ "int16": ("int16", "<i2"),
37
+ "int32": ("int32", "<i4"),
38
+ "int64": ("int64", "<i8"),
39
+ "uint8": ("uint8", "|u1"),
40
+ "uint16": ("uint16", "<u2"),
41
+ "uint32": ("uint32", "<u4"),
42
+ "uint64": ("uint64", "<u8"),
43
+ "bitmask": ("uint8", "|u1"), # bitmask exposed as uint8
44
+ }
45
+
46
+ # Zarr v3 data_type → TGM dtype string
47
+ _ZARR_TO_TGM_DTYPE: dict[str, str] = {
48
+ "float16": "float16",
49
+ "bfloat16": "bfloat16",
50
+ "float32": "float32",
51
+ "float64": "float64",
52
+ "complex64": "complex64",
53
+ "complex128": "complex128",
54
+ "int8": "int8",
55
+ "int16": "int16",
56
+ "int32": "int32",
57
+ "int64": "int64",
58
+ "uint8": "uint8",
59
+ "uint16": "uint16",
60
+ "uint32": "uint32",
61
+ "uint64": "uint64",
62
+ }
63
+
64
+ # numpy dtype → TGM dtype string (used on write path)
65
+ _NP_TO_TGM_DTYPE: dict[np.dtype, str] = {
66
+ np.dtype("float16"): "float16",
67
+ np.dtype("float32"): "float32",
68
+ np.dtype("float64"): "float64",
69
+ np.dtype("complex64"): "complex64",
70
+ np.dtype("complex128"): "complex128",
71
+ np.dtype("int8"): "int8",
72
+ np.dtype("int16"): "int16",
73
+ np.dtype("int32"): "int32",
74
+ np.dtype("int64"): "int64",
75
+ np.dtype("uint8"): "uint8",
76
+ np.dtype("uint16"): "uint16",
77
+ np.dtype("uint32"): "uint32",
78
+ np.dtype("uint64"): "uint64",
79
+ }
80
+
81
+
82
+ def tgm_dtype_to_zarr(tgm_dtype: str) -> str:
83
+ """Convert a TGM dtype string to a Zarr v3 data_type string."""
84
+ pair = _TGM_TO_ZARR_DTYPE.get(tgm_dtype)
85
+ if pair is None:
86
+ raise ValueError(f"unsupported TGM dtype: {tgm_dtype!r}")
87
+ return pair[0]
88
+
89
+
90
+ def tgm_dtype_to_numpy(tgm_dtype: str) -> np.dtype:
91
+ """Convert a TGM dtype string to a numpy dtype."""
92
+ pair = _TGM_TO_ZARR_DTYPE.get(tgm_dtype)
93
+ if pair is None:
94
+ raise ValueError(f"unsupported TGM dtype: {tgm_dtype!r}")
95
+ return np.dtype(pair[1])
96
+
97
+
98
+ def zarr_dtype_to_tgm(zarr_dtype: str) -> str:
99
+ """Convert a Zarr v3 data_type string to a TGM dtype string."""
100
+ result = _ZARR_TO_TGM_DTYPE.get(zarr_dtype)
101
+ if result is None:
102
+ raise ValueError(f"unsupported Zarr dtype: {zarr_dtype!r}")
103
+ return result
104
+
105
+
106
+ def numpy_dtype_to_tgm(dtype: np.dtype) -> str:
107
+ """Convert a numpy dtype to a TGM dtype string."""
108
+ result = _NP_TO_TGM_DTYPE.get(dtype)
109
+ if result is None:
110
+ raise ValueError(f"unsupported numpy dtype: {dtype!r}")
111
+ return result
112
+
113
+
114
+ # ---------------------------------------------------------------------------
115
+ # Zarr v3 metadata synthesis (read path: TGM → zarr.json)
116
+ # ---------------------------------------------------------------------------
117
+
118
+
119
+ def build_group_zarr_json(
120
+ meta: Any,
121
+ variable_names: list[str],
122
+ ) -> dict[str, Any]:
123
+ """Synthesize a Zarr v3 group ``zarr.json`` from TGM GlobalMetadata.
124
+
125
+ Parameters
126
+ ----------
127
+ meta : tensogram.Metadata
128
+ Decoded TGM global metadata.
129
+ variable_names : list[str]
130
+ Names of the arrays in this group (for informational attributes).
131
+
132
+ Returns
133
+ -------
134
+ dict
135
+ A Zarr v3 group metadata dict ready for JSON serialization.
136
+ """
137
+ attrs: dict[str, Any] = {}
138
+
139
+ # Merge extra metadata (message-level annotations)
140
+ if hasattr(meta, "extra") and meta.extra:
141
+ attrs.update(meta.extra)
142
+
143
+ attrs["_tensogram_version"] = meta.version
144
+ attrs["_tensogram_variables"] = variable_names
145
+
146
+ return {
147
+ "zarr_format": 3,
148
+ "node_type": "group",
149
+ "attributes": attrs,
150
+ }
151
+
152
+
153
+ def build_array_zarr_json(
154
+ desc: Any,
155
+ per_object_meta: dict[str, Any] | None = None,
156
+ ) -> dict[str, Any]:
157
+ """Synthesize a Zarr v3 array ``zarr.json`` from a TGM DataObjectDescriptor.
158
+
159
+ The array is treated as a single chunk (chunk_shape == shape) since each
160
+ TGM data object is a monolithic tensor.
161
+
162
+ Parameters
163
+ ----------
164
+ desc : tensogram.DataObjectDescriptor
165
+ The decoded object descriptor.
166
+ per_object_meta : dict, optional
167
+ Per-object metadata from ``meta.base[i]``.
168
+
169
+ Returns
170
+ -------
171
+ dict
172
+ A Zarr v3 array metadata dict ready for JSON serialization.
173
+ """
174
+ shape = list(desc.shape)
175
+ zarr_dtype = tgm_dtype_to_zarr(desc.dtype)
176
+
177
+ # Single chunk covering the whole array
178
+ chunk_shape = list(shape) if shape else [1]
179
+
180
+ # Build codec chain: just bytes (no Zarr-level compression; data is
181
+ # already encoded/compressed inside TGM)
182
+ codecs = [
183
+ {
184
+ "name": "bytes",
185
+ "configuration": {"endian": "little"},
186
+ },
187
+ ]
188
+
189
+ # Attributes from per-object metadata + descriptor params
190
+ attrs: dict[str, Any] = {}
191
+ if per_object_meta:
192
+ attrs.update(per_object_meta)
193
+ if desc.params:
194
+ # Encoding params stored under _tensogram prefix to avoid clashes
195
+ attrs["_tensogram_params"] = dict(desc.params)
196
+
197
+ attrs["_tensogram_encoding"] = desc.encoding
198
+ attrs["_tensogram_filter"] = desc.filter
199
+ attrs["_tensogram_compression"] = desc.compression
200
+ if desc.hash:
201
+ attrs["_tensogram_hash"] = desc.hash
202
+
203
+ return {
204
+ "zarr_format": 3,
205
+ "node_type": "array",
206
+ "shape": shape,
207
+ "data_type": zarr_dtype,
208
+ "chunk_grid": {
209
+ "name": "regular",
210
+ "configuration": {"chunk_shape": chunk_shape},
211
+ },
212
+ "chunk_key_encoding": {
213
+ "name": "default",
214
+ "configuration": {"separator": "/"},
215
+ },
216
+ "codecs": codecs,
217
+ "fill_value": _default_fill_value(zarr_dtype),
218
+ "attributes": attrs,
219
+ }
220
+
221
+
222
+ _FLOAT_LIKE_PREFIXES = ("float", "bfloat", "complex")
223
+
224
+
225
+ def _default_fill_value(zarr_dtype: str) -> Any:
226
+ """Return a sensible default fill value for a Zarr dtype."""
227
+ if any(zarr_dtype.startswith(p) for p in _FLOAT_LIKE_PREFIXES):
228
+ return float("nan")
229
+ return 0
230
+
231
+
232
+ # ---------------------------------------------------------------------------
233
+ # TGM metadata reconstruction (write path: zarr.json → TGM)
234
+ # ---------------------------------------------------------------------------
235
+
236
+
237
+ def parse_array_zarr_json(zarr_meta: dict[str, Any]) -> dict[str, Any]:
238
+ """Extract TGM-relevant fields from a Zarr v3 array ``zarr.json``.
239
+
240
+ Returns a dict with keys: ``shape``, ``dtype``, ``byte_order``,
241
+ ``encoding``, ``filter``, ``compression``, ``attrs``.
242
+ """
243
+ shape = zarr_meta["shape"]
244
+ zarr_dtype = zarr_meta["data_type"]
245
+ tgm_dtype = zarr_dtype_to_tgm(zarr_dtype)
246
+
247
+ # Try to recover byte order from codecs
248
+ byte_order = "little"
249
+ for codec in zarr_meta.get("codecs", []):
250
+ if codec.get("name") == "bytes":
251
+ byte_order = codec.get("configuration", {}).get("endian", "little")
252
+
253
+ # Work on a copy to avoid mutating the caller's dict
254
+ attrs = dict(zarr_meta.get("attributes", {}))
255
+
256
+ encoding = attrs.pop("_tensogram_encoding", "none")
257
+ filt = attrs.pop("_tensogram_filter", "none")
258
+ compression = attrs.pop("_tensogram_compression", "none")
259
+
260
+ return {
261
+ "shape": shape,
262
+ "dtype": tgm_dtype,
263
+ "byte_order": byte_order,
264
+ "encoding": encoding,
265
+ "filter": filt,
266
+ "compression": compression,
267
+ "attrs": {k: v for k, v in attrs.items() if not k.startswith("_tensogram_")},
268
+ }
269
+
270
+
271
+ def serialize_zarr_json(meta: dict[str, Any]) -> bytes:
272
+ """Serialize a zarr.json dict to UTF-8 JSON bytes.
273
+
274
+ Non-finite float values (NaN, Infinity, -Infinity) are converted to
275
+ their Zarr v3 string sentinels so the output is valid RFC 8259 JSON.
276
+ """
277
+ safe = _json_safe_metadata(meta)
278
+ return json.dumps(safe, separators=(",", ":"), sort_keys=True, allow_nan=False).encode("utf-8")
279
+
280
+
281
+ def _json_safe_metadata(obj: Any) -> Any:
282
+ """Recursively replace non-finite floats with Zarr v3 string sentinels.
283
+
284
+ RFC 8259 forbids bare ``NaN`` / ``Infinity`` tokens in JSON. Zarr v3
285
+ uses the string values ``"NaN"``, ``"Infinity"``, ``"-Infinity"`` for
286
+ fill_value and similar fields.
287
+ """
288
+ if isinstance(obj, float):
289
+ if math.isnan(obj):
290
+ return "NaN"
291
+ if math.isinf(obj):
292
+ return "Infinity" if obj > 0 else "-Infinity"
293
+ return obj
294
+ if isinstance(obj, dict):
295
+ return {k: _json_safe_metadata(v) for k, v in obj.items()}
296
+ if isinstance(obj, (list, tuple)):
297
+ return [_json_safe_metadata(v) for v in obj]
298
+ return obj
299
+
300
+
301
+ def deserialize_zarr_json(data: bytes) -> dict[str, Any]:
302
+ """Deserialize UTF-8 JSON bytes to a dict.
303
+
304
+ Raises ``ValueError`` with context if the data is not valid JSON.
305
+ """
306
+ try:
307
+ return json.loads(data)
308
+ except (json.JSONDecodeError, UnicodeDecodeError) as exc:
309
+ preview = data[:80].hex()
310
+ raise ValueError(
311
+ f"invalid zarr.json content ({len(data)} bytes, starts {preview!s}): {exc}"
312
+ ) from exc
313
+
314
+
315
+ # ---------------------------------------------------------------------------
316
+ # Variable naming
317
+ # ---------------------------------------------------------------------------
318
+
319
+ # Dotted-path metadata keys to try for variable naming, in priority order.
320
+ _VARIABLE_NAME_KEYS = [
321
+ "name",
322
+ "mars.param",
323
+ "param",
324
+ "mars.shortName",
325
+ "shortName",
326
+ ]
327
+
328
+
329
+ def resolve_variable_name(
330
+ obj_index: int,
331
+ per_object_meta: dict[str, Any] | None,
332
+ common_meta: dict[str, Any] | None = None,
333
+ variable_key: str | None = None,
334
+ ) -> str:
335
+ """Determine the Zarr variable name for a TGM data object.
336
+
337
+ Tries ``variable_key`` first if given, then ``_VARIABLE_NAME_KEYS``,
338
+ then falls back to ``object_<index>``.
339
+
340
+ Only ``per_object_meta`` (from ``meta.base[i]``) is consulted for
341
+ naming. ``common_meta`` (from ``meta.extra``) is accepted for API
342
+ compatibility but is **not** searched — variable names should come
343
+ from per-object metadata to avoid all objects in a message sharing
344
+ the same name.
345
+ """
346
+ source = per_object_meta or {}
347
+
348
+ # Try explicit key first
349
+ keys_to_try = [variable_key] if variable_key else []
350
+ keys_to_try.extend(_VARIABLE_NAME_KEYS)
351
+
352
+ for key in keys_to_try:
353
+ val = _dotted_get(source, key)
354
+ if val is not None:
355
+ return str(val)
356
+
357
+ return f"object_{obj_index}"
358
+
359
+
360
+ def _dotted_get(d: dict[str, Any], path: str) -> Any:
361
+ """Resolve a dotted key path like ``mars.param`` in a nested dict."""
362
+ parts = path.split(".")
363
+ current: Any = d
364
+ for part in parts:
365
+ if not isinstance(current, dict):
366
+ return None
367
+ current = current.get(part)
368
+ if current is None:
369
+ return None
370
+ return current