tensogram-xarray 0.20.0__tar.gz → 0.21.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/PKG-INFO +2 -2
  2. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/pyproject.toml +2 -2
  3. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/src/tensogram_xarray/array.py +16 -3
  4. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/src/tensogram_xarray/backend.py +23 -5
  5. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/src/tensogram_xarray/merge.py +7 -9
  6. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/src/tensogram_xarray/store.py +4 -6
  7. tensogram_xarray-0.21.0/tests/test_verify_hash.py +123 -0
  8. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/.gitignore +0 -0
  9. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/README.md +0 -0
  10. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/src/tensogram_xarray/__init__.py +0 -0
  11. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/src/tensogram_xarray/coords.py +0 -0
  12. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/src/tensogram_xarray/mapping.py +0 -0
  13. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/src/tensogram_xarray/scanner.py +0 -0
  14. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/tests/__init__.py +0 -0
  15. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/tests/conftest.py +0 -0
  16. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/tests/test_array.py +0 -0
  17. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/tests/test_backend.py +0 -0
  18. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/tests/test_coords.py +0 -0
  19. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/tests/test_coverage.py +0 -0
  20. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/tests/test_edge_cases.py +0 -0
  21. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/tests/test_issue_67_descriptor_name_fallback.py +0 -0
  22. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/tests/test_mapping.py +0 -0
  23. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/tests/test_merge.py +0 -0
  24. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/tests/test_nd_range.py +0 -0
  25. {tensogram_xarray-0.20.0 → tensogram_xarray-0.21.0}/tests/test_remote.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tensogram-xarray
3
- Version: 0.20.0
3
+ Version: 0.21.0
4
4
  Summary: xarray backend engine for tensogram .tgm files
5
5
  Project-URL: Homepage, https://sites.ecmwf.int/docs/tensogram/main
6
6
  Project-URL: Repository, https://github.com/ecmwf/tensogram
@@ -14,7 +14,7 @@ Classifier: Topic :: Scientific/Engineering
14
14
  Classifier: Topic :: Scientific/Engineering :: Atmospheric Science
15
15
  Requires-Python: >=3.11
16
16
  Requires-Dist: numpy
17
- Requires-Dist: tensogram<0.21,>=0.20.0
17
+ Requires-Dist: tensogram<0.22,>=0.21.0
18
18
  Requires-Dist: xarray>=2022.06
19
19
  Provides-Extra: dask
20
20
  Requires-Dist: dask[array]; extra == 'dask'
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "tensogram-xarray"
7
- version = "0.20.0"
7
+ version = "0.21.0"
8
8
  description = "xarray backend engine for tensogram .tgm files"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -18,7 +18,7 @@ classifiers = [
18
18
  "Topic :: Scientific/Engineering :: Atmospheric Science",
19
19
  ]
20
20
  dependencies = [
21
- "tensogram>=0.20.0,<0.21",
21
+ "tensogram>=0.21.0,<0.22",
22
22
  "xarray>=2022.06",
23
23
  "numpy",
24
24
  ]
@@ -200,11 +200,11 @@ class TensogramBackendArray(BackendArray):
200
200
  dtype: np.dtype,
201
201
  supports_range: bool,
202
202
  *,
203
- verify_hash: bool = False,
204
203
  range_threshold: float = DEFAULT_RANGE_THRESHOLD,
205
204
  lock: threading.Lock | None = None,
206
205
  storage_options: dict[str, Any] | None = None,
207
206
  shared_file: Any | None = None,
207
+ verify_hash: bool = False,
208
208
  ):
209
209
  import tensogram
210
210
 
@@ -215,10 +215,10 @@ class TensogramBackendArray(BackendArray):
215
215
  self.shape = shape
216
216
  self.dtype = dtype
217
217
  self.supports_range = supports_range
218
- self.verify_hash = verify_hash
219
218
  self.range_threshold = range_threshold
220
219
  self.storage_options = storage_options
221
220
  self._shared_file = shared_file
221
+ self.verify_hash = verify_hash
222
222
 
223
223
  # -- pickle support (no open handles stored) ----------------------------
224
224
 
@@ -267,12 +267,25 @@ class TensogramBackendArray(BackendArray):
267
267
  total_elements = math.prod(self.shape)
268
268
 
269
269
  if total_elements > 0 and total_requested / total_elements <= self.range_threshold:
270
+ # Per the decode-time verification contract
271
+ # (PLAN_DECODE_HASH_VERIFICATION §6, Q6): the
272
+ # range-decode fast path does *not* verify
273
+ # hashes — verifying a whole-frame hash would
274
+ # require reading every byte the optimisation
275
+ # is designed to avoid. When `verify_hash` is
276
+ # True, callers who care about integrity should
277
+ # set ``range_threshold=0`` to force every read
278
+ # through the full-decode path below. We
279
+ # *do* allow this fast path even under
280
+ # ``verify_hash=True`` so the user keeps the
281
+ # remote-fetch cost characteristics they
282
+ # opted in to via ``range_threshold``; the
283
+ # verification simply does not apply.
270
284
  arr = f.file_decode_range(
271
285
  self.msg_index,
272
286
  obj_index=self.obj_index,
273
287
  ranges=flat_ranges,
274
288
  join=True,
275
- verify_hash=self.verify_hash,
276
289
  native_byte_order=True,
277
290
  )
278
291
  return np.asarray(arr).reshape(out_shape)
@@ -55,9 +55,9 @@ class TensogramBackendEntrypoint(BackendEntrypoint):
55
55
  variable_key: str | None = None,
56
56
  message_index: int = 0,
57
57
  merge_objects: bool = False,
58
- verify_hash: bool = False,
59
58
  range_threshold: float = 0.5,
60
59
  storage_options: dict[str, Any] | None = None,
60
+ verify_hash: bool = False,
61
61
  ) -> xr.Dataset:
62
62
  """Open a single tensogram message as an :class:`xr.Dataset`.
63
63
 
@@ -79,8 +79,6 @@ class TensogramBackendEntrypoint(BackendEntrypoint):
79
79
  If *True*, attempt to merge objects across messages by stacking
80
80
  along metadata dimensions that vary. When *False* (default),
81
81
  only the single message at *message_index* is opened.
82
- verify_hash
83
- Whether to verify xxh3 hashes during decode.
84
82
  range_threshold
85
83
  Maximum fraction of total array elements (0.0-1.0) for which
86
84
  partial ``decode_range()`` is used instead of a full
@@ -88,6 +86,26 @@ class TensogramBackendEntrypoint(BackendEntrypoint):
88
86
  storage_options
89
87
  Key-value pairs forwarded to the object store backend when
90
88
  the path is a remote URL. Ignored for local files.
89
+ verify_hash
90
+ When *True*, every full ``decode_object`` (or
91
+ ``file_decode_object``) call materialised by this Dataset's
92
+ lazy backing arrays is verified against its inline xxh3
93
+ hash; ``MissingHashError`` / ``HashMismatchError`` from
94
+ the underlying tensogram bindings propagate to the
95
+ caller's first read.
96
+
97
+ **Caveat — partial-range fast path is unverified.** Per
98
+ the decode-time verification contract (see
99
+ ``plans/DESIGN.md`` §"Integrity Hashing" and
100
+ ``plans/WIRE_FORMAT.md`` §11.1), ``decode_range`` reads
101
+ only a slice of the encoded payload and cannot meaningfully
102
+ verify a whole-frame hash. When ``verify_hash=True`` and
103
+ the lazy reader chooses ``file_decode_range`` (because
104
+ the requested slice is below ``range_threshold``), no
105
+ verification happens for that read. Set
106
+ ``range_threshold=0`` to force every read through the
107
+ full-decode path if you need consistent integrity
108
+ coverage.
91
109
 
92
110
  Returns
93
111
  -------
@@ -107,9 +125,9 @@ class TensogramBackendEntrypoint(BackendEntrypoint):
107
125
  file_path,
108
126
  dim_names=dim_names,
109
127
  variable_key=variable_key,
110
- verify_hash=verify_hash,
111
128
  range_threshold=range_threshold,
112
129
  storage_options=storage_options,
130
+ verify_hash=verify_hash,
113
131
  )
114
132
  if not datasets:
115
133
  return xr.Dataset()
@@ -120,9 +138,9 @@ class TensogramBackendEntrypoint(BackendEntrypoint):
120
138
  msg_index=message_index,
121
139
  dim_names=dim_names,
122
140
  variable_key=variable_key,
123
- verify_hash=verify_hash,
124
141
  range_threshold=range_threshold,
125
142
  storage_options=storage_options,
143
+ verify_hash=verify_hash,
126
144
  )
127
145
 
128
146
  drop_set = set(drop_variables) if drop_variables else None
@@ -51,8 +51,8 @@ def open_datasets(
51
51
  *,
52
52
  dim_names: Sequence[str] | None = None,
53
53
  variable_key: str | None = None,
54
- verify_hash: bool = False,
55
54
  range_threshold: float = 0.5,
55
+ verify_hash: bool = False,
56
56
  storage_options: dict[str, Any] | None = None,
57
57
  ) -> list[xr.Dataset]:
58
58
  """Open a ``.tgm`` file, auto-grouping into compatible Datasets.
@@ -69,8 +69,6 @@ def open_datasets(
69
69
  Explicit dimension names for the innermost tensor axes.
70
70
  variable_key
71
71
  Dotted metadata key path for variable naming.
72
- verify_hash
73
- Whether to verify hashes on decode.
74
72
  range_threshold
75
73
  Maximum fraction of total array elements for which partial
76
74
  ``decode_range()`` is used. Default ``0.5``.
@@ -114,8 +112,8 @@ def open_datasets(
114
112
  shape=shape,
115
113
  dtype=np_dtype,
116
114
  supports_range=_supports_range_decode(obj.descriptor),
117
- verify_hash=verify_hash,
118
115
  range_threshold=range_threshold,
116
+ verify_hash=verify_hash,
119
117
  lock=lock,
120
118
  storage_options=storage_options,
121
119
  shared_file=shared_file,
@@ -479,8 +477,8 @@ def _single_object_dataset(
479
477
  shape=shape,
480
478
  dtype=np_dtype,
481
479
  supports_range=_supports_range_decode(obj.descriptor),
482
- verify_hash=verify_hash,
483
480
  range_threshold=range_threshold,
481
+ verify_hash=verify_hash,
484
482
  lock=lock,
485
483
  storage_options=storage_options,
486
484
  shared_file=shared_file,
@@ -532,8 +530,8 @@ def _flat_group_dataset(
532
530
  shape=obj.shape,
533
531
  dtype=np_dtype,
534
532
  supports_range=_supports_range_decode(obj.descriptor),
535
- verify_hash=verify_hash,
536
533
  range_threshold=range_threshold,
534
+ verify_hash=verify_hash,
537
535
  lock=lock,
538
536
  storage_options=storage_options,
539
537
  shared_file=shared_file,
@@ -696,8 +694,8 @@ def _build_multi_variable_dataset(
696
694
  shape=inner_shape,
697
695
  dtype=np_dtype,
698
696
  supports_range=_supports_range_decode(obj.descriptor),
699
- verify_hash=verify_hash,
700
697
  range_threshold=range_threshold,
698
+ verify_hash=verify_hash,
701
699
  lock=lock,
702
700
  storage_options=storage_options,
703
701
  shared_file=shared_file,
@@ -786,8 +784,8 @@ def _build_multi_variable_dataset(
786
784
  shape=inner_shape,
787
785
  dtype=np_dtype,
788
786
  supports_range=_supports_range_decode(obj.descriptor),
789
- verify_hash=verify_hash,
790
787
  range_threshold=range_threshold,
788
+ verify_hash=verify_hash,
791
789
  lock=lock,
792
790
  storage_options=storage_options,
793
791
  shared_file=shared_file,
@@ -816,8 +814,8 @@ def _build_multi_variable_dataset(
816
814
  shape=inner_shape,
817
815
  dtype=np_dtype,
818
816
  supports_range=_supports_range_decode(obj.descriptor),
819
- verify_hash=verify_hash,
820
817
  range_threshold=range_threshold,
818
+ verify_hash=verify_hash,
821
819
  lock=lock,
822
820
  storage_options=storage_options,
823
821
  shared_file=shared_file,
@@ -196,8 +196,6 @@ class TensogramDataStore:
196
196
  Optional user-specified dimension names for data variables.
197
197
  variable_key
198
198
  Optional dotted metadata path for variable naming.
199
- verify_hash
200
- Whether to verify object hashes on decode.
201
199
  range_threshold
202
200
  Maximum fraction of total array elements (0.0-1.0) for which
203
201
  partial ``decode_range()`` is used. Default ``0.5``.
@@ -214,9 +212,9 @@ class TensogramDataStore:
214
212
  msg_index: int = 0,
215
213
  dim_names: Sequence[str] | None = None,
216
214
  variable_key: str | None = None,
217
- verify_hash: bool = False,
218
215
  range_threshold: float = 0.5,
219
216
  storage_options: dict[str, Any] | None = None,
217
+ verify_hash: bool = False,
220
218
  ):
221
219
  import tensogram
222
220
 
@@ -225,9 +223,9 @@ class TensogramDataStore:
225
223
  self.msg_index = msg_index
226
224
  self.dim_names = dim_names
227
225
  self.variable_key = variable_key
228
- self.verify_hash = verify_hash
229
226
  self.range_threshold = range_threshold
230
227
  self.storage_options = storage_options
228
+ self.verify_hash = verify_hash
231
229
  self._lock = threading.Lock()
232
230
  self._backend_arrays: list[TensogramBackendArray] = []
233
231
 
@@ -347,11 +345,11 @@ class TensogramDataStore:
347
345
  shape=shape,
348
346
  dtype=np_dtype,
349
347
  supports_range=_supports_range_decode(desc),
350
- verify_hash=self.verify_hash,
351
348
  range_threshold=self.range_threshold,
352
349
  lock=self._lock,
353
350
  storage_options=self.storage_options,
354
351
  shared_file=self._file,
352
+ verify_hash=self.verify_hash,
355
353
  )
356
354
  self._backend_arrays.append(backend_array)
357
355
  lazy_data = indexing.LazilyIndexedArray(backend_array)
@@ -387,11 +385,11 @@ class TensogramDataStore:
387
385
  shape=shape,
388
386
  dtype=np_dtype,
389
387
  supports_range=_supports_range_decode(desc),
390
- verify_hash=self.verify_hash,
391
388
  range_threshold=self.range_threshold,
392
389
  lock=self._lock,
393
390
  storage_options=self.storage_options,
394
391
  shared_file=self._file,
392
+ verify_hash=self.verify_hash,
395
393
  )
396
394
  self._backend_arrays.append(backend_array)
397
395
 
@@ -0,0 +1,123 @@
1
+ # (C) Copyright 2026- ECMWF and individual contributors.
2
+ #
3
+ # This software is licensed under the terms of the Apache Licence Version 2.0
4
+ # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
5
+ # In applying this licence, ECMWF does not waive the privileges and immunities
6
+ # granted to it by virtue of its status as an intergovernmental organisation nor
7
+ # does it submit to any jurisdiction.
8
+
9
+ """verify_hash threading through the xarray backend.
10
+
11
+ The xarray backend re-exposes the upstream
12
+ ``DecodeOptions::verify_hash`` flag via the ``open_dataset(...,
13
+ verify_hash=True)`` keyword. When the lazy backing arrays
14
+ materialise data via ``file_decode_object`` /
15
+ ``decode_object``, the kwarg propagates and integrity errors
16
+ (``MissingHashError`` / ``HashMismatchError``) bubble up to
17
+ the caller's first read.
18
+
19
+ Per Q6 in ``PLAN_DECODE_HASH_VERIFICATION.md``: the partial-
20
+ range fast path silently does *not* verify (range decode
21
+ does not accept ``verify_hash``). Set ``range_threshold=0``
22
+ to force every read through the full-decode path if you
23
+ need uniform coverage.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import numpy as np
29
+ import pytest
30
+ import tensogram
31
+ import xarray as xr
32
+
33
+
34
+ def _build_unhashed_message(tmp_path) -> str:
35
+ """Encode a 1-object f32 message with hashing off + write to disk.
36
+
37
+ Returns the file path. The unhashed encoding is what makes
38
+ cell C (`verify_hash=True` → MissingHashError) testable.
39
+ """
40
+ meta = {"version": 3}
41
+ desc = {
42
+ "type": "ntensor",
43
+ "ndim": 1,
44
+ "shape": [4],
45
+ "strides": [1],
46
+ "dtype": "float32",
47
+ "encoding": "none",
48
+ "filter": "none",
49
+ "compression": "none",
50
+ }
51
+ data = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
52
+ msg = bytes(tensogram.encode(meta, [(desc, data)], hash=None))
53
+ path = tmp_path / "unhashed.tgm"
54
+ path.write_bytes(msg)
55
+ return str(path)
56
+
57
+
58
+ def _build_hashed_message(tmp_path) -> str:
59
+ """Encode a 1-object f32 message with hashing on + write to disk."""
60
+ meta = {"version": 3}
61
+ desc = {
62
+ "type": "ntensor",
63
+ "ndim": 1,
64
+ "shape": [4],
65
+ "strides": [1],
66
+ "dtype": "float32",
67
+ "encoding": "none",
68
+ "filter": "none",
69
+ "compression": "none",
70
+ }
71
+ data = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
72
+ msg = bytes(tensogram.encode(meta, [(desc, data)], hash="xxh3"))
73
+ path = tmp_path / "hashed.tgm"
74
+ path.write_bytes(msg)
75
+ return str(path)
76
+
77
+
78
+ class TestOpenDatasetVerifyHash:
79
+ def test_verify_hash_default_is_false(self, tmp_path):
80
+ """Default ``verify_hash=False`` decodes both hashed and
81
+ unhashed fixtures cleanly."""
82
+ ds_hashed = xr.open_dataset(_build_hashed_message(tmp_path), engine="tensogram")
83
+ # Force materialisation.
84
+ np.asarray(ds_hashed[next(iter(ds_hashed.data_vars))].values)
85
+ ds_hashed.close()
86
+
87
+ ds_unhashed = xr.open_dataset(_build_unhashed_message(tmp_path), engine="tensogram")
88
+ np.asarray(ds_unhashed[next(iter(ds_unhashed.data_vars))].values)
89
+ ds_unhashed.close()
90
+
91
+ def test_verify_hash_true_succeeds_on_hashed_dataset(self, tmp_path):
92
+ """Cell B equivalent on the xarray surface: opening a
93
+ hashed file with ``verify_hash=True`` and pulling data
94
+ materialises cleanly."""
95
+ ds = xr.open_dataset(
96
+ _build_hashed_message(tmp_path),
97
+ engine="tensogram",
98
+ verify_hash=True,
99
+ # Force the full-decode path so the verification fires.
100
+ range_threshold=0.0,
101
+ )
102
+ arr = np.asarray(ds[next(iter(ds.data_vars))].values)
103
+ np.testing.assert_array_equal(arr, [1.0, 2.0, 3.0, 4.0])
104
+ ds.close()
105
+
106
+ def test_verify_hash_true_raises_missing_hash_on_unhashed(self, tmp_path):
107
+ """Cell C on xarray: open a hashless file with
108
+ ``verify_hash=True`` and the first read raises
109
+ ``MissingHashError`` from the underlying
110
+ :mod:`tensogram` bindings."""
111
+ ds = xr.open_dataset(
112
+ _build_unhashed_message(tmp_path),
113
+ engine="tensogram",
114
+ verify_hash=True,
115
+ # Force the full-decode path so the verification fires.
116
+ range_threshold=0.0,
117
+ )
118
+ try:
119
+ with pytest.raises(tensogram.MissingHashError) as excinfo:
120
+ _ = np.asarray(ds[next(iter(ds.data_vars))].values)
121
+ assert excinfo.value.object_index == 0
122
+ finally:
123
+ ds.close()