tensogram-xarray 0.20.0__tar.gz → 0.22.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/.gitignore +7 -0
  2. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/PKG-INFO +2 -2
  3. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/pyproject.toml +2 -2
  4. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/array.py +16 -3
  5. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/backend.py +23 -5
  6. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/merge.py +7 -9
  7. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/store.py +4 -6
  8. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_coverage.py +268 -0
  9. tensogram_xarray-0.22.0/tests/test_verify_hash.py +123 -0
  10. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/README.md +0 -0
  11. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/__init__.py +0 -0
  12. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/coords.py +0 -0
  13. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/mapping.py +0 -0
  14. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/scanner.py +0 -0
  15. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/__init__.py +0 -0
  16. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/conftest.py +0 -0
  17. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_array.py +0 -0
  18. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_backend.py +0 -0
  19. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_coords.py +0 -0
  20. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_edge_cases.py +0 -0
  21. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_issue_67_descriptor_name_fallback.py +0 -0
  22. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_mapping.py +0 -0
  23. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_merge.py +0 -0
  24. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_nd_range.py +0 -0
  25. {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_remote.py +0 -0
@@ -10,6 +10,8 @@
10
10
  **/build/
11
11
  docs/book/
12
12
  python/**/dist/
13
+ # Release wheel staging from `make python-dist-extras` / `make release-check`
14
+ /dist/
13
15
  python/bindings/Cargo.lock
14
16
  rust/tensogram-grib/Cargo.lock
15
17
  rust/tensogram-netcdf/Cargo.lock
@@ -18,6 +20,11 @@ rust/tensogram-wasm/Cargo.lock
18
20
  .venv/
19
21
  **/__pycache__/
20
22
  *.pyc
23
+
24
+ # cargo-mutants working directory + per-run logs
25
+ mutants.out/
26
+ mutants.out.*/
27
+ mutants*.log
21
28
  python/bindings/python/tensogram/tensogram*.so
22
29
  # TODO do we want to have uv.locks ignored?
23
30
  **/uv.lock
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tensogram-xarray
3
- Version: 0.20.0
3
+ Version: 0.22.0
4
4
  Summary: xarray backend engine for tensogram .tgm files
5
5
  Project-URL: Homepage, https://sites.ecmwf.int/docs/tensogram/main
6
6
  Project-URL: Repository, https://github.com/ecmwf/tensogram
@@ -14,7 +14,7 @@ Classifier: Topic :: Scientific/Engineering
14
14
  Classifier: Topic :: Scientific/Engineering :: Atmospheric Science
15
15
  Requires-Python: >=3.11
16
16
  Requires-Dist: numpy
17
- Requires-Dist: tensogram<0.21,>=0.20.0
17
+ Requires-Dist: tensogram<0.23,>=0.22.0
18
18
  Requires-Dist: xarray>=2022.06
19
19
  Provides-Extra: dask
20
20
  Requires-Dist: dask[array]; extra == 'dask'
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "tensogram-xarray"
7
- version = "0.20.0"
7
+ version = "0.22.0"
8
8
  description = "xarray backend engine for tensogram .tgm files"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -18,7 +18,7 @@ classifiers = [
18
18
  "Topic :: Scientific/Engineering :: Atmospheric Science",
19
19
  ]
20
20
  dependencies = [
21
- "tensogram>=0.20.0,<0.21",
21
+ "tensogram>=0.22.0,<0.23",
22
22
  "xarray>=2022.06",
23
23
  "numpy",
24
24
  ]
@@ -200,11 +200,11 @@ class TensogramBackendArray(BackendArray):
200
200
  dtype: np.dtype,
201
201
  supports_range: bool,
202
202
  *,
203
- verify_hash: bool = False,
204
203
  range_threshold: float = DEFAULT_RANGE_THRESHOLD,
205
204
  lock: threading.Lock | None = None,
206
205
  storage_options: dict[str, Any] | None = None,
207
206
  shared_file: Any | None = None,
207
+ verify_hash: bool = False,
208
208
  ):
209
209
  import tensogram
210
210
 
@@ -215,10 +215,10 @@ class TensogramBackendArray(BackendArray):
215
215
  self.shape = shape
216
216
  self.dtype = dtype
217
217
  self.supports_range = supports_range
218
- self.verify_hash = verify_hash
219
218
  self.range_threshold = range_threshold
220
219
  self.storage_options = storage_options
221
220
  self._shared_file = shared_file
221
+ self.verify_hash = verify_hash
222
222
 
223
223
  # -- pickle support (no open handles stored) ----------------------------
224
224
 
@@ -267,12 +267,25 @@ class TensogramBackendArray(BackendArray):
267
267
  total_elements = math.prod(self.shape)
268
268
 
269
269
  if total_elements > 0 and total_requested / total_elements <= self.range_threshold:
270
+ # Per the decode-time verification contract
271
+ # (PLAN_DECODE_HASH_VERIFICATION §6, Q6): the
272
+ # range-decode fast path does *not* verify
273
+ # hashes — verifying a whole-frame hash would
274
+ # require reading every byte the optimisation
275
+ # is designed to avoid. When `verify_hash` is
276
+ # True, callers who care about integrity should
277
+ # set ``range_threshold=0`` to force every read
278
+ # through the full-decode path below. We
279
+ # *do* allow this fast path even under
280
+ # ``verify_hash=True`` so the user keeps the
281
+ # remote-fetch cost characteristics they
282
+ # opted in to via ``range_threshold``; the
283
+ # verification simply does not apply.
270
284
  arr = f.file_decode_range(
271
285
  self.msg_index,
272
286
  obj_index=self.obj_index,
273
287
  ranges=flat_ranges,
274
288
  join=True,
275
- verify_hash=self.verify_hash,
276
289
  native_byte_order=True,
277
290
  )
278
291
  return np.asarray(arr).reshape(out_shape)
@@ -55,9 +55,9 @@ class TensogramBackendEntrypoint(BackendEntrypoint):
55
55
  variable_key: str | None = None,
56
56
  message_index: int = 0,
57
57
  merge_objects: bool = False,
58
- verify_hash: bool = False,
59
58
  range_threshold: float = 0.5,
60
59
  storage_options: dict[str, Any] | None = None,
60
+ verify_hash: bool = False,
61
61
  ) -> xr.Dataset:
62
62
  """Open a single tensogram message as an :class:`xr.Dataset`.
63
63
 
@@ -79,8 +79,6 @@ class TensogramBackendEntrypoint(BackendEntrypoint):
79
79
  If *True*, attempt to merge objects across messages by stacking
80
80
  along metadata dimensions that vary. When *False* (default),
81
81
  only the single message at *message_index* is opened.
82
- verify_hash
83
- Whether to verify xxh3 hashes during decode.
84
82
  range_threshold
85
83
  Maximum fraction of total array elements (0.0-1.0) for which
86
84
  partial ``decode_range()`` is used instead of a full
@@ -88,6 +86,26 @@ class TensogramBackendEntrypoint(BackendEntrypoint):
88
86
  storage_options
89
87
  Key-value pairs forwarded to the object store backend when
90
88
  the path is a remote URL. Ignored for local files.
89
+ verify_hash
90
+ When *True*, every full ``decode_object`` (or
91
+ ``file_decode_object``) call materialised by this Dataset's
92
+ lazy backing arrays is verified against its inline xxh3
93
+ hash; ``MissingHashError`` / ``HashMismatchError`` from
94
+ the underlying tensogram bindings propagate to the
95
+ caller's first read.
96
+
97
+ **Caveat — partial-range fast path is unverified.** Per
98
+ the decode-time verification contract (see
99
+ ``plans/DESIGN.md`` §"Integrity Hashing" and
100
+ ``plans/WIRE_FORMAT.md`` §11.1), ``decode_range`` reads
101
+ only a slice of the encoded payload and cannot meaningfully
102
+ verify a whole-frame hash. When ``verify_hash=True`` and
103
+ the lazy reader chooses ``file_decode_range`` (because
104
+ the requested slice is below ``range_threshold``), no
105
+ verification happens for that read. Set
106
+ ``range_threshold=0`` to force every read through the
107
+ full-decode path if you need consistent integrity
108
+ coverage.
91
109
 
92
110
  Returns
93
111
  -------
@@ -107,9 +125,9 @@ class TensogramBackendEntrypoint(BackendEntrypoint):
107
125
  file_path,
108
126
  dim_names=dim_names,
109
127
  variable_key=variable_key,
110
- verify_hash=verify_hash,
111
128
  range_threshold=range_threshold,
112
129
  storage_options=storage_options,
130
+ verify_hash=verify_hash,
113
131
  )
114
132
  if not datasets:
115
133
  return xr.Dataset()
@@ -120,9 +138,9 @@ class TensogramBackendEntrypoint(BackendEntrypoint):
120
138
  msg_index=message_index,
121
139
  dim_names=dim_names,
122
140
  variable_key=variable_key,
123
- verify_hash=verify_hash,
124
141
  range_threshold=range_threshold,
125
142
  storage_options=storage_options,
143
+ verify_hash=verify_hash,
126
144
  )
127
145
 
128
146
  drop_set = set(drop_variables) if drop_variables else None
@@ -51,8 +51,8 @@ def open_datasets(
51
51
  *,
52
52
  dim_names: Sequence[str] | None = None,
53
53
  variable_key: str | None = None,
54
- verify_hash: bool = False,
55
54
  range_threshold: float = 0.5,
55
+ verify_hash: bool = False,
56
56
  storage_options: dict[str, Any] | None = None,
57
57
  ) -> list[xr.Dataset]:
58
58
  """Open a ``.tgm`` file, auto-grouping into compatible Datasets.
@@ -69,8 +69,6 @@ def open_datasets(
69
69
  Explicit dimension names for the innermost tensor axes.
70
70
  variable_key
71
71
  Dotted metadata key path for variable naming.
72
- verify_hash
73
- Whether to verify hashes on decode.
74
72
  range_threshold
75
73
  Maximum fraction of total array elements for which partial
76
74
  ``decode_range()`` is used. Default ``0.5``.
@@ -114,8 +112,8 @@ def open_datasets(
114
112
  shape=shape,
115
113
  dtype=np_dtype,
116
114
  supports_range=_supports_range_decode(obj.descriptor),
117
- verify_hash=verify_hash,
118
115
  range_threshold=range_threshold,
116
+ verify_hash=verify_hash,
119
117
  lock=lock,
120
118
  storage_options=storage_options,
121
119
  shared_file=shared_file,
@@ -479,8 +477,8 @@ def _single_object_dataset(
479
477
  shape=shape,
480
478
  dtype=np_dtype,
481
479
  supports_range=_supports_range_decode(obj.descriptor),
482
- verify_hash=verify_hash,
483
480
  range_threshold=range_threshold,
481
+ verify_hash=verify_hash,
484
482
  lock=lock,
485
483
  storage_options=storage_options,
486
484
  shared_file=shared_file,
@@ -532,8 +530,8 @@ def _flat_group_dataset(
532
530
  shape=obj.shape,
533
531
  dtype=np_dtype,
534
532
  supports_range=_supports_range_decode(obj.descriptor),
535
- verify_hash=verify_hash,
536
533
  range_threshold=range_threshold,
534
+ verify_hash=verify_hash,
537
535
  lock=lock,
538
536
  storage_options=storage_options,
539
537
  shared_file=shared_file,
@@ -696,8 +694,8 @@ def _build_multi_variable_dataset(
696
694
  shape=inner_shape,
697
695
  dtype=np_dtype,
698
696
  supports_range=_supports_range_decode(obj.descriptor),
699
- verify_hash=verify_hash,
700
697
  range_threshold=range_threshold,
698
+ verify_hash=verify_hash,
701
699
  lock=lock,
702
700
  storage_options=storage_options,
703
701
  shared_file=shared_file,
@@ -786,8 +784,8 @@ def _build_multi_variable_dataset(
786
784
  shape=inner_shape,
787
785
  dtype=np_dtype,
788
786
  supports_range=_supports_range_decode(obj.descriptor),
789
- verify_hash=verify_hash,
790
787
  range_threshold=range_threshold,
788
+ verify_hash=verify_hash,
791
789
  lock=lock,
792
790
  storage_options=storage_options,
793
791
  shared_file=shared_file,
@@ -816,8 +814,8 @@ def _build_multi_variable_dataset(
816
814
  shape=inner_shape,
817
815
  dtype=np_dtype,
818
816
  supports_range=_supports_range_decode(obj.descriptor),
819
- verify_hash=verify_hash,
820
817
  range_threshold=range_threshold,
818
+ verify_hash=verify_hash,
821
819
  lock=lock,
822
820
  storage_options=storage_options,
823
821
  shared_file=shared_file,
@@ -196,8 +196,6 @@ class TensogramDataStore:
196
196
  Optional user-specified dimension names for data variables.
197
197
  variable_key
198
198
  Optional dotted metadata path for variable naming.
199
- verify_hash
200
- Whether to verify object hashes on decode.
201
199
  range_threshold
202
200
  Maximum fraction of total array elements (0.0-1.0) for which
203
201
  partial ``decode_range()`` is used. Default ``0.5``.
@@ -214,9 +212,9 @@ class TensogramDataStore:
214
212
  msg_index: int = 0,
215
213
  dim_names: Sequence[str] | None = None,
216
214
  variable_key: str | None = None,
217
- verify_hash: bool = False,
218
215
  range_threshold: float = 0.5,
219
216
  storage_options: dict[str, Any] | None = None,
217
+ verify_hash: bool = False,
220
218
  ):
221
219
  import tensogram
222
220
 
@@ -225,9 +223,9 @@ class TensogramDataStore:
225
223
  self.msg_index = msg_index
226
224
  self.dim_names = dim_names
227
225
  self.variable_key = variable_key
228
- self.verify_hash = verify_hash
229
226
  self.range_threshold = range_threshold
230
227
  self.storage_options = storage_options
228
+ self.verify_hash = verify_hash
231
229
  self._lock = threading.Lock()
232
230
  self._backend_arrays: list[TensogramBackendArray] = []
233
231
 
@@ -347,11 +345,11 @@ class TensogramDataStore:
347
345
  shape=shape,
348
346
  dtype=np_dtype,
349
347
  supports_range=_supports_range_decode(desc),
350
- verify_hash=self.verify_hash,
351
348
  range_threshold=self.range_threshold,
352
349
  lock=self._lock,
353
350
  storage_options=self.storage_options,
354
351
  shared_file=self._file,
352
+ verify_hash=self.verify_hash,
355
353
  )
356
354
  self._backend_arrays.append(backend_array)
357
355
  lazy_data = indexing.LazilyIndexedArray(backend_array)
@@ -387,11 +385,11 @@ class TensogramDataStore:
387
385
  shape=shape,
388
386
  dtype=np_dtype,
389
387
  supports_range=_supports_range_decode(desc),
390
- verify_hash=self.verify_hash,
391
388
  range_threshold=self.range_threshold,
392
389
  lock=self._lock,
393
390
  storage_options=self.storage_options,
394
391
  shared_file=self._file,
392
+ verify_hash=self.verify_hash,
395
393
  )
396
394
  self._backend_arrays.append(backend_array)
397
395
 
@@ -1782,3 +1782,271 @@ class TestMergePathDimResolution:
1782
1782
  "merge path must not misread message-level _extra_['dim_names'] as a per-object hint"
1783
1783
  )
1784
1784
  assert datasets, "expected at least one Dataset despite inconsistent hints"
1785
+
1786
+
1787
+ # ---------------------------------------------------------------------------
1788
+ # Coverage pass 4: remaining audit gaps (deterministic, no-network)
1789
+ # ---------------------------------------------------------------------------
1790
+
1791
+
1792
+ def _serve_tgm_bytes(msg: bytes):
1793
+ """Serve raw .tgm *msg* bytes over a local loopback HTTP server.
1794
+
1795
+ Returns ``(url, shutdown)``. ``shutdown`` must be called to stop the
1796
+ server. Supports Range requests so the tensogram object-store backend
1797
+ can byte-range fetch descriptors/objects.
1798
+ """
1799
+ import http.server
1800
+ import threading
1801
+
1802
+ class Handler(http.server.BaseHTTPRequestHandler):
1803
+ def log_message(self, fmt, *args):
1804
+ pass
1805
+
1806
+ def do_HEAD(self):
1807
+ self.send_response(200)
1808
+ self.send_header("Content-Length", str(len(msg)))
1809
+ self.send_header("Accept-Ranges", "bytes")
1810
+ self.end_headers()
1811
+
1812
+ def do_GET(self):
1813
+ range_header = self.headers.get("Range")
1814
+ if range_header and range_header.startswith("bytes="):
1815
+ spec = range_header[6:]
1816
+ if spec.startswith("-"):
1817
+ n = int(spec[1:])
1818
+ s, e = max(0, len(msg) - n), len(msg)
1819
+ else:
1820
+ parts = spec.split("-")
1821
+ s = int(parts[0])
1822
+ e = int(parts[1]) + 1 if parts[1] else len(msg)
1823
+ e = min(e, len(msg))
1824
+ if s >= len(msg):
1825
+ self.send_response(416)
1826
+ self.end_headers()
1827
+ return
1828
+ chunk = msg[s:e]
1829
+ self.send_response(206)
1830
+ self.send_header("Content-Range", f"bytes {s}-{e - 1}/{len(msg)}")
1831
+ self.send_header("Content-Length", str(len(chunk)))
1832
+ self.end_headers()
1833
+ self.wfile.write(chunk)
1834
+ else:
1835
+ self.send_response(200)
1836
+ self.send_header("Content-Length", str(len(msg)))
1837
+ self.end_headers()
1838
+ self.wfile.write(msg)
1839
+
1840
+ server = http.server.ThreadingHTTPServer(("127.0.0.1", 0), Handler)
1841
+ port = server.server_address[1]
1842
+ thread = threading.Thread(target=server.serve_forever, daemon=True)
1843
+ thread.start()
1844
+
1845
+ def shutdown():
1846
+ server.shutdown()
1847
+ server.server_close()
1848
+ thread.join(timeout=5)
1849
+
1850
+ return f"http://127.0.0.1:{port}/test.tgm", shutdown
1851
+
1852
+
1853
+ class TestArrayGetFile:
1854
+ """Cover array.py ``_get_file`` branches."""
1855
+
1856
+ def test_get_file_returns_shared(self):
1857
+ """``_get_file`` short-circuits to the shared handle."""
1858
+ from tensogram_xarray.array import TensogramBackendArray
1859
+
1860
+ sentinel = object()
1861
+ ba = TensogramBackendArray(
1862
+ "/nowhere.tgm",
1863
+ 0,
1864
+ 0,
1865
+ (2,),
1866
+ np.dtype("float32"),
1867
+ supports_range=False,
1868
+ shared_file=sentinel,
1869
+ )
1870
+ assert ba._get_file() is sentinel
1871
+
1872
+ def test_get_file_remote_opens_remote(self):
1873
+ """``_get_file`` calls ``open_remote`` for a remote URL."""
1874
+ from unittest.mock import patch
1875
+
1876
+ from tensogram_xarray.array import TensogramBackendArray
1877
+
1878
+ with (
1879
+ patch("tensogram.is_remote_url", return_value=True),
1880
+ patch("tensogram.TensogramFile") as mock_file,
1881
+ ):
1882
+ ba = TensogramBackendArray(
1883
+ "s3://bucket/x.tgm",
1884
+ 0,
1885
+ 0,
1886
+ (2,),
1887
+ np.dtype("float32"),
1888
+ supports_range=False,
1889
+ storage_options={"region": "eu"},
1890
+ )
1891
+ ba._get_file()
1892
+ mock_file.open_remote.assert_called_once_with("s3://bucket/x.tgm", {"region": "eu"})
1893
+
1894
+
1895
+ class TestArrayDecodeRangeFallbackReal:
1896
+ """Cover array.py decode_range exception fallback."""
1897
+
1898
+ def test_file_decode_range_failure_falls_back(self, tmp_path: Path):
1899
+ """When ``f.file_decode_range`` raises, full decode is used instead."""
1900
+ from tensogram_xarray.array import TensogramBackendArray
1901
+
1902
+ data = np.arange(12, dtype=np.float32).reshape(3, 4)
1903
+ path = str(tmp_path / "range_fail.tgm")
1904
+ with tensogram.TensogramFile.create(path) as f:
1905
+ f.append({"version": 3}, [(_desc([3, 4]), data)])
1906
+
1907
+ class _FailingRange:
1908
+ """Wraps a real file but raises in the range-decode fast path."""
1909
+
1910
+ def __init__(self, inner):
1911
+ self._inner = inner
1912
+
1913
+ def file_decode_range(self, *args, **kwargs):
1914
+ raise RuntimeError("range decode unavailable")
1915
+
1916
+ def read_message(self, *args, **kwargs):
1917
+ return self._inner.read_message(*args, **kwargs)
1918
+
1919
+ with tensogram.TensogramFile.open(path) as inner:
1920
+ wrapper = _FailingRange(inner)
1921
+ ba = TensogramBackendArray(
1922
+ path,
1923
+ 0,
1924
+ 0,
1925
+ (3, 4),
1926
+ np.dtype("float32"),
1927
+ supports_range=True,
1928
+ range_threshold=1.0,
1929
+ shared_file=wrapper,
1930
+ )
1931
+ result = ba._raw_indexing_method((slice(0, 1), slice(0, 2)))
1932
+ np.testing.assert_array_equal(result, data[0:1, 0:2])
1933
+
1934
+
1935
+ class TestExpandKeyIntBranch:
1936
+ """Cover array.py ``_expand_key_to_indices`` int branch."""
1937
+
1938
+ def test_integer_key_wrapped_in_list(self):
1939
+ from tensogram_xarray.array import _expand_key_to_indices
1940
+
1941
+ result = _expand_key_to_indices((3, slice(1, 3)), (5, 10))
1942
+ assert result[0] == [3]
1943
+ assert result[1] == [1, 2]
1944
+
1945
+
1946
+ class TestMappingExtraHintErrorPaths:
1947
+ """Cover mapping.py ``parse_extra_dim_names_hint`` error branches."""
1948
+
1949
+ def test_list_element_str_raises_returns_empty(self):
1950
+ """A list whose elements fail ``str()`` yields ``{}``."""
1951
+ from tensogram_xarray.mapping import parse_extra_dim_names_hint
1952
+
1953
+ class _Unstringable:
1954
+ def __str__(self):
1955
+ raise ValueError("cannot stringify")
1956
+
1957
+ assert parse_extra_dim_names_hint(2, [_Unstringable(), _Unstringable()]) == {}
1958
+
1959
+ def test_dict_non_int_key_returns_empty(self):
1960
+ """A dict with a non-integer key yields ``{}``."""
1961
+ from tensogram_xarray.mapping import parse_extra_dim_names_hint
1962
+
1963
+ assert parse_extra_dim_names_hint(2, {"not_an_int": "values"}) == {}
1964
+
1965
+
1966
+ class TestScannerRemote:
1967
+ """Cover scanner.py remote scan path."""
1968
+
1969
+ def test_scan_file_remote(self):
1970
+ """``scan_file`` opens remote URLs and decodes descriptors lazily."""
1971
+ data = np.arange(6, dtype=np.float32)
1972
+ msg = bytes(tensogram.encode({"version": 3, "source": "remote"}, [(_desc([6]), data)]))
1973
+ url, shutdown = _serve_tgm_bytes(msg)
1974
+ try:
1975
+ index = scan_file(url)
1976
+ finally:
1977
+ shutdown()
1978
+ assert len(index.objects) == 1
1979
+ assert index.objects[0].shape == (6,)
1980
+ assert index.objects[0].common_meta.get("source") == "remote"
1981
+
1982
+
1983
+ class TestMergeRemote:
1984
+ """Cover merge.py remote shared-file open and close."""
1985
+
1986
+ def test_open_datasets_remote_and_close(self):
1987
+ """``open_datasets`` opens a remote shared file and closes cleanly."""
1988
+ data = np.arange(6, dtype=np.float32)
1989
+ msg = bytes(tensogram.encode({"version": 3}, [(_desc([6]), data)]))
1990
+ url, shutdown = _serve_tgm_bytes(msg)
1991
+ try:
1992
+ datasets = open_datasets(url)
1993
+ assert len(datasets) >= 1
1994
+ var = next(iter(datasets[0].data_vars.values()))
1995
+ np.testing.assert_array_equal(var.values, data)
1996
+ # Closing triggers the remote `_close_shared` callback.
1997
+ for ds in datasets:
1998
+ ds.close()
1999
+ finally:
2000
+ shutdown()
2001
+
2002
+
2003
+ class TestPartitionKeysUnhashable:
2004
+ """Cover merge.py ``_partition_keys`` unhashable fallback."""
2005
+
2006
+ def test_unhashable_values_treated_as_constant(self):
2007
+ """Values that remain unhashable after ``_make_hashable`` -> constant."""
2008
+ # ``set`` passes through ``_make_hashable`` unchanged and a set of
2009
+ # sets raises TypeError, exercising the except branch.
2010
+ kv = {"flags": [{1, 2}, {3, 4}]}
2011
+ const, vary = _partition_keys(kv)
2012
+ assert "flags" in const
2013
+ assert const["flags"] == {1, 2}
2014
+ assert vary == {}
2015
+
2016
+
2017
+ class TestHypercubeMissingEntry:
2018
+ """Cover merge.py missing-hypercube-entry guards."""
2019
+
2020
+ def test_missing_entry_without_variable_key(self, tmp_path: Path):
2021
+ """A duplicate + a missing grid corner raises in ``_hypercube_dataset``."""
2022
+ path = str(tmp_path / "missing_entry.tgm")
2023
+ # 4 objects: (a,1),(a,2),(b,1),(b,1) -> 2x2 grid count matches (4) yet
2024
+ # (b,2) is absent and (b,1) is duplicated.
2025
+ combos = [("a", "1"), ("a", "2"), ("b", "1"), ("b", "1")]
2026
+ with tensogram.TensogramFile.create(path) as f:
2027
+ for k1, k2 in combos:
2028
+ f.append(
2029
+ {"version": 3, "base": [{"k1": k1, "k2": k2}]},
2030
+ [(_desc([2, 3]), np.ones((2, 3), dtype=np.float32))],
2031
+ )
2032
+ with pytest.raises(ValueError, match="hypercube has a missing entry"):
2033
+ open_datasets(path)
2034
+
2035
+ def test_missing_entry_with_variable_key(self, tmp_path: Path):
2036
+ """Same defect inside a ``variable_key`` sub-group raises."""
2037
+ path = str(tmp_path / "missing_entry_var.tgm")
2038
+ rows = [
2039
+ ("2t", "d1", "L1"),
2040
+ ("2t", "d1", "L2"),
2041
+ ("2t", "d2", "L1"),
2042
+ ("2t", "d2", "L1"), # duplicate; (d2, L2) missing
2043
+ ("10u", "d1", "L1"),
2044
+ ]
2045
+ with tensogram.TensogramFile.create(path) as f:
2046
+ for param, date, level in rows:
2047
+ f.append(
2048
+ {"version": 3, "base": [{"param": param, "date": date, "level": level}]},
2049
+ [(_desc([2, 3]), np.ones((2, 3), dtype=np.float32))],
2050
+ )
2051
+ with pytest.raises(ValueError, match="hypercube has a missing entry"):
2052
+ open_datasets(path, variable_key="param")
@@ -0,0 +1,123 @@
1
+ # (C) Copyright 2026- ECMWF and individual contributors.
2
+ #
3
+ # This software is licensed under the terms of the Apache Licence Version 2.0
4
+ # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
5
+ # In applying this licence, ECMWF does not waive the privileges and immunities
6
+ # granted to it by virtue of its status as an intergovernmental organisation nor
7
+ # does it submit to any jurisdiction.
8
+
9
+ """verify_hash threading through the xarray backend.
10
+
11
+ The xarray backend re-exposes the upstream
12
+ ``DecodeOptions::verify_hash`` flag via the ``open_dataset(...,
13
+ verify_hash=True)`` keyword. When the lazy backing arrays
14
+ materialise data via ``file_decode_object`` /
15
+ ``decode_object``, the kwarg propagates and integrity errors
16
+ (``MissingHashError`` / ``HashMismatchError``) bubble up to
17
+ the caller's first read.
18
+
19
+ Per Q6 in ``PLAN_DECODE_HASH_VERIFICATION.md``: the partial-
20
+ range fast path silently does *not* verify (range decode
21
+ does not accept ``verify_hash``). Set ``range_threshold=0``
22
+ to force every read through the full-decode path if you
23
+ need uniform coverage.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import numpy as np
29
+ import pytest
30
+ import tensogram
31
+ import xarray as xr
32
+
33
+
34
+ def _build_unhashed_message(tmp_path) -> str:
35
+ """Encode a 1-object f32 message with hashing off + write to disk.
36
+
37
+ Returns the file path. The unhashed encoding is what makes
38
+ cell C (`verify_hash=True` → MissingHashError) testable.
39
+ """
40
+ meta = {"version": 3}
41
+ desc = {
42
+ "type": "ntensor",
43
+ "ndim": 1,
44
+ "shape": [4],
45
+ "strides": [1],
46
+ "dtype": "float32",
47
+ "encoding": "none",
48
+ "filter": "none",
49
+ "compression": "none",
50
+ }
51
+ data = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
52
+ msg = bytes(tensogram.encode(meta, [(desc, data)], hash=None))
53
+ path = tmp_path / "unhashed.tgm"
54
+ path.write_bytes(msg)
55
+ return str(path)
56
+
57
+
58
+ def _build_hashed_message(tmp_path) -> str:
59
+ """Encode a 1-object f32 message with hashing on + write to disk."""
60
+ meta = {"version": 3}
61
+ desc = {
62
+ "type": "ntensor",
63
+ "ndim": 1,
64
+ "shape": [4],
65
+ "strides": [1],
66
+ "dtype": "float32",
67
+ "encoding": "none",
68
+ "filter": "none",
69
+ "compression": "none",
70
+ }
71
+ data = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
72
+ msg = bytes(tensogram.encode(meta, [(desc, data)], hash="xxh3"))
73
+ path = tmp_path / "hashed.tgm"
74
+ path.write_bytes(msg)
75
+ return str(path)
76
+
77
+
78
+ class TestOpenDatasetVerifyHash:
79
+ def test_verify_hash_default_is_false(self, tmp_path):
80
+ """Default ``verify_hash=False`` decodes both hashed and
81
+ unhashed fixtures cleanly."""
82
+ ds_hashed = xr.open_dataset(_build_hashed_message(tmp_path), engine="tensogram")
83
+ # Force materialisation.
84
+ np.asarray(ds_hashed[next(iter(ds_hashed.data_vars))].values)
85
+ ds_hashed.close()
86
+
87
+ ds_unhashed = xr.open_dataset(_build_unhashed_message(tmp_path), engine="tensogram")
88
+ np.asarray(ds_unhashed[next(iter(ds_unhashed.data_vars))].values)
89
+ ds_unhashed.close()
90
+
91
+ def test_verify_hash_true_succeeds_on_hashed_dataset(self, tmp_path):
92
+ """Cell B equivalent on the xarray surface: opening a
93
+ hashed file with ``verify_hash=True`` and pulling data
94
+ materialises cleanly."""
95
+ ds = xr.open_dataset(
96
+ _build_hashed_message(tmp_path),
97
+ engine="tensogram",
98
+ verify_hash=True,
99
+ # Force the full-decode path so the verification fires.
100
+ range_threshold=0.0,
101
+ )
102
+ arr = np.asarray(ds[next(iter(ds.data_vars))].values)
103
+ np.testing.assert_array_equal(arr, [1.0, 2.0, 3.0, 4.0])
104
+ ds.close()
105
+
106
+ def test_verify_hash_true_raises_missing_hash_on_unhashed(self, tmp_path):
107
+ """Cell C on xarray: open a hashless file with
108
+ ``verify_hash=True`` and the first read raises
109
+ ``MissingHashError`` from the underlying
110
+ :mod:`tensogram` bindings."""
111
+ ds = xr.open_dataset(
112
+ _build_unhashed_message(tmp_path),
113
+ engine="tensogram",
114
+ verify_hash=True,
115
+ # Force the full-decode path so the verification fires.
116
+ range_threshold=0.0,
117
+ )
118
+ try:
119
+ with pytest.raises(tensogram.MissingHashError) as excinfo:
120
+ _ = np.asarray(ds[next(iter(ds.data_vars))].values)
121
+ assert excinfo.value.object_index == 0
122
+ finally:
123
+ ds.close()