tensogram-xarray 0.20.0__tar.gz → 0.22.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/.gitignore +7 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/PKG-INFO +2 -2
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/pyproject.toml +2 -2
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/array.py +16 -3
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/backend.py +23 -5
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/merge.py +7 -9
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/store.py +4 -6
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_coverage.py +268 -0
- tensogram_xarray-0.22.0/tests/test_verify_hash.py +123 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/README.md +0 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/__init__.py +0 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/coords.py +0 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/mapping.py +0 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/src/tensogram_xarray/scanner.py +0 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/__init__.py +0 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/conftest.py +0 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_array.py +0 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_backend.py +0 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_coords.py +0 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_edge_cases.py +0 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_issue_67_descriptor_name_fallback.py +0 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_mapping.py +0 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_merge.py +0 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_nd_range.py +0 -0
- {tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_remote.py +0 -0
|
@@ -10,6 +10,8 @@
|
|
|
10
10
|
**/build/
|
|
11
11
|
docs/book/
|
|
12
12
|
python/**/dist/
|
|
13
|
+
# Release wheel staging from `make python-dist-extras` / `make release-check`
|
|
14
|
+
/dist/
|
|
13
15
|
python/bindings/Cargo.lock
|
|
14
16
|
rust/tensogram-grib/Cargo.lock
|
|
15
17
|
rust/tensogram-netcdf/Cargo.lock
|
|
@@ -18,6 +20,11 @@ rust/tensogram-wasm/Cargo.lock
|
|
|
18
20
|
.venv/
|
|
19
21
|
**/__pycache__/
|
|
20
22
|
*.pyc
|
|
23
|
+
|
|
24
|
+
# cargo-mutants working directory + per-run logs
|
|
25
|
+
mutants.out/
|
|
26
|
+
mutants.out.*/
|
|
27
|
+
mutants*.log
|
|
21
28
|
python/bindings/python/tensogram/tensogram*.so
|
|
22
29
|
# TODO do we want to have uv.locks ignored?
|
|
23
30
|
**/uv.lock
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tensogram-xarray
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.22.0
|
|
4
4
|
Summary: xarray backend engine for tensogram .tgm files
|
|
5
5
|
Project-URL: Homepage, https://sites.ecmwf.int/docs/tensogram/main
|
|
6
6
|
Project-URL: Repository, https://github.com/ecmwf/tensogram
|
|
@@ -14,7 +14,7 @@ Classifier: Topic :: Scientific/Engineering
|
|
|
14
14
|
Classifier: Topic :: Scientific/Engineering :: Atmospheric Science
|
|
15
15
|
Requires-Python: >=3.11
|
|
16
16
|
Requires-Dist: numpy
|
|
17
|
-
Requires-Dist: tensogram<0.
|
|
17
|
+
Requires-Dist: tensogram<0.23,>=0.22.0
|
|
18
18
|
Requires-Dist: xarray>=2022.06
|
|
19
19
|
Provides-Extra: dask
|
|
20
20
|
Requires-Dist: dask[array]; extra == 'dask'
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "tensogram-xarray"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.22.0"
|
|
8
8
|
description = "xarray backend engine for tensogram .tgm files"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.11"
|
|
@@ -18,7 +18,7 @@ classifiers = [
|
|
|
18
18
|
"Topic :: Scientific/Engineering :: Atmospheric Science",
|
|
19
19
|
]
|
|
20
20
|
dependencies = [
|
|
21
|
-
"tensogram>=0.
|
|
21
|
+
"tensogram>=0.22.0,<0.23",
|
|
22
22
|
"xarray>=2022.06",
|
|
23
23
|
"numpy",
|
|
24
24
|
]
|
|
@@ -200,11 +200,11 @@ class TensogramBackendArray(BackendArray):
|
|
|
200
200
|
dtype: np.dtype,
|
|
201
201
|
supports_range: bool,
|
|
202
202
|
*,
|
|
203
|
-
verify_hash: bool = False,
|
|
204
203
|
range_threshold: float = DEFAULT_RANGE_THRESHOLD,
|
|
205
204
|
lock: threading.Lock | None = None,
|
|
206
205
|
storage_options: dict[str, Any] | None = None,
|
|
207
206
|
shared_file: Any | None = None,
|
|
207
|
+
verify_hash: bool = False,
|
|
208
208
|
):
|
|
209
209
|
import tensogram
|
|
210
210
|
|
|
@@ -215,10 +215,10 @@ class TensogramBackendArray(BackendArray):
|
|
|
215
215
|
self.shape = shape
|
|
216
216
|
self.dtype = dtype
|
|
217
217
|
self.supports_range = supports_range
|
|
218
|
-
self.verify_hash = verify_hash
|
|
219
218
|
self.range_threshold = range_threshold
|
|
220
219
|
self.storage_options = storage_options
|
|
221
220
|
self._shared_file = shared_file
|
|
221
|
+
self.verify_hash = verify_hash
|
|
222
222
|
|
|
223
223
|
# -- pickle support (no open handles stored) ----------------------------
|
|
224
224
|
|
|
@@ -267,12 +267,25 @@ class TensogramBackendArray(BackendArray):
|
|
|
267
267
|
total_elements = math.prod(self.shape)
|
|
268
268
|
|
|
269
269
|
if total_elements > 0 and total_requested / total_elements <= self.range_threshold:
|
|
270
|
+
# Per the decode-time verification contract
|
|
271
|
+
# (PLAN_DECODE_HASH_VERIFICATION §6, Q6): the
|
|
272
|
+
# range-decode fast path does *not* verify
|
|
273
|
+
# hashes — verifying a whole-frame hash would
|
|
274
|
+
# require reading every byte the optimisation
|
|
275
|
+
# is designed to avoid. When `verify_hash` is
|
|
276
|
+
# True, callers who care about integrity should
|
|
277
|
+
# set ``range_threshold=0`` to force every read
|
|
278
|
+
# through the full-decode path below. We
|
|
279
|
+
# *do* allow this fast path even under
|
|
280
|
+
# ``verify_hash=True`` so the user keeps the
|
|
281
|
+
# remote-fetch cost characteristics they
|
|
282
|
+
# opted in to via ``range_threshold``; the
|
|
283
|
+
# verification simply does not apply.
|
|
270
284
|
arr = f.file_decode_range(
|
|
271
285
|
self.msg_index,
|
|
272
286
|
obj_index=self.obj_index,
|
|
273
287
|
ranges=flat_ranges,
|
|
274
288
|
join=True,
|
|
275
|
-
verify_hash=self.verify_hash,
|
|
276
289
|
native_byte_order=True,
|
|
277
290
|
)
|
|
278
291
|
return np.asarray(arr).reshape(out_shape)
|
|
@@ -55,9 +55,9 @@ class TensogramBackendEntrypoint(BackendEntrypoint):
|
|
|
55
55
|
variable_key: str | None = None,
|
|
56
56
|
message_index: int = 0,
|
|
57
57
|
merge_objects: bool = False,
|
|
58
|
-
verify_hash: bool = False,
|
|
59
58
|
range_threshold: float = 0.5,
|
|
60
59
|
storage_options: dict[str, Any] | None = None,
|
|
60
|
+
verify_hash: bool = False,
|
|
61
61
|
) -> xr.Dataset:
|
|
62
62
|
"""Open a single tensogram message as an :class:`xr.Dataset`.
|
|
63
63
|
|
|
@@ -79,8 +79,6 @@ class TensogramBackendEntrypoint(BackendEntrypoint):
|
|
|
79
79
|
If *True*, attempt to merge objects across messages by stacking
|
|
80
80
|
along metadata dimensions that vary. When *False* (default),
|
|
81
81
|
only the single message at *message_index* is opened.
|
|
82
|
-
verify_hash
|
|
83
|
-
Whether to verify xxh3 hashes during decode.
|
|
84
82
|
range_threshold
|
|
85
83
|
Maximum fraction of total array elements (0.0-1.0) for which
|
|
86
84
|
partial ``decode_range()`` is used instead of a full
|
|
@@ -88,6 +86,26 @@ class TensogramBackendEntrypoint(BackendEntrypoint):
|
|
|
88
86
|
storage_options
|
|
89
87
|
Key-value pairs forwarded to the object store backend when
|
|
90
88
|
the path is a remote URL. Ignored for local files.
|
|
89
|
+
verify_hash
|
|
90
|
+
When *True*, every full ``decode_object`` (or
|
|
91
|
+
``file_decode_object``) call materialised by this Dataset's
|
|
92
|
+
lazy backing arrays is verified against its inline xxh3
|
|
93
|
+
hash; ``MissingHashError`` / ``HashMismatchError`` from
|
|
94
|
+
the underlying tensogram bindings propagate to the
|
|
95
|
+
caller's first read.
|
|
96
|
+
|
|
97
|
+
**Caveat — partial-range fast path is unverified.** Per
|
|
98
|
+
the decode-time verification contract (see
|
|
99
|
+
``plans/DESIGN.md`` §"Integrity Hashing" and
|
|
100
|
+
``plans/WIRE_FORMAT.md`` §11.1), ``decode_range`` reads
|
|
101
|
+
only a slice of the encoded payload and cannot meaningfully
|
|
102
|
+
verify a whole-frame hash. When ``verify_hash=True`` and
|
|
103
|
+
the lazy reader chooses ``file_decode_range`` (because
|
|
104
|
+
the requested slice is below ``range_threshold``), no
|
|
105
|
+
verification happens for that read. Set
|
|
106
|
+
``range_threshold=0`` to force every read through the
|
|
107
|
+
full-decode path if you need consistent integrity
|
|
108
|
+
coverage.
|
|
91
109
|
|
|
92
110
|
Returns
|
|
93
111
|
-------
|
|
@@ -107,9 +125,9 @@ class TensogramBackendEntrypoint(BackendEntrypoint):
|
|
|
107
125
|
file_path,
|
|
108
126
|
dim_names=dim_names,
|
|
109
127
|
variable_key=variable_key,
|
|
110
|
-
verify_hash=verify_hash,
|
|
111
128
|
range_threshold=range_threshold,
|
|
112
129
|
storage_options=storage_options,
|
|
130
|
+
verify_hash=verify_hash,
|
|
113
131
|
)
|
|
114
132
|
if not datasets:
|
|
115
133
|
return xr.Dataset()
|
|
@@ -120,9 +138,9 @@ class TensogramBackendEntrypoint(BackendEntrypoint):
|
|
|
120
138
|
msg_index=message_index,
|
|
121
139
|
dim_names=dim_names,
|
|
122
140
|
variable_key=variable_key,
|
|
123
|
-
verify_hash=verify_hash,
|
|
124
141
|
range_threshold=range_threshold,
|
|
125
142
|
storage_options=storage_options,
|
|
143
|
+
verify_hash=verify_hash,
|
|
126
144
|
)
|
|
127
145
|
|
|
128
146
|
drop_set = set(drop_variables) if drop_variables else None
|
|
@@ -51,8 +51,8 @@ def open_datasets(
|
|
|
51
51
|
*,
|
|
52
52
|
dim_names: Sequence[str] | None = None,
|
|
53
53
|
variable_key: str | None = None,
|
|
54
|
-
verify_hash: bool = False,
|
|
55
54
|
range_threshold: float = 0.5,
|
|
55
|
+
verify_hash: bool = False,
|
|
56
56
|
storage_options: dict[str, Any] | None = None,
|
|
57
57
|
) -> list[xr.Dataset]:
|
|
58
58
|
"""Open a ``.tgm`` file, auto-grouping into compatible Datasets.
|
|
@@ -69,8 +69,6 @@ def open_datasets(
|
|
|
69
69
|
Explicit dimension names for the innermost tensor axes.
|
|
70
70
|
variable_key
|
|
71
71
|
Dotted metadata key path for variable naming.
|
|
72
|
-
verify_hash
|
|
73
|
-
Whether to verify hashes on decode.
|
|
74
72
|
range_threshold
|
|
75
73
|
Maximum fraction of total array elements for which partial
|
|
76
74
|
``decode_range()`` is used. Default ``0.5``.
|
|
@@ -114,8 +112,8 @@ def open_datasets(
|
|
|
114
112
|
shape=shape,
|
|
115
113
|
dtype=np_dtype,
|
|
116
114
|
supports_range=_supports_range_decode(obj.descriptor),
|
|
117
|
-
verify_hash=verify_hash,
|
|
118
115
|
range_threshold=range_threshold,
|
|
116
|
+
verify_hash=verify_hash,
|
|
119
117
|
lock=lock,
|
|
120
118
|
storage_options=storage_options,
|
|
121
119
|
shared_file=shared_file,
|
|
@@ -479,8 +477,8 @@ def _single_object_dataset(
|
|
|
479
477
|
shape=shape,
|
|
480
478
|
dtype=np_dtype,
|
|
481
479
|
supports_range=_supports_range_decode(obj.descriptor),
|
|
482
|
-
verify_hash=verify_hash,
|
|
483
480
|
range_threshold=range_threshold,
|
|
481
|
+
verify_hash=verify_hash,
|
|
484
482
|
lock=lock,
|
|
485
483
|
storage_options=storage_options,
|
|
486
484
|
shared_file=shared_file,
|
|
@@ -532,8 +530,8 @@ def _flat_group_dataset(
|
|
|
532
530
|
shape=obj.shape,
|
|
533
531
|
dtype=np_dtype,
|
|
534
532
|
supports_range=_supports_range_decode(obj.descriptor),
|
|
535
|
-
verify_hash=verify_hash,
|
|
536
533
|
range_threshold=range_threshold,
|
|
534
|
+
verify_hash=verify_hash,
|
|
537
535
|
lock=lock,
|
|
538
536
|
storage_options=storage_options,
|
|
539
537
|
shared_file=shared_file,
|
|
@@ -696,8 +694,8 @@ def _build_multi_variable_dataset(
|
|
|
696
694
|
shape=inner_shape,
|
|
697
695
|
dtype=np_dtype,
|
|
698
696
|
supports_range=_supports_range_decode(obj.descriptor),
|
|
699
|
-
verify_hash=verify_hash,
|
|
700
697
|
range_threshold=range_threshold,
|
|
698
|
+
verify_hash=verify_hash,
|
|
701
699
|
lock=lock,
|
|
702
700
|
storage_options=storage_options,
|
|
703
701
|
shared_file=shared_file,
|
|
@@ -786,8 +784,8 @@ def _build_multi_variable_dataset(
|
|
|
786
784
|
shape=inner_shape,
|
|
787
785
|
dtype=np_dtype,
|
|
788
786
|
supports_range=_supports_range_decode(obj.descriptor),
|
|
789
|
-
verify_hash=verify_hash,
|
|
790
787
|
range_threshold=range_threshold,
|
|
788
|
+
verify_hash=verify_hash,
|
|
791
789
|
lock=lock,
|
|
792
790
|
storage_options=storage_options,
|
|
793
791
|
shared_file=shared_file,
|
|
@@ -816,8 +814,8 @@ def _build_multi_variable_dataset(
|
|
|
816
814
|
shape=inner_shape,
|
|
817
815
|
dtype=np_dtype,
|
|
818
816
|
supports_range=_supports_range_decode(obj.descriptor),
|
|
819
|
-
verify_hash=verify_hash,
|
|
820
817
|
range_threshold=range_threshold,
|
|
818
|
+
verify_hash=verify_hash,
|
|
821
819
|
lock=lock,
|
|
822
820
|
storage_options=storage_options,
|
|
823
821
|
shared_file=shared_file,
|
|
@@ -196,8 +196,6 @@ class TensogramDataStore:
|
|
|
196
196
|
Optional user-specified dimension names for data variables.
|
|
197
197
|
variable_key
|
|
198
198
|
Optional dotted metadata path for variable naming.
|
|
199
|
-
verify_hash
|
|
200
|
-
Whether to verify object hashes on decode.
|
|
201
199
|
range_threshold
|
|
202
200
|
Maximum fraction of total array elements (0.0-1.0) for which
|
|
203
201
|
partial ``decode_range()`` is used. Default ``0.5``.
|
|
@@ -214,9 +212,9 @@ class TensogramDataStore:
|
|
|
214
212
|
msg_index: int = 0,
|
|
215
213
|
dim_names: Sequence[str] | None = None,
|
|
216
214
|
variable_key: str | None = None,
|
|
217
|
-
verify_hash: bool = False,
|
|
218
215
|
range_threshold: float = 0.5,
|
|
219
216
|
storage_options: dict[str, Any] | None = None,
|
|
217
|
+
verify_hash: bool = False,
|
|
220
218
|
):
|
|
221
219
|
import tensogram
|
|
222
220
|
|
|
@@ -225,9 +223,9 @@ class TensogramDataStore:
|
|
|
225
223
|
self.msg_index = msg_index
|
|
226
224
|
self.dim_names = dim_names
|
|
227
225
|
self.variable_key = variable_key
|
|
228
|
-
self.verify_hash = verify_hash
|
|
229
226
|
self.range_threshold = range_threshold
|
|
230
227
|
self.storage_options = storage_options
|
|
228
|
+
self.verify_hash = verify_hash
|
|
231
229
|
self._lock = threading.Lock()
|
|
232
230
|
self._backend_arrays: list[TensogramBackendArray] = []
|
|
233
231
|
|
|
@@ -347,11 +345,11 @@ class TensogramDataStore:
|
|
|
347
345
|
shape=shape,
|
|
348
346
|
dtype=np_dtype,
|
|
349
347
|
supports_range=_supports_range_decode(desc),
|
|
350
|
-
verify_hash=self.verify_hash,
|
|
351
348
|
range_threshold=self.range_threshold,
|
|
352
349
|
lock=self._lock,
|
|
353
350
|
storage_options=self.storage_options,
|
|
354
351
|
shared_file=self._file,
|
|
352
|
+
verify_hash=self.verify_hash,
|
|
355
353
|
)
|
|
356
354
|
self._backend_arrays.append(backend_array)
|
|
357
355
|
lazy_data = indexing.LazilyIndexedArray(backend_array)
|
|
@@ -387,11 +385,11 @@ class TensogramDataStore:
|
|
|
387
385
|
shape=shape,
|
|
388
386
|
dtype=np_dtype,
|
|
389
387
|
supports_range=_supports_range_decode(desc),
|
|
390
|
-
verify_hash=self.verify_hash,
|
|
391
388
|
range_threshold=self.range_threshold,
|
|
392
389
|
lock=self._lock,
|
|
393
390
|
storage_options=self.storage_options,
|
|
394
391
|
shared_file=self._file,
|
|
392
|
+
verify_hash=self.verify_hash,
|
|
395
393
|
)
|
|
396
394
|
self._backend_arrays.append(backend_array)
|
|
397
395
|
|
|
@@ -1782,3 +1782,271 @@ class TestMergePathDimResolution:
|
|
|
1782
1782
|
"merge path must not misread message-level _extra_['dim_names'] as a per-object hint"
|
|
1783
1783
|
)
|
|
1784
1784
|
assert datasets, "expected at least one Dataset despite inconsistent hints"
|
|
1785
|
+
|
|
1786
|
+
|
|
1787
|
+
# ---------------------------------------------------------------------------
|
|
1788
|
+
# Coverage pass 4: remaining audit gaps (deterministic, no-network)
|
|
1789
|
+
# ---------------------------------------------------------------------------
|
|
1790
|
+
|
|
1791
|
+
|
|
1792
|
+
def _serve_tgm_bytes(msg: bytes):
|
|
1793
|
+
"""Serve raw .tgm *msg* bytes over a local loopback HTTP server.
|
|
1794
|
+
|
|
1795
|
+
Returns ``(url, shutdown)``. ``shutdown`` must be called to stop the
|
|
1796
|
+
server. Supports Range requests so the tensogram object-store backend
|
|
1797
|
+
can byte-range fetch descriptors/objects.
|
|
1798
|
+
"""
|
|
1799
|
+
import http.server
|
|
1800
|
+
import threading
|
|
1801
|
+
|
|
1802
|
+
class Handler(http.server.BaseHTTPRequestHandler):
|
|
1803
|
+
def log_message(self, fmt, *args):
|
|
1804
|
+
pass
|
|
1805
|
+
|
|
1806
|
+
def do_HEAD(self):
|
|
1807
|
+
self.send_response(200)
|
|
1808
|
+
self.send_header("Content-Length", str(len(msg)))
|
|
1809
|
+
self.send_header("Accept-Ranges", "bytes")
|
|
1810
|
+
self.end_headers()
|
|
1811
|
+
|
|
1812
|
+
def do_GET(self):
|
|
1813
|
+
range_header = self.headers.get("Range")
|
|
1814
|
+
if range_header and range_header.startswith("bytes="):
|
|
1815
|
+
spec = range_header[6:]
|
|
1816
|
+
if spec.startswith("-"):
|
|
1817
|
+
n = int(spec[1:])
|
|
1818
|
+
s, e = max(0, len(msg) - n), len(msg)
|
|
1819
|
+
else:
|
|
1820
|
+
parts = spec.split("-")
|
|
1821
|
+
s = int(parts[0])
|
|
1822
|
+
e = int(parts[1]) + 1 if parts[1] else len(msg)
|
|
1823
|
+
e = min(e, len(msg))
|
|
1824
|
+
if s >= len(msg):
|
|
1825
|
+
self.send_response(416)
|
|
1826
|
+
self.end_headers()
|
|
1827
|
+
return
|
|
1828
|
+
chunk = msg[s:e]
|
|
1829
|
+
self.send_response(206)
|
|
1830
|
+
self.send_header("Content-Range", f"bytes {s}-{e - 1}/{len(msg)}")
|
|
1831
|
+
self.send_header("Content-Length", str(len(chunk)))
|
|
1832
|
+
self.end_headers()
|
|
1833
|
+
self.wfile.write(chunk)
|
|
1834
|
+
else:
|
|
1835
|
+
self.send_response(200)
|
|
1836
|
+
self.send_header("Content-Length", str(len(msg)))
|
|
1837
|
+
self.end_headers()
|
|
1838
|
+
self.wfile.write(msg)
|
|
1839
|
+
|
|
1840
|
+
server = http.server.ThreadingHTTPServer(("127.0.0.1", 0), Handler)
|
|
1841
|
+
port = server.server_address[1]
|
|
1842
|
+
thread = threading.Thread(target=server.serve_forever, daemon=True)
|
|
1843
|
+
thread.start()
|
|
1844
|
+
|
|
1845
|
+
def shutdown():
|
|
1846
|
+
server.shutdown()
|
|
1847
|
+
server.server_close()
|
|
1848
|
+
thread.join(timeout=5)
|
|
1849
|
+
|
|
1850
|
+
return f"http://127.0.0.1:{port}/test.tgm", shutdown
|
|
1851
|
+
|
|
1852
|
+
|
|
1853
|
+
class TestArrayGetFile:
|
|
1854
|
+
"""Cover array.py ``_get_file`` branches."""
|
|
1855
|
+
|
|
1856
|
+
def test_get_file_returns_shared(self):
|
|
1857
|
+
"""``_get_file`` short-circuits to the shared handle."""
|
|
1858
|
+
from tensogram_xarray.array import TensogramBackendArray
|
|
1859
|
+
|
|
1860
|
+
sentinel = object()
|
|
1861
|
+
ba = TensogramBackendArray(
|
|
1862
|
+
"/nowhere.tgm",
|
|
1863
|
+
0,
|
|
1864
|
+
0,
|
|
1865
|
+
(2,),
|
|
1866
|
+
np.dtype("float32"),
|
|
1867
|
+
supports_range=False,
|
|
1868
|
+
shared_file=sentinel,
|
|
1869
|
+
)
|
|
1870
|
+
assert ba._get_file() is sentinel
|
|
1871
|
+
|
|
1872
|
+
def test_get_file_remote_opens_remote(self):
|
|
1873
|
+
"""``_get_file`` calls ``open_remote`` for a remote URL."""
|
|
1874
|
+
from unittest.mock import patch
|
|
1875
|
+
|
|
1876
|
+
from tensogram_xarray.array import TensogramBackendArray
|
|
1877
|
+
|
|
1878
|
+
with (
|
|
1879
|
+
patch("tensogram.is_remote_url", return_value=True),
|
|
1880
|
+
patch("tensogram.TensogramFile") as mock_file,
|
|
1881
|
+
):
|
|
1882
|
+
ba = TensogramBackendArray(
|
|
1883
|
+
"s3://bucket/x.tgm",
|
|
1884
|
+
0,
|
|
1885
|
+
0,
|
|
1886
|
+
(2,),
|
|
1887
|
+
np.dtype("float32"),
|
|
1888
|
+
supports_range=False,
|
|
1889
|
+
storage_options={"region": "eu"},
|
|
1890
|
+
)
|
|
1891
|
+
ba._get_file()
|
|
1892
|
+
mock_file.open_remote.assert_called_once_with("s3://bucket/x.tgm", {"region": "eu"})
|
|
1893
|
+
|
|
1894
|
+
|
|
1895
|
+
class TestArrayDecodeRangeFallbackReal:
|
|
1896
|
+
"""Cover array.py decode_range exception fallback."""
|
|
1897
|
+
|
|
1898
|
+
def test_file_decode_range_failure_falls_back(self, tmp_path: Path):
|
|
1899
|
+
"""When ``f.file_decode_range`` raises, full decode is used instead."""
|
|
1900
|
+
from tensogram_xarray.array import TensogramBackendArray
|
|
1901
|
+
|
|
1902
|
+
data = np.arange(12, dtype=np.float32).reshape(3, 4)
|
|
1903
|
+
path = str(tmp_path / "range_fail.tgm")
|
|
1904
|
+
with tensogram.TensogramFile.create(path) as f:
|
|
1905
|
+
f.append({"version": 3}, [(_desc([3, 4]), data)])
|
|
1906
|
+
|
|
1907
|
+
class _FailingRange:
|
|
1908
|
+
"""Wraps a real file but raises in the range-decode fast path."""
|
|
1909
|
+
|
|
1910
|
+
def __init__(self, inner):
|
|
1911
|
+
self._inner = inner
|
|
1912
|
+
|
|
1913
|
+
def file_decode_range(self, *args, **kwargs):
|
|
1914
|
+
raise RuntimeError("range decode unavailable")
|
|
1915
|
+
|
|
1916
|
+
def read_message(self, *args, **kwargs):
|
|
1917
|
+
return self._inner.read_message(*args, **kwargs)
|
|
1918
|
+
|
|
1919
|
+
with tensogram.TensogramFile.open(path) as inner:
|
|
1920
|
+
wrapper = _FailingRange(inner)
|
|
1921
|
+
ba = TensogramBackendArray(
|
|
1922
|
+
path,
|
|
1923
|
+
0,
|
|
1924
|
+
0,
|
|
1925
|
+
(3, 4),
|
|
1926
|
+
np.dtype("float32"),
|
|
1927
|
+
supports_range=True,
|
|
1928
|
+
range_threshold=1.0,
|
|
1929
|
+
shared_file=wrapper,
|
|
1930
|
+
)
|
|
1931
|
+
result = ba._raw_indexing_method((slice(0, 1), slice(0, 2)))
|
|
1932
|
+
np.testing.assert_array_equal(result, data[0:1, 0:2])
|
|
1933
|
+
|
|
1934
|
+
|
|
1935
|
+
class TestExpandKeyIntBranch:
|
|
1936
|
+
"""Cover array.py ``_expand_key_to_indices`` int branch."""
|
|
1937
|
+
|
|
1938
|
+
def test_integer_key_wrapped_in_list(self):
|
|
1939
|
+
from tensogram_xarray.array import _expand_key_to_indices
|
|
1940
|
+
|
|
1941
|
+
result = _expand_key_to_indices((3, slice(1, 3)), (5, 10))
|
|
1942
|
+
assert result[0] == [3]
|
|
1943
|
+
assert result[1] == [1, 2]
|
|
1944
|
+
|
|
1945
|
+
|
|
1946
|
+
class TestMappingExtraHintErrorPaths:
|
|
1947
|
+
"""Cover mapping.py ``parse_extra_dim_names_hint`` error branches."""
|
|
1948
|
+
|
|
1949
|
+
def test_list_element_str_raises_returns_empty(self):
|
|
1950
|
+
"""A list whose elements fail ``str()`` yields ``{}``."""
|
|
1951
|
+
from tensogram_xarray.mapping import parse_extra_dim_names_hint
|
|
1952
|
+
|
|
1953
|
+
class _Unstringable:
|
|
1954
|
+
def __str__(self):
|
|
1955
|
+
raise ValueError("cannot stringify")
|
|
1956
|
+
|
|
1957
|
+
assert parse_extra_dim_names_hint(2, [_Unstringable(), _Unstringable()]) == {}
|
|
1958
|
+
|
|
1959
|
+
def test_dict_non_int_key_returns_empty(self):
|
|
1960
|
+
"""A dict with a non-integer key yields ``{}``."""
|
|
1961
|
+
from tensogram_xarray.mapping import parse_extra_dim_names_hint
|
|
1962
|
+
|
|
1963
|
+
assert parse_extra_dim_names_hint(2, {"not_an_int": "values"}) == {}
|
|
1964
|
+
|
|
1965
|
+
|
|
1966
|
+
class TestScannerRemote:
|
|
1967
|
+
"""Cover scanner.py remote scan path."""
|
|
1968
|
+
|
|
1969
|
+
def test_scan_file_remote(self):
|
|
1970
|
+
"""``scan_file`` opens remote URLs and decodes descriptors lazily."""
|
|
1971
|
+
data = np.arange(6, dtype=np.float32)
|
|
1972
|
+
msg = bytes(tensogram.encode({"version": 3, "source": "remote"}, [(_desc([6]), data)]))
|
|
1973
|
+
url, shutdown = _serve_tgm_bytes(msg)
|
|
1974
|
+
try:
|
|
1975
|
+
index = scan_file(url)
|
|
1976
|
+
finally:
|
|
1977
|
+
shutdown()
|
|
1978
|
+
assert len(index.objects) == 1
|
|
1979
|
+
assert index.objects[0].shape == (6,)
|
|
1980
|
+
assert index.objects[0].common_meta.get("source") == "remote"
|
|
1981
|
+
|
|
1982
|
+
|
|
1983
|
+
class TestMergeRemote:
|
|
1984
|
+
"""Cover merge.py remote shared-file open and close."""
|
|
1985
|
+
|
|
1986
|
+
def test_open_datasets_remote_and_close(self):
|
|
1987
|
+
"""``open_datasets`` opens a remote shared file and closes cleanly."""
|
|
1988
|
+
data = np.arange(6, dtype=np.float32)
|
|
1989
|
+
msg = bytes(tensogram.encode({"version": 3}, [(_desc([6]), data)]))
|
|
1990
|
+
url, shutdown = _serve_tgm_bytes(msg)
|
|
1991
|
+
try:
|
|
1992
|
+
datasets = open_datasets(url)
|
|
1993
|
+
assert len(datasets) >= 1
|
|
1994
|
+
var = next(iter(datasets[0].data_vars.values()))
|
|
1995
|
+
np.testing.assert_array_equal(var.values, data)
|
|
1996
|
+
# Closing triggers the remote `_close_shared` callback.
|
|
1997
|
+
for ds in datasets:
|
|
1998
|
+
ds.close()
|
|
1999
|
+
finally:
|
|
2000
|
+
shutdown()
|
|
2001
|
+
|
|
2002
|
+
|
|
2003
|
+
class TestPartitionKeysUnhashable:
|
|
2004
|
+
"""Cover merge.py ``_partition_keys`` unhashable fallback."""
|
|
2005
|
+
|
|
2006
|
+
def test_unhashable_values_treated_as_constant(self):
|
|
2007
|
+
"""Values that remain unhashable after ``_make_hashable`` -> constant."""
|
|
2008
|
+
# ``set`` passes through ``_make_hashable`` unchanged and a set of
|
|
2009
|
+
# sets raises TypeError, exercising the except branch.
|
|
2010
|
+
kv = {"flags": [{1, 2}, {3, 4}]}
|
|
2011
|
+
const, vary = _partition_keys(kv)
|
|
2012
|
+
assert "flags" in const
|
|
2013
|
+
assert const["flags"] == {1, 2}
|
|
2014
|
+
assert vary == {}
|
|
2015
|
+
|
|
2016
|
+
|
|
2017
|
+
class TestHypercubeMissingEntry:
|
|
2018
|
+
"""Cover merge.py missing-hypercube-entry guards."""
|
|
2019
|
+
|
|
2020
|
+
def test_missing_entry_without_variable_key(self, tmp_path: Path):
|
|
2021
|
+
"""A duplicate + a missing grid corner raises in ``_hypercube_dataset``."""
|
|
2022
|
+
path = str(tmp_path / "missing_entry.tgm")
|
|
2023
|
+
# 4 objects: (a,1),(a,2),(b,1),(b,1) -> 2x2 grid count matches (4) yet
|
|
2024
|
+
# (b,2) is absent and (b,1) is duplicated.
|
|
2025
|
+
combos = [("a", "1"), ("a", "2"), ("b", "1"), ("b", "1")]
|
|
2026
|
+
with tensogram.TensogramFile.create(path) as f:
|
|
2027
|
+
for k1, k2 in combos:
|
|
2028
|
+
f.append(
|
|
2029
|
+
{"version": 3, "base": [{"k1": k1, "k2": k2}]},
|
|
2030
|
+
[(_desc([2, 3]), np.ones((2, 3), dtype=np.float32))],
|
|
2031
|
+
)
|
|
2032
|
+
with pytest.raises(ValueError, match="hypercube has a missing entry"):
|
|
2033
|
+
open_datasets(path)
|
|
2034
|
+
|
|
2035
|
+
def test_missing_entry_with_variable_key(self, tmp_path: Path):
|
|
2036
|
+
"""Same defect inside a ``variable_key`` sub-group raises."""
|
|
2037
|
+
path = str(tmp_path / "missing_entry_var.tgm")
|
|
2038
|
+
rows = [
|
|
2039
|
+
("2t", "d1", "L1"),
|
|
2040
|
+
("2t", "d1", "L2"),
|
|
2041
|
+
("2t", "d2", "L1"),
|
|
2042
|
+
("2t", "d2", "L1"), # duplicate; (d2, L2) missing
|
|
2043
|
+
("10u", "d1", "L1"),
|
|
2044
|
+
]
|
|
2045
|
+
with tensogram.TensogramFile.create(path) as f:
|
|
2046
|
+
for param, date, level in rows:
|
|
2047
|
+
f.append(
|
|
2048
|
+
{"version": 3, "base": [{"param": param, "date": date, "level": level}]},
|
|
2049
|
+
[(_desc([2, 3]), np.ones((2, 3), dtype=np.float32))],
|
|
2050
|
+
)
|
|
2051
|
+
with pytest.raises(ValueError, match="hypercube has a missing entry"):
|
|
2052
|
+
open_datasets(path, variable_key="param")
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# (C) Copyright 2026- ECMWF and individual contributors.
|
|
2
|
+
#
|
|
3
|
+
# This software is licensed under the terms of the Apache Licence Version 2.0
|
|
4
|
+
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
|
|
5
|
+
# In applying this licence, ECMWF does not waive the privileges and immunities
|
|
6
|
+
# granted to it by virtue of its status as an intergovernmental organisation nor
|
|
7
|
+
# does it submit to any jurisdiction.
|
|
8
|
+
|
|
9
|
+
"""verify_hash threading through the xarray backend.
|
|
10
|
+
|
|
11
|
+
The xarray backend re-exposes the upstream
|
|
12
|
+
``DecodeOptions::verify_hash`` flag via the ``open_dataset(...,
|
|
13
|
+
verify_hash=True)`` keyword. When the lazy backing arrays
|
|
14
|
+
materialise data via ``file_decode_object`` /
|
|
15
|
+
``decode_object``, the kwarg propagates and integrity errors
|
|
16
|
+
(``MissingHashError`` / ``HashMismatchError``) bubble up to
|
|
17
|
+
the caller's first read.
|
|
18
|
+
|
|
19
|
+
Per Q6 in ``PLAN_DECODE_HASH_VERIFICATION.md``: the partial-
|
|
20
|
+
range fast path silently does *not* verify (range decode
|
|
21
|
+
does not accept ``verify_hash``). Set ``range_threshold=0``
|
|
22
|
+
to force every read through the full-decode path if you
|
|
23
|
+
need uniform coverage.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import numpy as np
|
|
29
|
+
import pytest
|
|
30
|
+
import tensogram
|
|
31
|
+
import xarray as xr
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _build_unhashed_message(tmp_path) -> str:
|
|
35
|
+
"""Encode a 1-object f32 message with hashing off + write to disk.
|
|
36
|
+
|
|
37
|
+
Returns the file path. The unhashed encoding is what makes
|
|
38
|
+
cell C (`verify_hash=True` → MissingHashError) testable.
|
|
39
|
+
"""
|
|
40
|
+
meta = {"version": 3}
|
|
41
|
+
desc = {
|
|
42
|
+
"type": "ntensor",
|
|
43
|
+
"ndim": 1,
|
|
44
|
+
"shape": [4],
|
|
45
|
+
"strides": [1],
|
|
46
|
+
"dtype": "float32",
|
|
47
|
+
"encoding": "none",
|
|
48
|
+
"filter": "none",
|
|
49
|
+
"compression": "none",
|
|
50
|
+
}
|
|
51
|
+
data = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
|
|
52
|
+
msg = bytes(tensogram.encode(meta, [(desc, data)], hash=None))
|
|
53
|
+
path = tmp_path / "unhashed.tgm"
|
|
54
|
+
path.write_bytes(msg)
|
|
55
|
+
return str(path)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _build_hashed_message(tmp_path) -> str:
|
|
59
|
+
"""Encode a 1-object f32 message with hashing on + write to disk."""
|
|
60
|
+
meta = {"version": 3}
|
|
61
|
+
desc = {
|
|
62
|
+
"type": "ntensor",
|
|
63
|
+
"ndim": 1,
|
|
64
|
+
"shape": [4],
|
|
65
|
+
"strides": [1],
|
|
66
|
+
"dtype": "float32",
|
|
67
|
+
"encoding": "none",
|
|
68
|
+
"filter": "none",
|
|
69
|
+
"compression": "none",
|
|
70
|
+
}
|
|
71
|
+
data = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
|
|
72
|
+
msg = bytes(tensogram.encode(meta, [(desc, data)], hash="xxh3"))
|
|
73
|
+
path = tmp_path / "hashed.tgm"
|
|
74
|
+
path.write_bytes(msg)
|
|
75
|
+
return str(path)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class TestOpenDatasetVerifyHash:
|
|
79
|
+
def test_verify_hash_default_is_false(self, tmp_path):
|
|
80
|
+
"""Default ``verify_hash=False`` decodes both hashed and
|
|
81
|
+
unhashed fixtures cleanly."""
|
|
82
|
+
ds_hashed = xr.open_dataset(_build_hashed_message(tmp_path), engine="tensogram")
|
|
83
|
+
# Force materialisation.
|
|
84
|
+
np.asarray(ds_hashed[next(iter(ds_hashed.data_vars))].values)
|
|
85
|
+
ds_hashed.close()
|
|
86
|
+
|
|
87
|
+
ds_unhashed = xr.open_dataset(_build_unhashed_message(tmp_path), engine="tensogram")
|
|
88
|
+
np.asarray(ds_unhashed[next(iter(ds_unhashed.data_vars))].values)
|
|
89
|
+
ds_unhashed.close()
|
|
90
|
+
|
|
91
|
+
def test_verify_hash_true_succeeds_on_hashed_dataset(self, tmp_path):
|
|
92
|
+
"""Cell B equivalent on the xarray surface: opening a
|
|
93
|
+
hashed file with ``verify_hash=True`` and pulling data
|
|
94
|
+
materialises cleanly."""
|
|
95
|
+
ds = xr.open_dataset(
|
|
96
|
+
_build_hashed_message(tmp_path),
|
|
97
|
+
engine="tensogram",
|
|
98
|
+
verify_hash=True,
|
|
99
|
+
# Force the full-decode path so the verification fires.
|
|
100
|
+
range_threshold=0.0,
|
|
101
|
+
)
|
|
102
|
+
arr = np.asarray(ds[next(iter(ds.data_vars))].values)
|
|
103
|
+
np.testing.assert_array_equal(arr, [1.0, 2.0, 3.0, 4.0])
|
|
104
|
+
ds.close()
|
|
105
|
+
|
|
106
|
+
def test_verify_hash_true_raises_missing_hash_on_unhashed(self, tmp_path):
|
|
107
|
+
"""Cell C on xarray: open a hashless file with
|
|
108
|
+
``verify_hash=True`` and the first read raises
|
|
109
|
+
``MissingHashError`` from the underlying
|
|
110
|
+
:mod:`tensogram` bindings."""
|
|
111
|
+
ds = xr.open_dataset(
|
|
112
|
+
_build_unhashed_message(tmp_path),
|
|
113
|
+
engine="tensogram",
|
|
114
|
+
verify_hash=True,
|
|
115
|
+
# Force the full-decode path so the verification fires.
|
|
116
|
+
range_threshold=0.0,
|
|
117
|
+
)
|
|
118
|
+
try:
|
|
119
|
+
with pytest.raises(tensogram.MissingHashError) as excinfo:
|
|
120
|
+
_ = np.asarray(ds[next(iter(ds.data_vars))].values)
|
|
121
|
+
assert excinfo.value.object_index == 0
|
|
122
|
+
finally:
|
|
123
|
+
ds.close()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{tensogram_xarray-0.20.0 → tensogram_xarray-0.22.0}/tests/test_issue_67_descriptor_name_fallback.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|