tensogram-xarray 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tensogram_xarray/__init__.py +24 -0
- tensogram_xarray/array.py +408 -0
- tensogram_xarray/backend.py +139 -0
- tensogram_xarray/coords.py +113 -0
- tensogram_xarray/mapping.py +91 -0
- tensogram_xarray/merge.py +832 -0
- tensogram_xarray/scanner.py +196 -0
- tensogram_xarray/store.py +383 -0
- tensogram_xarray-0.14.0.dist-info/METADATA +23 -0
- tensogram_xarray-0.14.0.dist-info/RECORD +12 -0
- tensogram_xarray-0.14.0.dist-info/WHEEL +4 -0
- tensogram_xarray-0.14.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,832 @@
|
|
|
1
|
+
# (C) Copyright 2026- ECMWF and individual contributors.
|
|
2
|
+
#
|
|
3
|
+
# This software is licensed under the terms of the Apache Licence Version 2.0
|
|
4
|
+
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
|
|
5
|
+
# In applying this licence, ECMWF does not waive the privileges and immunities
|
|
6
|
+
# granted to it by virtue of its status as an intergovernmental organisation nor
|
|
7
|
+
# does it submit to any jurisdiction.
|
|
8
|
+
|
|
9
|
+
"""Auto-merge and auto-split for multi-message tensogram files.
|
|
10
|
+
|
|
11
|
+
``open_datasets()`` scans all messages in a ``.tgm`` file, groups compatible
|
|
12
|
+
data objects (same shape, dtype, metadata structure) into hypercubes, and
|
|
13
|
+
returns a list of :class:`xr.Dataset` instances. Incompatible objects are
|
|
14
|
+
automatically split into separate Datasets.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import itertools
|
|
20
|
+
import logging
|
|
21
|
+
import threading
|
|
22
|
+
from collections import defaultdict
|
|
23
|
+
from collections.abc import Sequence
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
import xarray as xr
|
|
27
|
+
from xarray.core import indexing
|
|
28
|
+
|
|
29
|
+
from tensogram_xarray.array import (
|
|
30
|
+
StackedBackendArray,
|
|
31
|
+
TensogramBackendArray,
|
|
32
|
+
_supports_range_decode,
|
|
33
|
+
)
|
|
34
|
+
from tensogram_xarray.coords import detect_coords
|
|
35
|
+
from tensogram_xarray.mapping import resolve_dim_names, resolve_variable_name
|
|
36
|
+
from tensogram_xarray.scanner import ObjectInfo, scan_file
|
|
37
|
+
from tensogram_xarray.store import _to_numpy_dtype
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def open_datasets(
|
|
43
|
+
path: str,
|
|
44
|
+
*,
|
|
45
|
+
dim_names: Sequence[str] | None = None,
|
|
46
|
+
variable_key: str | None = None,
|
|
47
|
+
verify_hash: bool = False,
|
|
48
|
+
range_threshold: float = 0.5,
|
|
49
|
+
storage_options: dict[str, Any] | None = None,
|
|
50
|
+
) -> list[xr.Dataset]:
|
|
51
|
+
"""Open a ``.tgm`` file, auto-grouping into compatible Datasets.
|
|
52
|
+
|
|
53
|
+
Each returned Dataset represents a group of data objects that share
|
|
54
|
+
compatible shapes and metadata structure. Objects whose metadata varies
|
|
55
|
+
on certain keys are stacked along new outer dimensions.
|
|
56
|
+
|
|
57
|
+
Parameters
|
|
58
|
+
----------
|
|
59
|
+
path
|
|
60
|
+
Path or remote URL (S3, GCS, Azure, HTTP) to the ``.tgm`` file.
|
|
61
|
+
dim_names
|
|
62
|
+
Explicit dimension names for the innermost tensor axes.
|
|
63
|
+
variable_key
|
|
64
|
+
Dotted metadata key path for variable naming.
|
|
65
|
+
verify_hash
|
|
66
|
+
Whether to verify hashes on decode.
|
|
67
|
+
range_threshold
|
|
68
|
+
Maximum fraction of total array elements for which partial
|
|
69
|
+
``decode_range()`` is used. Default ``0.5``.
|
|
70
|
+
storage_options
|
|
71
|
+
Key-value pairs forwarded to the object store backend for
|
|
72
|
+
remote URLs. Ignored for local files.
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
list[xr.Dataset]
|
|
77
|
+
One Dataset per compatible group.
|
|
78
|
+
"""
|
|
79
|
+
file_index = scan_file(path, storage_options=storage_options)
|
|
80
|
+
|
|
81
|
+
if not file_index.objects:
|
|
82
|
+
return []
|
|
83
|
+
|
|
84
|
+
import tensogram
|
|
85
|
+
|
|
86
|
+
is_remote = tensogram.is_remote_url(path)
|
|
87
|
+
shared_file = None
|
|
88
|
+
if is_remote:
|
|
89
|
+
shared_file = tensogram.TensogramFile.open_remote(path, storage_options or {})
|
|
90
|
+
|
|
91
|
+
all_metas = [o.merged_meta for o in file_index.objects]
|
|
92
|
+
coord_indices, var_indices, coord_dim_map = detect_coords(all_metas)
|
|
93
|
+
|
|
94
|
+
lock = threading.Lock()
|
|
95
|
+
all_backend_arrays: list[TensogramBackendArray] = []
|
|
96
|
+
coord_vars: dict[str, xr.Variable] = {}
|
|
97
|
+
for ci in coord_indices:
|
|
98
|
+
obj = file_index.objects[ci]
|
|
99
|
+
dim_name = coord_dim_map[ci]
|
|
100
|
+
np_dtype = _to_numpy_dtype(obj.dtype)
|
|
101
|
+
shape = obj.shape
|
|
102
|
+
|
|
103
|
+
backend_array = TensogramBackendArray(
|
|
104
|
+
file_path=path,
|
|
105
|
+
msg_index=obj.msg_index,
|
|
106
|
+
obj_index=obj.obj_index,
|
|
107
|
+
shape=shape,
|
|
108
|
+
dtype=np_dtype,
|
|
109
|
+
supports_range=_supports_range_decode(obj.descriptor),
|
|
110
|
+
verify_hash=verify_hash,
|
|
111
|
+
range_threshold=range_threshold,
|
|
112
|
+
lock=lock,
|
|
113
|
+
storage_options=storage_options,
|
|
114
|
+
shared_file=shared_file,
|
|
115
|
+
)
|
|
116
|
+
all_backend_arrays.append(backend_array)
|
|
117
|
+
lazy_data = indexing.LazilyIndexedArray(backend_array)
|
|
118
|
+
|
|
119
|
+
if dim_name in coord_vars:
|
|
120
|
+
existing = coord_vars[dim_name]
|
|
121
|
+
if existing.shape != shape:
|
|
122
|
+
msg = (
|
|
123
|
+
f"coordinate {dim_name!r} has conflicting shapes: "
|
|
124
|
+
f"existing {existing.shape} vs new {shape} "
|
|
125
|
+
f"(msg_index={obj.msg_index}, obj_index={obj.obj_index})"
|
|
126
|
+
)
|
|
127
|
+
raise ValueError(msg)
|
|
128
|
+
# Duplicate with matching shape -- skip (keep the first).
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
coord_vars[dim_name] = xr.Variable((dim_name,), lazy_data, dict(obj.per_object_meta))
|
|
132
|
+
|
|
133
|
+
# Group data objects by structural compatibility.
|
|
134
|
+
data_objects = [file_index.objects[i] for i in var_indices]
|
|
135
|
+
groups = _group_by_structure(data_objects)
|
|
136
|
+
|
|
137
|
+
datasets: list[xr.Dataset] = []
|
|
138
|
+
for group in groups:
|
|
139
|
+
ds = _build_dataset_from_group(
|
|
140
|
+
group,
|
|
141
|
+
file_path=path,
|
|
142
|
+
coord_vars=coord_vars,
|
|
143
|
+
dim_names=dim_names,
|
|
144
|
+
variable_key=variable_key,
|
|
145
|
+
lock=lock,
|
|
146
|
+
range_threshold=range_threshold,
|
|
147
|
+
verify_hash=verify_hash,
|
|
148
|
+
storage_options=storage_options,
|
|
149
|
+
shared_file=shared_file,
|
|
150
|
+
backend_arrays=all_backend_arrays,
|
|
151
|
+
)
|
|
152
|
+
if ds is not None:
|
|
153
|
+
datasets.append(ds)
|
|
154
|
+
|
|
155
|
+
if not datasets:
|
|
156
|
+
datasets = [xr.Dataset(coords=coord_vars, attrs={"source": path})]
|
|
157
|
+
|
|
158
|
+
if shared_file is not None:
|
|
159
|
+
|
|
160
|
+
def _close_shared():
|
|
161
|
+
nonlocal shared_file
|
|
162
|
+
for arr in all_backend_arrays:
|
|
163
|
+
arr._shared_file = None
|
|
164
|
+
all_backend_arrays.clear()
|
|
165
|
+
shared_file = None
|
|
166
|
+
|
|
167
|
+
for ds in datasets:
|
|
168
|
+
ds.set_close(_close_shared)
|
|
169
|
+
|
|
170
|
+
return datasets
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
# ---------------------------------------------------------------------------
|
|
174
|
+
# Grouping
|
|
175
|
+
# ---------------------------------------------------------------------------
|
|
176
|
+
|
|
177
|
+
_StructureKey = tuple[tuple[int, ...], str] # (shape, dtype)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _group_by_structure(
|
|
181
|
+
objects: list[ObjectInfo],
|
|
182
|
+
) -> list[list[ObjectInfo]]:
|
|
183
|
+
"""Group objects by (shape, dtype) -- structural compatibility."""
|
|
184
|
+
buckets: dict[_StructureKey, list[ObjectInfo]] = defaultdict(list)
|
|
185
|
+
for obj in objects:
|
|
186
|
+
key: _StructureKey = (obj.shape, obj.dtype)
|
|
187
|
+
buckets[key].append(obj)
|
|
188
|
+
return list(buckets.values())
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# ---------------------------------------------------------------------------
|
|
192
|
+
# Hypercube construction
|
|
193
|
+
# ---------------------------------------------------------------------------
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _extract_meta_keys(objects: list[ObjectInfo]) -> dict[str, list[Any]]:
|
|
197
|
+
"""For each metadata key, collect values across all objects."""
|
|
198
|
+
all_keys: set[str] = set()
|
|
199
|
+
for obj in objects:
|
|
200
|
+
all_keys.update(obj.merged_meta.keys())
|
|
201
|
+
|
|
202
|
+
key_values: dict[str, list[Any]] = {}
|
|
203
|
+
for k in sorted(all_keys):
|
|
204
|
+
values = []
|
|
205
|
+
for obj in objects:
|
|
206
|
+
values.append(obj.merged_meta.get(k))
|
|
207
|
+
key_values[k] = values
|
|
208
|
+
return key_values
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _partition_keys(
|
|
212
|
+
key_values: dict[str, list[Any]],
|
|
213
|
+
) -> tuple[dict[str, Any], dict[str, list[Any]]]:
|
|
214
|
+
"""Split keys into constant (attrs) and varying (candidate dims).
|
|
215
|
+
|
|
216
|
+
Returns
|
|
217
|
+
-------
|
|
218
|
+
constant
|
|
219
|
+
Keys with a single unique value -> Dataset attributes.
|
|
220
|
+
varying
|
|
221
|
+
Keys with multiple unique values -> candidate outer dimensions.
|
|
222
|
+
"""
|
|
223
|
+
constant: dict[str, Any] = {}
|
|
224
|
+
varying: dict[str, list[Any]] = {}
|
|
225
|
+
|
|
226
|
+
for k, values in key_values.items():
|
|
227
|
+
# Convert to hashable for uniqueness check.
|
|
228
|
+
try:
|
|
229
|
+
unique = set(_make_hashable(v) for v in values)
|
|
230
|
+
except TypeError:
|
|
231
|
+
# Unhashable values (dicts, lists) -> treat as attribute.
|
|
232
|
+
constant[k] = values[0]
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
if len(unique) == 1:
|
|
236
|
+
constant[k] = values[0]
|
|
237
|
+
else:
|
|
238
|
+
varying[k] = values
|
|
239
|
+
|
|
240
|
+
return constant, varying
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _make_hashable(val: Any) -> Any:
|
|
244
|
+
"""Convert a value to a hashable form for set operations."""
|
|
245
|
+
if isinstance(val, dict):
|
|
246
|
+
return tuple(sorted((k, _make_hashable(v)) for k, v in val.items()))
|
|
247
|
+
if isinstance(val, list):
|
|
248
|
+
return tuple(_make_hashable(v) for v in val)
|
|
249
|
+
return val
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _unique_values(values: list[Any]) -> list[Any]:
|
|
253
|
+
"""Return unique values preserving order, handling unhashable types."""
|
|
254
|
+
seen: set[Any] = set()
|
|
255
|
+
result: list[Any] = []
|
|
256
|
+
for v in values:
|
|
257
|
+
h = _make_hashable(v)
|
|
258
|
+
if h not in seen:
|
|
259
|
+
seen.add(h)
|
|
260
|
+
result.append(v)
|
|
261
|
+
return result
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _try_hypercube(
|
|
265
|
+
objects: list[ObjectInfo],
|
|
266
|
+
varying: dict[str, list[Any]],
|
|
267
|
+
) -> bool:
|
|
268
|
+
"""Check whether the varying keys form a complete hypercube.
|
|
269
|
+
|
|
270
|
+
A complete hypercube means every combination of unique values across
|
|
271
|
+
all varying keys has exactly one corresponding object.
|
|
272
|
+
"""
|
|
273
|
+
if not varying:
|
|
274
|
+
return True
|
|
275
|
+
|
|
276
|
+
# Unique values per key (use _make_hashable for unhashable types like dicts).
|
|
277
|
+
unique_per_key: dict[str, list[Any]] = {}
|
|
278
|
+
for k, v in varying.items():
|
|
279
|
+
seen: set[Any] = set()
|
|
280
|
+
unique: list[Any] = []
|
|
281
|
+
for val in v:
|
|
282
|
+
h = _make_hashable(val)
|
|
283
|
+
if h not in seen:
|
|
284
|
+
seen.add(h)
|
|
285
|
+
unique.append(val)
|
|
286
|
+
unique_per_key[k] = unique
|
|
287
|
+
|
|
288
|
+
# Total expected combinations.
|
|
289
|
+
expected = 1
|
|
290
|
+
for vals in unique_per_key.values():
|
|
291
|
+
expected *= len(vals)
|
|
292
|
+
|
|
293
|
+
return len(objects) == expected
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _split_by_key(
|
|
297
|
+
objects: list[ObjectInfo],
|
|
298
|
+
key: str,
|
|
299
|
+
) -> list[list[ObjectInfo]]:
|
|
300
|
+
"""Split objects into sub-groups by the values of *key*."""
|
|
301
|
+
buckets: dict[Any, list[ObjectInfo]] = defaultdict(list)
|
|
302
|
+
for obj in objects:
|
|
303
|
+
val = obj.merged_meta.get(key)
|
|
304
|
+
hval = _make_hashable(val)
|
|
305
|
+
buckets[hval].append(obj)
|
|
306
|
+
return list(buckets.values())
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
# ---------------------------------------------------------------------------
|
|
310
|
+
# Dataset construction from a compatible group
|
|
311
|
+
# ---------------------------------------------------------------------------
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def _build_dataset_from_group(
|
|
315
|
+
group: list[ObjectInfo],
|
|
316
|
+
file_path: str,
|
|
317
|
+
coord_vars: dict[str, xr.Variable],
|
|
318
|
+
dim_names: Sequence[str] | None,
|
|
319
|
+
variable_key: str | None,
|
|
320
|
+
lock: threading.Lock,
|
|
321
|
+
range_threshold: float = 0.5,
|
|
322
|
+
verify_hash: bool = False,
|
|
323
|
+
storage_options: dict[str, Any] | None = None,
|
|
324
|
+
*,
|
|
325
|
+
shared_file: Any = None,
|
|
326
|
+
backend_arrays: list | None = None,
|
|
327
|
+
) -> xr.Dataset | None:
|
|
328
|
+
"""Build a Dataset from a group of structurally compatible objects.
|
|
329
|
+
|
|
330
|
+
If the group has a single object, it produces a simple Dataset.
|
|
331
|
+
Multiple objects are merged along varying metadata keys when they
|
|
332
|
+
form a clean hypercube; otherwise auto-split is attempted.
|
|
333
|
+
"""
|
|
334
|
+
if not group:
|
|
335
|
+
return None
|
|
336
|
+
|
|
337
|
+
# Single object -> simple Dataset.
|
|
338
|
+
if len(group) == 1:
|
|
339
|
+
return _single_object_dataset(
|
|
340
|
+
group[0],
|
|
341
|
+
file_path,
|
|
342
|
+
coord_vars,
|
|
343
|
+
dim_names,
|
|
344
|
+
variable_key,
|
|
345
|
+
lock,
|
|
346
|
+
range_threshold=range_threshold,
|
|
347
|
+
verify_hash=verify_hash,
|
|
348
|
+
storage_options=storage_options,
|
|
349
|
+
shared_file=shared_file,
|
|
350
|
+
backend_arrays=backend_arrays,
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
# Multiple objects -> try hypercube merge.
|
|
354
|
+
key_values = _extract_meta_keys(group)
|
|
355
|
+
constant, varying = _partition_keys(key_values)
|
|
356
|
+
|
|
357
|
+
# If variable_key is specified, split by it first (each unique value
|
|
358
|
+
# becomes a separate variable in the Dataset). Use resolved variable
|
|
359
|
+
# names rather than membership in ``varying`` so dotted keys such as
|
|
360
|
+
# "mars.param" are handled correctly.
|
|
361
|
+
if variable_key is not None:
|
|
362
|
+
variable_names = {
|
|
363
|
+
resolve_variable_name(obj.obj_index, obj.per_object_meta, variable_key)
|
|
364
|
+
for obj in group
|
|
365
|
+
}
|
|
366
|
+
if len(variable_names) > 1:
|
|
367
|
+
return _build_multi_variable_dataset(
|
|
368
|
+
group,
|
|
369
|
+
file_path,
|
|
370
|
+
coord_vars,
|
|
371
|
+
dim_names,
|
|
372
|
+
variable_key,
|
|
373
|
+
constant,
|
|
374
|
+
varying,
|
|
375
|
+
lock,
|
|
376
|
+
range_threshold=range_threshold,
|
|
377
|
+
verify_hash=verify_hash,
|
|
378
|
+
storage_options=storage_options,
|
|
379
|
+
shared_file=shared_file,
|
|
380
|
+
backend_arrays=backend_arrays,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
# Check if the varying keys form a hypercube.
|
|
384
|
+
if not varying:
|
|
385
|
+
# All metadata identical -> can't distinguish objects.
|
|
386
|
+
# Return each as object_0, object_1, ...
|
|
387
|
+
return _flat_group_dataset(
|
|
388
|
+
group,
|
|
389
|
+
file_path,
|
|
390
|
+
coord_vars,
|
|
391
|
+
dim_names,
|
|
392
|
+
variable_key,
|
|
393
|
+
constant,
|
|
394
|
+
lock,
|
|
395
|
+
range_threshold=range_threshold,
|
|
396
|
+
verify_hash=verify_hash,
|
|
397
|
+
storage_options=storage_options,
|
|
398
|
+
shared_file=shared_file,
|
|
399
|
+
backend_arrays=backend_arrays,
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
if _try_hypercube(group, varying):
|
|
403
|
+
return _hypercube_dataset(
|
|
404
|
+
group,
|
|
405
|
+
file_path,
|
|
406
|
+
coord_vars,
|
|
407
|
+
dim_names,
|
|
408
|
+
variable_key,
|
|
409
|
+
constant,
|
|
410
|
+
varying,
|
|
411
|
+
lock,
|
|
412
|
+
range_threshold=range_threshold,
|
|
413
|
+
verify_hash=verify_hash,
|
|
414
|
+
storage_options=storage_options,
|
|
415
|
+
shared_file=shared_file,
|
|
416
|
+
backend_arrays=backend_arrays,
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
# Hypercube incomplete -> just return as separate variables.
|
|
420
|
+
return _flat_group_dataset(
|
|
421
|
+
group,
|
|
422
|
+
file_path,
|
|
423
|
+
coord_vars,
|
|
424
|
+
dim_names,
|
|
425
|
+
variable_key,
|
|
426
|
+
constant,
|
|
427
|
+
lock,
|
|
428
|
+
range_threshold=range_threshold,
|
|
429
|
+
verify_hash=verify_hash,
|
|
430
|
+
storage_options=storage_options,
|
|
431
|
+
shared_file=shared_file,
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def _single_object_dataset(
|
|
436
|
+
obj: ObjectInfo,
|
|
437
|
+
file_path: str,
|
|
438
|
+
coord_vars: dict[str, xr.Variable],
|
|
439
|
+
dim_names: Sequence[str] | None,
|
|
440
|
+
variable_key: str | None,
|
|
441
|
+
lock: threading.Lock,
|
|
442
|
+
range_threshold: float = 0.5,
|
|
443
|
+
verify_hash: bool = False,
|
|
444
|
+
storage_options: dict[str, Any] | None = None,
|
|
445
|
+
*,
|
|
446
|
+
shared_file: Any = None,
|
|
447
|
+
backend_arrays: list | None = None,
|
|
448
|
+
) -> xr.Dataset:
|
|
449
|
+
"""Build a Dataset from a single object."""
|
|
450
|
+
np_dtype = _to_numpy_dtype(obj.dtype)
|
|
451
|
+
shape = obj.shape
|
|
452
|
+
|
|
453
|
+
var_name = resolve_variable_name(obj.obj_index, obj.merged_meta, variable_key)
|
|
454
|
+
dims = _resolve_dims(shape, dim_names, coord_vars)
|
|
455
|
+
|
|
456
|
+
backend_array = TensogramBackendArray(
|
|
457
|
+
file_path=file_path,
|
|
458
|
+
msg_index=obj.msg_index,
|
|
459
|
+
obj_index=obj.obj_index,
|
|
460
|
+
shape=shape,
|
|
461
|
+
dtype=np_dtype,
|
|
462
|
+
supports_range=_supports_range_decode(obj.descriptor),
|
|
463
|
+
verify_hash=verify_hash,
|
|
464
|
+
range_threshold=range_threshold,
|
|
465
|
+
lock=lock,
|
|
466
|
+
storage_options=storage_options,
|
|
467
|
+
shared_file=shared_file,
|
|
468
|
+
)
|
|
469
|
+
if backend_arrays is not None:
|
|
470
|
+
backend_arrays.append(backend_array)
|
|
471
|
+
lazy_data = indexing.LazilyIndexedArray(backend_array)
|
|
472
|
+
var = xr.Variable(dims, lazy_data, dict(obj.merged_meta))
|
|
473
|
+
|
|
474
|
+
ds_attrs = dict(obj.common_meta)
|
|
475
|
+
ds = xr.Dataset({var_name: var}, coords=coord_vars, attrs=ds_attrs)
|
|
476
|
+
return ds
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def _flat_group_dataset(
|
|
480
|
+
group: list[ObjectInfo],
|
|
481
|
+
file_path: str,
|
|
482
|
+
coord_vars: dict[str, xr.Variable],
|
|
483
|
+
dim_names: Sequence[str] | None,
|
|
484
|
+
variable_key: str | None,
|
|
485
|
+
constant: dict[str, Any],
|
|
486
|
+
lock: threading.Lock,
|
|
487
|
+
range_threshold: float = 0.5,
|
|
488
|
+
verify_hash: bool = False,
|
|
489
|
+
storage_options: dict[str, Any] | None = None,
|
|
490
|
+
*,
|
|
491
|
+
shared_file: Any = None,
|
|
492
|
+
backend_arrays: list | None = None,
|
|
493
|
+
) -> xr.Dataset:
|
|
494
|
+
"""Build a Dataset with one variable per object (no stacking)."""
|
|
495
|
+
data_vars: dict[str, xr.Variable] = {}
|
|
496
|
+
|
|
497
|
+
for obj in group:
|
|
498
|
+
np_dtype = _to_numpy_dtype(obj.dtype)
|
|
499
|
+
shape = obj.shape
|
|
500
|
+
var_name = resolve_variable_name(obj.obj_index, obj.per_object_meta, variable_key)
|
|
501
|
+
dims = _resolve_dims(shape, dim_names, coord_vars)
|
|
502
|
+
|
|
503
|
+
backend_array = TensogramBackendArray(
|
|
504
|
+
file_path=file_path,
|
|
505
|
+
msg_index=obj.msg_index,
|
|
506
|
+
obj_index=obj.obj_index,
|
|
507
|
+
shape=shape,
|
|
508
|
+
dtype=np_dtype,
|
|
509
|
+
supports_range=_supports_range_decode(obj.descriptor),
|
|
510
|
+
verify_hash=verify_hash,
|
|
511
|
+
range_threshold=range_threshold,
|
|
512
|
+
lock=lock,
|
|
513
|
+
storage_options=storage_options,
|
|
514
|
+
shared_file=shared_file,
|
|
515
|
+
)
|
|
516
|
+
if backend_arrays is not None:
|
|
517
|
+
backend_arrays.append(backend_array)
|
|
518
|
+
lazy_data = indexing.LazilyIndexedArray(backend_array)
|
|
519
|
+
data_vars[var_name] = xr.Variable(dims, lazy_data, dict(obj.merged_meta))
|
|
520
|
+
|
|
521
|
+
ds_attrs = dict(constant)
|
|
522
|
+
ds = xr.Dataset(data_vars, coords=coord_vars, attrs=ds_attrs)
|
|
523
|
+
return ds
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def _hypercube_dataset(
|
|
527
|
+
group: list[ObjectInfo],
|
|
528
|
+
file_path: str,
|
|
529
|
+
coord_vars: dict[str, xr.Variable],
|
|
530
|
+
dim_names: Sequence[str] | None,
|
|
531
|
+
variable_key: str | None,
|
|
532
|
+
constant: dict[str, Any],
|
|
533
|
+
varying: dict[str, list[Any]],
|
|
534
|
+
lock: threading.Lock,
|
|
535
|
+
range_threshold: float = 0.5,
|
|
536
|
+
verify_hash: bool = False,
|
|
537
|
+
storage_options: dict[str, Any] | None = None,
|
|
538
|
+
*,
|
|
539
|
+
shared_file: Any = None,
|
|
540
|
+
backend_arrays: list | None = None,
|
|
541
|
+
) -> xr.Dataset:
|
|
542
|
+
"""Stack objects into a Dataset with outer dimensions from varying keys.
|
|
543
|
+
|
|
544
|
+
All objects in *group* must have the same inner shape. Varying metadata
|
|
545
|
+
keys become outer dimensions whose coordinate values are the unique
|
|
546
|
+
metadata values.
|
|
547
|
+
"""
|
|
548
|
+
inner_shape = group[0].shape
|
|
549
|
+
np_dtype = _to_numpy_dtype(group[0].dtype)
|
|
550
|
+
inner_dims = _resolve_dims(inner_shape, dim_names, coord_vars)
|
|
551
|
+
|
|
552
|
+
# Determine outer dimension names and coordinate values.
|
|
553
|
+
outer_keys = sorted(varying.keys())
|
|
554
|
+
outer_coords: dict[str, list] = {}
|
|
555
|
+
for k in outer_keys:
|
|
556
|
+
outer_coords[k] = _unique_values(varying[k])
|
|
557
|
+
|
|
558
|
+
# Build N-D index mapping: (val_for_key0, val_for_key1, ...) -> ObjectInfo
|
|
559
|
+
obj_by_coord: dict[tuple, ObjectInfo] = {}
|
|
560
|
+
for i, obj in enumerate(group):
|
|
561
|
+
coord_key = tuple(_make_hashable(varying[k][i]) for k in outer_keys)
|
|
562
|
+
obj_by_coord[coord_key] = obj
|
|
563
|
+
|
|
564
|
+
# Compute outer shape.
|
|
565
|
+
outer_shape = tuple(len(outer_coords[k]) for k in outer_keys)
|
|
566
|
+
outer_dims = tuple(outer_keys)
|
|
567
|
+
full_dims = outer_dims + inner_dims
|
|
568
|
+
|
|
569
|
+
# Build lazy backing arrays for each position in the outer grid
|
|
570
|
+
# (row-major order). No payload data is decoded here.
|
|
571
|
+
backing_arrays: list[TensogramBackendArray] = []
|
|
572
|
+
for idx_tuple in itertools.product(*(range(s) for s in outer_shape)):
|
|
573
|
+
coord_key = tuple(
|
|
574
|
+
_make_hashable(outer_coords[outer_keys[d]][idx_tuple[d]])
|
|
575
|
+
for d in range(len(outer_keys))
|
|
576
|
+
)
|
|
577
|
+
obj = obj_by_coord.get(coord_key)
|
|
578
|
+
if obj is None:
|
|
579
|
+
msg = (
|
|
580
|
+
f"hypercube has a missing entry at {dict(zip(outer_keys, idx_tuple))} "
|
|
581
|
+
f"in {file_path}"
|
|
582
|
+
)
|
|
583
|
+
raise ValueError(msg)
|
|
584
|
+
backing_arrays.append(
|
|
585
|
+
TensogramBackendArray(
|
|
586
|
+
file_path=file_path,
|
|
587
|
+
msg_index=obj.msg_index,
|
|
588
|
+
obj_index=obj.obj_index,
|
|
589
|
+
shape=inner_shape,
|
|
590
|
+
dtype=np_dtype,
|
|
591
|
+
supports_range=_supports_range_decode(obj.descriptor),
|
|
592
|
+
range_threshold=range_threshold,
|
|
593
|
+
verify_hash=verify_hash,
|
|
594
|
+
lock=lock,
|
|
595
|
+
storage_options=storage_options,
|
|
596
|
+
shared_file=shared_file,
|
|
597
|
+
)
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
if backend_arrays is not None:
|
|
601
|
+
backend_arrays.extend(backing_arrays)
|
|
602
|
+
stacked = StackedBackendArray(backing_arrays, outer_shape, inner_shape, np_dtype)
|
|
603
|
+
lazy_data = indexing.LazilyIndexedArray(stacked)
|
|
604
|
+
|
|
605
|
+
var_name = resolve_variable_name(group[0].obj_index, group[0].merged_meta, variable_key)
|
|
606
|
+
|
|
607
|
+
# Add outer coordinates.
|
|
608
|
+
merged_coords = dict(coord_vars)
|
|
609
|
+
for k in outer_keys:
|
|
610
|
+
merged_coords[k] = xr.Variable((k,), outer_coords[k])
|
|
611
|
+
|
|
612
|
+
var = xr.Variable(full_dims, lazy_data, dict(constant))
|
|
613
|
+
ds = xr.Dataset({var_name: var}, coords=merged_coords, attrs=dict(constant))
|
|
614
|
+
return ds
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def _build_multi_variable_dataset(
|
|
618
|
+
group: list[ObjectInfo],
|
|
619
|
+
file_path: str,
|
|
620
|
+
coord_vars: dict[str, xr.Variable],
|
|
621
|
+
dim_names: Sequence[str] | None,
|
|
622
|
+
variable_key: str,
|
|
623
|
+
constant: dict[str, Any],
|
|
624
|
+
varying: dict[str, list[Any]],
|
|
625
|
+
lock: threading.Lock,
|
|
626
|
+
range_threshold: float = 0.5,
|
|
627
|
+
verify_hash: bool = False,
|
|
628
|
+
storage_options: dict[str, Any] | None = None,
|
|
629
|
+
*,
|
|
630
|
+
shared_file: Any = None,
|
|
631
|
+
backend_arrays: list | None = None,
|
|
632
|
+
) -> xr.Dataset:
|
|
633
|
+
"""Split group by variable_key, then stack each sub-group.
|
|
634
|
+
|
|
635
|
+
Each unique value of *variable_key* becomes a separate variable in the
|
|
636
|
+
Dataset. Remaining varying keys become outer dimensions.
|
|
637
|
+
"""
|
|
638
|
+
# Split by variable_key value.
|
|
639
|
+
sub_groups: dict[str, list[ObjectInfo]] = defaultdict(list)
|
|
640
|
+
for obj in group:
|
|
641
|
+
val = resolve_variable_name(obj.obj_index, obj.per_object_meta, variable_key)
|
|
642
|
+
sub_groups[val].append(obj)
|
|
643
|
+
|
|
644
|
+
# Remaining varying keys (exclude variable_key).
|
|
645
|
+
remaining_varying = {k: v for k, v in varying.items() if k != variable_key}
|
|
646
|
+
|
|
647
|
+
data_vars: dict[str, xr.Variable] = {}
|
|
648
|
+
merged_coords = dict(coord_vars)
|
|
649
|
+
inner_shape = group[0].shape
|
|
650
|
+
np_dtype = _to_numpy_dtype(group[0].dtype)
|
|
651
|
+
inner_dims = _resolve_dims(inner_shape, dim_names, coord_vars)
|
|
652
|
+
|
|
653
|
+
for var_name, sub_group in sub_groups.items():
|
|
654
|
+
if len(sub_group) == 1:
|
|
655
|
+
# Single object for this variable -> no outer dims.
|
|
656
|
+
obj = sub_group[0]
|
|
657
|
+
backend_array = TensogramBackendArray(
|
|
658
|
+
file_path=file_path,
|
|
659
|
+
msg_index=obj.msg_index,
|
|
660
|
+
obj_index=obj.obj_index,
|
|
661
|
+
shape=inner_shape,
|
|
662
|
+
dtype=np_dtype,
|
|
663
|
+
supports_range=_supports_range_decode(obj.descriptor),
|
|
664
|
+
verify_hash=verify_hash,
|
|
665
|
+
range_threshold=range_threshold,
|
|
666
|
+
lock=lock,
|
|
667
|
+
storage_options=storage_options,
|
|
668
|
+
shared_file=shared_file,
|
|
669
|
+
)
|
|
670
|
+
if backend_arrays is not None:
|
|
671
|
+
backend_arrays.append(backend_array)
|
|
672
|
+
lazy_data = indexing.LazilyIndexedArray(backend_array)
|
|
673
|
+
data_vars[var_name] = xr.Variable(inner_dims, lazy_data, dict(obj.merged_meta))
|
|
674
|
+
elif remaining_varying:
|
|
675
|
+
# Re-extract varying keys for this sub-group.
|
|
676
|
+
sub_kv = _extract_meta_keys(sub_group)
|
|
677
|
+
sub_const, sub_vary = _partition_keys(sub_kv)
|
|
678
|
+
|
|
679
|
+
if sub_vary and _try_hypercube(sub_group, sub_vary):
|
|
680
|
+
# Build stacked variable.
|
|
681
|
+
outer_keys = sorted(sub_vary.keys())
|
|
682
|
+
outer_coords_local: dict[str, list] = {}
|
|
683
|
+
for k in outer_keys:
|
|
684
|
+
outer_coords_local[k] = _unique_values(sub_vary[k])
|
|
685
|
+
|
|
686
|
+
outer_shape = tuple(len(outer_coords_local[k]) for k in outer_keys)
|
|
687
|
+
outer_dims = tuple(outer_keys)
|
|
688
|
+
full_dims = outer_dims + inner_dims
|
|
689
|
+
|
|
690
|
+
obj_by_coord: dict[tuple, ObjectInfo] = {}
|
|
691
|
+
for j, obj in enumerate(sub_group):
|
|
692
|
+
coord_key = tuple(_make_hashable(sub_vary[k][j]) for k in outer_keys)
|
|
693
|
+
obj_by_coord[coord_key] = obj
|
|
694
|
+
|
|
695
|
+
# Build lazy stacked array (no payload decode here).
|
|
696
|
+
backing: list[TensogramBackendArray] = []
|
|
697
|
+
for idx_tuple in itertools.product(*(range(s) for s in outer_shape)):
|
|
698
|
+
coord_key = tuple(
|
|
699
|
+
_make_hashable(outer_coords_local[outer_keys[d]][idx_tuple[d]])
|
|
700
|
+
for d in range(len(outer_keys))
|
|
701
|
+
)
|
|
702
|
+
obj = obj_by_coord.get(coord_key)
|
|
703
|
+
if obj is None:
|
|
704
|
+
msg = (
|
|
705
|
+
f"hypercube has a missing entry at "
|
|
706
|
+
f"{dict(zip(outer_keys, idx_tuple))} in {file_path}"
|
|
707
|
+
)
|
|
708
|
+
raise ValueError(msg)
|
|
709
|
+
backing.append(
|
|
710
|
+
TensogramBackendArray(
|
|
711
|
+
file_path=file_path,
|
|
712
|
+
msg_index=obj.msg_index,
|
|
713
|
+
obj_index=obj.obj_index,
|
|
714
|
+
shape=inner_shape,
|
|
715
|
+
dtype=np_dtype,
|
|
716
|
+
supports_range=_supports_range_decode(obj.descriptor),
|
|
717
|
+
range_threshold=range_threshold,
|
|
718
|
+
verify_hash=verify_hash,
|
|
719
|
+
lock=lock,
|
|
720
|
+
storage_options=storage_options,
|
|
721
|
+
shared_file=shared_file,
|
|
722
|
+
)
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
if backend_arrays is not None:
|
|
726
|
+
backend_arrays.extend(backing)
|
|
727
|
+
stacked = StackedBackendArray(backing, outer_shape, inner_shape, np_dtype)
|
|
728
|
+
lazy_data = indexing.LazilyIndexedArray(stacked)
|
|
729
|
+
|
|
730
|
+
for k in outer_keys:
|
|
731
|
+
merged_coords[k] = xr.Variable((k,), outer_coords_local[k])
|
|
732
|
+
data_vars[var_name] = xr.Variable(full_dims, lazy_data, dict(sub_const))
|
|
733
|
+
else:
|
|
734
|
+
# Can't form hypercube -> use first object only.
|
|
735
|
+
logger.warning(
|
|
736
|
+
"variable %r: %d objects cannot form a hypercube, "
|
|
737
|
+
"using only the first object (dropping %d)",
|
|
738
|
+
var_name,
|
|
739
|
+
len(sub_group),
|
|
740
|
+
len(sub_group) - 1,
|
|
741
|
+
)
|
|
742
|
+
obj = sub_group[0]
|
|
743
|
+
backend_array = TensogramBackendArray(
|
|
744
|
+
file_path=file_path,
|
|
745
|
+
msg_index=obj.msg_index,
|
|
746
|
+
obj_index=obj.obj_index,
|
|
747
|
+
shape=inner_shape,
|
|
748
|
+
dtype=np_dtype,
|
|
749
|
+
supports_range=_supports_range_decode(obj.descriptor),
|
|
750
|
+
verify_hash=verify_hash,
|
|
751
|
+
range_threshold=range_threshold,
|
|
752
|
+
lock=lock,
|
|
753
|
+
storage_options=storage_options,
|
|
754
|
+
shared_file=shared_file,
|
|
755
|
+
)
|
|
756
|
+
if backend_arrays is not None:
|
|
757
|
+
backend_arrays.append(backend_array)
|
|
758
|
+
lazy_data = indexing.LazilyIndexedArray(backend_array)
|
|
759
|
+
data_vars[var_name] = xr.Variable(inner_dims, lazy_data, dict(obj.merged_meta))
|
|
760
|
+
else:
|
|
761
|
+
# No remaining varying keys -> use first object.
|
|
762
|
+
if len(sub_group) > 1:
|
|
763
|
+
logger.warning(
|
|
764
|
+
"variable %r: %d duplicate objects with no distinguishing "
|
|
765
|
+
"metadata, using only the first (dropping %d)",
|
|
766
|
+
var_name,
|
|
767
|
+
len(sub_group),
|
|
768
|
+
len(sub_group) - 1,
|
|
769
|
+
)
|
|
770
|
+
obj = sub_group[0]
|
|
771
|
+
backend_array = TensogramBackendArray(
|
|
772
|
+
file_path=file_path,
|
|
773
|
+
msg_index=obj.msg_index,
|
|
774
|
+
obj_index=obj.obj_index,
|
|
775
|
+
shape=inner_shape,
|
|
776
|
+
dtype=np_dtype,
|
|
777
|
+
supports_range=_supports_range_decode(obj.descriptor),
|
|
778
|
+
verify_hash=verify_hash,
|
|
779
|
+
range_threshold=range_threshold,
|
|
780
|
+
lock=lock,
|
|
781
|
+
storage_options=storage_options,
|
|
782
|
+
shared_file=shared_file,
|
|
783
|
+
)
|
|
784
|
+
if backend_arrays is not None:
|
|
785
|
+
backend_arrays.append(backend_array)
|
|
786
|
+
lazy_data = indexing.LazilyIndexedArray(backend_array)
|
|
787
|
+
data_vars[var_name] = xr.Variable(inner_dims, lazy_data, dict(obj.merged_meta))
|
|
788
|
+
|
|
789
|
+
ds = xr.Dataset(data_vars, coords=merged_coords, attrs=dict(constant))
|
|
790
|
+
return ds
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
# ---------------------------------------------------------------------------
|
|
794
|
+
# Helpers
|
|
795
|
+
# ---------------------------------------------------------------------------
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
def _resolve_dims(
|
|
799
|
+
shape: tuple[int, ...],
|
|
800
|
+
dim_names: Sequence[str] | None,
|
|
801
|
+
coord_vars: dict[str, xr.Variable],
|
|
802
|
+
) -> tuple[str, ...]:
|
|
803
|
+
"""Resolve dimension names for a tensor shape.
|
|
804
|
+
|
|
805
|
+
Same strategy as ``TensogramDataStore._resolve_dims_for_var``.
|
|
806
|
+
"""
|
|
807
|
+
ndim = len(shape)
|
|
808
|
+
|
|
809
|
+
if dim_names is not None:
|
|
810
|
+
return tuple(resolve_dim_names(ndim, dim_names))
|
|
811
|
+
|
|
812
|
+
# Match by size against known coordinates.
|
|
813
|
+
size_to_coord: dict[int, list[str]] = {}
|
|
814
|
+
for cname, cvar in coord_vars.items():
|
|
815
|
+
csize = cvar.shape[0]
|
|
816
|
+
size_to_coord.setdefault(csize, []).append(cname)
|
|
817
|
+
|
|
818
|
+
dims: list[str] = []
|
|
819
|
+
used: set[str] = set()
|
|
820
|
+
for axis_size in shape:
|
|
821
|
+
matched = False
|
|
822
|
+
if axis_size in size_to_coord:
|
|
823
|
+
for cname in size_to_coord[axis_size]:
|
|
824
|
+
if cname not in used:
|
|
825
|
+
dims.append(cname)
|
|
826
|
+
used.add(cname)
|
|
827
|
+
matched = True
|
|
828
|
+
break
|
|
829
|
+
if not matched:
|
|
830
|
+
dims.append(f"dim_{len(dims)}")
|
|
831
|
+
|
|
832
|
+
return tuple(dims)
|