tensogram-xarray 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,832 @@
1
+ # (C) Copyright 2026- ECMWF and individual contributors.
2
+ #
3
+ # This software is licensed under the terms of the Apache Licence Version 2.0
4
+ # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
5
+ # In applying this licence, ECMWF does not waive the privileges and immunities
6
+ # granted to it by virtue of its status as an intergovernmental organisation nor
7
+ # does it submit to any jurisdiction.
8
+
9
+ """Auto-merge and auto-split for multi-message tensogram files.
10
+
11
+ ``open_datasets()`` scans all messages in a ``.tgm`` file, groups compatible
12
+ data objects (same shape, dtype, metadata structure) into hypercubes, and
13
+ returns a list of :class:`xr.Dataset` instances. Incompatible objects are
14
+ automatically split into separate Datasets.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import itertools
20
+ import logging
21
+ import threading
22
+ from collections import defaultdict
23
+ from collections.abc import Sequence
24
+ from typing import Any
25
+
26
+ import xarray as xr
27
+ from xarray.core import indexing
28
+
29
+ from tensogram_xarray.array import (
30
+ StackedBackendArray,
31
+ TensogramBackendArray,
32
+ _supports_range_decode,
33
+ )
34
+ from tensogram_xarray.coords import detect_coords
35
+ from tensogram_xarray.mapping import resolve_dim_names, resolve_variable_name
36
+ from tensogram_xarray.scanner import ObjectInfo, scan_file
37
+ from tensogram_xarray.store import _to_numpy_dtype
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+
42
+ def open_datasets(
43
+ path: str,
44
+ *,
45
+ dim_names: Sequence[str] | None = None,
46
+ variable_key: str | None = None,
47
+ verify_hash: bool = False,
48
+ range_threshold: float = 0.5,
49
+ storage_options: dict[str, Any] | None = None,
50
+ ) -> list[xr.Dataset]:
51
+ """Open a ``.tgm`` file, auto-grouping into compatible Datasets.
52
+
53
+ Each returned Dataset represents a group of data objects that share
54
+ compatible shapes and metadata structure. Objects whose metadata varies
55
+ on certain keys are stacked along new outer dimensions.
56
+
57
+ Parameters
58
+ ----------
59
+ path
60
+ Path or remote URL (S3, GCS, Azure, HTTP) to the ``.tgm`` file.
61
+ dim_names
62
+ Explicit dimension names for the innermost tensor axes.
63
+ variable_key
64
+ Dotted metadata key path for variable naming.
65
+ verify_hash
66
+ Whether to verify hashes on decode.
67
+ range_threshold
68
+ Maximum fraction of total array elements for which partial
69
+ ``decode_range()`` is used. Default ``0.5``.
70
+ storage_options
71
+ Key-value pairs forwarded to the object store backend for
72
+ remote URLs. Ignored for local files.
73
+
74
+ Returns
75
+ -------
76
+ list[xr.Dataset]
77
+ One Dataset per compatible group.
78
+ """
79
+ file_index = scan_file(path, storage_options=storage_options)
80
+
81
+ if not file_index.objects:
82
+ return []
83
+
84
+ import tensogram
85
+
86
+ is_remote = tensogram.is_remote_url(path)
87
+ shared_file = None
88
+ if is_remote:
89
+ shared_file = tensogram.TensogramFile.open_remote(path, storage_options or {})
90
+
91
+ all_metas = [o.merged_meta for o in file_index.objects]
92
+ coord_indices, var_indices, coord_dim_map = detect_coords(all_metas)
93
+
94
+ lock = threading.Lock()
95
+ all_backend_arrays: list[TensogramBackendArray] = []
96
+ coord_vars: dict[str, xr.Variable] = {}
97
+ for ci in coord_indices:
98
+ obj = file_index.objects[ci]
99
+ dim_name = coord_dim_map[ci]
100
+ np_dtype = _to_numpy_dtype(obj.dtype)
101
+ shape = obj.shape
102
+
103
+ backend_array = TensogramBackendArray(
104
+ file_path=path,
105
+ msg_index=obj.msg_index,
106
+ obj_index=obj.obj_index,
107
+ shape=shape,
108
+ dtype=np_dtype,
109
+ supports_range=_supports_range_decode(obj.descriptor),
110
+ verify_hash=verify_hash,
111
+ range_threshold=range_threshold,
112
+ lock=lock,
113
+ storage_options=storage_options,
114
+ shared_file=shared_file,
115
+ )
116
+ all_backend_arrays.append(backend_array)
117
+ lazy_data = indexing.LazilyIndexedArray(backend_array)
118
+
119
+ if dim_name in coord_vars:
120
+ existing = coord_vars[dim_name]
121
+ if existing.shape != shape:
122
+ msg = (
123
+ f"coordinate {dim_name!r} has conflicting shapes: "
124
+ f"existing {existing.shape} vs new {shape} "
125
+ f"(msg_index={obj.msg_index}, obj_index={obj.obj_index})"
126
+ )
127
+ raise ValueError(msg)
128
+ # Duplicate with matching shape -- skip (keep the first).
129
+ continue
130
+
131
+ coord_vars[dim_name] = xr.Variable((dim_name,), lazy_data, dict(obj.per_object_meta))
132
+
133
+ # Group data objects by structural compatibility.
134
+ data_objects = [file_index.objects[i] for i in var_indices]
135
+ groups = _group_by_structure(data_objects)
136
+
137
+ datasets: list[xr.Dataset] = []
138
+ for group in groups:
139
+ ds = _build_dataset_from_group(
140
+ group,
141
+ file_path=path,
142
+ coord_vars=coord_vars,
143
+ dim_names=dim_names,
144
+ variable_key=variable_key,
145
+ lock=lock,
146
+ range_threshold=range_threshold,
147
+ verify_hash=verify_hash,
148
+ storage_options=storage_options,
149
+ shared_file=shared_file,
150
+ backend_arrays=all_backend_arrays,
151
+ )
152
+ if ds is not None:
153
+ datasets.append(ds)
154
+
155
+ if not datasets:
156
+ datasets = [xr.Dataset(coords=coord_vars, attrs={"source": path})]
157
+
158
+ if shared_file is not None:
159
+
160
+ def _close_shared():
161
+ nonlocal shared_file
162
+ for arr in all_backend_arrays:
163
+ arr._shared_file = None
164
+ all_backend_arrays.clear()
165
+ shared_file = None
166
+
167
+ for ds in datasets:
168
+ ds.set_close(_close_shared)
169
+
170
+ return datasets
171
+
172
+
173
+ # ---------------------------------------------------------------------------
174
+ # Grouping
175
+ # ---------------------------------------------------------------------------
176
+
177
+ _StructureKey = tuple[tuple[int, ...], str] # (shape, dtype)
178
+
179
+
180
+ def _group_by_structure(
181
+ objects: list[ObjectInfo],
182
+ ) -> list[list[ObjectInfo]]:
183
+ """Group objects by (shape, dtype) -- structural compatibility."""
184
+ buckets: dict[_StructureKey, list[ObjectInfo]] = defaultdict(list)
185
+ for obj in objects:
186
+ key: _StructureKey = (obj.shape, obj.dtype)
187
+ buckets[key].append(obj)
188
+ return list(buckets.values())
189
+
190
+
191
+ # ---------------------------------------------------------------------------
192
+ # Hypercube construction
193
+ # ---------------------------------------------------------------------------
194
+
195
+
196
+ def _extract_meta_keys(objects: list[ObjectInfo]) -> dict[str, list[Any]]:
197
+ """For each metadata key, collect values across all objects."""
198
+ all_keys: set[str] = set()
199
+ for obj in objects:
200
+ all_keys.update(obj.merged_meta.keys())
201
+
202
+ key_values: dict[str, list[Any]] = {}
203
+ for k in sorted(all_keys):
204
+ values = []
205
+ for obj in objects:
206
+ values.append(obj.merged_meta.get(k))
207
+ key_values[k] = values
208
+ return key_values
209
+
210
+
211
+ def _partition_keys(
212
+ key_values: dict[str, list[Any]],
213
+ ) -> tuple[dict[str, Any], dict[str, list[Any]]]:
214
+ """Split keys into constant (attrs) and varying (candidate dims).
215
+
216
+ Returns
217
+ -------
218
+ constant
219
+ Keys with a single unique value -> Dataset attributes.
220
+ varying
221
+ Keys with multiple unique values -> candidate outer dimensions.
222
+ """
223
+ constant: dict[str, Any] = {}
224
+ varying: dict[str, list[Any]] = {}
225
+
226
+ for k, values in key_values.items():
227
+ # Convert to hashable for uniqueness check.
228
+ try:
229
+ unique = set(_make_hashable(v) for v in values)
230
+ except TypeError:
231
+ # Unhashable values (dicts, lists) -> treat as attribute.
232
+ constant[k] = values[0]
233
+ continue
234
+
235
+ if len(unique) == 1:
236
+ constant[k] = values[0]
237
+ else:
238
+ varying[k] = values
239
+
240
+ return constant, varying
241
+
242
+
243
+ def _make_hashable(val: Any) -> Any:
244
+ """Convert a value to a hashable form for set operations."""
245
+ if isinstance(val, dict):
246
+ return tuple(sorted((k, _make_hashable(v)) for k, v in val.items()))
247
+ if isinstance(val, list):
248
+ return tuple(_make_hashable(v) for v in val)
249
+ return val
250
+
251
+
252
+ def _unique_values(values: list[Any]) -> list[Any]:
253
+ """Return unique values preserving order, handling unhashable types."""
254
+ seen: set[Any] = set()
255
+ result: list[Any] = []
256
+ for v in values:
257
+ h = _make_hashable(v)
258
+ if h not in seen:
259
+ seen.add(h)
260
+ result.append(v)
261
+ return result
262
+
263
+
264
+ def _try_hypercube(
265
+ objects: list[ObjectInfo],
266
+ varying: dict[str, list[Any]],
267
+ ) -> bool:
268
+ """Check whether the varying keys form a complete hypercube.
269
+
270
+ A complete hypercube means every combination of unique values across
271
+ all varying keys has exactly one corresponding object.
272
+ """
273
+ if not varying:
274
+ return True
275
+
276
+ # Unique values per key (use _make_hashable for unhashable types like dicts).
277
+ unique_per_key: dict[str, list[Any]] = {}
278
+ for k, v in varying.items():
279
+ seen: set[Any] = set()
280
+ unique: list[Any] = []
281
+ for val in v:
282
+ h = _make_hashable(val)
283
+ if h not in seen:
284
+ seen.add(h)
285
+ unique.append(val)
286
+ unique_per_key[k] = unique
287
+
288
+ # Total expected combinations.
289
+ expected = 1
290
+ for vals in unique_per_key.values():
291
+ expected *= len(vals)
292
+
293
+ return len(objects) == expected
294
+
295
+
296
+ def _split_by_key(
297
+ objects: list[ObjectInfo],
298
+ key: str,
299
+ ) -> list[list[ObjectInfo]]:
300
+ """Split objects into sub-groups by the values of *key*."""
301
+ buckets: dict[Any, list[ObjectInfo]] = defaultdict(list)
302
+ for obj in objects:
303
+ val = obj.merged_meta.get(key)
304
+ hval = _make_hashable(val)
305
+ buckets[hval].append(obj)
306
+ return list(buckets.values())
307
+
308
+
309
+ # ---------------------------------------------------------------------------
310
+ # Dataset construction from a compatible group
311
+ # ---------------------------------------------------------------------------
312
+
313
+
314
+ def _build_dataset_from_group(
315
+ group: list[ObjectInfo],
316
+ file_path: str,
317
+ coord_vars: dict[str, xr.Variable],
318
+ dim_names: Sequence[str] | None,
319
+ variable_key: str | None,
320
+ lock: threading.Lock,
321
+ range_threshold: float = 0.5,
322
+ verify_hash: bool = False,
323
+ storage_options: dict[str, Any] | None = None,
324
+ *,
325
+ shared_file: Any = None,
326
+ backend_arrays: list | None = None,
327
+ ) -> xr.Dataset | None:
328
+ """Build a Dataset from a group of structurally compatible objects.
329
+
330
+ If the group has a single object, it produces a simple Dataset.
331
+ Multiple objects are merged along varying metadata keys when they
332
+ form a clean hypercube; otherwise auto-split is attempted.
333
+ """
334
+ if not group:
335
+ return None
336
+
337
+ # Single object -> simple Dataset.
338
+ if len(group) == 1:
339
+ return _single_object_dataset(
340
+ group[0],
341
+ file_path,
342
+ coord_vars,
343
+ dim_names,
344
+ variable_key,
345
+ lock,
346
+ range_threshold=range_threshold,
347
+ verify_hash=verify_hash,
348
+ storage_options=storage_options,
349
+ shared_file=shared_file,
350
+ backend_arrays=backend_arrays,
351
+ )
352
+
353
+ # Multiple objects -> try hypercube merge.
354
+ key_values = _extract_meta_keys(group)
355
+ constant, varying = _partition_keys(key_values)
356
+
357
+ # If variable_key is specified, split by it first (each unique value
358
+ # becomes a separate variable in the Dataset). Use resolved variable
359
+ # names rather than membership in ``varying`` so dotted keys such as
360
+ # "mars.param" are handled correctly.
361
+ if variable_key is not None:
362
+ variable_names = {
363
+ resolve_variable_name(obj.obj_index, obj.per_object_meta, variable_key)
364
+ for obj in group
365
+ }
366
+ if len(variable_names) > 1:
367
+ return _build_multi_variable_dataset(
368
+ group,
369
+ file_path,
370
+ coord_vars,
371
+ dim_names,
372
+ variable_key,
373
+ constant,
374
+ varying,
375
+ lock,
376
+ range_threshold=range_threshold,
377
+ verify_hash=verify_hash,
378
+ storage_options=storage_options,
379
+ shared_file=shared_file,
380
+ backend_arrays=backend_arrays,
381
+ )
382
+
383
+ # Check if the varying keys form a hypercube.
384
+ if not varying:
385
+ # All metadata identical -> can't distinguish objects.
386
+ # Return each as object_0, object_1, ...
387
+ return _flat_group_dataset(
388
+ group,
389
+ file_path,
390
+ coord_vars,
391
+ dim_names,
392
+ variable_key,
393
+ constant,
394
+ lock,
395
+ range_threshold=range_threshold,
396
+ verify_hash=verify_hash,
397
+ storage_options=storage_options,
398
+ shared_file=shared_file,
399
+ backend_arrays=backend_arrays,
400
+ )
401
+
402
+ if _try_hypercube(group, varying):
403
+ return _hypercube_dataset(
404
+ group,
405
+ file_path,
406
+ coord_vars,
407
+ dim_names,
408
+ variable_key,
409
+ constant,
410
+ varying,
411
+ lock,
412
+ range_threshold=range_threshold,
413
+ verify_hash=verify_hash,
414
+ storage_options=storage_options,
415
+ shared_file=shared_file,
416
+ backend_arrays=backend_arrays,
417
+ )
418
+
419
+ # Hypercube incomplete -> just return as separate variables.
420
+ return _flat_group_dataset(
421
+ group,
422
+ file_path,
423
+ coord_vars,
424
+ dim_names,
425
+ variable_key,
426
+ constant,
427
+ lock,
428
+ range_threshold=range_threshold,
429
+ verify_hash=verify_hash,
430
+ storage_options=storage_options,
431
+ shared_file=shared_file,
432
+ )
433
+
434
+
435
+ def _single_object_dataset(
436
+ obj: ObjectInfo,
437
+ file_path: str,
438
+ coord_vars: dict[str, xr.Variable],
439
+ dim_names: Sequence[str] | None,
440
+ variable_key: str | None,
441
+ lock: threading.Lock,
442
+ range_threshold: float = 0.5,
443
+ verify_hash: bool = False,
444
+ storage_options: dict[str, Any] | None = None,
445
+ *,
446
+ shared_file: Any = None,
447
+ backend_arrays: list | None = None,
448
+ ) -> xr.Dataset:
449
+ """Build a Dataset from a single object."""
450
+ np_dtype = _to_numpy_dtype(obj.dtype)
451
+ shape = obj.shape
452
+
453
+ var_name = resolve_variable_name(obj.obj_index, obj.merged_meta, variable_key)
454
+ dims = _resolve_dims(shape, dim_names, coord_vars)
455
+
456
+ backend_array = TensogramBackendArray(
457
+ file_path=file_path,
458
+ msg_index=obj.msg_index,
459
+ obj_index=obj.obj_index,
460
+ shape=shape,
461
+ dtype=np_dtype,
462
+ supports_range=_supports_range_decode(obj.descriptor),
463
+ verify_hash=verify_hash,
464
+ range_threshold=range_threshold,
465
+ lock=lock,
466
+ storage_options=storage_options,
467
+ shared_file=shared_file,
468
+ )
469
+ if backend_arrays is not None:
470
+ backend_arrays.append(backend_array)
471
+ lazy_data = indexing.LazilyIndexedArray(backend_array)
472
+ var = xr.Variable(dims, lazy_data, dict(obj.merged_meta))
473
+
474
+ ds_attrs = dict(obj.common_meta)
475
+ ds = xr.Dataset({var_name: var}, coords=coord_vars, attrs=ds_attrs)
476
+ return ds
477
+
478
+
479
+ def _flat_group_dataset(
480
+ group: list[ObjectInfo],
481
+ file_path: str,
482
+ coord_vars: dict[str, xr.Variable],
483
+ dim_names: Sequence[str] | None,
484
+ variable_key: str | None,
485
+ constant: dict[str, Any],
486
+ lock: threading.Lock,
487
+ range_threshold: float = 0.5,
488
+ verify_hash: bool = False,
489
+ storage_options: dict[str, Any] | None = None,
490
+ *,
491
+ shared_file: Any = None,
492
+ backend_arrays: list | None = None,
493
+ ) -> xr.Dataset:
494
+ """Build a Dataset with one variable per object (no stacking)."""
495
+ data_vars: dict[str, xr.Variable] = {}
496
+
497
+ for obj in group:
498
+ np_dtype = _to_numpy_dtype(obj.dtype)
499
+ shape = obj.shape
500
+ var_name = resolve_variable_name(obj.obj_index, obj.per_object_meta, variable_key)
501
+ dims = _resolve_dims(shape, dim_names, coord_vars)
502
+
503
+ backend_array = TensogramBackendArray(
504
+ file_path=file_path,
505
+ msg_index=obj.msg_index,
506
+ obj_index=obj.obj_index,
507
+ shape=shape,
508
+ dtype=np_dtype,
509
+ supports_range=_supports_range_decode(obj.descriptor),
510
+ verify_hash=verify_hash,
511
+ range_threshold=range_threshold,
512
+ lock=lock,
513
+ storage_options=storage_options,
514
+ shared_file=shared_file,
515
+ )
516
+ if backend_arrays is not None:
517
+ backend_arrays.append(backend_array)
518
+ lazy_data = indexing.LazilyIndexedArray(backend_array)
519
+ data_vars[var_name] = xr.Variable(dims, lazy_data, dict(obj.merged_meta))
520
+
521
+ ds_attrs = dict(constant)
522
+ ds = xr.Dataset(data_vars, coords=coord_vars, attrs=ds_attrs)
523
+ return ds
524
+
525
+
526
+ def _hypercube_dataset(
527
+ group: list[ObjectInfo],
528
+ file_path: str,
529
+ coord_vars: dict[str, xr.Variable],
530
+ dim_names: Sequence[str] | None,
531
+ variable_key: str | None,
532
+ constant: dict[str, Any],
533
+ varying: dict[str, list[Any]],
534
+ lock: threading.Lock,
535
+ range_threshold: float = 0.5,
536
+ verify_hash: bool = False,
537
+ storage_options: dict[str, Any] | None = None,
538
+ *,
539
+ shared_file: Any = None,
540
+ backend_arrays: list | None = None,
541
+ ) -> xr.Dataset:
542
+ """Stack objects into a Dataset with outer dimensions from varying keys.
543
+
544
+ All objects in *group* must have the same inner shape. Varying metadata
545
+ keys become outer dimensions whose coordinate values are the unique
546
+ metadata values.
547
+ """
548
+ inner_shape = group[0].shape
549
+ np_dtype = _to_numpy_dtype(group[0].dtype)
550
+ inner_dims = _resolve_dims(inner_shape, dim_names, coord_vars)
551
+
552
+ # Determine outer dimension names and coordinate values.
553
+ outer_keys = sorted(varying.keys())
554
+ outer_coords: dict[str, list] = {}
555
+ for k in outer_keys:
556
+ outer_coords[k] = _unique_values(varying[k])
557
+
558
+ # Build N-D index mapping: (val_for_key0, val_for_key1, ...) -> ObjectInfo
559
+ obj_by_coord: dict[tuple, ObjectInfo] = {}
560
+ for i, obj in enumerate(group):
561
+ coord_key = tuple(_make_hashable(varying[k][i]) for k in outer_keys)
562
+ obj_by_coord[coord_key] = obj
563
+
564
+ # Compute outer shape.
565
+ outer_shape = tuple(len(outer_coords[k]) for k in outer_keys)
566
+ outer_dims = tuple(outer_keys)
567
+ full_dims = outer_dims + inner_dims
568
+
569
+ # Build lazy backing arrays for each position in the outer grid
570
+ # (row-major order). No payload data is decoded here.
571
+ backing_arrays: list[TensogramBackendArray] = []
572
+ for idx_tuple in itertools.product(*(range(s) for s in outer_shape)):
573
+ coord_key = tuple(
574
+ _make_hashable(outer_coords[outer_keys[d]][idx_tuple[d]])
575
+ for d in range(len(outer_keys))
576
+ )
577
+ obj = obj_by_coord.get(coord_key)
578
+ if obj is None:
579
+ msg = (
580
+ f"hypercube has a missing entry at {dict(zip(outer_keys, idx_tuple))} "
581
+ f"in {file_path}"
582
+ )
583
+ raise ValueError(msg)
584
+ backing_arrays.append(
585
+ TensogramBackendArray(
586
+ file_path=file_path,
587
+ msg_index=obj.msg_index,
588
+ obj_index=obj.obj_index,
589
+ shape=inner_shape,
590
+ dtype=np_dtype,
591
+ supports_range=_supports_range_decode(obj.descriptor),
592
+ range_threshold=range_threshold,
593
+ verify_hash=verify_hash,
594
+ lock=lock,
595
+ storage_options=storage_options,
596
+ shared_file=shared_file,
597
+ )
598
+ )
599
+
600
+ if backend_arrays is not None:
601
+ backend_arrays.extend(backing_arrays)
602
+ stacked = StackedBackendArray(backing_arrays, outer_shape, inner_shape, np_dtype)
603
+ lazy_data = indexing.LazilyIndexedArray(stacked)
604
+
605
+ var_name = resolve_variable_name(group[0].obj_index, group[0].merged_meta, variable_key)
606
+
607
+ # Add outer coordinates.
608
+ merged_coords = dict(coord_vars)
609
+ for k in outer_keys:
610
+ merged_coords[k] = xr.Variable((k,), outer_coords[k])
611
+
612
+ var = xr.Variable(full_dims, lazy_data, dict(constant))
613
+ ds = xr.Dataset({var_name: var}, coords=merged_coords, attrs=dict(constant))
614
+ return ds
615
+
616
+
617
+ def _build_multi_variable_dataset(
618
+ group: list[ObjectInfo],
619
+ file_path: str,
620
+ coord_vars: dict[str, xr.Variable],
621
+ dim_names: Sequence[str] | None,
622
+ variable_key: str,
623
+ constant: dict[str, Any],
624
+ varying: dict[str, list[Any]],
625
+ lock: threading.Lock,
626
+ range_threshold: float = 0.5,
627
+ verify_hash: bool = False,
628
+ storage_options: dict[str, Any] | None = None,
629
+ *,
630
+ shared_file: Any = None,
631
+ backend_arrays: list | None = None,
632
+ ) -> xr.Dataset:
633
+ """Split group by variable_key, then stack each sub-group.
634
+
635
+ Each unique value of *variable_key* becomes a separate variable in the
636
+ Dataset. Remaining varying keys become outer dimensions.
637
+ """
638
+ # Split by variable_key value.
639
+ sub_groups: dict[str, list[ObjectInfo]] = defaultdict(list)
640
+ for obj in group:
641
+ val = resolve_variable_name(obj.obj_index, obj.per_object_meta, variable_key)
642
+ sub_groups[val].append(obj)
643
+
644
+ # Remaining varying keys (exclude variable_key).
645
+ remaining_varying = {k: v for k, v in varying.items() if k != variable_key}
646
+
647
+ data_vars: dict[str, xr.Variable] = {}
648
+ merged_coords = dict(coord_vars)
649
+ inner_shape = group[0].shape
650
+ np_dtype = _to_numpy_dtype(group[0].dtype)
651
+ inner_dims = _resolve_dims(inner_shape, dim_names, coord_vars)
652
+
653
+ for var_name, sub_group in sub_groups.items():
654
+ if len(sub_group) == 1:
655
+ # Single object for this variable -> no outer dims.
656
+ obj = sub_group[0]
657
+ backend_array = TensogramBackendArray(
658
+ file_path=file_path,
659
+ msg_index=obj.msg_index,
660
+ obj_index=obj.obj_index,
661
+ shape=inner_shape,
662
+ dtype=np_dtype,
663
+ supports_range=_supports_range_decode(obj.descriptor),
664
+ verify_hash=verify_hash,
665
+ range_threshold=range_threshold,
666
+ lock=lock,
667
+ storage_options=storage_options,
668
+ shared_file=shared_file,
669
+ )
670
+ if backend_arrays is not None:
671
+ backend_arrays.append(backend_array)
672
+ lazy_data = indexing.LazilyIndexedArray(backend_array)
673
+ data_vars[var_name] = xr.Variable(inner_dims, lazy_data, dict(obj.merged_meta))
674
+ elif remaining_varying:
675
+ # Re-extract varying keys for this sub-group.
676
+ sub_kv = _extract_meta_keys(sub_group)
677
+ sub_const, sub_vary = _partition_keys(sub_kv)
678
+
679
+ if sub_vary and _try_hypercube(sub_group, sub_vary):
680
+ # Build stacked variable.
681
+ outer_keys = sorted(sub_vary.keys())
682
+ outer_coords_local: dict[str, list] = {}
683
+ for k in outer_keys:
684
+ outer_coords_local[k] = _unique_values(sub_vary[k])
685
+
686
+ outer_shape = tuple(len(outer_coords_local[k]) for k in outer_keys)
687
+ outer_dims = tuple(outer_keys)
688
+ full_dims = outer_dims + inner_dims
689
+
690
+ obj_by_coord: dict[tuple, ObjectInfo] = {}
691
+ for j, obj in enumerate(sub_group):
692
+ coord_key = tuple(_make_hashable(sub_vary[k][j]) for k in outer_keys)
693
+ obj_by_coord[coord_key] = obj
694
+
695
+ # Build lazy stacked array (no payload decode here).
696
+ backing: list[TensogramBackendArray] = []
697
+ for idx_tuple in itertools.product(*(range(s) for s in outer_shape)):
698
+ coord_key = tuple(
699
+ _make_hashable(outer_coords_local[outer_keys[d]][idx_tuple[d]])
700
+ for d in range(len(outer_keys))
701
+ )
702
+ obj = obj_by_coord.get(coord_key)
703
+ if obj is None:
704
+ msg = (
705
+ f"hypercube has a missing entry at "
706
+ f"{dict(zip(outer_keys, idx_tuple))} in {file_path}"
707
+ )
708
+ raise ValueError(msg)
709
+ backing.append(
710
+ TensogramBackendArray(
711
+ file_path=file_path,
712
+ msg_index=obj.msg_index,
713
+ obj_index=obj.obj_index,
714
+ shape=inner_shape,
715
+ dtype=np_dtype,
716
+ supports_range=_supports_range_decode(obj.descriptor),
717
+ range_threshold=range_threshold,
718
+ verify_hash=verify_hash,
719
+ lock=lock,
720
+ storage_options=storage_options,
721
+ shared_file=shared_file,
722
+ )
723
+ )
724
+
725
+ if backend_arrays is not None:
726
+ backend_arrays.extend(backing)
727
+ stacked = StackedBackendArray(backing, outer_shape, inner_shape, np_dtype)
728
+ lazy_data = indexing.LazilyIndexedArray(stacked)
729
+
730
+ for k in outer_keys:
731
+ merged_coords[k] = xr.Variable((k,), outer_coords_local[k])
732
+ data_vars[var_name] = xr.Variable(full_dims, lazy_data, dict(sub_const))
733
+ else:
734
+ # Can't form hypercube -> use first object only.
735
+ logger.warning(
736
+ "variable %r: %d objects cannot form a hypercube, "
737
+ "using only the first object (dropping %d)",
738
+ var_name,
739
+ len(sub_group),
740
+ len(sub_group) - 1,
741
+ )
742
+ obj = sub_group[0]
743
+ backend_array = TensogramBackendArray(
744
+ file_path=file_path,
745
+ msg_index=obj.msg_index,
746
+ obj_index=obj.obj_index,
747
+ shape=inner_shape,
748
+ dtype=np_dtype,
749
+ supports_range=_supports_range_decode(obj.descriptor),
750
+ verify_hash=verify_hash,
751
+ range_threshold=range_threshold,
752
+ lock=lock,
753
+ storage_options=storage_options,
754
+ shared_file=shared_file,
755
+ )
756
+ if backend_arrays is not None:
757
+ backend_arrays.append(backend_array)
758
+ lazy_data = indexing.LazilyIndexedArray(backend_array)
759
+ data_vars[var_name] = xr.Variable(inner_dims, lazy_data, dict(obj.merged_meta))
760
+ else:
761
+ # No remaining varying keys -> use first object.
762
+ if len(sub_group) > 1:
763
+ logger.warning(
764
+ "variable %r: %d duplicate objects with no distinguishing "
765
+ "metadata, using only the first (dropping %d)",
766
+ var_name,
767
+ len(sub_group),
768
+ len(sub_group) - 1,
769
+ )
770
+ obj = sub_group[0]
771
+ backend_array = TensogramBackendArray(
772
+ file_path=file_path,
773
+ msg_index=obj.msg_index,
774
+ obj_index=obj.obj_index,
775
+ shape=inner_shape,
776
+ dtype=np_dtype,
777
+ supports_range=_supports_range_decode(obj.descriptor),
778
+ verify_hash=verify_hash,
779
+ range_threshold=range_threshold,
780
+ lock=lock,
781
+ storage_options=storage_options,
782
+ shared_file=shared_file,
783
+ )
784
+ if backend_arrays is not None:
785
+ backend_arrays.append(backend_array)
786
+ lazy_data = indexing.LazilyIndexedArray(backend_array)
787
+ data_vars[var_name] = xr.Variable(inner_dims, lazy_data, dict(obj.merged_meta))
788
+
789
+ ds = xr.Dataset(data_vars, coords=merged_coords, attrs=dict(constant))
790
+ return ds
791
+
792
+
793
+ # ---------------------------------------------------------------------------
794
+ # Helpers
795
+ # ---------------------------------------------------------------------------
796
+
797
+
798
+ def _resolve_dims(
799
+ shape: tuple[int, ...],
800
+ dim_names: Sequence[str] | None,
801
+ coord_vars: dict[str, xr.Variable],
802
+ ) -> tuple[str, ...]:
803
+ """Resolve dimension names for a tensor shape.
804
+
805
+ Same strategy as ``TensogramDataStore._resolve_dims_for_var``.
806
+ """
807
+ ndim = len(shape)
808
+
809
+ if dim_names is not None:
810
+ return tuple(resolve_dim_names(ndim, dim_names))
811
+
812
+ # Match by size against known coordinates.
813
+ size_to_coord: dict[int, list[str]] = {}
814
+ for cname, cvar in coord_vars.items():
815
+ csize = cvar.shape[0]
816
+ size_to_coord.setdefault(csize, []).append(cname)
817
+
818
+ dims: list[str] = []
819
+ used: set[str] = set()
820
+ for axis_size in shape:
821
+ matched = False
822
+ if axis_size in size_to_coord:
823
+ for cname in size_to_coord[axis_size]:
824
+ if cname not in used:
825
+ dims.append(cname)
826
+ used.add(cname)
827
+ matched = True
828
+ break
829
+ if not matched:
830
+ dims.append(f"dim_{len(dims)}")
831
+
832
+ return tuple(dims)