tensogram-xarray 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tensogram_xarray/__init__.py +24 -0
- tensogram_xarray/array.py +408 -0
- tensogram_xarray/backend.py +139 -0
- tensogram_xarray/coords.py +113 -0
- tensogram_xarray/mapping.py +91 -0
- tensogram_xarray/merge.py +832 -0
- tensogram_xarray/scanner.py +196 -0
- tensogram_xarray/store.py +383 -0
- tensogram_xarray-0.14.0.dist-info/METADATA +23 -0
- tensogram_xarray-0.14.0.dist-info/RECORD +12 -0
- tensogram_xarray-0.14.0.dist-info/WHEEL +4 -0
- tensogram_xarray-0.14.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# (C) Copyright 2026- ECMWF and individual contributors.
|
|
2
|
+
#
|
|
3
|
+
# This software is licensed under the terms of the Apache Licence Version 2.0
|
|
4
|
+
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
|
|
5
|
+
# In applying this licence, ECMWF does not waive the privileges and immunities
|
|
6
|
+
# granted to it by virtue of its status as an intergovernmental organisation nor
|
|
7
|
+
# does it submit to any jurisdiction.
|
|
8
|
+
|
|
9
|
+
"""File scanner: extract metadata from all messages/objects without decoding payloads.
|
|
10
|
+
|
|
11
|
+
The scanner opens a ``.tgm`` file via ``tensogram.TensogramFile``, reads each
|
|
12
|
+
message's metadata, and builds an index of per-object metadata dicts that
|
|
13
|
+
downstream merge/split logic can consume.
|
|
14
|
+
|
|
15
|
+
Per-object metadata is read from ``meta.base[i]`` (with ``_reserved_``
|
|
16
|
+
filtered out) and supplemented by ``desc.params`` as fallback.
|
|
17
|
+
Message-level metadata comes from ``meta.extra``.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import os
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
# The ``_reserved_`` key in base entries is populated by the encoder
|
|
27
|
+
# with tensor info (ndim, shape, strides, dtype). It must be excluded
|
|
28
|
+
# from application-level metadata used for grouping and variable naming.
|
|
29
|
+
RESERVED_KEY = "_reserved_"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class ObjectInfo:
|
|
34
|
+
"""Metadata for a single data object within a message."""
|
|
35
|
+
|
|
36
|
+
msg_index: int
|
|
37
|
+
obj_index: int
|
|
38
|
+
ndim: int
|
|
39
|
+
shape: tuple[int, ...]
|
|
40
|
+
dtype: str
|
|
41
|
+
descriptor: Any # tensogram.DataObjectDescriptor
|
|
42
|
+
per_object_meta: dict[str, Any] # from meta.base[i] (filtered)
|
|
43
|
+
common_meta: dict[str, Any] # from meta.extra
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def merged_meta(self) -> dict[str, Any]:
|
|
47
|
+
"""Return common + per-object metadata merged (per-object wins)."""
|
|
48
|
+
merged: dict[str, Any] = {}
|
|
49
|
+
merged.update(self.common_meta)
|
|
50
|
+
merged.update(self.per_object_meta)
|
|
51
|
+
return merged
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class FileIndex:
|
|
56
|
+
"""Index over all messages and objects in a ``.tgm`` file."""
|
|
57
|
+
|
|
58
|
+
file_path: str
|
|
59
|
+
objects: list[ObjectInfo] = field(default_factory=list)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def message_count(self) -> int:
|
|
63
|
+
if not self.objects:
|
|
64
|
+
return 0
|
|
65
|
+
return max(o.msg_index for o in self.objects) + 1
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _desc_params(desc: Any) -> dict[str, Any]:
|
|
69
|
+
"""Extract per-object metadata from a descriptor's params."""
|
|
70
|
+
params = getattr(desc, "params", None)
|
|
71
|
+
if params and isinstance(params, dict):
|
|
72
|
+
return dict(params)
|
|
73
|
+
return {}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _base_entry_from_meta(meta: Any, obj_index: int) -> dict[str, Any]:
|
|
77
|
+
"""Extract per-object metadata from ``meta.base[obj_index]``.
|
|
78
|
+
|
|
79
|
+
The ``_reserved_`` key is filtered out since it contains
|
|
80
|
+
encoder-populated tensor info (ndim, shape, strides, dtype).
|
|
81
|
+
"""
|
|
82
|
+
base = getattr(meta, "base", None)
|
|
83
|
+
if base and isinstance(base, list) and obj_index < len(base):
|
|
84
|
+
entry = base[obj_index]
|
|
85
|
+
if isinstance(entry, dict):
|
|
86
|
+
return {k: v for k, v in entry.items() if k != RESERVED_KEY}
|
|
87
|
+
return {}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _merge_per_object_meta(meta: Any, obj_index: int, desc: Any) -> dict[str, Any]:
|
|
91
|
+
"""Build per-object metadata from base entry + descriptor params.
|
|
92
|
+
|
|
93
|
+
Base entry takes priority; descriptor params fill in any missing keys.
|
|
94
|
+
"""
|
|
95
|
+
result = _base_entry_from_meta(meta, obj_index)
|
|
96
|
+
for k, v in _desc_params(desc).items():
|
|
97
|
+
if k not in result:
|
|
98
|
+
result[k] = v
|
|
99
|
+
return result
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _extra_from_meta(meta: Any) -> dict[str, Any]:
|
|
103
|
+
"""Extract message-level metadata from ``meta.extra``."""
|
|
104
|
+
result: dict[str, Any] = {}
|
|
105
|
+
extra = getattr(meta, "extra", None)
|
|
106
|
+
if extra and isinstance(extra, dict):
|
|
107
|
+
result.update(extra)
|
|
108
|
+
return result
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def scan_file(
|
|
112
|
+
file_path: str,
|
|
113
|
+
storage_options: dict[str, Any] | None = None,
|
|
114
|
+
) -> FileIndex:
|
|
115
|
+
"""Scan a ``.tgm`` file and return a :class:`FileIndex`.
|
|
116
|
+
|
|
117
|
+
Decodes each message to get descriptors, per-object metadata
|
|
118
|
+
(from ``meta.base`` and ``desc.params``), and extra metadata.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
file_path
|
|
123
|
+
Path or remote URL to the ``.tgm`` file.
|
|
124
|
+
storage_options
|
|
125
|
+
Key-value pairs forwarded to the object store backend for
|
|
126
|
+
remote URLs. Ignored for local files.
|
|
127
|
+
"""
|
|
128
|
+
import tensogram
|
|
129
|
+
|
|
130
|
+
is_remote = tensogram.is_remote_url(file_path)
|
|
131
|
+
resolved = file_path if is_remote else os.path.abspath(file_path)
|
|
132
|
+
index = FileIndex(file_path=resolved)
|
|
133
|
+
|
|
134
|
+
if is_remote:
|
|
135
|
+
f = tensogram.TensogramFile.open_remote(resolved, storage_options or {})
|
|
136
|
+
else:
|
|
137
|
+
f = tensogram.TensogramFile.open(resolved)
|
|
138
|
+
|
|
139
|
+
with f:
|
|
140
|
+
n_messages = len(f)
|
|
141
|
+
for msg_idx in range(n_messages):
|
|
142
|
+
if is_remote:
|
|
143
|
+
result = f.file_decode_descriptors(msg_idx)
|
|
144
|
+
meta = result["metadata"]
|
|
145
|
+
descriptors = result["descriptors"]
|
|
146
|
+
else:
|
|
147
|
+
raw = f.read_message(msg_idx)
|
|
148
|
+
meta = tensogram.decode_metadata(raw)
|
|
149
|
+
_, descriptors = tensogram.decode_descriptors(raw)
|
|
150
|
+
|
|
151
|
+
extra = _extra_from_meta(meta)
|
|
152
|
+
|
|
153
|
+
for obj_idx, desc in enumerate(descriptors):
|
|
154
|
+
per_obj = _merge_per_object_meta(meta, obj_idx, desc)
|
|
155
|
+
shape = tuple(desc.shape)
|
|
156
|
+
info = ObjectInfo(
|
|
157
|
+
msg_index=msg_idx,
|
|
158
|
+
obj_index=obj_idx,
|
|
159
|
+
ndim=desc.ndim,
|
|
160
|
+
shape=shape,
|
|
161
|
+
dtype=desc.dtype,
|
|
162
|
+
descriptor=desc,
|
|
163
|
+
per_object_meta=per_obj,
|
|
164
|
+
common_meta=dict(extra),
|
|
165
|
+
)
|
|
166
|
+
index.objects.append(info)
|
|
167
|
+
|
|
168
|
+
return index
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def scan_message(raw_msg: bytes) -> list[ObjectInfo]:
|
|
172
|
+
"""Scan a single in-memory message and return :class:`ObjectInfo` list."""
|
|
173
|
+
import tensogram
|
|
174
|
+
|
|
175
|
+
meta = tensogram.decode_metadata(raw_msg)
|
|
176
|
+
extra = _extra_from_meta(meta)
|
|
177
|
+
|
|
178
|
+
_, descriptors = tensogram.decode_descriptors(raw_msg)
|
|
179
|
+
|
|
180
|
+
result: list[ObjectInfo] = []
|
|
181
|
+
for obj_idx, desc in enumerate(descriptors):
|
|
182
|
+
per_obj = _merge_per_object_meta(meta, obj_idx, desc)
|
|
183
|
+
shape = tuple(desc.shape)
|
|
184
|
+
info = ObjectInfo(
|
|
185
|
+
msg_index=0,
|
|
186
|
+
obj_index=obj_idx,
|
|
187
|
+
ndim=desc.ndim,
|
|
188
|
+
shape=shape,
|
|
189
|
+
dtype=desc.dtype,
|
|
190
|
+
descriptor=desc,
|
|
191
|
+
per_object_meta=per_obj,
|
|
192
|
+
common_meta=dict(extra),
|
|
193
|
+
)
|
|
194
|
+
result.append(info)
|
|
195
|
+
|
|
196
|
+
return result
|
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
# (C) Copyright 2026- ECMWF and individual contributors.
|
|
2
|
+
#
|
|
3
|
+
# This software is licensed under the terms of the Apache Licence Version 2.0
|
|
4
|
+
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
|
|
5
|
+
# In applying this licence, ECMWF does not waive the privileges and immunities
|
|
6
|
+
# granted to it by virtue of its status as an intergovernmental organisation nor
|
|
7
|
+
# does it submit to any jurisdiction.
|
|
8
|
+
|
|
9
|
+
"""Data store: bridge between tensogram messages and xarray Variables.
|
|
10
|
+
|
|
11
|
+
``TensogramDataStore`` reads a single tensogram message (identified by file
|
|
12
|
+
path and message index) and produces :class:`xarray.Variable` objects with
|
|
13
|
+
lazy-loaded data backed by :class:`TensogramBackendArray`.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
import threading
|
|
21
|
+
from collections.abc import Sequence
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
import numpy as np
|
|
25
|
+
import xarray as xr
|
|
26
|
+
from xarray.core import indexing
|
|
27
|
+
|
|
28
|
+
from tensogram_xarray.array import TensogramBackendArray, _supports_range_decode
|
|
29
|
+
from tensogram_xarray.coords import detect_coords
|
|
30
|
+
from tensogram_xarray.mapping import resolve_dim_names, resolve_variable_name
|
|
31
|
+
from tensogram_xarray.scanner import RESERVED_KEY
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
# Map tensogram dtype strings to numpy dtypes.
|
|
36
|
+
try:
|
|
37
|
+
import ml_dtypes
|
|
38
|
+
|
|
39
|
+
_BFLOAT16_DTYPE = ml_dtypes.bfloat16
|
|
40
|
+
except ImportError:
|
|
41
|
+
_BFLOAT16_DTYPE = np.dtype("uint16") # fallback: raw 2-byte words
|
|
42
|
+
|
|
43
|
+
_DTYPE_MAP: dict[str, np.dtype] = {
|
|
44
|
+
"float16": np.dtype("float16"),
|
|
45
|
+
"bfloat16": np.dtype(_BFLOAT16_DTYPE),
|
|
46
|
+
"float32": np.dtype("float32"),
|
|
47
|
+
"float64": np.dtype("float64"),
|
|
48
|
+
"complex64": np.dtype("complex64"),
|
|
49
|
+
"complex128": np.dtype("complex128"),
|
|
50
|
+
"int8": np.dtype("int8"),
|
|
51
|
+
"int16": np.dtype("int16"),
|
|
52
|
+
"int32": np.dtype("int32"),
|
|
53
|
+
"int64": np.dtype("int64"),
|
|
54
|
+
"uint8": np.dtype("uint8"),
|
|
55
|
+
"uint16": np.dtype("uint16"),
|
|
56
|
+
"uint32": np.dtype("uint32"),
|
|
57
|
+
"uint64": np.dtype("uint64"),
|
|
58
|
+
"bitmask": np.dtype("uint8"), # packed bits as raw bytes
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _to_numpy_dtype(tgm_dtype: str) -> np.dtype:
|
|
63
|
+
"""Convert a tensogram dtype string to a numpy dtype."""
|
|
64
|
+
key = tgm_dtype.lower()
|
|
65
|
+
if key in _DTYPE_MAP:
|
|
66
|
+
return _DTYPE_MAP[key]
|
|
67
|
+
# Fallback: try numpy directly.
|
|
68
|
+
try:
|
|
69
|
+
return np.dtype(tgm_dtype)
|
|
70
|
+
except TypeError as exc:
|
|
71
|
+
msg = f"unsupported tensogram dtype {tgm_dtype!r}: {exc}"
|
|
72
|
+
raise TypeError(msg) from exc
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class TensogramDataStore:
|
|
76
|
+
"""Read-only data store for a single tensogram message.
|
|
77
|
+
|
|
78
|
+
Parameters
|
|
79
|
+
----------
|
|
80
|
+
file_path
|
|
81
|
+
Path or remote URL to the ``.tgm`` file.
|
|
82
|
+
msg_index
|
|
83
|
+
Index of the message within the file.
|
|
84
|
+
dim_names
|
|
85
|
+
Optional user-specified dimension names for data variables.
|
|
86
|
+
variable_key
|
|
87
|
+
Optional dotted metadata path for variable naming.
|
|
88
|
+
verify_hash
|
|
89
|
+
Whether to verify object hashes on decode.
|
|
90
|
+
range_threshold
|
|
91
|
+
Maximum fraction of total array elements (0.0-1.0) for which
|
|
92
|
+
partial ``decode_range()`` is used. Default ``0.5``.
|
|
93
|
+
storage_options
|
|
94
|
+
Key-value pairs forwarded to the object store backend when
|
|
95
|
+
``file_path`` is a remote URL (S3, GCS, Azure, HTTP). Used
|
|
96
|
+
for credentials, region, endpoint overrides, etc. Ignored
|
|
97
|
+
for local files.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
def __init__(
|
|
101
|
+
self,
|
|
102
|
+
file_path: str,
|
|
103
|
+
msg_index: int = 0,
|
|
104
|
+
dim_names: Sequence[str] | None = None,
|
|
105
|
+
variable_key: str | None = None,
|
|
106
|
+
verify_hash: bool = False,
|
|
107
|
+
range_threshold: float = 0.5,
|
|
108
|
+
storage_options: dict[str, Any] | None = None,
|
|
109
|
+
):
|
|
110
|
+
import tensogram
|
|
111
|
+
|
|
112
|
+
self._is_remote = tensogram.is_remote_url(file_path)
|
|
113
|
+
self.file_path = file_path if self._is_remote else os.path.abspath(file_path)
|
|
114
|
+
self.msg_index = msg_index
|
|
115
|
+
self.dim_names = dim_names
|
|
116
|
+
self.variable_key = variable_key
|
|
117
|
+
self.verify_hash = verify_hash
|
|
118
|
+
self.range_threshold = range_threshold
|
|
119
|
+
self.storage_options = storage_options
|
|
120
|
+
self._lock = threading.Lock()
|
|
121
|
+
self._backend_arrays: list[TensogramBackendArray] = []
|
|
122
|
+
|
|
123
|
+
self._file = self._open_file()
|
|
124
|
+
self._meta, self._descriptors = self._read_metadata()
|
|
125
|
+
|
|
126
|
+
def _open_file(self) -> Any:
|
|
127
|
+
import tensogram
|
|
128
|
+
|
|
129
|
+
if self._is_remote:
|
|
130
|
+
return tensogram.TensogramFile.open_remote(self.file_path, self.storage_options or {})
|
|
131
|
+
return tensogram.TensogramFile.open(self.file_path)
|
|
132
|
+
|
|
133
|
+
def _read_metadata(self) -> tuple[Any, list]:
|
|
134
|
+
import tensogram
|
|
135
|
+
|
|
136
|
+
if self._is_remote:
|
|
137
|
+
result = self._file.file_decode_descriptors(self.msg_index)
|
|
138
|
+
return result["metadata"], result["descriptors"]
|
|
139
|
+
|
|
140
|
+
raw = self._file.read_message(self.msg_index)
|
|
141
|
+
meta = tensogram.decode_metadata(raw)
|
|
142
|
+
_, descriptors = tensogram.decode_descriptors(raw)
|
|
143
|
+
return meta, descriptors
|
|
144
|
+
|
|
145
|
+
def _get_common_meta(self) -> dict[str, Any]:
|
|
146
|
+
"""Extract message-level metadata for Dataset attributes.
|
|
147
|
+
|
|
148
|
+
Reads from ``meta.extra`` (message-level annotations).
|
|
149
|
+
"""
|
|
150
|
+
attrs: dict[str, Any] = {}
|
|
151
|
+
extra = getattr(self._meta, "extra", None)
|
|
152
|
+
if extra and isinstance(extra, dict):
|
|
153
|
+
attrs.update(extra)
|
|
154
|
+
attrs["tensogram_version"] = getattr(self._meta, "version", 2)
|
|
155
|
+
return attrs
|
|
156
|
+
|
|
157
|
+
def _get_per_object_meta(self, obj_index: int, desc: Any) -> dict[str, Any]:
|
|
158
|
+
"""Extract per-object metadata.
|
|
159
|
+
|
|
160
|
+
Reads from ``meta.base[obj_index]`` (primary), filtering out the
|
|
161
|
+
``_reserved_`` key (encoder-populated tensor info). Then merges in
|
|
162
|
+
``desc.params`` (fallback for extra keys in the descriptor dict).
|
|
163
|
+
|
|
164
|
+
If ``obj_index`` is out of range (fewer base entries than objects),
|
|
165
|
+
a warning is logged and the base entry is treated as empty.
|
|
166
|
+
"""
|
|
167
|
+
meta: dict[str, Any] = {}
|
|
168
|
+
# Primary source: meta.base[i]
|
|
169
|
+
base = getattr(self._meta, "base", None)
|
|
170
|
+
if base is not None and isinstance(base, list):
|
|
171
|
+
if obj_index < len(base):
|
|
172
|
+
entry = base[obj_index]
|
|
173
|
+
if isinstance(entry, dict):
|
|
174
|
+
for k, v in entry.items():
|
|
175
|
+
if k != RESERVED_KEY:
|
|
176
|
+
meta[k] = v
|
|
177
|
+
else:
|
|
178
|
+
logger.warning(
|
|
179
|
+
"meta.base has %d entries but object index %d requested; "
|
|
180
|
+
"per-object metadata will be empty for this object",
|
|
181
|
+
len(base),
|
|
182
|
+
obj_index,
|
|
183
|
+
)
|
|
184
|
+
# Fallback/supplement: desc.params (extra descriptor keys)
|
|
185
|
+
params = getattr(desc, "params", None)
|
|
186
|
+
if params and isinstance(params, dict):
|
|
187
|
+
for k, v in params.items():
|
|
188
|
+
if k not in meta:
|
|
189
|
+
meta[k] = v
|
|
190
|
+
return meta
|
|
191
|
+
|
|
192
|
+
def build_dataset(
|
|
193
|
+
self,
|
|
194
|
+
drop_variables: set[str] | None = None,
|
|
195
|
+
) -> xr.Dataset:
|
|
196
|
+
"""Construct an :class:`xr.Dataset` from this message.
|
|
197
|
+
|
|
198
|
+
Coordinate objects are auto-detected by name matching. Data objects
|
|
199
|
+
become lazy-loaded variables. All metadata flows to attributes.
|
|
200
|
+
"""
|
|
201
|
+
dataset_attrs = self._get_common_meta()
|
|
202
|
+
|
|
203
|
+
# Gather per-object metadata from payload + descriptor params.
|
|
204
|
+
obj_metas = [self._get_per_object_meta(i, d) for i, d in enumerate(self._descriptors)]
|
|
205
|
+
|
|
206
|
+
# Detect coordinates vs data variables.
|
|
207
|
+
coord_indices, var_indices, coord_dim_names = detect_coords(obj_metas)
|
|
208
|
+
|
|
209
|
+
# Build coordinate variables from detected coord objects.
|
|
210
|
+
coord_vars: dict[str, xr.Variable] = {}
|
|
211
|
+
for ci in coord_indices:
|
|
212
|
+
desc = self._descriptors[ci]
|
|
213
|
+
dim_name = coord_dim_names[ci]
|
|
214
|
+
np_dtype = _to_numpy_dtype(desc.dtype)
|
|
215
|
+
shape = tuple(desc.shape)
|
|
216
|
+
|
|
217
|
+
backend_array = TensogramBackendArray(
|
|
218
|
+
file_path=self.file_path,
|
|
219
|
+
msg_index=self.msg_index,
|
|
220
|
+
obj_index=ci,
|
|
221
|
+
shape=shape,
|
|
222
|
+
dtype=np_dtype,
|
|
223
|
+
supports_range=_supports_range_decode(desc),
|
|
224
|
+
verify_hash=self.verify_hash,
|
|
225
|
+
range_threshold=self.range_threshold,
|
|
226
|
+
lock=self._lock,
|
|
227
|
+
storage_options=self.storage_options,
|
|
228
|
+
shared_file=self._file,
|
|
229
|
+
)
|
|
230
|
+
self._backend_arrays.append(backend_array)
|
|
231
|
+
lazy_data = indexing.LazilyIndexedArray(backend_array)
|
|
232
|
+
|
|
233
|
+
coord_dims = (dim_name,)
|
|
234
|
+
coord_attrs = dict(obj_metas[ci])
|
|
235
|
+
coord_vars[dim_name] = xr.Variable(coord_dims, lazy_data, coord_attrs)
|
|
236
|
+
|
|
237
|
+
data_vars: dict[str, xr.Variable] = {}
|
|
238
|
+
for vi in var_indices:
|
|
239
|
+
desc = self._descriptors[vi]
|
|
240
|
+
var_name = resolve_variable_name(vi, obj_metas[vi], self.variable_key)
|
|
241
|
+
if drop_variables and var_name in drop_variables:
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
np_dtype = _to_numpy_dtype(desc.dtype)
|
|
245
|
+
shape = tuple(desc.shape)
|
|
246
|
+
|
|
247
|
+
dims = self._resolve_dims_for_var(shape, coord_vars)
|
|
248
|
+
|
|
249
|
+
backend_array = TensogramBackendArray(
|
|
250
|
+
file_path=self.file_path,
|
|
251
|
+
msg_index=self.msg_index,
|
|
252
|
+
obj_index=vi,
|
|
253
|
+
shape=shape,
|
|
254
|
+
dtype=np_dtype,
|
|
255
|
+
supports_range=_supports_range_decode(desc),
|
|
256
|
+
verify_hash=self.verify_hash,
|
|
257
|
+
range_threshold=self.range_threshold,
|
|
258
|
+
lock=self._lock,
|
|
259
|
+
storage_options=self.storage_options,
|
|
260
|
+
shared_file=self._file,
|
|
261
|
+
)
|
|
262
|
+
self._backend_arrays.append(backend_array)
|
|
263
|
+
lazy_data = indexing.LazilyIndexedArray(backend_array)
|
|
264
|
+
|
|
265
|
+
var_attrs = dict(obj_metas[vi])
|
|
266
|
+
data_vars[var_name] = xr.Variable(dims, lazy_data, var_attrs)
|
|
267
|
+
|
|
268
|
+
# Assemble Dataset.
|
|
269
|
+
ds = xr.Dataset(data_vars, coords=coord_vars, attrs=dataset_attrs)
|
|
270
|
+
return ds
|
|
271
|
+
|
|
272
|
+
def _get_meta_dim_names(self, ndim: int) -> list[str] | dict[int, str]:
|
|
273
|
+
"""Return dimension name hints from ``_extra_["dim_names"]``.
|
|
274
|
+
|
|
275
|
+
Any writer can embed hints at ``_extra_["dim_names"]`` to provide
|
|
276
|
+
meaningful dimension names without the reader passing ``dim_names``
|
|
277
|
+
explicitly. Two formats are accepted:
|
|
278
|
+
|
|
279
|
+
**List (preferred)** — axis-ordered names, one per dimension::
|
|
280
|
+
|
|
281
|
+
"_extra_": {"dim_names": ["values", "level"]}
|
|
282
|
+
|
|
283
|
+
This handles axes with identical sizes correctly because names
|
|
284
|
+
are assigned by position.
|
|
285
|
+
|
|
286
|
+
**Dict (legacy)** — size-to-name mapping::
|
|
287
|
+
|
|
288
|
+
"_extra_": {"dim_names": {"1000": "values", "50": "level"}}
|
|
289
|
+
|
|
290
|
+
A dict cannot disambiguate axes with the same size; only the
|
|
291
|
+
first matching axis receives the hint.
|
|
292
|
+
|
|
293
|
+
Returns an empty list/dict when the key is absent or malformed.
|
|
294
|
+
"""
|
|
295
|
+
try:
|
|
296
|
+
raw = self._meta.extra["dim_names"]
|
|
297
|
+
except (AttributeError, KeyError, TypeError):
|
|
298
|
+
return {}
|
|
299
|
+
|
|
300
|
+
# List format: axis-ordered names.
|
|
301
|
+
if isinstance(raw, list):
|
|
302
|
+
try:
|
|
303
|
+
names = [str(n) for n in raw]
|
|
304
|
+
if len(names) == ndim:
|
|
305
|
+
return names
|
|
306
|
+
# Length mismatch — ignore the hint rather than crash.
|
|
307
|
+
return {}
|
|
308
|
+
except (TypeError, ValueError):
|
|
309
|
+
return {}
|
|
310
|
+
|
|
311
|
+
# Dict format: size -> name (legacy).
|
|
312
|
+
if isinstance(raw, dict):
|
|
313
|
+
try:
|
|
314
|
+
return {int(k): str(v) for k, v in raw.items()}
|
|
315
|
+
except (TypeError, ValueError):
|
|
316
|
+
return {}
|
|
317
|
+
|
|
318
|
+
return {}
|
|
319
|
+
|
|
320
|
+
def _resolve_dims_for_var(
|
|
321
|
+
self,
|
|
322
|
+
shape: tuple[int, ...],
|
|
323
|
+
coord_vars: dict[str, xr.Variable],
|
|
324
|
+
) -> tuple[str, ...]:
|
|
325
|
+
"""Assign dimension names for a data variable.
|
|
326
|
+
|
|
327
|
+
Strategy:
|
|
328
|
+
1. If user provided ``dim_names``, use them directly.
|
|
329
|
+
2. Try to match each axis size against a known coordinate variable.
|
|
330
|
+
3. Use producer hints from ``_extra_["dim_names"]`` (list or dict).
|
|
331
|
+
4. Fall back to ``dim_0``, ``dim_1``, ...
|
|
332
|
+
"""
|
|
333
|
+
ndim = len(shape)
|
|
334
|
+
|
|
335
|
+
# (1) Explicit user mapping.
|
|
336
|
+
if self.dim_names is not None:
|
|
337
|
+
return tuple(resolve_dim_names(ndim, self.dim_names))
|
|
338
|
+
|
|
339
|
+
# (2) Match by size against detected coordinates.
|
|
340
|
+
# Build a map: size -> list of coord dim names with that size.
|
|
341
|
+
size_to_coord: dict[int, list[str]] = {}
|
|
342
|
+
for cname, cvar in coord_vars.items():
|
|
343
|
+
csize = cvar.shape[0]
|
|
344
|
+
size_to_coord.setdefault(csize, []).append(cname)
|
|
345
|
+
|
|
346
|
+
# (3) Metadata hints written by the producer.
|
|
347
|
+
meta_hints = self._get_meta_dim_names(ndim)
|
|
348
|
+
|
|
349
|
+
# If the producer supplied an axis-ordered list, use it directly.
|
|
350
|
+
if isinstance(meta_hints, list):
|
|
351
|
+
return tuple(meta_hints)
|
|
352
|
+
|
|
353
|
+
# Otherwise meta_hints is a dict (size -> name); match by size.
|
|
354
|
+
dims: list[str] = []
|
|
355
|
+
used: set[str] = set()
|
|
356
|
+
for axis, axis_size in enumerate(shape):
|
|
357
|
+
matched = False
|
|
358
|
+
# Prefer a detected coordinate dimension.
|
|
359
|
+
if axis_size in size_to_coord:
|
|
360
|
+
for cname in size_to_coord[axis_size]:
|
|
361
|
+
if cname not in used:
|
|
362
|
+
dims.append(cname)
|
|
363
|
+
used.add(cname)
|
|
364
|
+
matched = True
|
|
365
|
+
break
|
|
366
|
+
# Fall back to producer hint (dict).
|
|
367
|
+
if not matched and axis_size in meta_hints:
|
|
368
|
+
hint = meta_hints[axis_size]
|
|
369
|
+
if hint not in used:
|
|
370
|
+
dims.append(hint)
|
|
371
|
+
used.add(hint)
|
|
372
|
+
matched = True
|
|
373
|
+
# Final fallback.
|
|
374
|
+
if not matched:
|
|
375
|
+
dims.append(f"dim_{axis}")
|
|
376
|
+
|
|
377
|
+
return tuple(dims)
|
|
378
|
+
|
|
379
|
+
def close(self) -> None:
|
|
380
|
+
for arr in self._backend_arrays:
|
|
381
|
+
arr._shared_file = None
|
|
382
|
+
self._backend_arrays.clear()
|
|
383
|
+
self._file = None
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tensogram-xarray
|
|
3
|
+
Version: 0.14.0
|
|
4
|
+
Summary: xarray backend engine for tensogram .tgm files
|
|
5
|
+
Project-URL: Homepage, https://sites.ecmwf.int/docs/tensogram/main
|
|
6
|
+
Project-URL: Repository, https://github.com/ecmwf/tensogram
|
|
7
|
+
Project-URL: Documentation, https://sites.ecmwf.int/docs/tensogram/main
|
|
8
|
+
Author-email: ECMWF <software@ecmwf.int>
|
|
9
|
+
License-Expression: Apache-2.0
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Atmospheric Science
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Requires-Dist: numpy
|
|
17
|
+
Requires-Dist: tensogram<0.15,>=0.14.0
|
|
18
|
+
Requires-Dist: xarray>=2022.06
|
|
19
|
+
Provides-Extra: dask
|
|
20
|
+
Requires-Dist: dask[array]; extra == 'dask'
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
23
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
tensogram_xarray/__init__.py,sha256=UmIUhCzEZyCRC9zueuwHITobJcMXp34qS7QCjX9NAIw,895
|
|
2
|
+
tensogram_xarray/array.py,sha256=MFb2m3JAfo2hQVnABq1zcdC9mMa55X6r0uGn0Te3eXY,14658
|
|
3
|
+
tensogram_xarray/backend.py,sha256=JIrxmYXNNWHGn_kHtRUA_VngvtC9gzA2Z0LWdodzHhg,4970
|
|
4
|
+
tensogram_xarray/coords.py,sha256=Vk7k7oF-GgS6EE5BwMsHanzWlXp4ersBuuuXVvyjayM,3398
|
|
5
|
+
tensogram_xarray/mapping.py,sha256=i763IpvWxky1G9U27e9SWMPanCHLOlUl9DyHrfX2H54,3019
|
|
6
|
+
tensogram_xarray/merge.py,sha256=zFr_AXwqtgz1R2Ag8RDws-kdfKkiuktTOh5XIPnJF4s,29206
|
|
7
|
+
tensogram_xarray/scanner.py,sha256=uCeV-B_WBegNPP7qAuZPhUcgGUtQ8bEJfzMB5GOcKVQ,6439
|
|
8
|
+
tensogram_xarray/store.py,sha256=-XfEnCyScpE0aibzoGQka1oZ7yay8PUnN3V1XqZBj1k,14083
|
|
9
|
+
tensogram_xarray-0.14.0.dist-info/METADATA,sha256=Eb4lYe2yFFTif90BAIbSxBgGPumro-oxvU_Ef90adcw,936
|
|
10
|
+
tensogram_xarray-0.14.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
11
|
+
tensogram_xarray-0.14.0.dist-info/entry_points.txt,sha256=l5kLmFeoDJheSrXui7OlHa3WeSkDu5y5lmUoC7kU0Rc,82
|
|
12
|
+
tensogram_xarray-0.14.0.dist-info/RECORD,,
|