ocf-data-sampler 0.5.3__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocf_data_sampler/load/nwp/providers/utils.py +1 -2
- ocf_data_sampler/load/open_xarray_tensorstore.py +167 -0
- ocf_data_sampler/load/satellite.py +1 -3
- {ocf_data_sampler-0.5.3.dist-info → ocf_data_sampler-0.5.6.dist-info}/METADATA +10 -3
- {ocf_data_sampler-0.5.3.dist-info → ocf_data_sampler-0.5.6.dist-info}/RECORD +7 -7
- ocf_data_sampler/load/open_tensorstore_zarrs.py +0 -92
- {ocf_data_sampler-0.5.3.dist-info → ocf_data_sampler-0.5.6.dist-info}/WHEEL +0 -0
- {ocf_data_sampler-0.5.3.dist-info → ocf_data_sampler-0.5.6.dist-info}/top_level.txt +0 -0
|
@@ -3,9 +3,8 @@
|
|
|
3
3
|
from glob import glob
|
|
4
4
|
|
|
5
5
|
import xarray as xr
|
|
6
|
-
from xarray_tensorstore import open_zarr
|
|
7
6
|
|
|
8
|
-
from ocf_data_sampler.load.
|
|
7
|
+
from ocf_data_sampler.load.open_xarray_tensorstore import open_zarr, open_zarrs
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
def open_zarr_paths(
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Utilities for loading TensorStore data into Xarray.
|
|
2
|
+
|
|
3
|
+
This module uses and adapts internal functions from the Google xarray-tensorstore project [1],
|
|
4
|
+
licensed under the Apache License, Version 2.0. See [2] for details.
|
|
5
|
+
|
|
6
|
+
Modifications copyright 2025 Open climate Fix. Licensed under the MIT License.
|
|
7
|
+
|
|
8
|
+
Modifications from the original include:
|
|
9
|
+
- Adding support for opening multiple zarr files as a single xarray object
|
|
10
|
+
- Support for zarr 3 -> https://github.com/google/xarray-tensorstore/pull/22
|
|
11
|
+
|
|
12
|
+
References:
|
|
13
|
+
[1] https://github.com/google-research/tensorstore/blob/main/tensorstore/xarray.py
|
|
14
|
+
[2] https://www.apache.org/licenses/LICENSE-2.0
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os.path
|
|
18
|
+
import re
|
|
19
|
+
|
|
20
|
+
import tensorstore as ts
|
|
21
|
+
import xarray as xr
|
|
22
|
+
import zarr
|
|
23
|
+
from xarray_tensorstore import (
|
|
24
|
+
_DEFAULT_STORAGE_DRIVER,
|
|
25
|
+
_raise_if_mask_and_scale_used_for_data_vars,
|
|
26
|
+
_TensorStoreAdapter,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _zarr_spec_from_path(path: str, zarr_format: int) -> ...:
|
|
31
|
+
if re.match(r"\w+\://", path): # path is a URI
|
|
32
|
+
kv_store = path
|
|
33
|
+
else:
|
|
34
|
+
kv_store = {"driver": _DEFAULT_STORAGE_DRIVER, "path": path}
|
|
35
|
+
return {"driver": f"zarr{zarr_format}", "kvstore": kv_store}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _get_data_variable_array_futures(
|
|
39
|
+
path: str,
|
|
40
|
+
context: ts.Context | None,
|
|
41
|
+
variables: list[str],
|
|
42
|
+
) -> dict[ts.Future]:
|
|
43
|
+
"""Open all data variables in a zarr group and return futures.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
path: path or URI to zarr group to open.
|
|
47
|
+
context: TensorStore configuration options to use when opening arrays.
|
|
48
|
+
variables: The variables in the zarr groupto open.
|
|
49
|
+
"""
|
|
50
|
+
zarr_format = zarr.open(path).metadata.zarr_format
|
|
51
|
+
specs = {k: _zarr_spec_from_path(os.path.join(path, k), zarr_format) for k in variables}
|
|
52
|
+
return {k: ts.open(spec, read=True, write=False, context=context) for k, spec in specs.items()}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _tensorstore_open_zarrs(
|
|
56
|
+
paths: list[str],
|
|
57
|
+
data_vars: list[str],
|
|
58
|
+
concat_axes: list[int],
|
|
59
|
+
context: ts.Context,
|
|
60
|
+
) -> dict[str, ts.TensorStore]:
|
|
61
|
+
"""Open multiple zarrs with TensorStore.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
paths: List of paths to zarr stores.
|
|
65
|
+
data_vars: List of data variable names to open.
|
|
66
|
+
concat_axes: List of axes along which to concatenate the data variables.
|
|
67
|
+
context: TensorStore context.
|
|
68
|
+
"""
|
|
69
|
+
# Open all the variables from all the datasets - returned as futures
|
|
70
|
+
arrays_list: list[dict[str, ts.Future]] = []
|
|
71
|
+
for path in paths:
|
|
72
|
+
arrays_list.append(_get_data_variable_array_futures(path, context, data_vars))
|
|
73
|
+
|
|
74
|
+
# Wait for the async open operations
|
|
75
|
+
arrays_list = [{k: v.result() for k, v in arrays.items()} for arrays in arrays_list]
|
|
76
|
+
|
|
77
|
+
# Concatenate each of the variables along the required axis
|
|
78
|
+
arrays = {}
|
|
79
|
+
for k, axis in zip(data_vars, concat_axes, strict=True):
|
|
80
|
+
variable_arrays = [d[k] for d in arrays_list]
|
|
81
|
+
arrays[k] = ts.concat(variable_arrays, axis=axis)
|
|
82
|
+
|
|
83
|
+
return arrays
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def open_zarr(
|
|
87
|
+
path: str,
|
|
88
|
+
context: ts.Context | None = None,
|
|
89
|
+
mask_and_scale: bool = True,
|
|
90
|
+
) -> xr.Dataset:
|
|
91
|
+
"""Open an xarray.Dataset from zarr using TensorStore.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
path: path or URI to zarr group to open.
|
|
95
|
+
context: TensorStore configuration options to use when opening arrays.
|
|
96
|
+
mask_and_scale: if True (default), attempt to apply masking and scaling like
|
|
97
|
+
xarray.open_zarr(). This is only supported for coordinate variables and
|
|
98
|
+
otherwise will raise an error.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Dataset with all data variables opened via TensorStore.
|
|
102
|
+
"""
|
|
103
|
+
if context is None:
|
|
104
|
+
context = ts.Context()
|
|
105
|
+
|
|
106
|
+
# Avoid using dask by settung `chunks=None`
|
|
107
|
+
ds = xr.open_zarr(path, chunks=None, mask_and_scale=mask_and_scale)
|
|
108
|
+
|
|
109
|
+
if mask_and_scale:
|
|
110
|
+
_raise_if_mask_and_scale_used_for_data_vars(ds)
|
|
111
|
+
|
|
112
|
+
# Open all data variables using tensorstore - returned as futures
|
|
113
|
+
data_vars = list(ds.data_vars)
|
|
114
|
+
arrays = _get_data_variable_array_futures(path, context, data_vars)
|
|
115
|
+
|
|
116
|
+
# Wait for the async open operations
|
|
117
|
+
arrays = {k: v.result() for k, v in arrays.items()}
|
|
118
|
+
|
|
119
|
+
# Adapt the tensorstore arrays and plug them into the xarray object
|
|
120
|
+
new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()}
|
|
121
|
+
|
|
122
|
+
return ds.copy(data=new_data)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def open_zarrs(
|
|
126
|
+
paths: list[str],
|
|
127
|
+
concat_dim: str,
|
|
128
|
+
context: ts.Context | None = None,
|
|
129
|
+
mask_and_scale: bool = True,
|
|
130
|
+
) -> xr.Dataset:
|
|
131
|
+
"""Open multiple zarrs with TensorStore.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
paths: List of paths to zarr stores.
|
|
135
|
+
concat_dim: Dimension along which to concatenate the data variables.
|
|
136
|
+
context: TensorStore context.
|
|
137
|
+
mask_and_scale: Whether to mask and scale the data.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Concatenated Dataset with all data variables opened via TensorStore.
|
|
141
|
+
"""
|
|
142
|
+
if context is None:
|
|
143
|
+
context = ts.Context()
|
|
144
|
+
|
|
145
|
+
ds_list = [xr.open_zarr(p, mask_and_scale=mask_and_scale, decode_timedelta=True) for p in paths]
|
|
146
|
+
ds = xr.concat(
|
|
147
|
+
ds_list,
|
|
148
|
+
dim=concat_dim,
|
|
149
|
+
data_vars="minimal",
|
|
150
|
+
compat="equals",
|
|
151
|
+
combine_attrs="no_conflicts",
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
if mask_and_scale:
|
|
155
|
+
_raise_if_mask_and_scale_used_for_data_vars(ds)
|
|
156
|
+
|
|
157
|
+
# Find the axis along which each data array must be concatenated
|
|
158
|
+
data_vars = list(ds.data_vars)
|
|
159
|
+
concat_axes = [ds[v].dims.index(concat_dim) for v in data_vars]
|
|
160
|
+
|
|
161
|
+
# Open and concat all zarrs so each variables is a single TensorStore array
|
|
162
|
+
arrays = _tensorstore_open_zarrs(paths, data_vars, concat_axes, context)
|
|
163
|
+
|
|
164
|
+
# Plug the arrays into the xarray object
|
|
165
|
+
new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()}
|
|
166
|
+
|
|
167
|
+
return ds.copy(data=new_data)
|
|
@@ -1,16 +1,14 @@
|
|
|
1
1
|
"""Satellite loader."""
|
|
2
2
|
import numpy as np
|
|
3
3
|
import xarray as xr
|
|
4
|
-
from xarray_tensorstore import open_zarr
|
|
5
4
|
|
|
5
|
+
from ocf_data_sampler.load.open_xarray_tensorstore import open_zarr, open_zarrs
|
|
6
6
|
from ocf_data_sampler.load.utils import (
|
|
7
7
|
check_time_unique_increasing,
|
|
8
8
|
get_xr_data_array_from_xr_dataset,
|
|
9
9
|
make_spatial_coords_increasing,
|
|
10
10
|
)
|
|
11
11
|
|
|
12
|
-
from .open_tensorstore_zarrs import open_zarrs
|
|
13
|
-
|
|
14
12
|
|
|
15
13
|
def open_sat_data(zarr_path: str | list[str]) -> xr.DataArray:
|
|
16
14
|
"""Lazily opens the zarr store and validates data types.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ocf-data-sampler
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.6
|
|
4
4
|
Author: James Fulton, Peter Dudfield
|
|
5
5
|
Author-email: Open Climate Fix team <info@openclimatefix.org>
|
|
6
6
|
License: MIT License
|
|
@@ -28,14 +28,14 @@ License: MIT License
|
|
|
28
28
|
Project-URL: repository, https://github.com/openclimatefix/ocf-data-sampler
|
|
29
29
|
Classifier: Programming Language :: Python :: 3
|
|
30
30
|
Classifier: License :: OSI Approved :: MIT License
|
|
31
|
-
Requires-Python: >=3.
|
|
31
|
+
Requires-Python: >=3.11
|
|
32
32
|
Description-Content-Type: text/markdown
|
|
33
33
|
Requires-Dist: torch
|
|
34
34
|
Requires-Dist: numpy
|
|
35
35
|
Requires-Dist: pandas
|
|
36
36
|
Requires-Dist: xarray
|
|
37
37
|
Requires-Dist: zarr
|
|
38
|
-
Requires-Dist: numcodecs
|
|
38
|
+
Requires-Dist: numcodecs
|
|
39
39
|
Requires-Dist: dask
|
|
40
40
|
Requires-Dist: matplotlib
|
|
41
41
|
Requires-Dist: pvlib
|
|
@@ -45,6 +45,7 @@ Requires-Dist: pyaml_env
|
|
|
45
45
|
Requires-Dist: pyresample
|
|
46
46
|
Requires-Dist: h5netcdf
|
|
47
47
|
Requires-Dist: xarray-tensorstore==0.1.5
|
|
48
|
+
Requires-Dist: zarr>=3
|
|
48
49
|
|
|
49
50
|
# ocf-data-sampler
|
|
50
51
|
|
|
@@ -62,6 +63,12 @@ We are currently migrating to this repo from [ocf_datapipes](https://github.com/
|
|
|
62
63
|
> [!Note]
|
|
63
64
|
> This repository is still in early development development and large changes to the user facing functions may still occur.
|
|
64
65
|
|
|
66
|
+
## Licence
|
|
67
|
+
|
|
68
|
+
This project is primarily licensed under the MIT License (see LICENSE).
|
|
69
|
+
|
|
70
|
+
It includes and adapts internal functions from the Google xarray-tensorstore project, licensed under the Apache License, Version 2.0.
|
|
71
|
+
|
|
65
72
|
## Documentation
|
|
66
73
|
|
|
67
74
|
**ocf-data-sampler** doesn't have external documentation _yet_; you can read a bit about how our torch datasets work in the README [here](ocf_data_sampler/torch_datasets/README.md).
|
|
@@ -9,8 +9,8 @@ ocf_data_sampler/data/uk_gsp_locations_20250109.csv,sha256=XZISFatnbpO9j8LwaxNKF
|
|
|
9
9
|
ocf_data_sampler/load/__init__.py,sha256=-vQP9g0UOWdVbjEGyVX_ipa7R1btmiETIKAf6aw4d78,201
|
|
10
10
|
ocf_data_sampler/load/gsp.py,sha256=d30jQWnwFaLj6rKNMHdz1qD8fzF8q--RNnEXT7bGiX0,2981
|
|
11
11
|
ocf_data_sampler/load/load_dataset.py,sha256=K8rWykjII-3g127If7WRRFivzHNx3SshCvZj4uQlf28,2089
|
|
12
|
-
ocf_data_sampler/load/
|
|
13
|
-
ocf_data_sampler/load/satellite.py,sha256=
|
|
12
|
+
ocf_data_sampler/load/open_xarray_tensorstore.py,sha256=i2IWd-uNctP4TGc5NXDlMd2Or7tmLQduPxwrBfDx-7g,5618
|
|
13
|
+
ocf_data_sampler/load/satellite.py,sha256=B-m0_Py_D0GwzwX5o-ixyeXntV5Z4k4MbmMBHZLUWMM,1831
|
|
14
14
|
ocf_data_sampler/load/site.py,sha256=WtOy20VMHJIY0IwEemCdcecSDUGcVaLUown-4ixJw90,2147
|
|
15
15
|
ocf_data_sampler/load/utils.py,sha256=AGL0aOOQPrgqNBTjlBtR7Qg1PyQov3DFJo-y198u8pY,2044
|
|
16
16
|
ocf_data_sampler/load/nwp/__init__.py,sha256=SmcrnbygO5xtCKmGR4wtHrj-HI7nOAvnAtfuvRufBGQ,25
|
|
@@ -21,7 +21,7 @@ ocf_data_sampler/load/nwp/providers/ecmwf.py,sha256=P7JqfssmQq8eHKKXaBexsxts325A
|
|
|
21
21
|
ocf_data_sampler/load/nwp/providers/gfs.py,sha256=h6vm-Rfz1JGOE4P_fP1_XQJ3bugNbeNAIyt56N8B1Dc,1066
|
|
22
22
|
ocf_data_sampler/load/nwp/providers/icon.py,sha256=iVZwLKRr_D74_kAu5MHir6pRKEfbTmIxFRZAxzmiYdI,1257
|
|
23
23
|
ocf_data_sampler/load/nwp/providers/ukv.py,sha256=2i32VM9gnmWUpbL0qBSp_AKzuyKucXZPS8yklbcGlbc,1039
|
|
24
|
-
ocf_data_sampler/load/nwp/providers/utils.py,sha256=
|
|
24
|
+
ocf_data_sampler/load/nwp/providers/utils.py,sha256=IjJ3w7zDgXNFaVa4TMk8yVCvdzfrIRu5tn1OaaQ7Zso,2304
|
|
25
25
|
ocf_data_sampler/numpy_sample/__init__.py,sha256=5bdpzM8hMAEe0XRSZ9AZFQdqEeBsEPhaF79Y8bDx3GQ,407
|
|
26
26
|
ocf_data_sampler/numpy_sample/collate.py,sha256=hoxIc5SoHoIs3Nx37aRZzWChpswjy9lHUgaKgHIoo80,2039
|
|
27
27
|
ocf_data_sampler/numpy_sample/common_types.py,sha256=9CjYHkUTx0ObduWh43fhsybZCTXvexql7qC2ptMDoek,377
|
|
@@ -56,7 +56,7 @@ ocf_data_sampler/torch_datasets/utils/validation_utils.py,sha256=YqmT-lExWlI8_ul
|
|
|
56
56
|
scripts/download_gsp_location_data.py,sha256=rRDXMoqX-RYY4jPdxhdlxJGhWdl6r245F5UARgKV6P4,3121
|
|
57
57
|
scripts/refactor_site.py,sha256=skzvsPP0Cn9yTKndzkilyNcGz4DZ88ctvCJ0XrBdc2A,3135
|
|
58
58
|
utils/compute_icon_mean_stddev.py,sha256=a1oWMRMnny39rV-dvu8rcx85sb4bXzPFrR1gkUr4Jpg,2296
|
|
59
|
-
ocf_data_sampler-0.5.
|
|
60
|
-
ocf_data_sampler-0.5.
|
|
61
|
-
ocf_data_sampler-0.5.
|
|
62
|
-
ocf_data_sampler-0.5.
|
|
59
|
+
ocf_data_sampler-0.5.6.dist-info/METADATA,sha256=dOphQCwkuQjbwplFe3NDTSSc6Dw2z07KYQ4XtLgeGqo,12816
|
|
60
|
+
ocf_data_sampler-0.5.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
61
|
+
ocf_data_sampler-0.5.6.dist-info/top_level.txt,sha256=LEFU4Uk-PEo72QGLAfnVZIUEm37Q8mKuMeg_Xk-p33g,31
|
|
62
|
+
ocf_data_sampler-0.5.6.dist-info/RECORD,,
|
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
"""Open multiple zarrs with TensorStore.
|
|
2
|
-
|
|
3
|
-
This extendds the functionality of xarray_tensorstore to open multiple zarr stores
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import os
|
|
7
|
-
|
|
8
|
-
import tensorstore as ts
|
|
9
|
-
import xarray as xr
|
|
10
|
-
from xarray_tensorstore import (
|
|
11
|
-
_raise_if_mask_and_scale_used_for_data_vars,
|
|
12
|
-
_TensorStoreAdapter,
|
|
13
|
-
_zarr_spec_from_path,
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def tensorstore_open_multi_zarrs(
|
|
18
|
-
paths: list[str],
|
|
19
|
-
data_vars: list[str],
|
|
20
|
-
concat_axes: list[int],
|
|
21
|
-
context: ts.Context,
|
|
22
|
-
write: bool,
|
|
23
|
-
) -> dict[str, ts.TensorStore]:
|
|
24
|
-
"""Open multiple zarrs with TensorStore.
|
|
25
|
-
|
|
26
|
-
Args:
|
|
27
|
-
paths: List of paths to zarr stores.
|
|
28
|
-
data_vars: List of data variable names to open.
|
|
29
|
-
concat_axes: List of axes along which to concatenate the data variables.
|
|
30
|
-
context: TensorStore context.
|
|
31
|
-
write: Whether to open the stores for writing.
|
|
32
|
-
"""
|
|
33
|
-
arrays_list = []
|
|
34
|
-
for path in paths:
|
|
35
|
-
specs = {k: _zarr_spec_from_path(os.path.join(path, k)) for k in data_vars}
|
|
36
|
-
array_futures = {
|
|
37
|
-
k: ts.open(spec, read=True, write=write, context=context)
|
|
38
|
-
for k, spec in specs.items()
|
|
39
|
-
}
|
|
40
|
-
arrays_list.append({k: v.result() for k, v in array_futures.items()})
|
|
41
|
-
|
|
42
|
-
arrays = {}
|
|
43
|
-
for k, axis in zip(data_vars, concat_axes, strict=False):
|
|
44
|
-
datasets = [d[k] for d in arrays_list]
|
|
45
|
-
arrays[k] = ts.concat(datasets, axis=axis)
|
|
46
|
-
|
|
47
|
-
return arrays
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def open_zarrs(
|
|
51
|
-
paths: list[str],
|
|
52
|
-
concat_dim: str,
|
|
53
|
-
*,
|
|
54
|
-
context: ts.Context | None = None,
|
|
55
|
-
mask_and_scale: bool = True,
|
|
56
|
-
write: bool = False,
|
|
57
|
-
) -> xr.Dataset:
|
|
58
|
-
"""Open multiple zarrs with TensorStore.
|
|
59
|
-
|
|
60
|
-
Args:
|
|
61
|
-
paths: List of paths to zarr stores.
|
|
62
|
-
concat_dim: Dimension along which to concatenate the data variables.
|
|
63
|
-
context: TensorStore context.
|
|
64
|
-
mask_and_scale: Whether to mask and scale the data.
|
|
65
|
-
write: Whether to open the stores for writing.
|
|
66
|
-
"""
|
|
67
|
-
if context is None:
|
|
68
|
-
context = ts.Context()
|
|
69
|
-
|
|
70
|
-
ds = xr.open_mfdataset(
|
|
71
|
-
paths,
|
|
72
|
-
concat_dim=concat_dim,
|
|
73
|
-
combine="nested",
|
|
74
|
-
mask_and_scale=mask_and_scale,
|
|
75
|
-
decode_timedelta=True,
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
if mask_and_scale:
|
|
79
|
-
# Data variables get replaced below with _TensorStoreAdapter arrays, which
|
|
80
|
-
# don't get masked or scaled. Raising an error avoids surprising users with
|
|
81
|
-
# incorrect data values.
|
|
82
|
-
_raise_if_mask_and_scale_used_for_data_vars(ds)
|
|
83
|
-
|
|
84
|
-
data_vars = list(ds.data_vars)
|
|
85
|
-
|
|
86
|
-
concat_axes = [ds[v].dims.index(concat_dim) for v in data_vars]
|
|
87
|
-
|
|
88
|
-
arrays = tensorstore_open_multi_zarrs(paths, data_vars, concat_axes, context, write)
|
|
89
|
-
|
|
90
|
-
new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()}
|
|
91
|
-
|
|
92
|
-
return ds.copy(data=new_data)
|
|
File without changes
|
|
File without changes
|