ocf-data-sampler 0.5.3__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,9 +3,8 @@
3
3
  from glob import glob
4
4
 
5
5
  import xarray as xr
6
- from xarray_tensorstore import open_zarr
7
6
 
8
- from ocf_data_sampler.load.open_tensorstore_zarrs import open_zarrs
7
+ from ocf_data_sampler.load.open_xarray_tensorstore import open_zarr, open_zarrs
9
8
 
10
9
 
11
10
  def open_zarr_paths(
@@ -0,0 +1,167 @@
1
+ """Utilities for loading TensorStore data into Xarray.
2
+
3
+ This module uses and adapts internal functions from the Google xarray-tensorstore project [1],
4
+ licensed under the Apache License, Version 2.0. See [2] for details.
5
+
6
+ Modifications copyright 2025 Open climate Fix. Licensed under the MIT License.
7
+
8
+ Modifications from the original include:
9
+ - Adding support for opening multiple zarr files as a single xarray object
10
+ - Support for zarr 3 -> https://github.com/google/xarray-tensorstore/pull/22
11
+
12
+ References:
13
+ [1] https://github.com/google-research/tensorstore/blob/main/tensorstore/xarray.py
14
+ [2] https://www.apache.org/licenses/LICENSE-2.0
15
+ """
16
+
17
+ import os.path
18
+ import re
19
+
20
+ import tensorstore as ts
21
+ import xarray as xr
22
+ import zarr
23
+ from xarray_tensorstore import (
24
+ _DEFAULT_STORAGE_DRIVER,
25
+ _raise_if_mask_and_scale_used_for_data_vars,
26
+ _TensorStoreAdapter,
27
+ )
28
+
29
+
30
+ def _zarr_spec_from_path(path: str, zarr_format: int) -> ...:
31
+ if re.match(r"\w+\://", path): # path is a URI
32
+ kv_store = path
33
+ else:
34
+ kv_store = {"driver": _DEFAULT_STORAGE_DRIVER, "path": path}
35
+ return {"driver": f"zarr{zarr_format}", "kvstore": kv_store}
36
+
37
+
38
+ def _get_data_variable_array_futures(
39
+ path: str,
40
+ context: ts.Context | None,
41
+ variables: list[str],
42
+ ) -> dict[ts.Future]:
43
+ """Open all data variables in a zarr group and return futures.
44
+
45
+ Args:
46
+ path: path or URI to zarr group to open.
47
+ context: TensorStore configuration options to use when opening arrays.
48
+ variables: The variables in the zarr groupto open.
49
+ """
50
+ zarr_format = zarr.open(path).metadata.zarr_format
51
+ specs = {k: _zarr_spec_from_path(os.path.join(path, k), zarr_format) for k in variables}
52
+ return {k: ts.open(spec, read=True, write=False, context=context) for k, spec in specs.items()}
53
+
54
+
55
+ def _tensorstore_open_zarrs(
56
+ paths: list[str],
57
+ data_vars: list[str],
58
+ concat_axes: list[int],
59
+ context: ts.Context,
60
+ ) -> dict[str, ts.TensorStore]:
61
+ """Open multiple zarrs with TensorStore.
62
+
63
+ Args:
64
+ paths: List of paths to zarr stores.
65
+ data_vars: List of data variable names to open.
66
+ concat_axes: List of axes along which to concatenate the data variables.
67
+ context: TensorStore context.
68
+ """
69
+ # Open all the variables from all the datasets - returned as futures
70
+ arrays_list: list[dict[str, ts.Future]] = []
71
+ for path in paths:
72
+ arrays_list.append(_get_data_variable_array_futures(path, context, data_vars))
73
+
74
+ # Wait for the async open operations
75
+ arrays_list = [{k: v.result() for k, v in arrays.items()} for arrays in arrays_list]
76
+
77
+ # Concatenate each of the variables along the required axis
78
+ arrays = {}
79
+ for k, axis in zip(data_vars, concat_axes, strict=True):
80
+ variable_arrays = [d[k] for d in arrays_list]
81
+ arrays[k] = ts.concat(variable_arrays, axis=axis)
82
+
83
+ return arrays
84
+
85
+
86
+ def open_zarr(
87
+ path: str,
88
+ context: ts.Context | None = None,
89
+ mask_and_scale: bool = True,
90
+ ) -> xr.Dataset:
91
+ """Open an xarray.Dataset from zarr using TensorStore.
92
+
93
+ Args:
94
+ path: path or URI to zarr group to open.
95
+ context: TensorStore configuration options to use when opening arrays.
96
+ mask_and_scale: if True (default), attempt to apply masking and scaling like
97
+ xarray.open_zarr(). This is only supported for coordinate variables and
98
+ otherwise will raise an error.
99
+
100
+ Returns:
101
+ Dataset with all data variables opened via TensorStore.
102
+ """
103
+ if context is None:
104
+ context = ts.Context()
105
+
106
+ # Avoid using dask by settung `chunks=None`
107
+ ds = xr.open_zarr(path, chunks=None, mask_and_scale=mask_and_scale)
108
+
109
+ if mask_and_scale:
110
+ _raise_if_mask_and_scale_used_for_data_vars(ds)
111
+
112
+ # Open all data variables using tensorstore - returned as futures
113
+ data_vars = list(ds.data_vars)
114
+ arrays = _get_data_variable_array_futures(path, context, data_vars)
115
+
116
+ # Wait for the async open operations
117
+ arrays = {k: v.result() for k, v in arrays.items()}
118
+
119
+ # Adapt the tensorstore arrays and plug them into the xarray object
120
+ new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()}
121
+
122
+ return ds.copy(data=new_data)
123
+
124
+
125
+ def open_zarrs(
126
+ paths: list[str],
127
+ concat_dim: str,
128
+ context: ts.Context | None = None,
129
+ mask_and_scale: bool = True,
130
+ ) -> xr.Dataset:
131
+ """Open multiple zarrs with TensorStore.
132
+
133
+ Args:
134
+ paths: List of paths to zarr stores.
135
+ concat_dim: Dimension along which to concatenate the data variables.
136
+ context: TensorStore context.
137
+ mask_and_scale: Whether to mask and scale the data.
138
+
139
+ Returns:
140
+ Concatenated Dataset with all data variables opened via TensorStore.
141
+ """
142
+ if context is None:
143
+ context = ts.Context()
144
+
145
+ ds_list = [xr.open_zarr(p, mask_and_scale=mask_and_scale, decode_timedelta=True) for p in paths]
146
+ ds = xr.concat(
147
+ ds_list,
148
+ dim=concat_dim,
149
+ data_vars="minimal",
150
+ compat="equals",
151
+ combine_attrs="no_conflicts",
152
+ )
153
+
154
+ if mask_and_scale:
155
+ _raise_if_mask_and_scale_used_for_data_vars(ds)
156
+
157
+ # Find the axis along which each data array must be concatenated
158
+ data_vars = list(ds.data_vars)
159
+ concat_axes = [ds[v].dims.index(concat_dim) for v in data_vars]
160
+
161
+ # Open and concat all zarrs so each variables is a single TensorStore array
162
+ arrays = _tensorstore_open_zarrs(paths, data_vars, concat_axes, context)
163
+
164
+ # Plug the arrays into the xarray object
165
+ new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()}
166
+
167
+ return ds.copy(data=new_data)
@@ -1,16 +1,14 @@
1
1
  """Satellite loader."""
2
2
  import numpy as np
3
3
  import xarray as xr
4
- from xarray_tensorstore import open_zarr
5
4
 
5
+ from ocf_data_sampler.load.open_xarray_tensorstore import open_zarr, open_zarrs
6
6
  from ocf_data_sampler.load.utils import (
7
7
  check_time_unique_increasing,
8
8
  get_xr_data_array_from_xr_dataset,
9
9
  make_spatial_coords_increasing,
10
10
  )
11
11
 
12
- from .open_tensorstore_zarrs import open_zarrs
13
-
14
12
 
15
13
  def open_sat_data(zarr_path: str | list[str]) -> xr.DataArray:
16
14
  """Lazily opens the zarr store and validates data types.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocf-data-sampler
3
- Version: 0.5.3
3
+ Version: 0.5.6
4
4
  Author: James Fulton, Peter Dudfield
5
5
  Author-email: Open Climate Fix team <info@openclimatefix.org>
6
6
  License: MIT License
@@ -28,14 +28,14 @@ License: MIT License
28
28
  Project-URL: repository, https://github.com/openclimatefix/ocf-data-sampler
29
29
  Classifier: Programming Language :: Python :: 3
30
30
  Classifier: License :: OSI Approved :: MIT License
31
- Requires-Python: >=3.10
31
+ Requires-Python: >=3.11
32
32
  Description-Content-Type: text/markdown
33
33
  Requires-Dist: torch
34
34
  Requires-Dist: numpy
35
35
  Requires-Dist: pandas
36
36
  Requires-Dist: xarray
37
37
  Requires-Dist: zarr
38
- Requires-Dist: numcodecs==0.13.1
38
+ Requires-Dist: numcodecs
39
39
  Requires-Dist: dask
40
40
  Requires-Dist: matplotlib
41
41
  Requires-Dist: pvlib
@@ -45,6 +45,7 @@ Requires-Dist: pyaml_env
45
45
  Requires-Dist: pyresample
46
46
  Requires-Dist: h5netcdf
47
47
  Requires-Dist: xarray-tensorstore==0.1.5
48
+ Requires-Dist: zarr>=3
48
49
 
49
50
  # ocf-data-sampler
50
51
 
@@ -62,6 +63,12 @@ We are currently migrating to this repo from [ocf_datapipes](https://github.com/
62
63
  > [!Note]
63
64
  > This repository is still in early development development and large changes to the user facing functions may still occur.
64
65
 
66
+ ## Licence
67
+
68
+ This project is primarily licensed under the MIT License (see LICENSE).
69
+
70
+ It includes and adapts internal functions from the Google xarray-tensorstore project, licensed under the Apache License, Version 2.0.
71
+
65
72
  ## Documentation
66
73
 
67
74
  **ocf-data-sampler** doesn't have external documentation _yet_; you can read a bit about how our torch datasets work in the README [here](ocf_data_sampler/torch_datasets/README.md).
@@ -9,8 +9,8 @@ ocf_data_sampler/data/uk_gsp_locations_20250109.csv,sha256=XZISFatnbpO9j8LwaxNKF
9
9
  ocf_data_sampler/load/__init__.py,sha256=-vQP9g0UOWdVbjEGyVX_ipa7R1btmiETIKAf6aw4d78,201
10
10
  ocf_data_sampler/load/gsp.py,sha256=d30jQWnwFaLj6rKNMHdz1qD8fzF8q--RNnEXT7bGiX0,2981
11
11
  ocf_data_sampler/load/load_dataset.py,sha256=K8rWykjII-3g127If7WRRFivzHNx3SshCvZj4uQlf28,2089
12
- ocf_data_sampler/load/open_tensorstore_zarrs.py,sha256=_RHWe0GmrBSA9s1TH5I9VCMPpeZEsuRuhDt5Vyyx5Fo,2725
13
- ocf_data_sampler/load/satellite.py,sha256=RylkJz8avxdM5pK_liaTlD1DTboyPMgykXJ4_Ek9WBA,1840
12
+ ocf_data_sampler/load/open_xarray_tensorstore.py,sha256=i2IWd-uNctP4TGc5NXDlMd2Or7tmLQduPxwrBfDx-7g,5618
13
+ ocf_data_sampler/load/satellite.py,sha256=B-m0_Py_D0GwzwX5o-ixyeXntV5Z4k4MbmMBHZLUWMM,1831
14
14
  ocf_data_sampler/load/site.py,sha256=WtOy20VMHJIY0IwEemCdcecSDUGcVaLUown-4ixJw90,2147
15
15
  ocf_data_sampler/load/utils.py,sha256=AGL0aOOQPrgqNBTjlBtR7Qg1PyQov3DFJo-y198u8pY,2044
16
16
  ocf_data_sampler/load/nwp/__init__.py,sha256=SmcrnbygO5xtCKmGR4wtHrj-HI7nOAvnAtfuvRufBGQ,25
@@ -21,7 +21,7 @@ ocf_data_sampler/load/nwp/providers/ecmwf.py,sha256=P7JqfssmQq8eHKKXaBexsxts325A
21
21
  ocf_data_sampler/load/nwp/providers/gfs.py,sha256=h6vm-Rfz1JGOE4P_fP1_XQJ3bugNbeNAIyt56N8B1Dc,1066
22
22
  ocf_data_sampler/load/nwp/providers/icon.py,sha256=iVZwLKRr_D74_kAu5MHir6pRKEfbTmIxFRZAxzmiYdI,1257
23
23
  ocf_data_sampler/load/nwp/providers/ukv.py,sha256=2i32VM9gnmWUpbL0qBSp_AKzuyKucXZPS8yklbcGlbc,1039
24
- ocf_data_sampler/load/nwp/providers/utils.py,sha256=cVwCiC8FqNpkZFSUGv1CRqIQlKdjx1sIsb2SIUlvWV8,2333
24
+ ocf_data_sampler/load/nwp/providers/utils.py,sha256=IjJ3w7zDgXNFaVa4TMk8yVCvdzfrIRu5tn1OaaQ7Zso,2304
25
25
  ocf_data_sampler/numpy_sample/__init__.py,sha256=5bdpzM8hMAEe0XRSZ9AZFQdqEeBsEPhaF79Y8bDx3GQ,407
26
26
  ocf_data_sampler/numpy_sample/collate.py,sha256=hoxIc5SoHoIs3Nx37aRZzWChpswjy9lHUgaKgHIoo80,2039
27
27
  ocf_data_sampler/numpy_sample/common_types.py,sha256=9CjYHkUTx0ObduWh43fhsybZCTXvexql7qC2ptMDoek,377
@@ -56,7 +56,7 @@ ocf_data_sampler/torch_datasets/utils/validation_utils.py,sha256=YqmT-lExWlI8_ul
56
56
  scripts/download_gsp_location_data.py,sha256=rRDXMoqX-RYY4jPdxhdlxJGhWdl6r245F5UARgKV6P4,3121
57
57
  scripts/refactor_site.py,sha256=skzvsPP0Cn9yTKndzkilyNcGz4DZ88ctvCJ0XrBdc2A,3135
58
58
  utils/compute_icon_mean_stddev.py,sha256=a1oWMRMnny39rV-dvu8rcx85sb4bXzPFrR1gkUr4Jpg,2296
59
- ocf_data_sampler-0.5.3.dist-info/METADATA,sha256=9gg1K9SNIX6pJ-PXQptutiLU9fo7FsnrKM6vdHbpQYg,12580
60
- ocf_data_sampler-0.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
61
- ocf_data_sampler-0.5.3.dist-info/top_level.txt,sha256=LEFU4Uk-PEo72QGLAfnVZIUEm37Q8mKuMeg_Xk-p33g,31
62
- ocf_data_sampler-0.5.3.dist-info/RECORD,,
59
+ ocf_data_sampler-0.5.6.dist-info/METADATA,sha256=dOphQCwkuQjbwplFe3NDTSSc6Dw2z07KYQ4XtLgeGqo,12816
60
+ ocf_data_sampler-0.5.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
61
+ ocf_data_sampler-0.5.6.dist-info/top_level.txt,sha256=LEFU4Uk-PEo72QGLAfnVZIUEm37Q8mKuMeg_Xk-p33g,31
62
+ ocf_data_sampler-0.5.6.dist-info/RECORD,,
@@ -1,92 +0,0 @@
1
- """Open multiple zarrs with TensorStore.
2
-
3
- This extendds the functionality of xarray_tensorstore to open multiple zarr stores
4
- """
5
-
6
- import os
7
-
8
- import tensorstore as ts
9
- import xarray as xr
10
- from xarray_tensorstore import (
11
- _raise_if_mask_and_scale_used_for_data_vars,
12
- _TensorStoreAdapter,
13
- _zarr_spec_from_path,
14
- )
15
-
16
-
17
- def tensorstore_open_multi_zarrs(
18
- paths: list[str],
19
- data_vars: list[str],
20
- concat_axes: list[int],
21
- context: ts.Context,
22
- write: bool,
23
- ) -> dict[str, ts.TensorStore]:
24
- """Open multiple zarrs with TensorStore.
25
-
26
- Args:
27
- paths: List of paths to zarr stores.
28
- data_vars: List of data variable names to open.
29
- concat_axes: List of axes along which to concatenate the data variables.
30
- context: TensorStore context.
31
- write: Whether to open the stores for writing.
32
- """
33
- arrays_list = []
34
- for path in paths:
35
- specs = {k: _zarr_spec_from_path(os.path.join(path, k)) for k in data_vars}
36
- array_futures = {
37
- k: ts.open(spec, read=True, write=write, context=context)
38
- for k, spec in specs.items()
39
- }
40
- arrays_list.append({k: v.result() for k, v in array_futures.items()})
41
-
42
- arrays = {}
43
- for k, axis in zip(data_vars, concat_axes, strict=False):
44
- datasets = [d[k] for d in arrays_list]
45
- arrays[k] = ts.concat(datasets, axis=axis)
46
-
47
- return arrays
48
-
49
-
50
- def open_zarrs(
51
- paths: list[str],
52
- concat_dim: str,
53
- *,
54
- context: ts.Context | None = None,
55
- mask_and_scale: bool = True,
56
- write: bool = False,
57
- ) -> xr.Dataset:
58
- """Open multiple zarrs with TensorStore.
59
-
60
- Args:
61
- paths: List of paths to zarr stores.
62
- concat_dim: Dimension along which to concatenate the data variables.
63
- context: TensorStore context.
64
- mask_and_scale: Whether to mask and scale the data.
65
- write: Whether to open the stores for writing.
66
- """
67
- if context is None:
68
- context = ts.Context()
69
-
70
- ds = xr.open_mfdataset(
71
- paths,
72
- concat_dim=concat_dim,
73
- combine="nested",
74
- mask_and_scale=mask_and_scale,
75
- decode_timedelta=True,
76
- )
77
-
78
- if mask_and_scale:
79
- # Data variables get replaced below with _TensorStoreAdapter arrays, which
80
- # don't get masked or scaled. Raising an error avoids surprising users with
81
- # incorrect data values.
82
- _raise_if_mask_and_scale_used_for_data_vars(ds)
83
-
84
- data_vars = list(ds.data_vars)
85
-
86
- concat_axes = [ds[v].dims.index(concat_dim) for v in data_vars]
87
-
88
- arrays = tensorstore_open_multi_zarrs(paths, data_vars, concat_axes, context, write)
89
-
90
- new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()}
91
-
92
- return ds.copy(data=new_data)