OceanDataStore 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- OceanDataStore/__init__.py +21 -0
- OceanDataStore/catalog/__init__.py +12 -0
- OceanDataStore/catalog/oceandatacatalog.py +1242 -0
- OceanDataStore/catalog/stac/README.md +34 -0
- OceanDataStore/catalog/stac/__init__.py +30 -0
- OceanDataStore/catalog/stac/create_noc_stac.py +109 -0
- OceanDataStore/catalog/stac/npd_era5_collection.py +364 -0
- OceanDataStore/catalog/stac/npd_jra55_collection.py +196 -0
- OceanDataStore/catalog/stac/ods_obs_collection.py +534 -0
- OceanDataStore/catalog/stac/rapid_evo_collection.py +309 -0
- OceanDataStore/catalog/stac/template_collection.py +85 -0
- OceanDataStore/catalog/stac/utils.py +476 -0
- OceanDataStore/cli/__init__.py +34 -0
- OceanDataStore/cli/arg_parser.py +182 -0
- OceanDataStore/cli/cli.py +203 -0
- OceanDataStore/cli/exceptions.py +83 -0
- OceanDataStore/cli/icechunk.py +888 -0
- OceanDataStore/cli/logging.py +52 -0
- OceanDataStore/cli/object_store.py +293 -0
- OceanDataStore/cli/utils.py +275 -0
- OceanDataStore/cli/zarr.py +870 -0
- OceanDataStore/data/ARMOR3D/create_ARMOR3D_P1M-m_monthly_climatology.py +135 -0
- OceanDataStore/data/ARMOR3D/download_ARMOR3D_0.125def_P1M-m_1993_2024.py +33 -0
- OceanDataStore/data/ARMOR3D/run_create_ARMOR3D_P1M-m_monthly_climatology.slurm +32 -0
- OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_climatology_to_os.slurm +32 -0
- OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ARMOR3D/run_update_ARMOR3D_P1m-m_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_climatology_to_os.py +99 -0
- OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_to_os.py +147 -0
- OceanDataStore/data/ARMOR3D/update_ARMOR3D_P1m-m_monthly_to_os.py +143 -0
- OceanDataStore/data/EN.4.2.2/create_EN4.2.2_analysis_g10_climatology.py +162 -0
- OceanDataStore/data/EN.4.2.2/download_EN4.2.2_analysis_g10_data.sh +51 -0
- OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_climatology_to_os.slurm +32 -0
- OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
- OceanDataStore/data/EN.4.2.2/run_update_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
- OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_climatology_to_os.py +76 -0
- OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_to_os.py +165 -0
- OceanDataStore/data/EN.4.2.2/update_EN4.2.2_analysis_g10_monthly_to_os.py +161 -0
- OceanDataStore/data/ERA5/create_ERA5_daily_climatology.py +110 -0
- OceanDataStore/data/ERA5/create_ERA5_daily_mean.py +69 -0
- OceanDataStore/data/ERA5/create_ERA5_monthly_mean.py +74 -0
- OceanDataStore/data/ERA5/run_create_ERA5_daily_climatology.slurm +54 -0
- OceanDataStore/data/ERA5/run_send_ERA5_daily_climatology_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_send_ERA5_daily_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_send_ERA5_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_update_ERA5_daily_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_update_ERA5_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/send_ERA5_daily_climatology_to_os.py +159 -0
- OceanDataStore/data/ERA5/send_ERA5_daily_to_os.py +141 -0
- OceanDataStore/data/ERA5/send_ERA5_monthly_to_os.py +173 -0
- OceanDataStore/data/ERA5/update_ERA5_daily_to_os.py +141 -0
- OceanDataStore/data/ERA5/update_ERA5_monthly_to_os.py +169 -0
- OceanDataStore/data/HadISST/download_HadISST1_data.sh +43 -0
- OceanDataStore/data/HadISST/run_send_HadISST1_monthly_to_os.slurm +32 -0
- OceanDataStore/data/HadISST/send_HadISST1_monthly_to_os.py +133 -0
- OceanDataStore/data/NSIDC/download_NSIDC_monthly_1979_2025_data.sh +54 -0
- OceanDataStore/data/NSIDC/process_NSIDC_SSI_Antarctic_data.py +130 -0
- OceanDataStore/data/NSIDC/process_NSIDC_SSI_Arctic_data.py +129 -0
- OceanDataStore/data/NSIDC/run_send_NSIDC_v4.0_to_OS.slurm +32 -0
- OceanDataStore/data/NSIDC/send_NSIDC_SII_v4.0_to_os.py +140 -0
- OceanDataStore/data/OISST/create_OISSTv2_daily_climatology.py +83 -0
- OceanDataStore/data/OISST/download_oisstv2_data.sh +43 -0
- OceanDataStore/data/OISST/run_create_OISSTv2_daily_climatology.slurm +44 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_daily_climatology_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_daily_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_monthly_climatology_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_monthly_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_update_OISSTv2_daily_to_os.slurm +32 -0
- OceanDataStore/data/OISST/send_OISSTv2_daily_climatology_to_os.py +154 -0
- OceanDataStore/data/OISST/send_OISSTv2_daily_ltm_climatology_to_os.py +151 -0
- OceanDataStore/data/OISST/send_OISSTv2_daily_to_os.py +142 -0
- OceanDataStore/data/OISST/send_OISSTv2_monthly_climatology_to_os.py +150 -0
- OceanDataStore/data/OISST/send_OISSTv2_monthly_to_os.py +145 -0
- OceanDataStore/data/OISST/update_OISSTv2_daily_to_os.py +142 -0
- OceanDataStore/data/OSTIA/create_OSTIA_daily_climatology.py +120 -0
- OceanDataStore/data/OSTIA/download_OSTIA_NRT.py +42 -0
- OceanDataStore/data/OSTIA/download_OSTIA_REP_1981_2025.py +42 -0
- OceanDataStore/data/OSTIA/run_create_OSTIA_daily_climatology.slurm +54 -0
- OceanDataStore/data/OSTIA/run_send_OSTIA_daily_climatology_to_os.slurm +32 -0
- OceanDataStore/data/OSTIA/run_send_OSTIA_nrt_daily_to_os.slurm +32 -0
- OceanDataStore/data/OSTIA/run_send_OSTIA_rep_daily_to_os.slurm +32 -0
- OceanDataStore/data/OSTIA/run_update_OSTIA_daily_to_os.slurm +33 -0
- OceanDataStore/data/OSTIA/send_OSTIA_daily_climatology_to_os.py +194 -0
- OceanDataStore/data/OSTIA/send_OSTIA_nrt_daily_to_os.py +141 -0
- OceanDataStore/data/OSTIA/send_OSTIA_rep_daily_to_os.py +145 -0
- OceanDataStore/data/OSTIA/update_OSTIA_copernicus_nrt_daily_to_os.py +144 -0
- OceanDataStore/data/OSTIA/update_OSTIA_nrt_daily_to_os.py +137 -0
- OceanDataStore/data/WOA23/download_WOA23_climatology.sh +41 -0
- OceanDataStore/data/WOA23/run_send_WOA23_annual_climatology_to_os.slurm +32 -0
- OceanDataStore/data/WOA23/run_send_WOA23_monthly_climatology_to_os.slurm +32 -0
- OceanDataStore/data/WOA23/send_WOA23_annual_climatology_to_os.py +263 -0
- OceanDataStore/data/WOA23/send_WOA23_monthly_climatology_to_os.py +292 -0
- OceanDataStore/data/update_icechunk_repo_attrs.py +76 -0
- OceanDataStore/data/update_noc_npd_era5v1_attrs.py +172 -0
- OceanDataStore/data/utils.py +506 -0
- OceanDataStore/zarr.py +993 -0
- oceandatastore-0.3.0.dist-info/METADATA +184 -0
- oceandatastore-0.3.0.dist-info/RECORD +104 -0
- oceandatastore-0.3.0.dist-info/WHEEL +5 -0
- oceandatastore-0.3.0.dist-info/entry_points.txt +2 -0
- oceandatastore-0.3.0.dist-info/licenses/LICENSE +201 -0
- oceandatastore-0.3.0.dist-info/scm_file_list.json +154 -0
- oceandatastore-0.3.0.dist-info/scm_version.json +8 -0
- oceandatastore-0.3.0.dist-info/top_level.txt +1 -0
OceanDataStore/zarr.py
ADDED
|
@@ -0,0 +1,993 @@
|
|
|
1
|
+
"""
|
|
2
|
+
zarr.py
|
|
3
|
+
|
|
4
|
+
Description:
|
|
5
|
+
This module defines the functions to send and update data
|
|
6
|
+
to an object store.
|
|
7
|
+
|
|
8
|
+
Authors:
|
|
9
|
+
- Ollie Tooth
|
|
10
|
+
- Tobias Ferreira
|
|
11
|
+
- Joao Morado
|
|
12
|
+
"""
|
|
13
|
+
# -- Import Python Modules -- #
|
|
14
|
+
import glob
|
|
15
|
+
import time
|
|
16
|
+
import logging
|
|
17
|
+
import warnings
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
import xarray as xr
|
|
22
|
+
|
|
23
|
+
import dask
|
|
24
|
+
from dask.distributed import Client, LocalCluster
|
|
25
|
+
from dask.distributed.diagnostics.plugin import WorkerPlugin
|
|
26
|
+
|
|
27
|
+
# -- Import OceanDataStore Modules -- #
|
|
28
|
+
from .object_store import ObjectStoreS3
|
|
29
|
+
|
|
30
|
+
from .exceptions import (
|
|
31
|
+
ObjectNotFound,
|
|
32
|
+
DimensionNotFound,
|
|
33
|
+
DimensionSizeError,
|
|
34
|
+
AppendDimensionError,
|
|
35
|
+
AppendDimensionSizeError,
|
|
36
|
+
ChunkSizeError,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# -- Define WorkerPlugin -- #
|
|
40
|
+
class CaptureWarningsPlugin(WorkerPlugin):
|
|
41
|
+
def setup(self, worker):
|
|
42
|
+
# Used to catch UserWarnings when rechunking:
|
|
43
|
+
logging.captureWarnings(True)
|
|
44
|
+
def teardown(self, worker):
|
|
45
|
+
logging.captureWarnings(False)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# -- Define timing context manager -- #
|
|
49
|
+
class timer():
|
|
50
|
+
"""
|
|
51
|
+
Timer context manager class to return time
|
|
52
|
+
taken to write variables & datasets to an
|
|
53
|
+
object store.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
action : str
|
|
58
|
+
Action to be performed. Options are 'send' or 'update'.
|
|
59
|
+
url : str
|
|
60
|
+
URL path to Zarr store or Icechunk repository.
|
|
61
|
+
var : Optional[str], default=None
|
|
62
|
+
Name of variable to be sent or updated to store.
|
|
63
|
+
"""
|
|
64
|
+
def __init__(self, action: str, url: str, var: Optional[str] = None) -> None:
|
|
65
|
+
# Define class attributes:
|
|
66
|
+
if action == 'send':
|
|
67
|
+
if var is not None:
|
|
68
|
+
self.action = f'Sent {var} to'
|
|
69
|
+
else:
|
|
70
|
+
self.action = 'Sent dataset to'
|
|
71
|
+
elif action == 'replace':
|
|
72
|
+
if var is not None:
|
|
73
|
+
self.action = f'Updated {var} in'
|
|
74
|
+
else:
|
|
75
|
+
self.action = 'Updated'
|
|
76
|
+
elif action == 'append':
|
|
77
|
+
if var is not None:
|
|
78
|
+
self.action = f'Appended {var} to'
|
|
79
|
+
else:
|
|
80
|
+
self.action = 'Appended to'
|
|
81
|
+
else:
|
|
82
|
+
raise ValueError("Invalid action: must be 'send', 'replace' or 'append'.")
|
|
83
|
+
self.url = url
|
|
84
|
+
|
|
85
|
+
def __enter__(self):
|
|
86
|
+
self.t_start = time.time()
|
|
87
|
+
|
|
88
|
+
def __exit__(self, type, value, traceback):
|
|
89
|
+
self.t_end = time.time()
|
|
90
|
+
logging.info(
|
|
91
|
+
f"Completed: {self.action} store {self.url} in {(self.t_end - self.t_start):.2f} seconds"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# -- Define OceanDataStore Core Functions -- #
|
|
96
|
+
def _check_zarr_store(obj_store: ObjectStoreS3,
|
|
97
|
+
url: str
|
|
98
|
+
) -> bool:
|
|
99
|
+
"""
|
|
100
|
+
Check if a Zarr store exists at a specified URL path.
|
|
101
|
+
|
|
102
|
+
Parameters
|
|
103
|
+
----------
|
|
104
|
+
obj_store
|
|
105
|
+
ObjectStoreS3 remote filesystem.
|
|
106
|
+
url
|
|
107
|
+
URL path to Zarr store.
|
|
108
|
+
|
|
109
|
+
Returns
|
|
110
|
+
-------
|
|
111
|
+
bool
|
|
112
|
+
True if the store exists, False otherwise.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
return obj_store.exists(url.replace("s3://", ""))
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _check_zarr_compatibility(data: xr.DataArray | xr.Dataset,
|
|
119
|
+
obj_store: ObjectStoreS3,
|
|
120
|
+
url: str,
|
|
121
|
+
append_dim: str = "time_counter",
|
|
122
|
+
rechunk: Optional[dict] = None,
|
|
123
|
+
version: int = 3,
|
|
124
|
+
) -> None:
|
|
125
|
+
"""
|
|
126
|
+
Check compatibility of DataArray or Dataset to update existing
|
|
127
|
+
Zarr store in cloud object storage.
|
|
128
|
+
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
data: xr.DataArray | xr.Dataset
|
|
132
|
+
DataArray or DataSet to update existing Zarr store with.
|
|
133
|
+
obj_store: ObjectStoreS3
|
|
134
|
+
ObjectStoreS3 remote filesystem.
|
|
135
|
+
url: str
|
|
136
|
+
URL path to Zarr store.
|
|
137
|
+
append_dim: bool, default="time_counter"
|
|
138
|
+
Dimension to append data to existing Zarr store.
|
|
139
|
+
rechunk: Optional[dict], default=None
|
|
140
|
+
Mapping to rechunk dimensions.
|
|
141
|
+
version: int, default=3
|
|
142
|
+
Zarr version to use.
|
|
143
|
+
"""
|
|
144
|
+
# 1. Check if the store exists:
|
|
145
|
+
if not _check_zarr_store(obj_store=obj_store, path=url):
|
|
146
|
+
raise ObjectNotFound(object_name=url)
|
|
147
|
+
|
|
148
|
+
# 2. Check Zarr store compatibility:
|
|
149
|
+
try:
|
|
150
|
+
ds_store = xr.open_zarr(store=url,
|
|
151
|
+
storage_options=obj_store.get_remote_options(),
|
|
152
|
+
zarr_format=version
|
|
153
|
+
)
|
|
154
|
+
except Exception as e:
|
|
155
|
+
raise FileNotFoundError(f"zarr version {version} is not compatible with the store: {e}")
|
|
156
|
+
|
|
157
|
+
# 3. Check if core dimensions exist & size are compatible:
|
|
158
|
+
dims_data = {dim : data.sizes[dim] for dim in data.dims if dim != append_dim}
|
|
159
|
+
for dim in dims_data:
|
|
160
|
+
if dim in ds_store.dims:
|
|
161
|
+
if dims_data[dim] != ds_store.sizes[dim]:
|
|
162
|
+
raise DimensionSizeError(dim=dim, size=dims_data[dim], expected_size=ds_store.sizes[dim])
|
|
163
|
+
else:
|
|
164
|
+
raise DimensionNotFound(dim=dim, object_name=url)
|
|
165
|
+
|
|
166
|
+
# 4. Check if append dimension values are compatible:
|
|
167
|
+
if (data[append_dim][0] < ds_store[append_dim][0]):
|
|
168
|
+
raise AppendDimensionError(dim=append_dim)
|
|
169
|
+
|
|
170
|
+
# 5. Check if specified chunks are compatible:
|
|
171
|
+
if rechunk is not None:
|
|
172
|
+
for dim in rechunk:
|
|
173
|
+
if dim in ds_store.dims:
|
|
174
|
+
if rechunk[dim] != ds_store.chunks[dim][0]:
|
|
175
|
+
raise ChunkSizeError(chunks=rechunk, store_chunks=ds_store.chunks)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _write_to_zarr(
|
|
179
|
+
data: xr.DataArray | xr.Dataset,
|
|
180
|
+
obj_store: ObjectStoreS3,
|
|
181
|
+
url: str,
|
|
182
|
+
version: int = 3,
|
|
183
|
+
) -> None:
|
|
184
|
+
"""
|
|
185
|
+
Write DataArray or Dataset to Zarr store in cloud
|
|
186
|
+
object storage.
|
|
187
|
+
|
|
188
|
+
Parameters
|
|
189
|
+
----------
|
|
190
|
+
data: xr.DataArray | xr.Dataset
|
|
191
|
+
DataArray or DataSet to write to Zarr store.
|
|
192
|
+
obj_store: ObjectStoreS3
|
|
193
|
+
ObjectStoreS3 remote filesystem.
|
|
194
|
+
url: str
|
|
195
|
+
URL path to Zarr store.
|
|
196
|
+
version: int, default=3
|
|
197
|
+
Zarr version to use.
|
|
198
|
+
"""
|
|
199
|
+
# === Verify Inputs === #
|
|
200
|
+
if not isinstance(data, (xr.DataArray, xr.Dataset)):
|
|
201
|
+
raise TypeError("data must be a DataArray or Dataset.")
|
|
202
|
+
if not isinstance(obj_store, ObjectStoreS3):
|
|
203
|
+
raise TypeError("obj_store must be an ObjectStoreS3 instance.")
|
|
204
|
+
if not isinstance(url, str):
|
|
205
|
+
raise TypeError("url must be a string.")
|
|
206
|
+
if not isinstance(version, int):
|
|
207
|
+
raise TypeError("version must be an integer.")
|
|
208
|
+
|
|
209
|
+
# Convert DataArrays to Datasets:
|
|
210
|
+
if isinstance(data, xr.DataArray):
|
|
211
|
+
var = data.name
|
|
212
|
+
data = data.to_dataset()
|
|
213
|
+
else:
|
|
214
|
+
var = None
|
|
215
|
+
|
|
216
|
+
# Write Dataset to Zarr store in Object Store:
|
|
217
|
+
if _check_zarr_store(obj_store=obj_store, path=url):
|
|
218
|
+
logging.info(f"Skipping Variable: Store already exists at {url}")
|
|
219
|
+
|
|
220
|
+
else:
|
|
221
|
+
with timer(action='send', url=url, var=var):
|
|
222
|
+
# Catch consolidated metadata warnings:
|
|
223
|
+
with warnings.catch_warnings():
|
|
224
|
+
warnings.simplefilter(action="ignore", category=UserWarning)
|
|
225
|
+
data.to_zarr(store=url,
|
|
226
|
+
storage_options=obj_store.get_remote_options(),
|
|
227
|
+
mode="w",
|
|
228
|
+
zarr_format=version
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _append_to_zarr(data: xr.DataArray | xr.Dataset,
|
|
233
|
+
obj_store: ObjectStoreS3,
|
|
234
|
+
url: str,
|
|
235
|
+
append_dim: str = "time_counter",
|
|
236
|
+
version: int = 3,
|
|
237
|
+
) -> None:
|
|
238
|
+
"""
|
|
239
|
+
Append DataArray or Dataset to existing Zarr store in
|
|
240
|
+
cloud object storage.
|
|
241
|
+
|
|
242
|
+
Parameters
|
|
243
|
+
----------
|
|
244
|
+
data: xr.DataArray | xr.Dataset
|
|
245
|
+
DataArray or DataSet to append to existing Zarr store.
|
|
246
|
+
obj_store: ObjectStoreS3
|
|
247
|
+
ObjectStoreS3 remote filesystem.
|
|
248
|
+
url: str
|
|
249
|
+
URL path to Zarr store.
|
|
250
|
+
append_dim: str, default="time_counter"
|
|
251
|
+
Dimension to append data to existing Zarr store.
|
|
252
|
+
version: int, default=3
|
|
253
|
+
Zarr version to use.
|
|
254
|
+
"""
|
|
255
|
+
with timer(action='append', url=url):
|
|
256
|
+
# Catch consolidated metadata warnings:
|
|
257
|
+
with warnings.catch_warnings():
|
|
258
|
+
warnings.simplefilter(action="ignore", category=UserWarning)
|
|
259
|
+
data.to_zarr(store=url,
|
|
260
|
+
storage_options=obj_store.get_remote_options(),
|
|
261
|
+
append_dim=append_dim,
|
|
262
|
+
zarr_format=version
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _replace_in_zarr(data: xr.DataArray | xr.Dataset,
|
|
267
|
+
obj_store: ObjectStoreS3,
|
|
268
|
+
url: str,
|
|
269
|
+
region: dict,
|
|
270
|
+
version: int = 3,
|
|
271
|
+
) -> None:
|
|
272
|
+
"""
|
|
273
|
+
Append DataArray or Dataset to existing Zarr store in
|
|
274
|
+
cloud object storage.
|
|
275
|
+
|
|
276
|
+
Parameters
|
|
277
|
+
----------
|
|
278
|
+
data: xr.DataArray | xr.Dataset
|
|
279
|
+
DataArray or DataSet to append to existing Zarr store.
|
|
280
|
+
obj_store: ObjectStoreS3
|
|
281
|
+
ObjectStoreS3 remote filesystem.
|
|
282
|
+
url: str
|
|
283
|
+
URL path to Zarr store.
|
|
284
|
+
region: dict
|
|
285
|
+
Region of existing Zarr store to replace data.
|
|
286
|
+
version: int, default=3
|
|
287
|
+
Zarr version to use.
|
|
288
|
+
"""
|
|
289
|
+
# Drop variables w/o append dimension:
|
|
290
|
+
append_dim = list(region.keys())[0]
|
|
291
|
+
drop_list = [var for var in data.variables if append_dim not in data[var].dims]
|
|
292
|
+
data = data.drop_vars(drop_list)
|
|
293
|
+
|
|
294
|
+
with timer(action='replace', url=url):
|
|
295
|
+
# Catch consolidated metadata warnings:
|
|
296
|
+
with warnings.catch_warnings():
|
|
297
|
+
warnings.simplefilter(action="ignore", category=UserWarning)
|
|
298
|
+
data.to_zarr(store=url,
|
|
299
|
+
storage_options=obj_store.get_remote_options(),
|
|
300
|
+
region=region,
|
|
301
|
+
zarr_format=version
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def _update_zarr_store(data: xr.DataArray | xr.Dataset,
|
|
306
|
+
obj_store: ObjectStoreS3,
|
|
307
|
+
url: str,
|
|
308
|
+
append_dim: str = "time_counter",
|
|
309
|
+
rechunk: Optional[dict] = None,
|
|
310
|
+
version: int = 3,
|
|
311
|
+
) -> None:
|
|
312
|
+
"""
|
|
313
|
+
Update an existing Zarr store in object storage by replacing
|
|
314
|
+
existing values and/or appending new values.
|
|
315
|
+
|
|
316
|
+
Parameters
|
|
317
|
+
----------
|
|
318
|
+
data: xr.DataArray | xr.Dataset
|
|
319
|
+
DataArray or DataSet to update existing Zarr store with.
|
|
320
|
+
obj_store: ObjectStoreS3
|
|
321
|
+
ObjectStoreS3 remote filesystem.
|
|
322
|
+
url: str
|
|
323
|
+
URL path to Zarr store.
|
|
324
|
+
append_dim: bool, default="time_counter"
|
|
325
|
+
Dimension to append data to existing Zarr store.
|
|
326
|
+
rechunk: Optional[dict], default=None
|
|
327
|
+
Mapping to rechunk dimensions.
|
|
328
|
+
version: int, default=3
|
|
329
|
+
Zarr version to use.
|
|
330
|
+
"""
|
|
331
|
+
# === Verify Inputs === #
|
|
332
|
+
if not isinstance(data, (xr.DataArray, xr.Dataset)):
|
|
333
|
+
raise TypeError("data must be a DataArray or Dataset.")
|
|
334
|
+
if not isinstance(obj_store, ObjectStoreS3):
|
|
335
|
+
raise TypeError("obj_store must be an ObjectStoreS3 instance.")
|
|
336
|
+
if not isinstance(url, str):
|
|
337
|
+
raise TypeError("url must be a string.")
|
|
338
|
+
if not isinstance(append_dim, str):
|
|
339
|
+
raise TypeError("append_dim must be a string.")
|
|
340
|
+
if rechunk is not None:
|
|
341
|
+
if not isinstance(rechunk, dict):
|
|
342
|
+
raise TypeError("rechunk must be a dictionary.")
|
|
343
|
+
if not isinstance(version, int):
|
|
344
|
+
raise TypeError("version must be an integer.")
|
|
345
|
+
|
|
346
|
+
# Convert DataArrays to Datasets:
|
|
347
|
+
if isinstance(data, xr.DataArray):
|
|
348
|
+
var = data.name
|
|
349
|
+
ds_source = data.to_dataset()
|
|
350
|
+
else:
|
|
351
|
+
var = None
|
|
352
|
+
ds_source = data
|
|
353
|
+
|
|
354
|
+
# Check source Dataset compatibility with existing store:
|
|
355
|
+
_check_zarr_compatibility(data=ds_source,
|
|
356
|
+
obj_store=obj_store,
|
|
357
|
+
url=url,
|
|
358
|
+
append_dim=append_dim,
|
|
359
|
+
rechunk=rechunk,
|
|
360
|
+
version=version
|
|
361
|
+
)
|
|
362
|
+
logging.info(f"Passed Compatibility Checks for store {url}")
|
|
363
|
+
|
|
364
|
+
# === Update existing variable in Zarr Store === #
|
|
365
|
+
# Extract source & target append dimension values:
|
|
366
|
+
ds_target = xr.open_zarr(store=url,
|
|
367
|
+
storage_options=obj_store.get_remote_options(),
|
|
368
|
+
zarr_format=version
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
if (var in ds_target.data_vars) or (var is None):
|
|
372
|
+
|
|
373
|
+
# === Updating existing Zarr store === #
|
|
374
|
+
# Extract source & target append dimension values:
|
|
375
|
+
target_append_dim = ds_target[append_dim].values
|
|
376
|
+
source_append_dim = ds_source[append_dim].values
|
|
377
|
+
|
|
378
|
+
# Determine intersection between source & target append dimensions:
|
|
379
|
+
intersect_append_dim = np.intersect1d(source_append_dim, target_append_dim)
|
|
380
|
+
|
|
381
|
+
if intersect_append_dim.size != 0:
|
|
382
|
+
# == Intersection exists -> replace overlapping values in target store == #
|
|
383
|
+
|
|
384
|
+
# Ensure all overlapping values exist along target append dimension:
|
|
385
|
+
overlap_append_dim = (source_append_dim <= target_append_dim[-1]).sum()
|
|
386
|
+
if intersect_append_dim.size != overlap_append_dim:
|
|
387
|
+
raise AppendDimensionSizeError(dim=append_dim, size=overlap_append_dim, expected_size=intersect_append_dim.size)
|
|
388
|
+
|
|
389
|
+
# Determine source and target append dimension indices of overlap:
|
|
390
|
+
target_ind_min = np.flatnonzero(target_append_dim == source_append_dim[0])[0]
|
|
391
|
+
target_ind_max = target_append_dim.size
|
|
392
|
+
source_ind_min = 0
|
|
393
|
+
source_ind_max = target_ind_max - target_ind_min
|
|
394
|
+
source_ind_size = source_append_dim.size
|
|
395
|
+
|
|
396
|
+
# 1. Replace overlapping values in target store:
|
|
397
|
+
logging.info(f"Updating {url} along {append_dim} from {target_append_dim[target_ind_min]} to {target_append_dim[target_ind_max - 1]}.")
|
|
398
|
+
_replace_in_zarr(data=ds_source.isel({append_dim : slice(source_ind_min, source_ind_max)}),
|
|
399
|
+
obj_store=obj_store,
|
|
400
|
+
url=url,
|
|
401
|
+
region={append_dim : slice(target_ind_min, target_ind_max)},
|
|
402
|
+
version=version,
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
# 2. Append new values to target store:
|
|
406
|
+
if source_ind_size > source_ind_max:
|
|
407
|
+
logging.info(f"Appending to {url} along {append_dim} from {source_append_dim[source_ind_max]} to {source_append_dim[source_ind_size - 1]}.")
|
|
408
|
+
_append_to_zarr(data=ds_source.isel({append_dim : slice(source_ind_max, source_ind_size)}),
|
|
409
|
+
obj_store=obj_store,
|
|
410
|
+
url=url,
|
|
411
|
+
append_dim=append_dim,
|
|
412
|
+
version=version,
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
else:
|
|
416
|
+
# == No intersection -> append all source values to target store == #
|
|
417
|
+
_append_to_zarr(data=ds_source,
|
|
418
|
+
obj_store=obj_store,
|
|
419
|
+
url=url,
|
|
420
|
+
append_dim=append_dim,
|
|
421
|
+
version=version,
|
|
422
|
+
)
|
|
423
|
+
else:
|
|
424
|
+
# == Add new variable to Zarr Store == #
|
|
425
|
+
logging.info(f"Sending Variable {var}")
|
|
426
|
+
_write_to_zarr(data=ds_source,
|
|
427
|
+
obj_store=obj_store,
|
|
428
|
+
url=url,
|
|
429
|
+
version=version,
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def _preprocess_dataset(file: list[str] | str | xr.Dataset,
|
|
434
|
+
rechunk: Optional[dict] = None,
|
|
435
|
+
append_dim: str = "time_counter",
|
|
436
|
+
update_coords: Optional[dict] = None,
|
|
437
|
+
grid_filepath: Optional[str] = None,
|
|
438
|
+
attrs: Optional[dict] = None,
|
|
439
|
+
parallel: bool = False,
|
|
440
|
+
) -> xr.Dataset:
|
|
441
|
+
"""
|
|
442
|
+
Preprocess the dataset to be sent to the object store.
|
|
443
|
+
|
|
444
|
+
Parameters
|
|
445
|
+
----------
|
|
446
|
+
file: list | str | xarray.Dataset
|
|
447
|
+
Regular expression or list of filepaths to netCDF file(s).
|
|
448
|
+
Users can also pass a single xarray.Dataset directly.
|
|
449
|
+
rechunk: Optional[dict], default=None
|
|
450
|
+
Mapping to rechunk dimensions. If None, dask chunks
|
|
451
|
+
will be set to on-disk chunks.
|
|
452
|
+
append_dim: str, default='time_counter'
|
|
453
|
+
Name of the dimension to append multi-file datasets.
|
|
454
|
+
update_coords: Optional[dict], default=None
|
|
455
|
+
Mapping of coordinate variables to update using model
|
|
456
|
+
grid file. Keys are coordinate variable names in the
|
|
457
|
+
dataset to be sent, and values are the corresponding
|
|
458
|
+
variable names in the model grid file. If None, no
|
|
459
|
+
coordinates will be updated.
|
|
460
|
+
grid_filepath: Optional[str], default=None
|
|
461
|
+
Filepath to the model grid file to update coordinate
|
|
462
|
+
variables. Required if update_coords is not None.
|
|
463
|
+
attrs: Optional[dict], default=None
|
|
464
|
+
Dictionary of attributes to add to the dataset.
|
|
465
|
+
If None, no attributes will be added.
|
|
466
|
+
parallel: bool, default=False
|
|
467
|
+
Whether to open and preprocess the dataset in parallel
|
|
468
|
+
using `dask.delayed`.
|
|
469
|
+
|
|
470
|
+
Returns
|
|
471
|
+
-------
|
|
472
|
+
xr.Dataset
|
|
473
|
+
Preprocessed (multifile) dataset with optionally
|
|
474
|
+
updated coordinates, chunksizes and attributes.
|
|
475
|
+
|
|
476
|
+
"""
|
|
477
|
+
# == Verify Inputs == #
|
|
478
|
+
if not isinstance(file, (list, str, xr.Dataset)):
|
|
479
|
+
raise TypeError("filepaths must be a list, a string or an xarray Dataset.")
|
|
480
|
+
if isinstance(file, list):
|
|
481
|
+
for fpath in file:
|
|
482
|
+
if not isinstance(fpath, str):
|
|
483
|
+
raise TypeError("filepaths must be a list of strings.")
|
|
484
|
+
if not fpath.endswith('.nc'):
|
|
485
|
+
raise ValueError("Invalid file extension: only .nc files are supported.")
|
|
486
|
+
elif isinstance(file, str):
|
|
487
|
+
if not file.endswith('.nc'):
|
|
488
|
+
raise ValueError("Invalid file extension: only .nc files are supported.")
|
|
489
|
+
if rechunk is not None:
|
|
490
|
+
if not isinstance(rechunk, dict):
|
|
491
|
+
raise TypeError("rechunk must be a dictionary.")
|
|
492
|
+
if not isinstance(append_dim, str):
|
|
493
|
+
raise TypeError("append_dim must be a string.")
|
|
494
|
+
if update_coords is not None:
|
|
495
|
+
if not isinstance(update_coords, dict):
|
|
496
|
+
raise TypeError("update_coords must be a dictionary.")
|
|
497
|
+
if grid_filepath is not None:
|
|
498
|
+
if not isinstance(grid_filepath, str):
|
|
499
|
+
raise TypeError("grid_filepath must be a string.")
|
|
500
|
+
if attrs is not None:
|
|
501
|
+
if not isinstance(attrs, dict):
|
|
502
|
+
raise TypeError("attrs must be a dictionary.")
|
|
503
|
+
if not isinstance(parallel, bool):
|
|
504
|
+
raise TypeError("parallel must be a boolean.")
|
|
505
|
+
|
|
506
|
+
# === Load netCDF dataset === #
|
|
507
|
+
if rechunk is None:
|
|
508
|
+
# Default to dask chunks equal to on-disk chunks:
|
|
509
|
+
rechunk = {}
|
|
510
|
+
|
|
511
|
+
# File names from str / regular expression:
|
|
512
|
+
if isinstance(file, str):
|
|
513
|
+
if '*' in file:
|
|
514
|
+
filepaths = sorted(glob.glob(file))
|
|
515
|
+
if len(filepaths) == 0:
|
|
516
|
+
raise FileNotFoundError(f"No files found at {filepaths}")
|
|
517
|
+
else:
|
|
518
|
+
filepaths = [file]
|
|
519
|
+
# File names from list:
|
|
520
|
+
elif isinstance(file, list):
|
|
521
|
+
filepaths = file
|
|
522
|
+
|
|
523
|
+
# Use input dataset:
|
|
524
|
+
if isinstance(file, xr.Dataset):
|
|
525
|
+
ds_filepath = file
|
|
526
|
+
if rechunk is not None:
|
|
527
|
+
ds_filepath = ds_filepath.chunk(rechunk)
|
|
528
|
+
else:
|
|
529
|
+
# Open multi-file dataset:
|
|
530
|
+
if len(filepaths) > 1:
|
|
531
|
+
ds_filepath = xr.open_mfdataset(filepaths,
|
|
532
|
+
engine='h5netcdf',
|
|
533
|
+
chunks=rechunk,
|
|
534
|
+
parallel=parallel,
|
|
535
|
+
concat_dim=append_dim,
|
|
536
|
+
combine='nested',
|
|
537
|
+
data_vars='minimal',
|
|
538
|
+
coords='minimal',
|
|
539
|
+
compat='override'
|
|
540
|
+
)
|
|
541
|
+
else:
|
|
542
|
+
# Open single file dataset:
|
|
543
|
+
ds_filepath = xr.open_dataset(filepaths[0], chunks=rechunk)
|
|
544
|
+
|
|
545
|
+
# === Update coordinates using model grid file === #
|
|
546
|
+
if update_coords is not None:
|
|
547
|
+
if grid_filepath is None:
|
|
548
|
+
raise ValueError(
|
|
549
|
+
"grid_filepath must be specified to update coordinate variables."
|
|
550
|
+
)
|
|
551
|
+
else:
|
|
552
|
+
ds_grid = xr.open_dataset(grid_filepath)
|
|
553
|
+
# Update coordinate vars using model grid file:
|
|
554
|
+
for key in update_coords.keys():
|
|
555
|
+
coord_data = ds_grid[update_coords[key]].squeeze(drop=True)
|
|
556
|
+
# Rechunk dimensions to user specified chunks:
|
|
557
|
+
if rechunk is not None:
|
|
558
|
+
coord_chunks = {dim: rechunk[dim] for dim in coord_data.dims}
|
|
559
|
+
ds_filepath = ds_filepath.assign_coords(
|
|
560
|
+
{key: coord_data.chunk(coord_chunks)}
|
|
561
|
+
)
|
|
562
|
+
else:
|
|
563
|
+
ds_filepath = ds_filepath.assign_coords(
|
|
564
|
+
{key: coord_data}
|
|
565
|
+
)
|
|
566
|
+
logging.info('Completed: Updated coordinate variables.')
|
|
567
|
+
|
|
568
|
+
# === Update Attributes === #
|
|
569
|
+
if attrs is not None:
|
|
570
|
+
ds_filepath = ds_filepath.assign_attrs(attrs)
|
|
571
|
+
|
|
572
|
+
return ds_filepath
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def _send_to_zarr(
|
|
576
|
+
file: list[str] | str | xr.Dataset,
|
|
577
|
+
bucket: str,
|
|
578
|
+
object_prefix: str,
|
|
579
|
+
store_credentials_json: str,
|
|
580
|
+
variables: Optional[list[str]] = None,
|
|
581
|
+
append_dim: str = "time_counter",
|
|
582
|
+
grid_filepath: Optional[str] = None,
|
|
583
|
+
update_coords: Optional[dict] = None,
|
|
584
|
+
rechunk: Optional[dict] = None,
|
|
585
|
+
attrs: Optional[dict] = None,
|
|
586
|
+
parallel: bool = False,
|
|
587
|
+
zarr_version: int = 3
|
|
588
|
+
) -> None:
|
|
589
|
+
"""
|
|
590
|
+
Write data to new Zarr store in cloud object storage.
|
|
591
|
+
|
|
592
|
+
Parameters
|
|
593
|
+
----------
|
|
594
|
+
file: list | str | xarray.Dataset
|
|
595
|
+
Regular expression or list of filepaths to netCDF file(s).
|
|
596
|
+
Users can also pass a single xarray.Dataset directly.
|
|
597
|
+
bucket: str
|
|
598
|
+
Name of the bucket in the object store. Bucket names can contain only
|
|
599
|
+
lowercase letters, numbers, dots (.), and hyphens (-).
|
|
600
|
+
object_prefix: str
|
|
601
|
+
Prefix to be added to the object names in the object store.
|
|
602
|
+
store_credentials_json: str
|
|
603
|
+
Path to the JSON file containing the object store credentials.
|
|
604
|
+
variables: list[str], optional
|
|
605
|
+
List of variables to send to Zarr stores.
|
|
606
|
+
If None, all variables will be sent.
|
|
607
|
+
append_dim: str, default='time_counter'
|
|
608
|
+
Name of the dimension to append multifile datasets.
|
|
609
|
+
grid_filepath: str, optional
|
|
610
|
+
Path to file containing model grid parameter.
|
|
611
|
+
update_coords: dict, optional
|
|
612
|
+
Dictionary of coordinate variables to update.
|
|
613
|
+
rechunk: dict, optional
|
|
614
|
+
Rechunk strategy dictionary.
|
|
615
|
+
attrs: dict, optional
|
|
616
|
+
Attributes to add to the dataset.
|
|
617
|
+
parallel: bool, default=False,
|
|
618
|
+
Whether to perform open and preprocess steps in parallel using
|
|
619
|
+
`dask.delayed`.
|
|
620
|
+
zarr_version: int, default=3
|
|
621
|
+
Zarr version to use.
|
|
622
|
+
"""
|
|
623
|
+
# === Initialise Asynchronous Object Store === #
|
|
624
|
+
logging.info("Reading object store credentials from %s", store_credentials_json)
|
|
625
|
+
obj_store = ObjectStoreS3(anon=False,
|
|
626
|
+
asynchronous=True,
|
|
627
|
+
store_credentials_json=store_credentials_json
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
# === Preprocess Data === #
|
|
631
|
+
ds_filepath = _preprocess_dataset(file=file,
|
|
632
|
+
rechunk=rechunk,
|
|
633
|
+
append_dim=append_dim,
|
|
634
|
+
update_coords=update_coords,
|
|
635
|
+
grid_filepath=grid_filepath,
|
|
636
|
+
attrs=attrs,
|
|
637
|
+
parallel=parallel,
|
|
638
|
+
)
|
|
639
|
+
if variables is None:
|
|
640
|
+
variables = list(ds_filepath.data_vars)
|
|
641
|
+
|
|
642
|
+
# === Send Dataset to Zarr store === #
|
|
643
|
+
# Write to Zarr store:
|
|
644
|
+
url = f"s3://{bucket}/{object_prefix}"
|
|
645
|
+
logging.info(f"Sending Dataset to {url}")
|
|
646
|
+
_write_to_zarr(data=ds_filepath[variables],
|
|
647
|
+
obj_store=obj_store,
|
|
648
|
+
url=url,
|
|
649
|
+
version=zarr_version
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
# Release resources to avoid memory leaks:
|
|
653
|
+
ds_filepath.close()
|
|
654
|
+
|
|
655
|
+
def send_to_zarr(
|
|
656
|
+
file: list[str] | str | xr.Dataset,
|
|
657
|
+
bucket: str,
|
|
658
|
+
object_prefix: str,
|
|
659
|
+
store_credentials_json: str,
|
|
660
|
+
variables: Optional[list[str]] = None,
|
|
661
|
+
append_dim: str = "time_counter",
|
|
662
|
+
grid_filepath: Optional[str] = None,
|
|
663
|
+
update_coords: Optional[dict] = None,
|
|
664
|
+
rechunk: Optional[dict] = None,
|
|
665
|
+
attrs: Optional[dict] = None,
|
|
666
|
+
client : Optional[Client] = None,
|
|
667
|
+
dask_config_kwargs: Optional[dict] = None,
|
|
668
|
+
dask_cluster_kwargs: Optional[dict] = None,
|
|
669
|
+
zarr_version: int = 3
|
|
670
|
+
) -> None:
|
|
671
|
+
"""
|
|
672
|
+
Write data to new Zarr store in cloud object storage with
|
|
673
|
+
option of using dask.
|
|
674
|
+
|
|
675
|
+
Parameters
|
|
676
|
+
----------
|
|
677
|
+
file: list | str | xarray.Dataset
|
|
678
|
+
Regular expression or list of filepaths to netCDF file(s).
|
|
679
|
+
Users can also pass a single xarray.Dataset directly.
|
|
680
|
+
bucket: str
|
|
681
|
+
Name of the bucket in the object store. Bucket names can contain only
|
|
682
|
+
lowercase letters, numbers, dots (.), and hyphens (-).
|
|
683
|
+
object_prefix: str
|
|
684
|
+
Prefix to be added to the object names in the object store.
|
|
685
|
+
store_credentials_json: str
|
|
686
|
+
Path to the JSON file containing the object store credentials.
|
|
687
|
+
variables: list[str], optional
|
|
688
|
+
List of variables to send. If None, all variables will be sent.
|
|
689
|
+
append_dim: str, default="time_counter"
|
|
690
|
+
Name of the append dimension, by default "time_counter".
|
|
691
|
+
grid_filepath: str, optional
|
|
692
|
+
Path to file containing model grid parameter.
|
|
693
|
+
update_coords: dict, optional
|
|
694
|
+
Dictionary of coordinate variables to update.
|
|
695
|
+
rechunk: dict, optional
|
|
696
|
+
Rechunk strategy dictionary, by default None.
|
|
697
|
+
attrs: dict, optional
|
|
698
|
+
Attributes to add to the dataset.
|
|
699
|
+
client: dask.distributed.Client, optional
|
|
700
|
+
Dask Distributed Client.
|
|
701
|
+
dask_config_kwargs: dict[str,str], optional
|
|
702
|
+
Dask configuration settings passed to dask.config.set().
|
|
703
|
+
dask_cluster_kwargs: dict, optional
|
|
704
|
+
Dask cluster configuration settings passed to LocalCluster().
|
|
705
|
+
zarr_version: int, default=3
|
|
706
|
+
Zarr version to use.
|
|
707
|
+
"""
|
|
708
|
+
if dask_cluster_kwargs is not None:
|
|
709
|
+
# === Send to Zarr store with Dask === #
|
|
710
|
+
if dask_config_kwargs is not None:
|
|
711
|
+
dask.config.set(dask_config_kwargs)
|
|
712
|
+
logging.info("Updated dask configuration settings.")
|
|
713
|
+
|
|
714
|
+
# Create local dask cluster & client:
|
|
715
|
+
with LocalCluster(**dask_cluster_kwargs) as cluster, Client(cluster) as client:
|
|
716
|
+
logging.info(f"Created LocalCluster with {dask_cluster_kwargs['n_workers']} workers @ Client: {client.dashboard_link}")
|
|
717
|
+
|
|
718
|
+
# Catch UserWarnings when rechunking data:
|
|
719
|
+
client.register_worker_plugin(CaptureWarningsPlugin())
|
|
720
|
+
|
|
721
|
+
_send_to_zarr(file=file,
|
|
722
|
+
bucket=bucket,
|
|
723
|
+
object_prefix=object_prefix,
|
|
724
|
+
store_credentials_json=store_credentials_json,
|
|
725
|
+
variables=variables,
|
|
726
|
+
append_dim=append_dim,
|
|
727
|
+
grid_filepath=grid_filepath,
|
|
728
|
+
update_coords=update_coords,
|
|
729
|
+
rechunk=rechunk,
|
|
730
|
+
attrs=attrs,
|
|
731
|
+
parallel=True,
|
|
732
|
+
zarr_version=zarr_version
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
# --- Shutdown Store & Dask Cluster --- #
|
|
736
|
+
cluster.close()
|
|
737
|
+
client.shutdown()
|
|
738
|
+
logging.info("Dask Cluster has been shutdown.")
|
|
739
|
+
|
|
740
|
+
elif client is not None:
|
|
741
|
+
logging.info(f"Using existing Dask Cluster @ Client: {client.dashboard_link}")
|
|
742
|
+
|
|
743
|
+
# Catch UserWarnings when rechunking data:
|
|
744
|
+
client.register_worker_plugin(CaptureWarningsPlugin())
|
|
745
|
+
|
|
746
|
+
_send_to_zarr(file=file,
|
|
747
|
+
bucket=bucket,
|
|
748
|
+
object_prefix=object_prefix,
|
|
749
|
+
store_credentials_json=store_credentials_json,
|
|
750
|
+
variables=variables,
|
|
751
|
+
append_dim=append_dim,
|
|
752
|
+
grid_filepath=grid_filepath,
|
|
753
|
+
update_coords=update_coords,
|
|
754
|
+
rechunk=rechunk,
|
|
755
|
+
attrs=attrs,
|
|
756
|
+
parallel=True,
|
|
757
|
+
zarr_version=zarr_version
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
# --- Shutdown Store & Dask Cluster --- #
|
|
761
|
+
cluster.close()
|
|
762
|
+
client.shutdown()
|
|
763
|
+
logging.info("Existing Dask Cluster has been shutdown.")
|
|
764
|
+
|
|
765
|
+
else:
|
|
766
|
+
# === Send to Zarr store without Dask === #
|
|
767
|
+
_send_to_zarr(file=file,
|
|
768
|
+
bucket=bucket,
|
|
769
|
+
object_prefix=object_prefix,
|
|
770
|
+
store_credentials_json=store_credentials_json,
|
|
771
|
+
variables=variables,
|
|
772
|
+
append_dim=append_dim,
|
|
773
|
+
grid_filepath=grid_filepath,
|
|
774
|
+
update_coords=update_coords,
|
|
775
|
+
rechunk=rechunk,
|
|
776
|
+
attrs=attrs,
|
|
777
|
+
parallel=False,
|
|
778
|
+
zarr_version=zarr_version
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
def _update_zarr(
|
|
782
|
+
file: list[str] | str | xr.Dataset,
|
|
783
|
+
bucket: str,
|
|
784
|
+
object_prefix: str,
|
|
785
|
+
store_credentials_json: str,
|
|
786
|
+
variables: Optional[list[str]] = None,
|
|
787
|
+
append_dim: str = "time_counter",
|
|
788
|
+
grid_filepath: Optional[str] = None,
|
|
789
|
+
update_coords: Optional[dict] = None,
|
|
790
|
+
rechunk: Optional[dict] = None,
|
|
791
|
+
attrs: Optional[dict] = None,
|
|
792
|
+
parallel: bool = False,
|
|
793
|
+
zarr_version: int = 3
|
|
794
|
+
) -> None:
|
|
795
|
+
"""
|
|
796
|
+
Update existing Zarr store in cloud object storage
|
|
797
|
+
by replacing and/or appending data.
|
|
798
|
+
|
|
799
|
+
Parameters
|
|
800
|
+
----------
|
|
801
|
+
file: list | str
|
|
802
|
+
Regular expression or list of filepaths to netCDF file(s).
|
|
803
|
+
Users can also pass a single xarray.Dataset directly.
|
|
804
|
+
bucket: str
|
|
805
|
+
Name of the bucket in the object store. Bucket names can contain only
|
|
806
|
+
lowercase letters, numbers, dots (.), and hyphens (-).
|
|
807
|
+
object_prefix: str
|
|
808
|
+
Prefix to be added to the object names in the object store.
|
|
809
|
+
store_credentials_json: str
|
|
810
|
+
Path to the JSON file containing the object store credentials.
|
|
811
|
+
variables: list, optional
|
|
812
|
+
List of variables to send to Zarr stores.
|
|
813
|
+
If None, all variables will be sent.
|
|
814
|
+
append_dim: str, default='time_counter'
|
|
815
|
+
Name of the dimension to append multifile datasets.
|
|
816
|
+
grid_filepath: str, optional
|
|
817
|
+
Path to file containing model grid parameter.
|
|
818
|
+
update_coords: dict, optional
|
|
819
|
+
Dictionary of coordinate variables to update.
|
|
820
|
+
rechunk: dict, optional
|
|
821
|
+
Rechunk strategy dictionary.
|
|
822
|
+
attrs: dict, optional
|
|
823
|
+
Attributes to add to the dataset.
|
|
824
|
+
parallel: bool, default=False
|
|
825
|
+
Whether to perform open and preprocess steps in parallel using
|
|
826
|
+
`dask.delayed`.
|
|
827
|
+
zarr_version: int, default=3
|
|
828
|
+
Zarr version to use.
|
|
829
|
+
"""
|
|
830
|
+
# === Initialise Asynchronous Object Store === #
|
|
831
|
+
logging.info("Reading object store credentials from %s", store_credentials_json)
|
|
832
|
+
obj_store = ObjectStoreS3(anon=False,
|
|
833
|
+
asynchronous=True,
|
|
834
|
+
store_credentials_json=store_credentials_json
|
|
835
|
+
)
|
|
836
|
+
|
|
837
|
+
# === Preprocess Data === #
|
|
838
|
+
ds_filepath = _preprocess_dataset(file=file,
|
|
839
|
+
rechunk=rechunk,
|
|
840
|
+
append_dim=append_dim,
|
|
841
|
+
update_coords=update_coords,
|
|
842
|
+
grid_filepath=grid_filepath,
|
|
843
|
+
attrs=attrs,
|
|
844
|
+
parallel=parallel,
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
if variables is None:
|
|
848
|
+
variables = list(ds_filepath.data_vars)
|
|
849
|
+
# Consider variables with append dimension only:
|
|
850
|
+
variables = [var for var in variables if append_dim in ds_filepath[var].dims]
|
|
851
|
+
|
|
852
|
+
# === Update Existing Zarr store === #
|
|
853
|
+
# Write to Zarr store:
|
|
854
|
+
url = f"s3://{bucket}/{object_prefix}"
|
|
855
|
+
logging.info(f"Updating Dataset at {url}")
|
|
856
|
+
_update_zarr_store(data=ds_filepath[variables],
|
|
857
|
+
obj_store=obj_store,
|
|
858
|
+
url=url,
|
|
859
|
+
append_dim=append_dim,
|
|
860
|
+
rechunk=rechunk,
|
|
861
|
+
version=zarr_version
|
|
862
|
+
)
|
|
863
|
+
|
|
864
|
+
# Release resources to avoid memory leaks:
|
|
865
|
+
ds_filepath.close()
|
|
866
|
+
|
|
867
|
+
|
|
868
|
+
def update_zarr(
|
|
869
|
+
file: list[str] | str | xr.Dataset,
|
|
870
|
+
bucket: str,
|
|
871
|
+
object_prefix: str,
|
|
872
|
+
store_credentials_json: str,
|
|
873
|
+
variables: Optional[list[str]] = None,
|
|
874
|
+
append_dim: str = "time_counter",
|
|
875
|
+
grid_filepath: Optional[str] = None,
|
|
876
|
+
update_coords: Optional[dict] = None,
|
|
877
|
+
rechunk: Optional[dict] = None,
|
|
878
|
+
attrs: Optional[dict] = None,
|
|
879
|
+
client : Optional[Client] = None,
|
|
880
|
+
dask_config_kwargs: Optional[dict] = None,
|
|
881
|
+
dask_cluster_kwargs: Optional[dict] = None,
|
|
882
|
+
zarr_version: int = 3
|
|
883
|
+
) -> None:
|
|
884
|
+
"""
|
|
885
|
+
Update data in existing Zarr store in cloud object
|
|
886
|
+
storage with option of using dask.
|
|
887
|
+
|
|
888
|
+
Parameters
|
|
889
|
+
----------
|
|
890
|
+
file: list | str | xarray.Dataset
|
|
891
|
+
Regular expression or list of filepaths to netCDF file(s).
|
|
892
|
+
Users can also pass a single xarray.Dataset directly.
|
|
893
|
+
bucket: str
|
|
894
|
+
Name of the bucket in the object store. Bucket names can contain only
|
|
895
|
+
lowercase letters, numbers, dots (.), and hyphens (-).
|
|
896
|
+
object_prefix: str
|
|
897
|
+
Prefix to be added to the object names in the object store.
|
|
898
|
+
store_credentials_json: str
|
|
899
|
+
Path to the JSON file containing the object store credentials.
|
|
900
|
+
variables: list, optional
|
|
901
|
+
List of variables to send to Zarr stores.
|
|
902
|
+
If None, all variables will be sent.
|
|
903
|
+
append_dim: str, default='time_counter'
|
|
904
|
+
Name of the dimension to append multifile datasets.
|
|
905
|
+
grid_filepath: str, optional
|
|
906
|
+
Path to file containing model grid parameter.
|
|
907
|
+
update_coords: dict, optional
|
|
908
|
+
Dictionary of coordinate variables to update.
|
|
909
|
+
rechunk: dict, optional
|
|
910
|
+
Rechunk strategy dictionary.
|
|
911
|
+
attrs: dict, optional
|
|
912
|
+
Attributes to add to the dataset.
|
|
913
|
+
client: dask.distributed.Client, optional
|
|
914
|
+
Dask Distributed Client.
|
|
915
|
+
dask_config_kwargs: Dict[str,str], optional
|
|
916
|
+
Dask configuration settings passed to dask.config.set().
|
|
917
|
+
dask_cluster_kwargs: dict, optional
|
|
918
|
+
Dask cluster configuration settings passed to LocalCluster().
|
|
919
|
+
zarr_version: int, default=3
|
|
920
|
+
zarr version to use.
|
|
921
|
+
"""
|
|
922
|
+
if dask_cluster_kwargs is not None:
|
|
923
|
+
# === Update Zarr store with Dask === #
|
|
924
|
+
if dask_config_kwargs is not None:
|
|
925
|
+
dask.config.set(dask_config_kwargs)
|
|
926
|
+
logging.info("Updated dask configuration settings.")
|
|
927
|
+
|
|
928
|
+
# Create local dask cluster & client:
|
|
929
|
+
with LocalCluster(**dask_cluster_kwargs) as cluster, Client(cluster) as client:
|
|
930
|
+
logging.info(f"Created LocalCluster with {dask_cluster_kwargs['n_workers']} workers @ Client: {client.dashboard_link}")
|
|
931
|
+
|
|
932
|
+
# Catch UserWarnings when rechunking data:
|
|
933
|
+
client.register_worker_plugin(CaptureWarningsPlugin())
|
|
934
|
+
|
|
935
|
+
_update_zarr(file=file,
|
|
936
|
+
bucket=bucket,
|
|
937
|
+
object_prefix=object_prefix,
|
|
938
|
+
store_credentials_json=store_credentials_json,
|
|
939
|
+
variables=variables,
|
|
940
|
+
append_dim=append_dim,
|
|
941
|
+
grid_filepath=grid_filepath,
|
|
942
|
+
update_coords=update_coords,
|
|
943
|
+
rechunk=rechunk,
|
|
944
|
+
attrs=attrs,
|
|
945
|
+
parallel=True,
|
|
946
|
+
zarr_version=zarr_version
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
# --- Shutdown Store & Dask Cluster --- #
|
|
950
|
+
cluster.close()
|
|
951
|
+
client.shutdown()
|
|
952
|
+
logging.info("Dask Cluster has been shutdown.")
|
|
953
|
+
|
|
954
|
+
elif client is not None:
|
|
955
|
+
logging.info(f"Using existing Dask Cluster @ Client: {client.dashboard_link}")
|
|
956
|
+
|
|
957
|
+
# Catch UserWarnings when rechunking data:
|
|
958
|
+
client.register_worker_plugin(CaptureWarningsPlugin())
|
|
959
|
+
|
|
960
|
+
_update_zarr(file=file,
|
|
961
|
+
bucket=bucket,
|
|
962
|
+
object_prefix=object_prefix,
|
|
963
|
+
store_credentials_json=store_credentials_json,
|
|
964
|
+
variables=variables,
|
|
965
|
+
append_dim=append_dim,
|
|
966
|
+
grid_filepath=grid_filepath,
|
|
967
|
+
update_coords=update_coords,
|
|
968
|
+
rechunk=rechunk,
|
|
969
|
+
attrs=attrs,
|
|
970
|
+
parallel=True,
|
|
971
|
+
zarr_version=zarr_version
|
|
972
|
+
)
|
|
973
|
+
|
|
974
|
+
# --- Shutdown Store & Dask Cluster --- #
|
|
975
|
+
cluster.close()
|
|
976
|
+
client.shutdown()
|
|
977
|
+
logging.info("Existing Dask Cluster has been shutdown.")
|
|
978
|
+
|
|
979
|
+
else:
|
|
980
|
+
# === Update Zarr store without Dask === #
|
|
981
|
+
_update_zarr(file=file,
|
|
982
|
+
bucket=bucket,
|
|
983
|
+
object_prefix=object_prefix,
|
|
984
|
+
store_credentials_json=store_credentials_json,
|
|
985
|
+
variables=variables,
|
|
986
|
+
append_dim=append_dim,
|
|
987
|
+
grid_filepath=grid_filepath,
|
|
988
|
+
update_coords=update_coords,
|
|
989
|
+
rechunk=rechunk,
|
|
990
|
+
attrs=attrs,
|
|
991
|
+
parallel=False,
|
|
992
|
+
zarr_version=zarr_version
|
|
993
|
+
)
|