OceanDataStore 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- OceanDataStore/__init__.py +21 -0
- OceanDataStore/catalog/__init__.py +12 -0
- OceanDataStore/catalog/oceandatacatalog.py +1242 -0
- OceanDataStore/catalog/stac/README.md +34 -0
- OceanDataStore/catalog/stac/__init__.py +30 -0
- OceanDataStore/catalog/stac/create_noc_stac.py +109 -0
- OceanDataStore/catalog/stac/npd_era5_collection.py +364 -0
- OceanDataStore/catalog/stac/npd_jra55_collection.py +196 -0
- OceanDataStore/catalog/stac/ods_obs_collection.py +534 -0
- OceanDataStore/catalog/stac/rapid_evo_collection.py +309 -0
- OceanDataStore/catalog/stac/template_collection.py +85 -0
- OceanDataStore/catalog/stac/utils.py +476 -0
- OceanDataStore/cli/__init__.py +34 -0
- OceanDataStore/cli/arg_parser.py +182 -0
- OceanDataStore/cli/cli.py +203 -0
- OceanDataStore/cli/exceptions.py +83 -0
- OceanDataStore/cli/icechunk.py +888 -0
- OceanDataStore/cli/logging.py +52 -0
- OceanDataStore/cli/object_store.py +293 -0
- OceanDataStore/cli/utils.py +275 -0
- OceanDataStore/cli/zarr.py +870 -0
- OceanDataStore/data/ARMOR3D/create_ARMOR3D_P1M-m_monthly_climatology.py +135 -0
- OceanDataStore/data/ARMOR3D/download_ARMOR3D_0.125def_P1M-m_1993_2024.py +33 -0
- OceanDataStore/data/ARMOR3D/run_create_ARMOR3D_P1M-m_monthly_climatology.slurm +32 -0
- OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_climatology_to_os.slurm +32 -0
- OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ARMOR3D/run_update_ARMOR3D_P1m-m_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_climatology_to_os.py +99 -0
- OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_to_os.py +147 -0
- OceanDataStore/data/ARMOR3D/update_ARMOR3D_P1m-m_monthly_to_os.py +143 -0
- OceanDataStore/data/EN.4.2.2/create_EN4.2.2_analysis_g10_climatology.py +162 -0
- OceanDataStore/data/EN.4.2.2/download_EN4.2.2_analysis_g10_data.sh +51 -0
- OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_climatology_to_os.slurm +32 -0
- OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
- OceanDataStore/data/EN.4.2.2/run_update_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
- OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_climatology_to_os.py +76 -0
- OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_to_os.py +165 -0
- OceanDataStore/data/EN.4.2.2/update_EN4.2.2_analysis_g10_monthly_to_os.py +161 -0
- OceanDataStore/data/ERA5/create_ERA5_daily_climatology.py +110 -0
- OceanDataStore/data/ERA5/create_ERA5_daily_mean.py +69 -0
- OceanDataStore/data/ERA5/create_ERA5_monthly_mean.py +74 -0
- OceanDataStore/data/ERA5/run_create_ERA5_daily_climatology.slurm +54 -0
- OceanDataStore/data/ERA5/run_send_ERA5_daily_climatology_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_send_ERA5_daily_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_send_ERA5_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_update_ERA5_daily_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_update_ERA5_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/send_ERA5_daily_climatology_to_os.py +159 -0
- OceanDataStore/data/ERA5/send_ERA5_daily_to_os.py +141 -0
- OceanDataStore/data/ERA5/send_ERA5_monthly_to_os.py +173 -0
- OceanDataStore/data/ERA5/update_ERA5_daily_to_os.py +141 -0
- OceanDataStore/data/ERA5/update_ERA5_monthly_to_os.py +169 -0
- OceanDataStore/data/HadISST/download_HadISST1_data.sh +43 -0
- OceanDataStore/data/HadISST/run_send_HadISST1_monthly_to_os.slurm +32 -0
- OceanDataStore/data/HadISST/send_HadISST1_monthly_to_os.py +133 -0
- OceanDataStore/data/NSIDC/download_NSIDC_monthly_1979_2025_data.sh +54 -0
- OceanDataStore/data/NSIDC/process_NSIDC_SSI_Antarctic_data.py +130 -0
- OceanDataStore/data/NSIDC/process_NSIDC_SSI_Arctic_data.py +129 -0
- OceanDataStore/data/NSIDC/run_send_NSIDC_v4.0_to_OS.slurm +32 -0
- OceanDataStore/data/NSIDC/send_NSIDC_SII_v4.0_to_os.py +140 -0
- OceanDataStore/data/OISST/create_OISSTv2_daily_climatology.py +83 -0
- OceanDataStore/data/OISST/download_oisstv2_data.sh +43 -0
- OceanDataStore/data/OISST/run_create_OISSTv2_daily_climatology.slurm +44 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_daily_climatology_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_daily_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_monthly_climatology_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_monthly_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_update_OISSTv2_daily_to_os.slurm +32 -0
- OceanDataStore/data/OISST/send_OISSTv2_daily_climatology_to_os.py +154 -0
- OceanDataStore/data/OISST/send_OISSTv2_daily_ltm_climatology_to_os.py +151 -0
- OceanDataStore/data/OISST/send_OISSTv2_daily_to_os.py +142 -0
- OceanDataStore/data/OISST/send_OISSTv2_monthly_climatology_to_os.py +150 -0
- OceanDataStore/data/OISST/send_OISSTv2_monthly_to_os.py +145 -0
- OceanDataStore/data/OISST/update_OISSTv2_daily_to_os.py +142 -0
- OceanDataStore/data/OSTIA/create_OSTIA_daily_climatology.py +120 -0
- OceanDataStore/data/OSTIA/download_OSTIA_NRT.py +42 -0
- OceanDataStore/data/OSTIA/download_OSTIA_REP_1981_2025.py +42 -0
- OceanDataStore/data/OSTIA/run_create_OSTIA_daily_climatology.slurm +54 -0
- OceanDataStore/data/OSTIA/run_send_OSTIA_daily_climatology_to_os.slurm +32 -0
- OceanDataStore/data/OSTIA/run_send_OSTIA_nrt_daily_to_os.slurm +32 -0
- OceanDataStore/data/OSTIA/run_send_OSTIA_rep_daily_to_os.slurm +32 -0
- OceanDataStore/data/OSTIA/run_update_OSTIA_daily_to_os.slurm +33 -0
- OceanDataStore/data/OSTIA/send_OSTIA_daily_climatology_to_os.py +194 -0
- OceanDataStore/data/OSTIA/send_OSTIA_nrt_daily_to_os.py +141 -0
- OceanDataStore/data/OSTIA/send_OSTIA_rep_daily_to_os.py +145 -0
- OceanDataStore/data/OSTIA/update_OSTIA_copernicus_nrt_daily_to_os.py +144 -0
- OceanDataStore/data/OSTIA/update_OSTIA_nrt_daily_to_os.py +137 -0
- OceanDataStore/data/WOA23/download_WOA23_climatology.sh +41 -0
- OceanDataStore/data/WOA23/run_send_WOA23_annual_climatology_to_os.slurm +32 -0
- OceanDataStore/data/WOA23/run_send_WOA23_monthly_climatology_to_os.slurm +32 -0
- OceanDataStore/data/WOA23/send_WOA23_annual_climatology_to_os.py +263 -0
- OceanDataStore/data/WOA23/send_WOA23_monthly_climatology_to_os.py +292 -0
- OceanDataStore/data/update_icechunk_repo_attrs.py +76 -0
- OceanDataStore/data/update_noc_npd_era5v1_attrs.py +172 -0
- OceanDataStore/data/utils.py +506 -0
- OceanDataStore/zarr.py +993 -0
- oceandatastore-0.3.0.dist-info/METADATA +184 -0
- oceandatastore-0.3.0.dist-info/RECORD +104 -0
- oceandatastore-0.3.0.dist-info/WHEEL +5 -0
- oceandatastore-0.3.0.dist-info/entry_points.txt +2 -0
- oceandatastore-0.3.0.dist-info/licenses/LICENSE +201 -0
- oceandatastore-0.3.0.dist-info/scm_file_list.json +154 -0
- oceandatastore-0.3.0.dist-info/scm_version.json +8 -0
- oceandatastore-0.3.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,870 @@
|
|
|
1
|
+
# ===================================================================
|
|
2
|
+
# Copyright 2026 National Oceanography Centre
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0.
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
11
|
+
# implied. See the License for the specific language governing
|
|
12
|
+
# permissions and limitations under the License.
|
|
13
|
+
# ===================================================================
|
|
14
|
+
"""
|
|
15
|
+
zarr.py
|
|
16
|
+
|
|
17
|
+
Description:
|
|
18
|
+
This module defines functions to send and update data in Zarr stores in
|
|
19
|
+
cloud object storage.
|
|
20
|
+
|
|
21
|
+
Authors:
|
|
22
|
+
- Ollie Tooth
|
|
23
|
+
"""
|
|
24
|
+
# -- Import Python Modules -- #
|
|
25
|
+
import logging
|
|
26
|
+
import warnings
|
|
27
|
+
from typing import Optional
|
|
28
|
+
|
|
29
|
+
import dask
|
|
30
|
+
import numpy as np
|
|
31
|
+
import xarray as xr
|
|
32
|
+
from dask.distributed import Client, LocalCluster
|
|
33
|
+
|
|
34
|
+
from OceanDataStore.cli.exceptions import (
|
|
35
|
+
AppendDimensionError,
|
|
36
|
+
AppendDimensionSizeError,
|
|
37
|
+
ChunkSizeError,
|
|
38
|
+
DimensionNotFound,
|
|
39
|
+
DimensionSizeError,
|
|
40
|
+
ObjectNotFound,
|
|
41
|
+
)
|
|
42
|
+
from OceanDataStore.cli.object_store import ObjectStoreS3
|
|
43
|
+
from OceanDataStore.cli.utils import (
|
|
44
|
+
CaptureWarningsPlugin,
|
|
45
|
+
CloseClientSessionPlugin,
|
|
46
|
+
_preprocess_dataset,
|
|
47
|
+
timer,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ======== Define Zarr Validation Functions ======== #
|
|
52
|
+
def _check_zarr_store(
|
|
53
|
+
obj_store: ObjectStoreS3,
|
|
54
|
+
url: str
|
|
55
|
+
) -> bool:
|
|
56
|
+
"""
|
|
57
|
+
Check if a Zarr store exists at a specified URL path.
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
obj_store: ObjectStoreS3
|
|
62
|
+
ObjectStoreS3 remote filesystem.
|
|
63
|
+
url: str
|
|
64
|
+
URL path to Zarr store.
|
|
65
|
+
|
|
66
|
+
Returns
|
|
67
|
+
-------
|
|
68
|
+
bool
|
|
69
|
+
True if the store exists, False otherwise.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
return obj_store.exists(url.replace("s3://", ""))
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _check_zarr_compatibility(
|
|
76
|
+
data: xr.DataArray | xr.Dataset,
|
|
77
|
+
obj_store: ObjectStoreS3,
|
|
78
|
+
url: str,
|
|
79
|
+
append_dim: str = "time_counter",
|
|
80
|
+
rechunk: Optional[dict] = None,
|
|
81
|
+
version: int = 3,
|
|
82
|
+
) -> None:
|
|
83
|
+
"""
|
|
84
|
+
Check compatibility of DataArray or Dataset to update existing
|
|
85
|
+
Zarr store in cloud object storage.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
data: xr.DataArray | xr.Dataset
|
|
90
|
+
DataArray or DataSet to update existing Zarr store with.
|
|
91
|
+
obj_store: ObjectStoreS3
|
|
92
|
+
ObjectStoreS3 remote filesystem.
|
|
93
|
+
url: str
|
|
94
|
+
URL path to Zarr store.
|
|
95
|
+
append_dim: bool, default="time_counter"
|
|
96
|
+
Dimension to append data to existing Zarr store.
|
|
97
|
+
rechunk: Optional[dict], default=None
|
|
98
|
+
Mapping to rechunk dimensions.
|
|
99
|
+
version: int, default=3
|
|
100
|
+
Zarr version to use.
|
|
101
|
+
"""
|
|
102
|
+
# 1. Check if the object exists:
|
|
103
|
+
if not _check_zarr_store(obj_store=obj_store, url=url):
|
|
104
|
+
raise ObjectNotFound(object_name=url)
|
|
105
|
+
|
|
106
|
+
# 2. Check Zarr store compatibility:
|
|
107
|
+
try:
|
|
108
|
+
ds_store = xr.open_zarr(store=url,
|
|
109
|
+
storage_options=obj_store.get_storage_options(set_async=True),
|
|
110
|
+
zarr_format=version
|
|
111
|
+
)
|
|
112
|
+
except Exception as e:
|
|
113
|
+
raise FileNotFoundError(f"zarr version {version} is not compatible with the store: {e}")
|
|
114
|
+
|
|
115
|
+
# 3. Check if core dimensions exist & size are compatible:
|
|
116
|
+
dims_data = {dim : data.sizes[dim] for dim in data.dims if dim != append_dim}
|
|
117
|
+
for dim in dims_data:
|
|
118
|
+
if dim in ds_store.dims:
|
|
119
|
+
if dims_data[dim] != ds_store.sizes[dim]:
|
|
120
|
+
raise DimensionSizeError(dim=dim, size=dims_data[dim], expected_size=ds_store.sizes[dim])
|
|
121
|
+
else:
|
|
122
|
+
raise DimensionNotFound(dim=dim, object_name=url)
|
|
123
|
+
|
|
124
|
+
# 4. Check if append dimension values are compatible:
|
|
125
|
+
if (data[append_dim][0] < ds_store[append_dim][0]):
|
|
126
|
+
raise AppendDimensionError(dim=append_dim)
|
|
127
|
+
|
|
128
|
+
# 5. Check if specified chunks are compatible:
|
|
129
|
+
if rechunk is not None:
|
|
130
|
+
for dim in rechunk:
|
|
131
|
+
if dim in ds_store.dims:
|
|
132
|
+
if rechunk[dim] != ds_store.chunks[dim][0]:
|
|
133
|
+
raise ChunkSizeError(chunks=rechunk, store_chunks=ds_store.chunks)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# ======== Define Zarr Writer Functions ======== #
|
|
137
|
+
def _write_to_zarr(
|
|
138
|
+
data: xr.DataArray | xr.Dataset,
|
|
139
|
+
obj_store: ObjectStoreS3,
|
|
140
|
+
url: str,
|
|
141
|
+
version: int = 3,
|
|
142
|
+
) -> None:
|
|
143
|
+
"""
|
|
144
|
+
Write DataArray or Dataset to Zarr store in cloud
|
|
145
|
+
object storage.
|
|
146
|
+
|
|
147
|
+
Parameters
|
|
148
|
+
----------
|
|
149
|
+
data: xr.DataArray | xr.Dataset
|
|
150
|
+
DataArray or DataSet to write to Zarr store.
|
|
151
|
+
obj_store: ObjectStoreS3
|
|
152
|
+
ObjectStoreS3 remote filesystem.
|
|
153
|
+
url: str
|
|
154
|
+
URL path to Zarr store.
|
|
155
|
+
version: int, default=3
|
|
156
|
+
Zarr version to use.
|
|
157
|
+
"""
|
|
158
|
+
# Convert DataArrays to Datasets:
|
|
159
|
+
if isinstance(data, xr.DataArray):
|
|
160
|
+
var = data.name
|
|
161
|
+
data = data.to_dataset()
|
|
162
|
+
else:
|
|
163
|
+
var = None
|
|
164
|
+
|
|
165
|
+
# Write Dataset to Zarr store in Object Store:
|
|
166
|
+
if _check_zarr_store(obj_store=obj_store, url=url):
|
|
167
|
+
logging.info(f"Skipping Variable: Store already exists at {url}")
|
|
168
|
+
|
|
169
|
+
else:
|
|
170
|
+
with timer(action='send', dest=url, var=var):
|
|
171
|
+
# Catch consolidated metadata warnings:
|
|
172
|
+
with warnings.catch_warnings():
|
|
173
|
+
warnings.simplefilter(action="ignore", category=UserWarning)
|
|
174
|
+
data.to_zarr(store=url,
|
|
175
|
+
storage_options=obj_store.get_storage_options(set_async=True),
|
|
176
|
+
mode="w",
|
|
177
|
+
zarr_format=version
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _append_to_zarr(
|
|
182
|
+
data: xr.DataArray | xr.Dataset,
|
|
183
|
+
obj_store: ObjectStoreS3,
|
|
184
|
+
url: str,
|
|
185
|
+
append_dim: str = "time_counter",
|
|
186
|
+
version: int = 3,
|
|
187
|
+
) -> None:
|
|
188
|
+
"""
|
|
189
|
+
Append DataArray or Dataset to existing Zarr store in
|
|
190
|
+
cloud object storage.
|
|
191
|
+
|
|
192
|
+
Parameters
|
|
193
|
+
----------
|
|
194
|
+
data: xr.DataArray | xr.Dataset
|
|
195
|
+
DataArray or DataSet to append to existing Zarr store.
|
|
196
|
+
obj_store: ObjectStoreS3
|
|
197
|
+
ObjectStoreS3 remote filesystem.
|
|
198
|
+
url: str
|
|
199
|
+
URL path to Zarr store.
|
|
200
|
+
append_dim: str, default="time_counter"
|
|
201
|
+
Dimension to append data to existing Zarr store.
|
|
202
|
+
version: int, default=3
|
|
203
|
+
Zarr version to use.
|
|
204
|
+
"""
|
|
205
|
+
with timer(action='append', dest=url):
|
|
206
|
+
# Catch consolidated metadata warnings:
|
|
207
|
+
with warnings.catch_warnings():
|
|
208
|
+
warnings.simplefilter(action="ignore", category=UserWarning)
|
|
209
|
+
data.to_zarr(store=url,
|
|
210
|
+
storage_options=obj_store.get_storage_options(set_async=True),
|
|
211
|
+
append_dim=append_dim,
|
|
212
|
+
zarr_format=version
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _replace_in_zarr(
|
|
217
|
+
data: xr.DataArray | xr.Dataset,
|
|
218
|
+
obj_store: ObjectStoreS3,
|
|
219
|
+
url: str,
|
|
220
|
+
region: dict,
|
|
221
|
+
version: int = 3,
|
|
222
|
+
) -> None:
|
|
223
|
+
"""
|
|
224
|
+
Append DataArray or Dataset to existing Zarr store in
|
|
225
|
+
cloud object storage.
|
|
226
|
+
|
|
227
|
+
Parameters
|
|
228
|
+
----------
|
|
229
|
+
data: xr.DataArray | xr.Dataset
|
|
230
|
+
DataArray or DataSet to append to existing Zarr store.
|
|
231
|
+
obj_store: ObjectStoreS3
|
|
232
|
+
ObjectStoreS3 remote filesystem.
|
|
233
|
+
url: str
|
|
234
|
+
URL path to Zarr store.
|
|
235
|
+
region: dict
|
|
236
|
+
Region of existing Zarr store to replace data.
|
|
237
|
+
version: int, default=3
|
|
238
|
+
Zarr version to use.
|
|
239
|
+
"""
|
|
240
|
+
# Drop variables w/o append dimension:
|
|
241
|
+
append_dim = list(region.keys())[0]
|
|
242
|
+
drop_list = [var for var in data.variables if append_dim not in data[var].dims]
|
|
243
|
+
data = data.drop_vars(drop_list)
|
|
244
|
+
|
|
245
|
+
with timer(action='replace', dest=url):
|
|
246
|
+
# Catch consolidated metadata warnings:
|
|
247
|
+
with warnings.catch_warnings():
|
|
248
|
+
warnings.simplefilter(action="ignore", category=UserWarning)
|
|
249
|
+
data.to_zarr(store=url,
|
|
250
|
+
storage_options=obj_store.get_storage_options(set_async=True),
|
|
251
|
+
region=region,
|
|
252
|
+
zarr_format=version
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _update_zarr_store(
|
|
257
|
+
data: xr.DataArray | xr.Dataset,
|
|
258
|
+
obj_store: ObjectStoreS3,
|
|
259
|
+
url: str,
|
|
260
|
+
append_dim: str = "time_counter",
|
|
261
|
+
rechunk: Optional[dict] = None,
|
|
262
|
+
version: int = 3,
|
|
263
|
+
) -> None:
|
|
264
|
+
"""
|
|
265
|
+
Update an existing Zarr store in object storage by replacing
|
|
266
|
+
existing values and/or appending new values.
|
|
267
|
+
|
|
268
|
+
Parameters
|
|
269
|
+
----------
|
|
270
|
+
data: xr.DataArray | xr.Dataset
|
|
271
|
+
DataArray or DataSet to update existing Zarr store with.
|
|
272
|
+
obj_store: ObjectStoreS3
|
|
273
|
+
ObjectStoreS3 remote filesystem.
|
|
274
|
+
url: str
|
|
275
|
+
URL path to Zarr store.
|
|
276
|
+
append_dim: str, default="time_counter"
|
|
277
|
+
Dimension to append data to existing Zarr store.
|
|
278
|
+
rechunk: Optional[dict], default=None
|
|
279
|
+
Mapping to rechunk dimensions.
|
|
280
|
+
version: int, default=3
|
|
281
|
+
Zarr version to use.
|
|
282
|
+
"""
|
|
283
|
+
# Convert DataArrays to Datasets:
|
|
284
|
+
if isinstance(data, xr.DataArray):
|
|
285
|
+
var = data.name
|
|
286
|
+
ds_source = data.to_dataset()
|
|
287
|
+
else:
|
|
288
|
+
var = None
|
|
289
|
+
ds_source = data
|
|
290
|
+
|
|
291
|
+
# Check source Dataset compatibility with existing store:
|
|
292
|
+
_check_zarr_compatibility(data=ds_source,
|
|
293
|
+
obj_store=obj_store,
|
|
294
|
+
url=url,
|
|
295
|
+
append_dim=append_dim,
|
|
296
|
+
rechunk=rechunk,
|
|
297
|
+
version=version
|
|
298
|
+
)
|
|
299
|
+
logging.info(f"Passed Compatibility Checks for store {url}")
|
|
300
|
+
|
|
301
|
+
# === Update existing variable in Zarr Store === #
|
|
302
|
+
# Extract source & target append dimension values:
|
|
303
|
+
ds_target = xr.open_zarr(store=url,
|
|
304
|
+
storage_options=obj_store.get_storage_options(set_async=True),
|
|
305
|
+
zarr_format=version
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
if (var in ds_target.data_vars) or (var is None):
|
|
309
|
+
|
|
310
|
+
# === Updating existing Zarr store === #
|
|
311
|
+
# Extract source & target append dimension values:
|
|
312
|
+
target_append_dim = ds_target[append_dim].values
|
|
313
|
+
source_append_dim = ds_source[append_dim].values
|
|
314
|
+
|
|
315
|
+
# Determine intersection between source & target append dimensions:
|
|
316
|
+
intersect_append_dim = np.intersect1d(source_append_dim, target_append_dim)
|
|
317
|
+
|
|
318
|
+
if intersect_append_dim.size != 0:
|
|
319
|
+
# == Intersection exists -> replace overlapping values in target store == #
|
|
320
|
+
|
|
321
|
+
# Ensure all overlapping values exist along target append dimension:
|
|
322
|
+
overlap_append_dim = (source_append_dim <= target_append_dim[-1]).sum()
|
|
323
|
+
if intersect_append_dim.size != overlap_append_dim:
|
|
324
|
+
raise AppendDimensionSizeError(dim=append_dim, size=overlap_append_dim, expected_size=intersect_append_dim.size)
|
|
325
|
+
|
|
326
|
+
# Determine source and target append dimension indices of overlap:
|
|
327
|
+
target_ind_min = np.flatnonzero(target_append_dim == source_append_dim[0])[0]
|
|
328
|
+
target_ind_max = target_append_dim.size
|
|
329
|
+
source_ind_min = 0
|
|
330
|
+
source_ind_max = target_ind_max - target_ind_min
|
|
331
|
+
source_ind_size = source_append_dim.size
|
|
332
|
+
|
|
333
|
+
# 1. Replace overlapping values in target store:
|
|
334
|
+
logging.info(f"Updating {url} along {append_dim} from {target_append_dim[target_ind_min]} to {target_append_dim[target_ind_max - 1]}.")
|
|
335
|
+
_replace_in_zarr(data=ds_source.isel({append_dim : slice(source_ind_min, source_ind_max)}),
|
|
336
|
+
obj_store=obj_store,
|
|
337
|
+
url=url,
|
|
338
|
+
region={append_dim : slice(target_ind_min, target_ind_max)},
|
|
339
|
+
version=version,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
# 2. Append new values to target store:
|
|
343
|
+
if source_ind_size > source_ind_max:
|
|
344
|
+
logging.info(f"Appending to {url} along {append_dim} from {source_append_dim[source_ind_max]} to {source_append_dim[source_ind_size - 1]}.")
|
|
345
|
+
_append_to_zarr(data=ds_source.isel({append_dim : slice(source_ind_max, source_ind_size)}),
|
|
346
|
+
obj_store=obj_store,
|
|
347
|
+
url=url,
|
|
348
|
+
append_dim=append_dim,
|
|
349
|
+
version=version,
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
else:
|
|
353
|
+
# == No intersection -> append all source values to target store == #
|
|
354
|
+
_append_to_zarr(data=ds_source,
|
|
355
|
+
obj_store=obj_store,
|
|
356
|
+
url=url,
|
|
357
|
+
append_dim=append_dim,
|
|
358
|
+
version=version,
|
|
359
|
+
)
|
|
360
|
+
else:
|
|
361
|
+
# == Add new variable to Zarr Store == #
|
|
362
|
+
logging.info(f"Sending Variable {var}")
|
|
363
|
+
_write_to_zarr(data=ds_source,
|
|
364
|
+
obj_store=obj_store,
|
|
365
|
+
url=url,
|
|
366
|
+
version=version,
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def _send_to_zarr(
|
|
371
|
+
file: list[str] | str | xr.Dataset,
|
|
372
|
+
bucket: str,
|
|
373
|
+
object_prefix: str,
|
|
374
|
+
store_credentials_json: str,
|
|
375
|
+
variables: Optional[list[str]] = None,
|
|
376
|
+
append_dim: str = "time_counter",
|
|
377
|
+
grid_filepath: Optional[str] = None,
|
|
378
|
+
update_coords: Optional[dict] = None,
|
|
379
|
+
rechunk: Optional[dict] = None,
|
|
380
|
+
attrs: Optional[dict] = None,
|
|
381
|
+
parallel: bool = False,
|
|
382
|
+
zarr_version: int = 3
|
|
383
|
+
) -> None:
|
|
384
|
+
"""
|
|
385
|
+
Write data to new Zarr store in cloud object storage.
|
|
386
|
+
|
|
387
|
+
Parameters
|
|
388
|
+
----------
|
|
389
|
+
file: list | str | xarray.Dataset
|
|
390
|
+
Regular expression or list of filepaths to netCDF file(s).
|
|
391
|
+
Users can also pass a single xarray.Dataset directly.
|
|
392
|
+
bucket: str
|
|
393
|
+
Name of the bucket in the object store. Bucket names can contain only
|
|
394
|
+
lowercase letters, numbers, dots (.), and hyphens (-).
|
|
395
|
+
object_prefix: str
|
|
396
|
+
Prefix to be added to the object names in the object store.
|
|
397
|
+
store_credentials_json: str
|
|
398
|
+
Path to the JSON file containing the object store credentials.
|
|
399
|
+
variables: list[str], optional
|
|
400
|
+
List of variables to send to Zarr stores.
|
|
401
|
+
If None, all variables will be sent.
|
|
402
|
+
append_dim: str, default='time_counter'
|
|
403
|
+
Name of the dimension to append multifile datasets.
|
|
404
|
+
grid_filepath: str, optional
|
|
405
|
+
Path to file containing model grid parameter.
|
|
406
|
+
update_coords: dict, optional
|
|
407
|
+
Dictionary of coordinate variables to update.
|
|
408
|
+
rechunk: dict, optional
|
|
409
|
+
Rechunk strategy dictionary.
|
|
410
|
+
attrs: dict, optional
|
|
411
|
+
Attributes to add to the dataset.
|
|
412
|
+
parallel: bool, default=False,
|
|
413
|
+
Whether to perform open and preprocess steps in parallel using
|
|
414
|
+
`dask.delayed`.
|
|
415
|
+
zarr_version: int, default=3
|
|
416
|
+
Zarr version to use.
|
|
417
|
+
"""
|
|
418
|
+
# === Verify Inputs === #
|
|
419
|
+
if not isinstance(file, (list, str, xr.Dataset)):
|
|
420
|
+
raise TypeError("file must be a list of strings, a string, or an xarray.Dataset.")
|
|
421
|
+
if not isinstance(bucket, str):
|
|
422
|
+
raise TypeError("bucket must be a string.")
|
|
423
|
+
if not isinstance(object_prefix, str):
|
|
424
|
+
raise TypeError("object_prefix must be a string.")
|
|
425
|
+
if not isinstance(store_credentials_json, str):
|
|
426
|
+
raise TypeError("store_credentials_json must be a string.")
|
|
427
|
+
if variables is not None:
|
|
428
|
+
if not isinstance(variables, list):
|
|
429
|
+
raise TypeError("variables must be a list of strings.")
|
|
430
|
+
if not all(isinstance(var, str) for var in variables):
|
|
431
|
+
raise TypeError("variables must be a list of strings.")
|
|
432
|
+
if not isinstance(append_dim, str):
|
|
433
|
+
raise TypeError("append_dim must be a string.")
|
|
434
|
+
if grid_filepath is not None:
|
|
435
|
+
if not isinstance(grid_filepath, str):
|
|
436
|
+
raise TypeError("grid_filepath must be a string.")
|
|
437
|
+
if update_coords is not None:
|
|
438
|
+
if not isinstance(update_coords, dict):
|
|
439
|
+
raise TypeError("update_coords must be a dictionary.")
|
|
440
|
+
if rechunk is not None:
|
|
441
|
+
if not isinstance(rechunk, dict):
|
|
442
|
+
raise TypeError("rechunk must be a dictionary.")
|
|
443
|
+
if attrs is not None:
|
|
444
|
+
if not isinstance(attrs, dict):
|
|
445
|
+
raise TypeError("attrs must be a dictionary.")
|
|
446
|
+
if not isinstance(parallel, bool):
|
|
447
|
+
raise TypeError("parallel must be a boolean.")
|
|
448
|
+
if not isinstance(zarr_version, int):
|
|
449
|
+
raise TypeError("zarr_version must be an integer.")
|
|
450
|
+
|
|
451
|
+
# === Initialise Synchronous Object Store === #
|
|
452
|
+
logging.info("Reading object store credentials from %s", store_credentials_json)
|
|
453
|
+
obj_store = ObjectStoreS3(anon=False,
|
|
454
|
+
asynchronous=False,
|
|
455
|
+
store_credentials_json=store_credentials_json
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
# === Preprocess Data === #
|
|
459
|
+
ds_filepath = _preprocess_dataset(file=file,
|
|
460
|
+
rechunk=rechunk,
|
|
461
|
+
append_dim=append_dim,
|
|
462
|
+
update_coords=update_coords,
|
|
463
|
+
grid_filepath=grid_filepath,
|
|
464
|
+
attrs=attrs,
|
|
465
|
+
parallel=parallel,
|
|
466
|
+
)
|
|
467
|
+
if variables is None:
|
|
468
|
+
variables = list(ds_filepath.data_vars)
|
|
469
|
+
|
|
470
|
+
# === Send Dataset to Zarr store === #
|
|
471
|
+
# Write to Zarr store:
|
|
472
|
+
url = f"s3://{bucket}/{object_prefix}"
|
|
473
|
+
logging.info(f"Sending Dataset to {url}")
|
|
474
|
+
_write_to_zarr(data=ds_filepath[variables],
|
|
475
|
+
obj_store=obj_store,
|
|
476
|
+
url=url,
|
|
477
|
+
version=zarr_version
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
# Release resources to avoid memory leaks:
|
|
481
|
+
ds_filepath.close()
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def send_to_zarr(
|
|
485
|
+
file: list[str] | str | xr.Dataset,
|
|
486
|
+
bucket: str,
|
|
487
|
+
object_prefix: str,
|
|
488
|
+
store_credentials_json: str,
|
|
489
|
+
variables: Optional[list[str]] = None,
|
|
490
|
+
append_dim: str = "time_counter",
|
|
491
|
+
grid_filepath: Optional[str] = None,
|
|
492
|
+
update_coords: Optional[dict] = None,
|
|
493
|
+
rechunk: Optional[dict] = None,
|
|
494
|
+
attrs: Optional[dict] = None,
|
|
495
|
+
client : Optional[Client] = None,
|
|
496
|
+
dask_config_kwargs: Optional[dict] = None,
|
|
497
|
+
dask_cluster_kwargs: Optional[dict] = None,
|
|
498
|
+
zarr_version: int = 3
|
|
499
|
+
) -> None:
|
|
500
|
+
"""
|
|
501
|
+
Write data to new Zarr store in cloud object storage with
|
|
502
|
+
option of using dask.
|
|
503
|
+
|
|
504
|
+
Parameters
|
|
505
|
+
----------
|
|
506
|
+
file: list | str | xarray.Dataset
|
|
507
|
+
Regular expression or list of filepaths to netCDF file(s).
|
|
508
|
+
Users can also pass a single xarray.Dataset directly.
|
|
509
|
+
bucket: str
|
|
510
|
+
Name of the bucket in the object store. Bucket names can contain only
|
|
511
|
+
lowercase letters, numbers, dots (.), and hyphens (-).
|
|
512
|
+
object_prefix: str
|
|
513
|
+
Prefix to be added to the object names in the object store.
|
|
514
|
+
store_credentials_json: str
|
|
515
|
+
Path to the JSON file containing the object store credentials.
|
|
516
|
+
variables: list[str], optional
|
|
517
|
+
List of variables to send. If None, all variables will be sent.
|
|
518
|
+
append_dim: str, default="time_counter"
|
|
519
|
+
Name of the append dimension, by default "time_counter".
|
|
520
|
+
grid_filepath: str, optional
|
|
521
|
+
Path to file containing model grid parameter.
|
|
522
|
+
update_coords: dict, optional
|
|
523
|
+
Dictionary of coordinate variables to update.
|
|
524
|
+
rechunk: dict, optional
|
|
525
|
+
Rechunk strategy dictionary, by default None.
|
|
526
|
+
attrs: dict, optional
|
|
527
|
+
Attributes to add to the dataset.
|
|
528
|
+
client: dask.distributed.Client, optional
|
|
529
|
+
Dask Distributed Client.
|
|
530
|
+
dask_config_kwargs: dict[str,str], optional
|
|
531
|
+
Dask configuration settings passed to dask.config.set().
|
|
532
|
+
Ignored if dask client is provided.
|
|
533
|
+
dask_cluster_kwargs: dict, optional
|
|
534
|
+
Dask cluster configuration settings passed to LocalCluster().
|
|
535
|
+
Ignored if dask client is provided.
|
|
536
|
+
zarr_version: int, default=3
|
|
537
|
+
Zarr version to use.
|
|
538
|
+
"""
|
|
539
|
+
if client is not None:
|
|
540
|
+
logging.info(f"Using existing Dask Cluster @ Client: {client.dashboard_link}")
|
|
541
|
+
|
|
542
|
+
# Register plugins: capture UserWarnings when rechunking data:
|
|
543
|
+
client.register_worker_plugin(CaptureWarningsPlugin())
|
|
544
|
+
|
|
545
|
+
# Register plugins: close aiohttp.ClientSessions:
|
|
546
|
+
client.register_worker_plugin(CloseClientSessionPlugin())
|
|
547
|
+
|
|
548
|
+
_send_to_zarr(file=file,
|
|
549
|
+
bucket=bucket,
|
|
550
|
+
object_prefix=object_prefix,
|
|
551
|
+
store_credentials_json=store_credentials_json,
|
|
552
|
+
variables=variables,
|
|
553
|
+
append_dim=append_dim,
|
|
554
|
+
grid_filepath=grid_filepath,
|
|
555
|
+
update_coords=update_coords,
|
|
556
|
+
rechunk=rechunk,
|
|
557
|
+
attrs=attrs,
|
|
558
|
+
parallel=True,
|
|
559
|
+
zarr_version=zarr_version
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
# --- Shutdown Dask Client --- #
|
|
563
|
+
client.shutdown()
|
|
564
|
+
logging.info("Existing Dask Client has been shutdown.")
|
|
565
|
+
|
|
566
|
+
elif dask_cluster_kwargs is not None:
|
|
567
|
+
# === Send to Zarr store with Dask === #
|
|
568
|
+
if dask_config_kwargs is not None:
|
|
569
|
+
dask.config.set(dask_config_kwargs)
|
|
570
|
+
logging.info("Updated dask configuration settings.")
|
|
571
|
+
|
|
572
|
+
# Create local dask cluster & client:
|
|
573
|
+
with LocalCluster(**dask_cluster_kwargs) as cluster, Client(cluster) as client:
|
|
574
|
+
logging.info(f"Created LocalCluster with {dask_cluster_kwargs['n_workers']} workers @ Client: {client.dashboard_link}")
|
|
575
|
+
|
|
576
|
+
# Register plugins: capture UserWarnings when rechunking data:
|
|
577
|
+
client.register_worker_plugin(CaptureWarningsPlugin())
|
|
578
|
+
|
|
579
|
+
# Register plugins: close aiohttp.ClientSessions:
|
|
580
|
+
client.register_worker_plugin(CloseClientSessionPlugin())
|
|
581
|
+
|
|
582
|
+
_send_to_zarr(file=file,
|
|
583
|
+
bucket=bucket,
|
|
584
|
+
object_prefix=object_prefix,
|
|
585
|
+
store_credentials_json=store_credentials_json,
|
|
586
|
+
variables=variables,
|
|
587
|
+
append_dim=append_dim,
|
|
588
|
+
grid_filepath=grid_filepath,
|
|
589
|
+
update_coords=update_coords,
|
|
590
|
+
rechunk=rechunk,
|
|
591
|
+
attrs=attrs,
|
|
592
|
+
parallel=True,
|
|
593
|
+
zarr_version=zarr_version
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
# --- Shutdown Store & Dask Cluster --- #
|
|
597
|
+
cluster.close()
|
|
598
|
+
client.shutdown()
|
|
599
|
+
logging.info("Dask Cluster has been shutdown.")
|
|
600
|
+
|
|
601
|
+
else:
|
|
602
|
+
# === Send to Zarr store without Dask === #
|
|
603
|
+
_send_to_zarr(file=file,
|
|
604
|
+
bucket=bucket,
|
|
605
|
+
object_prefix=object_prefix,
|
|
606
|
+
store_credentials_json=store_credentials_json,
|
|
607
|
+
variables=variables,
|
|
608
|
+
append_dim=append_dim,
|
|
609
|
+
grid_filepath=grid_filepath,
|
|
610
|
+
update_coords=update_coords,
|
|
611
|
+
rechunk=rechunk,
|
|
612
|
+
attrs=attrs,
|
|
613
|
+
parallel=False,
|
|
614
|
+
zarr_version=zarr_version
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def _update_zarr(
|
|
619
|
+
file: list[str] | str | xr.Dataset,
|
|
620
|
+
bucket: str,
|
|
621
|
+
object_prefix: str,
|
|
622
|
+
store_credentials_json: str,
|
|
623
|
+
variables: Optional[list[str]] = None,
|
|
624
|
+
append_dim: str = "time_counter",
|
|
625
|
+
grid_filepath: Optional[str] = None,
|
|
626
|
+
update_coords: Optional[dict] = None,
|
|
627
|
+
rechunk: Optional[dict] = None,
|
|
628
|
+
attrs: Optional[dict] = None,
|
|
629
|
+
parallel: bool = False,
|
|
630
|
+
zarr_version: int = 3
|
|
631
|
+
) -> None:
|
|
632
|
+
"""
|
|
633
|
+
Update existing Zarr store in cloud object storage
|
|
634
|
+
by replacing and/or appending data.
|
|
635
|
+
|
|
636
|
+
Parameters
|
|
637
|
+
----------
|
|
638
|
+
file: list | str
|
|
639
|
+
Regular expression or list of filepaths to netCDF file(s).
|
|
640
|
+
Users can also pass a single xarray.Dataset directly.
|
|
641
|
+
bucket: str
|
|
642
|
+
Name of the bucket in the object store. Bucket names can contain only
|
|
643
|
+
lowercase letters, numbers, dots (.), and hyphens (-).
|
|
644
|
+
object_prefix: str
|
|
645
|
+
Prefix to be added to the object names in the object store.
|
|
646
|
+
store_credentials_json: str
|
|
647
|
+
Path to the JSON file containing the object store credentials.
|
|
648
|
+
variables: list, optional
|
|
649
|
+
List of variables to send to Zarr stores.
|
|
650
|
+
If None, all variables will be sent.
|
|
651
|
+
append_dim: str, default='time_counter'
|
|
652
|
+
Name of the dimension to append multifile datasets.
|
|
653
|
+
grid_filepath: str, optional
|
|
654
|
+
Path to file containing model grid parameter.
|
|
655
|
+
update_coords: dict, optional
|
|
656
|
+
Dictionary of coordinate variables to update.
|
|
657
|
+
rechunk: dict, optional
|
|
658
|
+
Rechunk strategy dictionary.
|
|
659
|
+
attrs: dict, optional
|
|
660
|
+
Attributes to add to the dataset.
|
|
661
|
+
parallel: bool, default=False
|
|
662
|
+
Whether to perform open and preprocess steps in parallel using
|
|
663
|
+
`dask.delayed`.
|
|
664
|
+
zarr_version: int, default=3
|
|
665
|
+
Zarr version to use.
|
|
666
|
+
"""
|
|
667
|
+
# === Verify Inputs === #
|
|
668
|
+
if not isinstance(file, (list, str, xr.Dataset)):
|
|
669
|
+
raise TypeError("file must be a list of strings, a string, or an xarray.Dataset.")
|
|
670
|
+
if not isinstance(bucket, str):
|
|
671
|
+
raise TypeError("bucket must be a string.")
|
|
672
|
+
if not isinstance(object_prefix, str):
|
|
673
|
+
raise TypeError("object_prefix must be a string.")
|
|
674
|
+
if not isinstance(store_credentials_json, str):
|
|
675
|
+
raise TypeError("store_credentials_json must be a string.")
|
|
676
|
+
if variables is not None:
|
|
677
|
+
if not isinstance(variables, list):
|
|
678
|
+
raise TypeError("variables must be a list of strings.")
|
|
679
|
+
if not all(isinstance(var, str) for var in variables):
|
|
680
|
+
raise TypeError("variables must be a list of strings.")
|
|
681
|
+
if not isinstance(append_dim, str):
|
|
682
|
+
raise TypeError("append_dim must be a string.")
|
|
683
|
+
if grid_filepath is not None:
|
|
684
|
+
if not isinstance(grid_filepath, str):
|
|
685
|
+
raise TypeError("grid_filepath must be a string.")
|
|
686
|
+
if update_coords is not None:
|
|
687
|
+
if not isinstance(update_coords, dict):
|
|
688
|
+
raise TypeError("update_coords must be a dictionary.")
|
|
689
|
+
if rechunk is not None:
|
|
690
|
+
if not isinstance(rechunk, dict):
|
|
691
|
+
raise TypeError("rechunk must be a dictionary.")
|
|
692
|
+
if attrs is not None:
|
|
693
|
+
if not isinstance(attrs, dict):
|
|
694
|
+
raise TypeError("attrs must be a dictionary.")
|
|
695
|
+
if not isinstance(parallel, bool):
|
|
696
|
+
raise TypeError("parallel must be a boolean.")
|
|
697
|
+
if not isinstance(zarr_version, int):
|
|
698
|
+
raise TypeError("zarr_version must be an integer.")
|
|
699
|
+
|
|
700
|
+
# === Initialise Synchronous Object Store === #
|
|
701
|
+
logging.info("Reading object store credentials from %s", store_credentials_json)
|
|
702
|
+
obj_store = ObjectStoreS3(anon=False,
|
|
703
|
+
asynchronous=False,
|
|
704
|
+
store_credentials_json=store_credentials_json
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
# === Preprocess Data === #
|
|
708
|
+
ds_filepath = _preprocess_dataset(file=file,
|
|
709
|
+
rechunk=rechunk,
|
|
710
|
+
append_dim=append_dim,
|
|
711
|
+
update_coords=update_coords,
|
|
712
|
+
grid_filepath=grid_filepath,
|
|
713
|
+
attrs=attrs,
|
|
714
|
+
parallel=parallel,
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
if variables is None:
|
|
718
|
+
variables = list(ds_filepath.data_vars)
|
|
719
|
+
# Consider variables with append dimension only:
|
|
720
|
+
variables = [var for var in variables if append_dim in ds_filepath[var].dims]
|
|
721
|
+
|
|
722
|
+
# === Update Existing Zarr store === #
|
|
723
|
+
# Write to Zarr store:
|
|
724
|
+
url = f"s3://{bucket}/{object_prefix}"
|
|
725
|
+
logging.info(f"Updating Dataset at {url}")
|
|
726
|
+
_update_zarr_store(data=ds_filepath[variables],
|
|
727
|
+
obj_store=obj_store,
|
|
728
|
+
url=url,
|
|
729
|
+
append_dim=append_dim,
|
|
730
|
+
rechunk=rechunk,
|
|
731
|
+
version=zarr_version
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
# Release resources to avoid memory leaks:
|
|
735
|
+
ds_filepath.close()
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
def update_zarr(
|
|
739
|
+
file: list[str] | str | xr.Dataset,
|
|
740
|
+
bucket: str,
|
|
741
|
+
object_prefix: str,
|
|
742
|
+
store_credentials_json: str,
|
|
743
|
+
variables: Optional[list[str]] = None,
|
|
744
|
+
append_dim: str = "time_counter",
|
|
745
|
+
grid_filepath: Optional[str] = None,
|
|
746
|
+
update_coords: Optional[dict] = None,
|
|
747
|
+
rechunk: Optional[dict] = None,
|
|
748
|
+
attrs: Optional[dict] = None,
|
|
749
|
+
client : Optional[Client] = None,
|
|
750
|
+
dask_config_kwargs: Optional[dict] = None,
|
|
751
|
+
dask_cluster_kwargs: Optional[dict] = None,
|
|
752
|
+
zarr_version: int = 3
|
|
753
|
+
) -> None:
|
|
754
|
+
"""
|
|
755
|
+
Update data in existing Zarr store in cloud object
|
|
756
|
+
storage with option of using dask.
|
|
757
|
+
|
|
758
|
+
Parameters
|
|
759
|
+
----------
|
|
760
|
+
file: list | str | xarray.Dataset
|
|
761
|
+
Regular expression or list of filepaths to netCDF file(s).
|
|
762
|
+
Users can also pass a single xarray.Dataset directly.
|
|
763
|
+
bucket: str
|
|
764
|
+
Name of the bucket in the object store. Bucket names can contain only
|
|
765
|
+
lowercase letters, numbers, dots (.), and hyphens (-).
|
|
766
|
+
object_prefix: str
|
|
767
|
+
Prefix to be added to the object names in the object store.
|
|
768
|
+
store_credentials_json: str
|
|
769
|
+
Path to the JSON file containing the object store credentials.
|
|
770
|
+
variables: list, optional
|
|
771
|
+
List of variables to send to Zarr stores.
|
|
772
|
+
If None, all variables will be sent.
|
|
773
|
+
append_dim: str, default='time_counter'
|
|
774
|
+
Name of the dimension to append multifile datasets.
|
|
775
|
+
grid_filepath: str, optional
|
|
776
|
+
Path to file containing model grid parameter.
|
|
777
|
+
update_coords: dict, optional
|
|
778
|
+
Dictionary of coordinate variables to update.
|
|
779
|
+
rechunk: dict, optional
|
|
780
|
+
Rechunk strategy dictionary.
|
|
781
|
+
attrs: dict, optional
|
|
782
|
+
Attributes to add to the dataset.
|
|
783
|
+
client: dask.distributed.Client, optional
|
|
784
|
+
Dask Distributed Client.
|
|
785
|
+
dask_config_kwargs: Dict[str,str], optional
|
|
786
|
+
Dask configuration settings passed to dask.config.set().
|
|
787
|
+
Ignored if dask client is provided.
|
|
788
|
+
dask_cluster_kwargs: dict, optional
|
|
789
|
+
Dask cluster configuration settings passed to LocalCluster().
|
|
790
|
+
Ignored if dask client is provided.
|
|
791
|
+
zarr_version: int, default=3
|
|
792
|
+
zarr version to use.
|
|
793
|
+
"""
|
|
794
|
+
if client is not None:
|
|
795
|
+
logging.info(f"Using existing Dask Cluster @ Client: {client.dashboard_link}")
|
|
796
|
+
|
|
797
|
+
# Register plugins: capture UserWarnings when rechunking data:
|
|
798
|
+
client.register_worker_plugin(CaptureWarningsPlugin())
|
|
799
|
+
|
|
800
|
+
# Register plugins: close aiohttp.ClientSessions:
|
|
801
|
+
client.register_worker_plugin(CloseClientSessionPlugin())
|
|
802
|
+
|
|
803
|
+
_update_zarr(file=file,
|
|
804
|
+
bucket=bucket,
|
|
805
|
+
object_prefix=object_prefix,
|
|
806
|
+
store_credentials_json=store_credentials_json,
|
|
807
|
+
variables=variables,
|
|
808
|
+
append_dim=append_dim,
|
|
809
|
+
grid_filepath=grid_filepath,
|
|
810
|
+
update_coords=update_coords,
|
|
811
|
+
rechunk=rechunk,
|
|
812
|
+
attrs=attrs,
|
|
813
|
+
parallel=True,
|
|
814
|
+
zarr_version=zarr_version
|
|
815
|
+
)
|
|
816
|
+
|
|
817
|
+
# --- Shutdown Dask Client --- #
|
|
818
|
+
client.shutdown()
|
|
819
|
+
logging.info("Existing Dask Client has been shutdown.")
|
|
820
|
+
|
|
821
|
+
elif dask_cluster_kwargs is not None:
|
|
822
|
+
# === Update Zarr store with Dask === #
|
|
823
|
+
if dask_config_kwargs is not None:
|
|
824
|
+
dask.config.set(dask_config_kwargs)
|
|
825
|
+
logging.info("Updated dask configuration settings.")
|
|
826
|
+
|
|
827
|
+
# Create local dask cluster & client:
|
|
828
|
+
with LocalCluster(**dask_cluster_kwargs) as cluster, Client(cluster) as client:
|
|
829
|
+
logging.info(f"Created LocalCluster with {dask_cluster_kwargs['n_workers']} workers @ Client: {client.dashboard_link}")
|
|
830
|
+
|
|
831
|
+
# Register plugins: capture UserWarnings when rechunking data:
|
|
832
|
+
client.register_worker_plugin(CaptureWarningsPlugin())
|
|
833
|
+
|
|
834
|
+
# Register plugins: close aiohttp.ClientSessions:
|
|
835
|
+
client.register_worker_plugin(CloseClientSessionPlugin())
|
|
836
|
+
|
|
837
|
+
_update_zarr(file=file,
|
|
838
|
+
bucket=bucket,
|
|
839
|
+
object_prefix=object_prefix,
|
|
840
|
+
store_credentials_json=store_credentials_json,
|
|
841
|
+
variables=variables,
|
|
842
|
+
append_dim=append_dim,
|
|
843
|
+
grid_filepath=grid_filepath,
|
|
844
|
+
update_coords=update_coords,
|
|
845
|
+
rechunk=rechunk,
|
|
846
|
+
attrs=attrs,
|
|
847
|
+
parallel=True,
|
|
848
|
+
zarr_version=zarr_version
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
# --- Shutdown Store & Dask Cluster --- #
|
|
852
|
+
cluster.close()
|
|
853
|
+
client.shutdown()
|
|
854
|
+
logging.info("Dask Cluster has been shutdown.")
|
|
855
|
+
|
|
856
|
+
else:
|
|
857
|
+
# === Update Zarr store without Dask === #
|
|
858
|
+
_update_zarr(file=file,
|
|
859
|
+
bucket=bucket,
|
|
860
|
+
object_prefix=object_prefix,
|
|
861
|
+
store_credentials_json=store_credentials_json,
|
|
862
|
+
variables=variables,
|
|
863
|
+
append_dim=append_dim,
|
|
864
|
+
grid_filepath=grid_filepath,
|
|
865
|
+
update_coords=update_coords,
|
|
866
|
+
rechunk=rechunk,
|
|
867
|
+
attrs=attrs,
|
|
868
|
+
parallel=False,
|
|
869
|
+
zarr_version=zarr_version
|
|
870
|
+
)
|