OceanDataStore 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. OceanDataStore/__init__.py +21 -0
  2. OceanDataStore/catalog/__init__.py +12 -0
  3. OceanDataStore/catalog/oceandatacatalog.py +1242 -0
  4. OceanDataStore/catalog/stac/README.md +34 -0
  5. OceanDataStore/catalog/stac/__init__.py +30 -0
  6. OceanDataStore/catalog/stac/create_noc_stac.py +109 -0
  7. OceanDataStore/catalog/stac/npd_era5_collection.py +364 -0
  8. OceanDataStore/catalog/stac/npd_jra55_collection.py +196 -0
  9. OceanDataStore/catalog/stac/ods_obs_collection.py +534 -0
  10. OceanDataStore/catalog/stac/rapid_evo_collection.py +309 -0
  11. OceanDataStore/catalog/stac/template_collection.py +85 -0
  12. OceanDataStore/catalog/stac/utils.py +476 -0
  13. OceanDataStore/cli/__init__.py +34 -0
  14. OceanDataStore/cli/arg_parser.py +182 -0
  15. OceanDataStore/cli/cli.py +203 -0
  16. OceanDataStore/cli/exceptions.py +83 -0
  17. OceanDataStore/cli/icechunk.py +888 -0
  18. OceanDataStore/cli/logging.py +52 -0
  19. OceanDataStore/cli/object_store.py +293 -0
  20. OceanDataStore/cli/utils.py +275 -0
  21. OceanDataStore/cli/zarr.py +870 -0
  22. OceanDataStore/data/ARMOR3D/create_ARMOR3D_P1M-m_monthly_climatology.py +135 -0
  23. OceanDataStore/data/ARMOR3D/download_ARMOR3D_0.125def_P1M-m_1993_2024.py +33 -0
  24. OceanDataStore/data/ARMOR3D/run_create_ARMOR3D_P1M-m_monthly_climatology.slurm +32 -0
  25. OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_climatology_to_os.slurm +32 -0
  26. OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_monthly_to_os.slurm +32 -0
  27. OceanDataStore/data/ARMOR3D/run_update_ARMOR3D_P1m-m_monthly_to_os.slurm +32 -0
  28. OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_climatology_to_os.py +99 -0
  29. OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_to_os.py +147 -0
  30. OceanDataStore/data/ARMOR3D/update_ARMOR3D_P1m-m_monthly_to_os.py +143 -0
  31. OceanDataStore/data/EN.4.2.2/create_EN4.2.2_analysis_g10_climatology.py +162 -0
  32. OceanDataStore/data/EN.4.2.2/download_EN4.2.2_analysis_g10_data.sh +51 -0
  33. OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_climatology_to_os.slurm +32 -0
  34. OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
  35. OceanDataStore/data/EN.4.2.2/run_update_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
  36. OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_climatology_to_os.py +76 -0
  37. OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_to_os.py +165 -0
  38. OceanDataStore/data/EN.4.2.2/update_EN4.2.2_analysis_g10_monthly_to_os.py +161 -0
  39. OceanDataStore/data/ERA5/create_ERA5_daily_climatology.py +110 -0
  40. OceanDataStore/data/ERA5/create_ERA5_daily_mean.py +69 -0
  41. OceanDataStore/data/ERA5/create_ERA5_monthly_mean.py +74 -0
  42. OceanDataStore/data/ERA5/run_create_ERA5_daily_climatology.slurm +54 -0
  43. OceanDataStore/data/ERA5/run_send_ERA5_daily_climatology_to_os.slurm +32 -0
  44. OceanDataStore/data/ERA5/run_send_ERA5_daily_to_os.slurm +32 -0
  45. OceanDataStore/data/ERA5/run_send_ERA5_monthly_to_os.slurm +32 -0
  46. OceanDataStore/data/ERA5/run_update_ERA5_daily_to_os.slurm +32 -0
  47. OceanDataStore/data/ERA5/run_update_ERA5_monthly_to_os.slurm +32 -0
  48. OceanDataStore/data/ERA5/send_ERA5_daily_climatology_to_os.py +159 -0
  49. OceanDataStore/data/ERA5/send_ERA5_daily_to_os.py +141 -0
  50. OceanDataStore/data/ERA5/send_ERA5_monthly_to_os.py +173 -0
  51. OceanDataStore/data/ERA5/update_ERA5_daily_to_os.py +141 -0
  52. OceanDataStore/data/ERA5/update_ERA5_monthly_to_os.py +169 -0
  53. OceanDataStore/data/HadISST/download_HadISST1_data.sh +43 -0
  54. OceanDataStore/data/HadISST/run_send_HadISST1_monthly_to_os.slurm +32 -0
  55. OceanDataStore/data/HadISST/send_HadISST1_monthly_to_os.py +133 -0
  56. OceanDataStore/data/NSIDC/download_NSIDC_monthly_1979_2025_data.sh +54 -0
  57. OceanDataStore/data/NSIDC/process_NSIDC_SSI_Antarctic_data.py +130 -0
  58. OceanDataStore/data/NSIDC/process_NSIDC_SSI_Arctic_data.py +129 -0
  59. OceanDataStore/data/NSIDC/run_send_NSIDC_v4.0_to_OS.slurm +32 -0
  60. OceanDataStore/data/NSIDC/send_NSIDC_SII_v4.0_to_os.py +140 -0
  61. OceanDataStore/data/OISST/create_OISSTv2_daily_climatology.py +83 -0
  62. OceanDataStore/data/OISST/download_oisstv2_data.sh +43 -0
  63. OceanDataStore/data/OISST/run_create_OISSTv2_daily_climatology.slurm +44 -0
  64. OceanDataStore/data/OISST/run_send_OISSTv2_daily_climatology_to_os.slurm +32 -0
  65. OceanDataStore/data/OISST/run_send_OISSTv2_daily_to_os.slurm +32 -0
  66. OceanDataStore/data/OISST/run_send_OISSTv2_monthly_climatology_to_os.slurm +32 -0
  67. OceanDataStore/data/OISST/run_send_OISSTv2_monthly_to_os.slurm +32 -0
  68. OceanDataStore/data/OISST/run_update_OISSTv2_daily_to_os.slurm +32 -0
  69. OceanDataStore/data/OISST/send_OISSTv2_daily_climatology_to_os.py +154 -0
  70. OceanDataStore/data/OISST/send_OISSTv2_daily_ltm_climatology_to_os.py +151 -0
  71. OceanDataStore/data/OISST/send_OISSTv2_daily_to_os.py +142 -0
  72. OceanDataStore/data/OISST/send_OISSTv2_monthly_climatology_to_os.py +150 -0
  73. OceanDataStore/data/OISST/send_OISSTv2_monthly_to_os.py +145 -0
  74. OceanDataStore/data/OISST/update_OISSTv2_daily_to_os.py +142 -0
  75. OceanDataStore/data/OSTIA/create_OSTIA_daily_climatology.py +120 -0
  76. OceanDataStore/data/OSTIA/download_OSTIA_NRT.py +42 -0
  77. OceanDataStore/data/OSTIA/download_OSTIA_REP_1981_2025.py +42 -0
  78. OceanDataStore/data/OSTIA/run_create_OSTIA_daily_climatology.slurm +54 -0
  79. OceanDataStore/data/OSTIA/run_send_OSTIA_daily_climatology_to_os.slurm +32 -0
  80. OceanDataStore/data/OSTIA/run_send_OSTIA_nrt_daily_to_os.slurm +32 -0
  81. OceanDataStore/data/OSTIA/run_send_OSTIA_rep_daily_to_os.slurm +32 -0
  82. OceanDataStore/data/OSTIA/run_update_OSTIA_daily_to_os.slurm +33 -0
  83. OceanDataStore/data/OSTIA/send_OSTIA_daily_climatology_to_os.py +194 -0
  84. OceanDataStore/data/OSTIA/send_OSTIA_nrt_daily_to_os.py +141 -0
  85. OceanDataStore/data/OSTIA/send_OSTIA_rep_daily_to_os.py +145 -0
  86. OceanDataStore/data/OSTIA/update_OSTIA_copernicus_nrt_daily_to_os.py +144 -0
  87. OceanDataStore/data/OSTIA/update_OSTIA_nrt_daily_to_os.py +137 -0
  88. OceanDataStore/data/WOA23/download_WOA23_climatology.sh +41 -0
  89. OceanDataStore/data/WOA23/run_send_WOA23_annual_climatology_to_os.slurm +32 -0
  90. OceanDataStore/data/WOA23/run_send_WOA23_monthly_climatology_to_os.slurm +32 -0
  91. OceanDataStore/data/WOA23/send_WOA23_annual_climatology_to_os.py +263 -0
  92. OceanDataStore/data/WOA23/send_WOA23_monthly_climatology_to_os.py +292 -0
  93. OceanDataStore/data/update_icechunk_repo_attrs.py +76 -0
  94. OceanDataStore/data/update_noc_npd_era5v1_attrs.py +172 -0
  95. OceanDataStore/data/utils.py +506 -0
  96. OceanDataStore/zarr.py +993 -0
  97. oceandatastore-0.3.0.dist-info/METADATA +184 -0
  98. oceandatastore-0.3.0.dist-info/RECORD +104 -0
  99. oceandatastore-0.3.0.dist-info/WHEEL +5 -0
  100. oceandatastore-0.3.0.dist-info/entry_points.txt +2 -0
  101. oceandatastore-0.3.0.dist-info/licenses/LICENSE +201 -0
  102. oceandatastore-0.3.0.dist-info/scm_file_list.json +154 -0
  103. oceandatastore-0.3.0.dist-info/scm_version.json +8 -0
  104. oceandatastore-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,870 @@
1
+ # ===================================================================
2
+ # Copyright 2026 National Oceanography Centre
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0.
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
11
+ # implied. See the License for the specific language governing
12
+ # permissions and limitations under the License.
13
+ # ===================================================================
14
+ """
15
+ zarr.py
16
+
17
+ Description:
18
+ This module defines functions to send and update data in Zarr stores in
19
+ cloud object storage.
20
+
21
+ Authors:
22
+ - Ollie Tooth
23
+ """
24
+ # -- Import Python Modules -- #
25
+ import logging
26
+ import warnings
27
+ from typing import Optional
28
+
29
+ import dask
30
+ import numpy as np
31
+ import xarray as xr
32
+ from dask.distributed import Client, LocalCluster
33
+
34
+ from OceanDataStore.cli.exceptions import (
35
+ AppendDimensionError,
36
+ AppendDimensionSizeError,
37
+ ChunkSizeError,
38
+ DimensionNotFound,
39
+ DimensionSizeError,
40
+ ObjectNotFound,
41
+ )
42
+ from OceanDataStore.cli.object_store import ObjectStoreS3
43
+ from OceanDataStore.cli.utils import (
44
+ CaptureWarningsPlugin,
45
+ CloseClientSessionPlugin,
46
+ _preprocess_dataset,
47
+ timer,
48
+ )
49
+
50
+
51
+ # ======== Define Zarr Validation Functions ======== #
52
+ def _check_zarr_store(
53
+ obj_store: ObjectStoreS3,
54
+ url: str
55
+ ) -> bool:
56
+ """
57
+ Check if a Zarr store exists at a specified URL path.
58
+
59
+ Parameters
60
+ ----------
61
+ obj_store: ObjectStoreS3
62
+ ObjectStoreS3 remote filesystem.
63
+ url: str
64
+ URL path to Zarr store.
65
+
66
+ Returns
67
+ -------
68
+ bool
69
+ True if the store exists, False otherwise.
70
+ """
71
+
72
+ return obj_store.exists(url.replace("s3://", ""))
73
+
74
+
75
+ def _check_zarr_compatibility(
76
+ data: xr.DataArray | xr.Dataset,
77
+ obj_store: ObjectStoreS3,
78
+ url: str,
79
+ append_dim: str = "time_counter",
80
+ rechunk: Optional[dict] = None,
81
+ version: int = 3,
82
+ ) -> None:
83
+ """
84
+ Check compatibility of DataArray or Dataset to update existing
85
+ Zarr store in cloud object storage.
86
+
87
+ Parameters
88
+ ----------
89
+ data: xr.DataArray | xr.Dataset
90
+ DataArray or DataSet to update existing Zarr store with.
91
+ obj_store: ObjectStoreS3
92
+ ObjectStoreS3 remote filesystem.
93
+ url: str
94
+ URL path to Zarr store.
95
+ append_dim: bool, default="time_counter"
96
+ Dimension to append data to existing Zarr store.
97
+ rechunk: Optional[dict], default=None
98
+ Mapping to rechunk dimensions.
99
+ version: int, default=3
100
+ Zarr version to use.
101
+ """
102
+ # 1. Check if the object exists:
103
+ if not _check_zarr_store(obj_store=obj_store, url=url):
104
+ raise ObjectNotFound(object_name=url)
105
+
106
+ # 2. Check Zarr store compatibility:
107
+ try:
108
+ ds_store = xr.open_zarr(store=url,
109
+ storage_options=obj_store.get_storage_options(set_async=True),
110
+ zarr_format=version
111
+ )
112
+ except Exception as e:
113
+ raise FileNotFoundError(f"zarr version {version} is not compatible with the store: {e}")
114
+
115
+ # 3. Check if core dimensions exist & size are compatible:
116
+ dims_data = {dim : data.sizes[dim] for dim in data.dims if dim != append_dim}
117
+ for dim in dims_data:
118
+ if dim in ds_store.dims:
119
+ if dims_data[dim] != ds_store.sizes[dim]:
120
+ raise DimensionSizeError(dim=dim, size=dims_data[dim], expected_size=ds_store.sizes[dim])
121
+ else:
122
+ raise DimensionNotFound(dim=dim, object_name=url)
123
+
124
+ # 4. Check if append dimension values are compatible:
125
+ if (data[append_dim][0] < ds_store[append_dim][0]):
126
+ raise AppendDimensionError(dim=append_dim)
127
+
128
+ # 5. Check if specified chunks are compatible:
129
+ if rechunk is not None:
130
+ for dim in rechunk:
131
+ if dim in ds_store.dims:
132
+ if rechunk[dim] != ds_store.chunks[dim][0]:
133
+ raise ChunkSizeError(chunks=rechunk, store_chunks=ds_store.chunks)
134
+
135
+
136
+ # ======== Define Zarr Writer Functions ======== #
137
+ def _write_to_zarr(
138
+ data: xr.DataArray | xr.Dataset,
139
+ obj_store: ObjectStoreS3,
140
+ url: str,
141
+ version: int = 3,
142
+ ) -> None:
143
+ """
144
+ Write DataArray or Dataset to Zarr store in cloud
145
+ object storage.
146
+
147
+ Parameters
148
+ ----------
149
+ data: xr.DataArray | xr.Dataset
150
+ DataArray or DataSet to write to Zarr store.
151
+ obj_store: ObjectStoreS3
152
+ ObjectStoreS3 remote filesystem.
153
+ url: str
154
+ URL path to Zarr store.
155
+ version: int, default=3
156
+ Zarr version to use.
157
+ """
158
+ # Convert DataArrays to Datasets:
159
+ if isinstance(data, xr.DataArray):
160
+ var = data.name
161
+ data = data.to_dataset()
162
+ else:
163
+ var = None
164
+
165
+ # Write Dataset to Zarr store in Object Store:
166
+ if _check_zarr_store(obj_store=obj_store, url=url):
167
+ logging.info(f"Skipping Variable: Store already exists at {url}")
168
+
169
+ else:
170
+ with timer(action='send', dest=url, var=var):
171
+ # Catch consolidated metadata warnings:
172
+ with warnings.catch_warnings():
173
+ warnings.simplefilter(action="ignore", category=UserWarning)
174
+ data.to_zarr(store=url,
175
+ storage_options=obj_store.get_storage_options(set_async=True),
176
+ mode="w",
177
+ zarr_format=version
178
+ )
179
+
180
+
181
+ def _append_to_zarr(
182
+ data: xr.DataArray | xr.Dataset,
183
+ obj_store: ObjectStoreS3,
184
+ url: str,
185
+ append_dim: str = "time_counter",
186
+ version: int = 3,
187
+ ) -> None:
188
+ """
189
+ Append DataArray or Dataset to existing Zarr store in
190
+ cloud object storage.
191
+
192
+ Parameters
193
+ ----------
194
+ data: xr.DataArray | xr.Dataset
195
+ DataArray or DataSet to append to existing Zarr store.
196
+ obj_store: ObjectStoreS3
197
+ ObjectStoreS3 remote filesystem.
198
+ url: str
199
+ URL path to Zarr store.
200
+ append_dim: str, default="time_counter"
201
+ Dimension to append data to existing Zarr store.
202
+ version: int, default=3
203
+ Zarr version to use.
204
+ """
205
+ with timer(action='append', dest=url):
206
+ # Catch consolidated metadata warnings:
207
+ with warnings.catch_warnings():
208
+ warnings.simplefilter(action="ignore", category=UserWarning)
209
+ data.to_zarr(store=url,
210
+ storage_options=obj_store.get_storage_options(set_async=True),
211
+ append_dim=append_dim,
212
+ zarr_format=version
213
+ )
214
+
215
+
216
+ def _replace_in_zarr(
217
+ data: xr.DataArray | xr.Dataset,
218
+ obj_store: ObjectStoreS3,
219
+ url: str,
220
+ region: dict,
221
+ version: int = 3,
222
+ ) -> None:
223
+ """
224
+ Append DataArray or Dataset to existing Zarr store in
225
+ cloud object storage.
226
+
227
+ Parameters
228
+ ----------
229
+ data: xr.DataArray | xr.Dataset
230
+ DataArray or DataSet to append to existing Zarr store.
231
+ obj_store: ObjectStoreS3
232
+ ObjectStoreS3 remote filesystem.
233
+ url: str
234
+ URL path to Zarr store.
235
+ region: dict
236
+ Region of existing Zarr store to replace data.
237
+ version: int, default=3
238
+ Zarr version to use.
239
+ """
240
+ # Drop variables w/o append dimension:
241
+ append_dim = list(region.keys())[0]
242
+ drop_list = [var for var in data.variables if append_dim not in data[var].dims]
243
+ data = data.drop_vars(drop_list)
244
+
245
+ with timer(action='replace', dest=url):
246
+ # Catch consolidated metadata warnings:
247
+ with warnings.catch_warnings():
248
+ warnings.simplefilter(action="ignore", category=UserWarning)
249
+ data.to_zarr(store=url,
250
+ storage_options=obj_store.get_storage_options(set_async=True),
251
+ region=region,
252
+ zarr_format=version
253
+ )
254
+
255
+
256
+ def _update_zarr_store(
257
+ data: xr.DataArray | xr.Dataset,
258
+ obj_store: ObjectStoreS3,
259
+ url: str,
260
+ append_dim: str = "time_counter",
261
+ rechunk: Optional[dict] = None,
262
+ version: int = 3,
263
+ ) -> None:
264
+ """
265
+ Update an existing Zarr store in object storage by replacing
266
+ existing values and/or appending new values.
267
+
268
+ Parameters
269
+ ----------
270
+ data: xr.DataArray | xr.Dataset
271
+ DataArray or DataSet to update existing Zarr store with.
272
+ obj_store: ObjectStoreS3
273
+ ObjectStoreS3 remote filesystem.
274
+ url: str
275
+ URL path to Zarr store.
276
+ append_dim: str, default="time_counter"
277
+ Dimension to append data to existing Zarr store.
278
+ rechunk: Optional[dict], default=None
279
+ Mapping to rechunk dimensions.
280
+ version: int, default=3
281
+ Zarr version to use.
282
+ """
283
+ # Convert DataArrays to Datasets:
284
+ if isinstance(data, xr.DataArray):
285
+ var = data.name
286
+ ds_source = data.to_dataset()
287
+ else:
288
+ var = None
289
+ ds_source = data
290
+
291
+ # Check source Dataset compatibility with existing store:
292
+ _check_zarr_compatibility(data=ds_source,
293
+ obj_store=obj_store,
294
+ url=url,
295
+ append_dim=append_dim,
296
+ rechunk=rechunk,
297
+ version=version
298
+ )
299
+ logging.info(f"Passed Compatibility Checks for store {url}")
300
+
301
+ # === Update existing variable in Zarr Store === #
302
+ # Extract source & target append dimension values:
303
+ ds_target = xr.open_zarr(store=url,
304
+ storage_options=obj_store.get_storage_options(set_async=True),
305
+ zarr_format=version
306
+ )
307
+
308
+ if (var in ds_target.data_vars) or (var is None):
309
+
310
+ # === Updating existing Zarr store === #
311
+ # Extract source & target append dimension values:
312
+ target_append_dim = ds_target[append_dim].values
313
+ source_append_dim = ds_source[append_dim].values
314
+
315
+ # Determine intersection between source & target append dimensions:
316
+ intersect_append_dim = np.intersect1d(source_append_dim, target_append_dim)
317
+
318
+ if intersect_append_dim.size != 0:
319
+ # == Intersection exists -> replace overlapping values in target store == #
320
+
321
+ # Ensure all overlapping values exist along target append dimension:
322
+ overlap_append_dim = (source_append_dim <= target_append_dim[-1]).sum()
323
+ if intersect_append_dim.size != overlap_append_dim:
324
+ raise AppendDimensionSizeError(dim=append_dim, size=overlap_append_dim, expected_size=intersect_append_dim.size)
325
+
326
+ # Determine source and target append dimension indices of overlap:
327
+ target_ind_min = np.flatnonzero(target_append_dim == source_append_dim[0])[0]
328
+ target_ind_max = target_append_dim.size
329
+ source_ind_min = 0
330
+ source_ind_max = target_ind_max - target_ind_min
331
+ source_ind_size = source_append_dim.size
332
+
333
+ # 1. Replace overlapping values in target store:
334
+ logging.info(f"Updating {url} along {append_dim} from {target_append_dim[target_ind_min]} to {target_append_dim[target_ind_max - 1]}.")
335
+ _replace_in_zarr(data=ds_source.isel({append_dim : slice(source_ind_min, source_ind_max)}),
336
+ obj_store=obj_store,
337
+ url=url,
338
+ region={append_dim : slice(target_ind_min, target_ind_max)},
339
+ version=version,
340
+ )
341
+
342
+ # 2. Append new values to target store:
343
+ if source_ind_size > source_ind_max:
344
+ logging.info(f"Appending to {url} along {append_dim} from {source_append_dim[source_ind_max]} to {source_append_dim[source_ind_size - 1]}.")
345
+ _append_to_zarr(data=ds_source.isel({append_dim : slice(source_ind_max, source_ind_size)}),
346
+ obj_store=obj_store,
347
+ url=url,
348
+ append_dim=append_dim,
349
+ version=version,
350
+ )
351
+
352
+ else:
353
+ # == No intersection -> append all source values to target store == #
354
+ _append_to_zarr(data=ds_source,
355
+ obj_store=obj_store,
356
+ url=url,
357
+ append_dim=append_dim,
358
+ version=version,
359
+ )
360
+ else:
361
+ # == Add new variable to Zarr Store == #
362
+ logging.info(f"Sending Variable {var}")
363
+ _write_to_zarr(data=ds_source,
364
+ obj_store=obj_store,
365
+ url=url,
366
+ version=version,
367
+ )
368
+
369
+
370
+ def _send_to_zarr(
371
+ file: list[str] | str | xr.Dataset,
372
+ bucket: str,
373
+ object_prefix: str,
374
+ store_credentials_json: str,
375
+ variables: Optional[list[str]] = None,
376
+ append_dim: str = "time_counter",
377
+ grid_filepath: Optional[str] = None,
378
+ update_coords: Optional[dict] = None,
379
+ rechunk: Optional[dict] = None,
380
+ attrs: Optional[dict] = None,
381
+ parallel: bool = False,
382
+ zarr_version: int = 3
383
+ ) -> None:
384
+ """
385
+ Write data to new Zarr store in cloud object storage.
386
+
387
+ Parameters
388
+ ----------
389
+ file: list | str | xarray.Dataset
390
+ Regular expression or list of filepaths to netCDF file(s).
391
+ Users can also pass a single xarray.Dataset directly.
392
+ bucket: str
393
+ Name of the bucket in the object store. Bucket names can contain only
394
+ lowercase letters, numbers, dots (.), and hyphens (-).
395
+ object_prefix: str
396
+ Prefix to be added to the object names in the object store.
397
+ store_credentials_json: str
398
+ Path to the JSON file containing the object store credentials.
399
+ variables: list[str], optional
400
+ List of variables to send to Zarr stores.
401
+ If None, all variables will be sent.
402
+ append_dim: str, default='time_counter'
403
+ Name of the dimension to append multifile datasets.
404
+ grid_filepath: str, optional
405
+ Path to file containing model grid parameter.
406
+ update_coords: dict, optional
407
+ Dictionary of coordinate variables to update.
408
+ rechunk: dict, optional
409
+ Rechunk strategy dictionary.
410
+ attrs: dict, optional
411
+ Attributes to add to the dataset.
412
+ parallel: bool, default=False,
413
+ Whether to perform open and preprocess steps in parallel using
414
+ `dask.delayed`.
415
+ zarr_version: int, default=3
416
+ Zarr version to use.
417
+ """
418
+ # === Verify Inputs === #
419
+ if not isinstance(file, (list, str, xr.Dataset)):
420
+ raise TypeError("file must be a list of strings, a string, or an xarray.Dataset.")
421
+ if not isinstance(bucket, str):
422
+ raise TypeError("bucket must be a string.")
423
+ if not isinstance(object_prefix, str):
424
+ raise TypeError("object_prefix must be a string.")
425
+ if not isinstance(store_credentials_json, str):
426
+ raise TypeError("store_credentials_json must be a string.")
427
+ if variables is not None:
428
+ if not isinstance(variables, list):
429
+ raise TypeError("variables must be a list of strings.")
430
+ if not all(isinstance(var, str) for var in variables):
431
+ raise TypeError("variables must be a list of strings.")
432
+ if not isinstance(append_dim, str):
433
+ raise TypeError("append_dim must be a string.")
434
+ if grid_filepath is not None:
435
+ if not isinstance(grid_filepath, str):
436
+ raise TypeError("grid_filepath must be a string.")
437
+ if update_coords is not None:
438
+ if not isinstance(update_coords, dict):
439
+ raise TypeError("update_coords must be a dictionary.")
440
+ if rechunk is not None:
441
+ if not isinstance(rechunk, dict):
442
+ raise TypeError("rechunk must be a dictionary.")
443
+ if attrs is not None:
444
+ if not isinstance(attrs, dict):
445
+ raise TypeError("attrs must be a dictionary.")
446
+ if not isinstance(parallel, bool):
447
+ raise TypeError("parallel must be a boolean.")
448
+ if not isinstance(zarr_version, int):
449
+ raise TypeError("zarr_version must be an integer.")
450
+
451
+ # === Initialise Synchronous Object Store === #
452
+ logging.info("Reading object store credentials from %s", store_credentials_json)
453
+ obj_store = ObjectStoreS3(anon=False,
454
+ asynchronous=False,
455
+ store_credentials_json=store_credentials_json
456
+ )
457
+
458
+ # === Preprocess Data === #
459
+ ds_filepath = _preprocess_dataset(file=file,
460
+ rechunk=rechunk,
461
+ append_dim=append_dim,
462
+ update_coords=update_coords,
463
+ grid_filepath=grid_filepath,
464
+ attrs=attrs,
465
+ parallel=parallel,
466
+ )
467
+ if variables is None:
468
+ variables = list(ds_filepath.data_vars)
469
+
470
+ # === Send Dataset to Zarr store === #
471
+ # Write to Zarr store:
472
+ url = f"s3://{bucket}/{object_prefix}"
473
+ logging.info(f"Sending Dataset to {url}")
474
+ _write_to_zarr(data=ds_filepath[variables],
475
+ obj_store=obj_store,
476
+ url=url,
477
+ version=zarr_version
478
+ )
479
+
480
+ # Release resources to avoid memory leaks:
481
+ ds_filepath.close()
482
+
483
+
484
+ def send_to_zarr(
485
+ file: list[str] | str | xr.Dataset,
486
+ bucket: str,
487
+ object_prefix: str,
488
+ store_credentials_json: str,
489
+ variables: Optional[list[str]] = None,
490
+ append_dim: str = "time_counter",
491
+ grid_filepath: Optional[str] = None,
492
+ update_coords: Optional[dict] = None,
493
+ rechunk: Optional[dict] = None,
494
+ attrs: Optional[dict] = None,
495
+ client : Optional[Client] = None,
496
+ dask_config_kwargs: Optional[dict] = None,
497
+ dask_cluster_kwargs: Optional[dict] = None,
498
+ zarr_version: int = 3
499
+ ) -> None:
500
+ """
501
+ Write data to new Zarr store in cloud object storage with
502
+ option of using dask.
503
+
504
+ Parameters
505
+ ----------
506
+ file: list | str | xarray.Dataset
507
+ Regular expression or list of filepaths to netCDF file(s).
508
+ Users can also pass a single xarray.Dataset directly.
509
+ bucket: str
510
+ Name of the bucket in the object store. Bucket names can contain only
511
+ lowercase letters, numbers, dots (.), and hyphens (-).
512
+ object_prefix: str
513
+ Prefix to be added to the object names in the object store.
514
+ store_credentials_json: str
515
+ Path to the JSON file containing the object store credentials.
516
+ variables: list[str], optional
517
+ List of variables to send. If None, all variables will be sent.
518
+ append_dim: str, default="time_counter"
519
+ Name of the append dimension, by default "time_counter".
520
+ grid_filepath: str, optional
521
+ Path to file containing model grid parameter.
522
+ update_coords: dict, optional
523
+ Dictionary of coordinate variables to update.
524
+ rechunk: dict, optional
525
+ Rechunk strategy dictionary, by default None.
526
+ attrs: dict, optional
527
+ Attributes to add to the dataset.
528
+ client: dask.distributed.Client, optional
529
+ Dask Distributed Client.
530
+ dask_config_kwargs: dict[str,str], optional
531
+ Dask configuration settings passed to dask.config.set().
532
+ Ignored if dask client is provided.
533
+ dask_cluster_kwargs: dict, optional
534
+ Dask cluster configuration settings passed to LocalCluster().
535
+ Ignored if dask client is provided.
536
+ zarr_version: int, default=3
537
+ Zarr version to use.
538
+ """
539
+ if client is not None:
540
+ logging.info(f"Using existing Dask Cluster @ Client: {client.dashboard_link}")
541
+
542
+ # Register plugins: capture UserWarnings when rechunking data:
543
+ client.register_worker_plugin(CaptureWarningsPlugin())
544
+
545
+ # Register plugins: close aiohttp.ClientSessions:
546
+ client.register_worker_plugin(CloseClientSessionPlugin())
547
+
548
+ _send_to_zarr(file=file,
549
+ bucket=bucket,
550
+ object_prefix=object_prefix,
551
+ store_credentials_json=store_credentials_json,
552
+ variables=variables,
553
+ append_dim=append_dim,
554
+ grid_filepath=grid_filepath,
555
+ update_coords=update_coords,
556
+ rechunk=rechunk,
557
+ attrs=attrs,
558
+ parallel=True,
559
+ zarr_version=zarr_version
560
+ )
561
+
562
+ # --- Shutdown Dask Client --- #
563
+ client.shutdown()
564
+ logging.info("Existing Dask Client has been shutdown.")
565
+
566
+ elif dask_cluster_kwargs is not None:
567
+ # === Send to Zarr store with Dask === #
568
+ if dask_config_kwargs is not None:
569
+ dask.config.set(dask_config_kwargs)
570
+ logging.info("Updated dask configuration settings.")
571
+
572
+ # Create local dask cluster & client:
573
+ with LocalCluster(**dask_cluster_kwargs) as cluster, Client(cluster) as client:
574
+ logging.info(f"Created LocalCluster with {dask_cluster_kwargs['n_workers']} workers @ Client: {client.dashboard_link}")
575
+
576
+ # Register plugins: capture UserWarnings when rechunking data:
577
+ client.register_worker_plugin(CaptureWarningsPlugin())
578
+
579
+ # Register plugins: close aiohttp.ClientSessions:
580
+ client.register_worker_plugin(CloseClientSessionPlugin())
581
+
582
+ _send_to_zarr(file=file,
583
+ bucket=bucket,
584
+ object_prefix=object_prefix,
585
+ store_credentials_json=store_credentials_json,
586
+ variables=variables,
587
+ append_dim=append_dim,
588
+ grid_filepath=grid_filepath,
589
+ update_coords=update_coords,
590
+ rechunk=rechunk,
591
+ attrs=attrs,
592
+ parallel=True,
593
+ zarr_version=zarr_version
594
+ )
595
+
596
+ # --- Shutdown Store & Dask Cluster --- #
597
+ cluster.close()
598
+ client.shutdown()
599
+ logging.info("Dask Cluster has been shutdown.")
600
+
601
+ else:
602
+ # === Send to Zarr store without Dask === #
603
+ _send_to_zarr(file=file,
604
+ bucket=bucket,
605
+ object_prefix=object_prefix,
606
+ store_credentials_json=store_credentials_json,
607
+ variables=variables,
608
+ append_dim=append_dim,
609
+ grid_filepath=grid_filepath,
610
+ update_coords=update_coords,
611
+ rechunk=rechunk,
612
+ attrs=attrs,
613
+ parallel=False,
614
+ zarr_version=zarr_version
615
+ )
616
+
617
+
618
+ def _update_zarr(
619
+ file: list[str] | str | xr.Dataset,
620
+ bucket: str,
621
+ object_prefix: str,
622
+ store_credentials_json: str,
623
+ variables: Optional[list[str]] = None,
624
+ append_dim: str = "time_counter",
625
+ grid_filepath: Optional[str] = None,
626
+ update_coords: Optional[dict] = None,
627
+ rechunk: Optional[dict] = None,
628
+ attrs: Optional[dict] = None,
629
+ parallel: bool = False,
630
+ zarr_version: int = 3
631
+ ) -> None:
632
+ """
633
+ Update existing Zarr store in cloud object storage
634
+ by replacing and/or appending data.
635
+
636
+ Parameters
637
+ ----------
638
+ file: list | str
639
+ Regular expression or list of filepaths to netCDF file(s).
640
+ Users can also pass a single xarray.Dataset directly.
641
+ bucket: str
642
+ Name of the bucket in the object store. Bucket names can contain only
643
+ lowercase letters, numbers, dots (.), and hyphens (-).
644
+ object_prefix: str
645
+ Prefix to be added to the object names in the object store.
646
+ store_credentials_json: str
647
+ Path to the JSON file containing the object store credentials.
648
+ variables: list, optional
649
+ List of variables to send to Zarr stores.
650
+ If None, all variables will be sent.
651
+ append_dim: str, default='time_counter'
652
+ Name of the dimension to append multifile datasets.
653
+ grid_filepath: str, optional
654
+ Path to file containing model grid parameter.
655
+ update_coords: dict, optional
656
+ Dictionary of coordinate variables to update.
657
+ rechunk: dict, optional
658
+ Rechunk strategy dictionary.
659
+ attrs: dict, optional
660
+ Attributes to add to the dataset.
661
+ parallel: bool, default=False
662
+ Whether to perform open and preprocess steps in parallel using
663
+ `dask.delayed`.
664
+ zarr_version: int, default=3
665
+ Zarr version to use.
666
+ """
667
+ # === Verify Inputs === #
668
+ if not isinstance(file, (list, str, xr.Dataset)):
669
+ raise TypeError("file must be a list of strings, a string, or an xarray.Dataset.")
670
+ if not isinstance(bucket, str):
671
+ raise TypeError("bucket must be a string.")
672
+ if not isinstance(object_prefix, str):
673
+ raise TypeError("object_prefix must be a string.")
674
+ if not isinstance(store_credentials_json, str):
675
+ raise TypeError("store_credentials_json must be a string.")
676
+ if variables is not None:
677
+ if not isinstance(variables, list):
678
+ raise TypeError("variables must be a list of strings.")
679
+ if not all(isinstance(var, str) for var in variables):
680
+ raise TypeError("variables must be a list of strings.")
681
+ if not isinstance(append_dim, str):
682
+ raise TypeError("append_dim must be a string.")
683
+ if grid_filepath is not None:
684
+ if not isinstance(grid_filepath, str):
685
+ raise TypeError("grid_filepath must be a string.")
686
+ if update_coords is not None:
687
+ if not isinstance(update_coords, dict):
688
+ raise TypeError("update_coords must be a dictionary.")
689
+ if rechunk is not None:
690
+ if not isinstance(rechunk, dict):
691
+ raise TypeError("rechunk must be a dictionary.")
692
+ if attrs is not None:
693
+ if not isinstance(attrs, dict):
694
+ raise TypeError("attrs must be a dictionary.")
695
+ if not isinstance(parallel, bool):
696
+ raise TypeError("parallel must be a boolean.")
697
+ if not isinstance(zarr_version, int):
698
+ raise TypeError("zarr_version must be an integer.")
699
+
700
+ # === Initialise Synchronous Object Store === #
701
+ logging.info("Reading object store credentials from %s", store_credentials_json)
702
+ obj_store = ObjectStoreS3(anon=False,
703
+ asynchronous=False,
704
+ store_credentials_json=store_credentials_json
705
+ )
706
+
707
+ # === Preprocess Data === #
708
+ ds_filepath = _preprocess_dataset(file=file,
709
+ rechunk=rechunk,
710
+ append_dim=append_dim,
711
+ update_coords=update_coords,
712
+ grid_filepath=grid_filepath,
713
+ attrs=attrs,
714
+ parallel=parallel,
715
+ )
716
+
717
+ if variables is None:
718
+ variables = list(ds_filepath.data_vars)
719
+ # Consider variables with append dimension only:
720
+ variables = [var for var in variables if append_dim in ds_filepath[var].dims]
721
+
722
+ # === Update Existing Zarr store === #
723
+ # Write to Zarr store:
724
+ url = f"s3://{bucket}/{object_prefix}"
725
+ logging.info(f"Updating Dataset at {url}")
726
+ _update_zarr_store(data=ds_filepath[variables],
727
+ obj_store=obj_store,
728
+ url=url,
729
+ append_dim=append_dim,
730
+ rechunk=rechunk,
731
+ version=zarr_version
732
+ )
733
+
734
+ # Release resources to avoid memory leaks:
735
+ ds_filepath.close()
736
+
737
+
738
+ def update_zarr(
739
+ file: list[str] | str | xr.Dataset,
740
+ bucket: str,
741
+ object_prefix: str,
742
+ store_credentials_json: str,
743
+ variables: Optional[list[str]] = None,
744
+ append_dim: str = "time_counter",
745
+ grid_filepath: Optional[str] = None,
746
+ update_coords: Optional[dict] = None,
747
+ rechunk: Optional[dict] = None,
748
+ attrs: Optional[dict] = None,
749
+ client : Optional[Client] = None,
750
+ dask_config_kwargs: Optional[dict] = None,
751
+ dask_cluster_kwargs: Optional[dict] = None,
752
+ zarr_version: int = 3
753
+ ) -> None:
754
+ """
755
+ Update data in existing Zarr store in cloud object
756
+ storage with option of using dask.
757
+
758
+ Parameters
759
+ ----------
760
+ file: list | str | xarray.Dataset
761
+ Regular expression or list of filepaths to netCDF file(s).
762
+ Users can also pass a single xarray.Dataset directly.
763
+ bucket: str
764
+ Name of the bucket in the object store. Bucket names can contain only
765
+ lowercase letters, numbers, dots (.), and hyphens (-).
766
+ object_prefix: str
767
+ Prefix to be added to the object names in the object store.
768
+ store_credentials_json: str
769
+ Path to the JSON file containing the object store credentials.
770
+ variables: list, optional
771
+ List of variables to send to Zarr stores.
772
+ If None, all variables will be sent.
773
+ append_dim: str, default='time_counter'
774
+ Name of the dimension to append multifile datasets.
775
+ grid_filepath: str, optional
776
+ Path to file containing model grid parameter.
777
+ update_coords: dict, optional
778
+ Dictionary of coordinate variables to update.
779
+ rechunk: dict, optional
780
+ Rechunk strategy dictionary.
781
+ attrs: dict, optional
782
+ Attributes to add to the dataset.
783
+ client: dask.distributed.Client, optional
784
+ Dask Distributed Client.
785
+ dask_config_kwargs: Dict[str,str], optional
786
+ Dask configuration settings passed to dask.config.set().
787
+ Ignored if dask client is provided.
788
+ dask_cluster_kwargs: dict, optional
789
+ Dask cluster configuration settings passed to LocalCluster().
790
+ Ignored if dask client is provided.
791
+ zarr_version: int, default=3
792
+ zarr version to use.
793
+ """
794
+ if client is not None:
795
+ logging.info(f"Using existing Dask Cluster @ Client: {client.dashboard_link}")
796
+
797
+ # Register plugins: capture UserWarnings when rechunking data:
798
+ client.register_worker_plugin(CaptureWarningsPlugin())
799
+
800
+ # Register plugins: close aiohttp.ClientSessions:
801
+ client.register_worker_plugin(CloseClientSessionPlugin())
802
+
803
+ _update_zarr(file=file,
804
+ bucket=bucket,
805
+ object_prefix=object_prefix,
806
+ store_credentials_json=store_credentials_json,
807
+ variables=variables,
808
+ append_dim=append_dim,
809
+ grid_filepath=grid_filepath,
810
+ update_coords=update_coords,
811
+ rechunk=rechunk,
812
+ attrs=attrs,
813
+ parallel=True,
814
+ zarr_version=zarr_version
815
+ )
816
+
817
+ # --- Shutdown Dask Client --- #
818
+ client.shutdown()
819
+ logging.info("Existing Dask Client has been shutdown.")
820
+
821
+ elif dask_cluster_kwargs is not None:
822
+ # === Update Zarr store with Dask === #
823
+ if dask_config_kwargs is not None:
824
+ dask.config.set(dask_config_kwargs)
825
+ logging.info("Updated dask configuration settings.")
826
+
827
+ # Create local dask cluster & client:
828
+ with LocalCluster(**dask_cluster_kwargs) as cluster, Client(cluster) as client:
829
+ logging.info(f"Created LocalCluster with {dask_cluster_kwargs['n_workers']} workers @ Client: {client.dashboard_link}")
830
+
831
+ # Register plugins: capture UserWarnings when rechunking data:
832
+ client.register_worker_plugin(CaptureWarningsPlugin())
833
+
834
+ # Register plugins: close aiohttp.ClientSessions:
835
+ client.register_worker_plugin(CloseClientSessionPlugin())
836
+
837
+ _update_zarr(file=file,
838
+ bucket=bucket,
839
+ object_prefix=object_prefix,
840
+ store_credentials_json=store_credentials_json,
841
+ variables=variables,
842
+ append_dim=append_dim,
843
+ grid_filepath=grid_filepath,
844
+ update_coords=update_coords,
845
+ rechunk=rechunk,
846
+ attrs=attrs,
847
+ parallel=True,
848
+ zarr_version=zarr_version
849
+ )
850
+
851
+ # --- Shutdown Store & Dask Cluster --- #
852
+ cluster.close()
853
+ client.shutdown()
854
+ logging.info("Dask Cluster has been shutdown.")
855
+
856
+ else:
857
+ # === Update Zarr store without Dask === #
858
+ _update_zarr(file=file,
859
+ bucket=bucket,
860
+ object_prefix=object_prefix,
861
+ store_credentials_json=store_credentials_json,
862
+ variables=variables,
863
+ append_dim=append_dim,
864
+ grid_filepath=grid_filepath,
865
+ update_coords=update_coords,
866
+ rechunk=rechunk,
867
+ attrs=attrs,
868
+ parallel=False,
869
+ zarr_version=zarr_version
870
+ )