OceanDataStore 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. OceanDataStore/__init__.py +21 -0
  2. OceanDataStore/catalog/__init__.py +12 -0
  3. OceanDataStore/catalog/oceandatacatalog.py +1242 -0
  4. OceanDataStore/catalog/stac/README.md +34 -0
  5. OceanDataStore/catalog/stac/__init__.py +30 -0
  6. OceanDataStore/catalog/stac/create_noc_stac.py +109 -0
  7. OceanDataStore/catalog/stac/npd_era5_collection.py +364 -0
  8. OceanDataStore/catalog/stac/npd_jra55_collection.py +196 -0
  9. OceanDataStore/catalog/stac/ods_obs_collection.py +534 -0
  10. OceanDataStore/catalog/stac/rapid_evo_collection.py +309 -0
  11. OceanDataStore/catalog/stac/template_collection.py +85 -0
  12. OceanDataStore/catalog/stac/utils.py +476 -0
  13. OceanDataStore/cli/__init__.py +34 -0
  14. OceanDataStore/cli/arg_parser.py +182 -0
  15. OceanDataStore/cli/cli.py +203 -0
  16. OceanDataStore/cli/exceptions.py +83 -0
  17. OceanDataStore/cli/icechunk.py +888 -0
  18. OceanDataStore/cli/logging.py +52 -0
  19. OceanDataStore/cli/object_store.py +293 -0
  20. OceanDataStore/cli/utils.py +275 -0
  21. OceanDataStore/cli/zarr.py +870 -0
  22. OceanDataStore/data/ARMOR3D/create_ARMOR3D_P1M-m_monthly_climatology.py +135 -0
  23. OceanDataStore/data/ARMOR3D/download_ARMOR3D_0.125def_P1M-m_1993_2024.py +33 -0
  24. OceanDataStore/data/ARMOR3D/run_create_ARMOR3D_P1M-m_monthly_climatology.slurm +32 -0
  25. OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_climatology_to_os.slurm +32 -0
  26. OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_monthly_to_os.slurm +32 -0
  27. OceanDataStore/data/ARMOR3D/run_update_ARMOR3D_P1m-m_monthly_to_os.slurm +32 -0
  28. OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_climatology_to_os.py +99 -0
  29. OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_to_os.py +147 -0
  30. OceanDataStore/data/ARMOR3D/update_ARMOR3D_P1m-m_monthly_to_os.py +143 -0
  31. OceanDataStore/data/EN.4.2.2/create_EN4.2.2_analysis_g10_climatology.py +162 -0
  32. OceanDataStore/data/EN.4.2.2/download_EN4.2.2_analysis_g10_data.sh +51 -0
  33. OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_climatology_to_os.slurm +32 -0
  34. OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
  35. OceanDataStore/data/EN.4.2.2/run_update_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
  36. OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_climatology_to_os.py +76 -0
  37. OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_to_os.py +165 -0
  38. OceanDataStore/data/EN.4.2.2/update_EN4.2.2_analysis_g10_monthly_to_os.py +161 -0
  39. OceanDataStore/data/ERA5/create_ERA5_daily_climatology.py +110 -0
  40. OceanDataStore/data/ERA5/create_ERA5_daily_mean.py +69 -0
  41. OceanDataStore/data/ERA5/create_ERA5_monthly_mean.py +74 -0
  42. OceanDataStore/data/ERA5/run_create_ERA5_daily_climatology.slurm +54 -0
  43. OceanDataStore/data/ERA5/run_send_ERA5_daily_climatology_to_os.slurm +32 -0
  44. OceanDataStore/data/ERA5/run_send_ERA5_daily_to_os.slurm +32 -0
  45. OceanDataStore/data/ERA5/run_send_ERA5_monthly_to_os.slurm +32 -0
  46. OceanDataStore/data/ERA5/run_update_ERA5_daily_to_os.slurm +32 -0
  47. OceanDataStore/data/ERA5/run_update_ERA5_monthly_to_os.slurm +32 -0
  48. OceanDataStore/data/ERA5/send_ERA5_daily_climatology_to_os.py +159 -0
  49. OceanDataStore/data/ERA5/send_ERA5_daily_to_os.py +141 -0
  50. OceanDataStore/data/ERA5/send_ERA5_monthly_to_os.py +173 -0
  51. OceanDataStore/data/ERA5/update_ERA5_daily_to_os.py +141 -0
  52. OceanDataStore/data/ERA5/update_ERA5_monthly_to_os.py +169 -0
  53. OceanDataStore/data/HadISST/download_HadISST1_data.sh +43 -0
  54. OceanDataStore/data/HadISST/run_send_HadISST1_monthly_to_os.slurm +32 -0
  55. OceanDataStore/data/HadISST/send_HadISST1_monthly_to_os.py +133 -0
  56. OceanDataStore/data/NSIDC/download_NSIDC_monthly_1979_2025_data.sh +54 -0
  57. OceanDataStore/data/NSIDC/process_NSIDC_SSI_Antarctic_data.py +130 -0
  58. OceanDataStore/data/NSIDC/process_NSIDC_SSI_Arctic_data.py +129 -0
  59. OceanDataStore/data/NSIDC/run_send_NSIDC_v4.0_to_OS.slurm +32 -0
  60. OceanDataStore/data/NSIDC/send_NSIDC_SII_v4.0_to_os.py +140 -0
  61. OceanDataStore/data/OISST/create_OISSTv2_daily_climatology.py +83 -0
  62. OceanDataStore/data/OISST/download_oisstv2_data.sh +43 -0
  63. OceanDataStore/data/OISST/run_create_OISSTv2_daily_climatology.slurm +44 -0
  64. OceanDataStore/data/OISST/run_send_OISSTv2_daily_climatology_to_os.slurm +32 -0
  65. OceanDataStore/data/OISST/run_send_OISSTv2_daily_to_os.slurm +32 -0
  66. OceanDataStore/data/OISST/run_send_OISSTv2_monthly_climatology_to_os.slurm +32 -0
  67. OceanDataStore/data/OISST/run_send_OISSTv2_monthly_to_os.slurm +32 -0
  68. OceanDataStore/data/OISST/run_update_OISSTv2_daily_to_os.slurm +32 -0
  69. OceanDataStore/data/OISST/send_OISSTv2_daily_climatology_to_os.py +154 -0
  70. OceanDataStore/data/OISST/send_OISSTv2_daily_ltm_climatology_to_os.py +151 -0
  71. OceanDataStore/data/OISST/send_OISSTv2_daily_to_os.py +142 -0
  72. OceanDataStore/data/OISST/send_OISSTv2_monthly_climatology_to_os.py +150 -0
  73. OceanDataStore/data/OISST/send_OISSTv2_monthly_to_os.py +145 -0
  74. OceanDataStore/data/OISST/update_OISSTv2_daily_to_os.py +142 -0
  75. OceanDataStore/data/OSTIA/create_OSTIA_daily_climatology.py +120 -0
  76. OceanDataStore/data/OSTIA/download_OSTIA_NRT.py +42 -0
  77. OceanDataStore/data/OSTIA/download_OSTIA_REP_1981_2025.py +42 -0
  78. OceanDataStore/data/OSTIA/run_create_OSTIA_daily_climatology.slurm +54 -0
  79. OceanDataStore/data/OSTIA/run_send_OSTIA_daily_climatology_to_os.slurm +32 -0
  80. OceanDataStore/data/OSTIA/run_send_OSTIA_nrt_daily_to_os.slurm +32 -0
  81. OceanDataStore/data/OSTIA/run_send_OSTIA_rep_daily_to_os.slurm +32 -0
  82. OceanDataStore/data/OSTIA/run_update_OSTIA_daily_to_os.slurm +33 -0
  83. OceanDataStore/data/OSTIA/send_OSTIA_daily_climatology_to_os.py +194 -0
  84. OceanDataStore/data/OSTIA/send_OSTIA_nrt_daily_to_os.py +141 -0
  85. OceanDataStore/data/OSTIA/send_OSTIA_rep_daily_to_os.py +145 -0
  86. OceanDataStore/data/OSTIA/update_OSTIA_copernicus_nrt_daily_to_os.py +144 -0
  87. OceanDataStore/data/OSTIA/update_OSTIA_nrt_daily_to_os.py +137 -0
  88. OceanDataStore/data/WOA23/download_WOA23_climatology.sh +41 -0
  89. OceanDataStore/data/WOA23/run_send_WOA23_annual_climatology_to_os.slurm +32 -0
  90. OceanDataStore/data/WOA23/run_send_WOA23_monthly_climatology_to_os.slurm +32 -0
  91. OceanDataStore/data/WOA23/send_WOA23_annual_climatology_to_os.py +263 -0
  92. OceanDataStore/data/WOA23/send_WOA23_monthly_climatology_to_os.py +292 -0
  93. OceanDataStore/data/update_icechunk_repo_attrs.py +76 -0
  94. OceanDataStore/data/update_noc_npd_era5v1_attrs.py +172 -0
  95. OceanDataStore/data/utils.py +506 -0
  96. OceanDataStore/zarr.py +993 -0
  97. oceandatastore-0.3.0.dist-info/METADATA +184 -0
  98. oceandatastore-0.3.0.dist-info/RECORD +104 -0
  99. oceandatastore-0.3.0.dist-info/WHEEL +5 -0
  100. oceandatastore-0.3.0.dist-info/entry_points.txt +2 -0
  101. oceandatastore-0.3.0.dist-info/licenses/LICENSE +201 -0
  102. oceandatastore-0.3.0.dist-info/scm_file_list.json +154 -0
  103. oceandatastore-0.3.0.dist-info/scm_version.json +8 -0
  104. oceandatastore-0.3.0.dist-info/top_level.txt +1 -0
OceanDataStore/zarr.py ADDED
@@ -0,0 +1,993 @@
1
+ """
2
+ zarr.py
3
+
4
+ Description:
5
+ This module defines the functions to send and update data
6
+ to an object store.
7
+
8
+ Authors:
9
+ - Ollie Tooth
10
+ - Tobias Ferreira
11
+ - Joao Morado
12
+ """
13
+ # -- Import Python Modules -- #
14
+ import glob
15
+ import time
16
+ import logging
17
+ import warnings
18
+ from typing import Optional
19
+
20
+ import numpy as np
21
+ import xarray as xr
22
+
23
+ import dask
24
+ from dask.distributed import Client, LocalCluster
25
+ from dask.distributed.diagnostics.plugin import WorkerPlugin
26
+
27
+ # -- Import OceanDataStore Modules -- #
28
+ from .object_store import ObjectStoreS3
29
+
30
+ from .exceptions import (
31
+ ObjectNotFound,
32
+ DimensionNotFound,
33
+ DimensionSizeError,
34
+ AppendDimensionError,
35
+ AppendDimensionSizeError,
36
+ ChunkSizeError,
37
+ )
38
+
39
+ # -- Define WorkerPlugin -- #
40
+ class CaptureWarningsPlugin(WorkerPlugin):
41
+ def setup(self, worker):
42
+ # Used to catch UserWarnings when rechunking:
43
+ logging.captureWarnings(True)
44
+ def teardown(self, worker):
45
+ logging.captureWarnings(False)
46
+
47
+
48
+ # -- Define timing context manager -- #
49
+ class timer():
50
+ """
51
+ Timer context manager class to return time
52
+ taken to write variables & datasets to an
53
+ object store.
54
+
55
+ Parameters
56
+ ----------
57
+ action : str
58
+ Action to be performed. Options are 'send' or 'update'.
59
+ url : str
60
+ URL path to Zarr store or Icechunk repository.
61
+ var : Optional[str], default=None
62
+ Name of variable to be sent or updated to store.
63
+ """
64
+ def __init__(self, action: str, url: str, var: Optional[str] = None) -> None:
65
+ # Define class attributes:
66
+ if action == 'send':
67
+ if var is not None:
68
+ self.action = f'Sent {var} to'
69
+ else:
70
+ self.action = 'Sent dataset to'
71
+ elif action == 'replace':
72
+ if var is not None:
73
+ self.action = f'Updated {var} in'
74
+ else:
75
+ self.action = 'Updated'
76
+ elif action == 'append':
77
+ if var is not None:
78
+ self.action = f'Appended {var} to'
79
+ else:
80
+ self.action = 'Appended to'
81
+ else:
82
+ raise ValueError("Invalid action: must be 'send', 'replace' or 'append'.")
83
+ self.url = url
84
+
85
+ def __enter__(self):
86
+ self.t_start = time.time()
87
+
88
+ def __exit__(self, type, value, traceback):
89
+ self.t_end = time.time()
90
+ logging.info(
91
+ f"Completed: {self.action} store {self.url} in {(self.t_end - self.t_start):.2f} seconds"
92
+ )
93
+
94
+
95
+ # -- Define OceanDataStore Core Functions -- #
96
+ def _check_zarr_store(obj_store: ObjectStoreS3,
97
+ url: str
98
+ ) -> bool:
99
+ """
100
+ Check if a Zarr store exists at a specified URL path.
101
+
102
+ Parameters
103
+ ----------
104
+ obj_store
105
+ ObjectStoreS3 remote filesystem.
106
+ url
107
+ URL path to Zarr store.
108
+
109
+ Returns
110
+ -------
111
+ bool
112
+ True if the store exists, False otherwise.
113
+ """
114
+
115
+ return obj_store.exists(url.replace("s3://", ""))
116
+
117
+
118
+ def _check_zarr_compatibility(data: xr.DataArray | xr.Dataset,
119
+ obj_store: ObjectStoreS3,
120
+ url: str,
121
+ append_dim: str = "time_counter",
122
+ rechunk: Optional[dict] = None,
123
+ version: int = 3,
124
+ ) -> None:
125
+ """
126
+ Check compatibility of DataArray or Dataset to update existing
127
+ Zarr store in cloud object storage.
128
+
129
+ Parameters
130
+ ----------
131
+ data: xr.DataArray | xr.Dataset
132
+ DataArray or DataSet to update existing Zarr store with.
133
+ obj_store: ObjectStoreS3
134
+ ObjectStoreS3 remote filesystem.
135
+ url: str
136
+ URL path to Zarr store.
137
+ append_dim: bool, default="time_counter"
138
+ Dimension to append data to existing Zarr store.
139
+ rechunk: Optional[dict], default=None
140
+ Mapping to rechunk dimensions.
141
+ version: int, default=3
142
+ Zarr version to use.
143
+ """
144
+ # 1. Check if the store exists:
145
+ if not _check_zarr_store(obj_store=obj_store, path=url):
146
+ raise ObjectNotFound(object_name=url)
147
+
148
+ # 2. Check Zarr store compatibility:
149
+ try:
150
+ ds_store = xr.open_zarr(store=url,
151
+ storage_options=obj_store.get_remote_options(),
152
+ zarr_format=version
153
+ )
154
+ except Exception as e:
155
+ raise FileNotFoundError(f"zarr version {version} is not compatible with the store: {e}")
156
+
157
+ # 3. Check if core dimensions exist & size are compatible:
158
+ dims_data = {dim : data.sizes[dim] for dim in data.dims if dim != append_dim}
159
+ for dim in dims_data:
160
+ if dim in ds_store.dims:
161
+ if dims_data[dim] != ds_store.sizes[dim]:
162
+ raise DimensionSizeError(dim=dim, size=dims_data[dim], expected_size=ds_store.sizes[dim])
163
+ else:
164
+ raise DimensionNotFound(dim=dim, object_name=url)
165
+
166
+ # 4. Check if append dimension values are compatible:
167
+ if (data[append_dim][0] < ds_store[append_dim][0]):
168
+ raise AppendDimensionError(dim=append_dim)
169
+
170
+ # 5. Check if specified chunks are compatible:
171
+ if rechunk is not None:
172
+ for dim in rechunk:
173
+ if dim in ds_store.dims:
174
+ if rechunk[dim] != ds_store.chunks[dim][0]:
175
+ raise ChunkSizeError(chunks=rechunk, store_chunks=ds_store.chunks)
176
+
177
+
178
+ def _write_to_zarr(
179
+ data: xr.DataArray | xr.Dataset,
180
+ obj_store: ObjectStoreS3,
181
+ url: str,
182
+ version: int = 3,
183
+ ) -> None:
184
+ """
185
+ Write DataArray or Dataset to Zarr store in cloud
186
+ object storage.
187
+
188
+ Parameters
189
+ ----------
190
+ data: xr.DataArray | xr.Dataset
191
+ DataArray or DataSet to write to Zarr store.
192
+ obj_store: ObjectStoreS3
193
+ ObjectStoreS3 remote filesystem.
194
+ url: str
195
+ URL path to Zarr store.
196
+ version: int, default=3
197
+ Zarr version to use.
198
+ """
199
+ # === Verify Inputs === #
200
+ if not isinstance(data, (xr.DataArray, xr.Dataset)):
201
+ raise TypeError("data must be a DataArray or Dataset.")
202
+ if not isinstance(obj_store, ObjectStoreS3):
203
+ raise TypeError("obj_store must be an ObjectStoreS3 instance.")
204
+ if not isinstance(url, str):
205
+ raise TypeError("url must be a string.")
206
+ if not isinstance(version, int):
207
+ raise TypeError("version must be an integer.")
208
+
209
+ # Convert DataArrays to Datasets:
210
+ if isinstance(data, xr.DataArray):
211
+ var = data.name
212
+ data = data.to_dataset()
213
+ else:
214
+ var = None
215
+
216
+ # Write Dataset to Zarr store in Object Store:
217
+ if _check_zarr_store(obj_store=obj_store, path=url):
218
+ logging.info(f"Skipping Variable: Store already exists at {url}")
219
+
220
+ else:
221
+ with timer(action='send', url=url, var=var):
222
+ # Catch consolidated metadata warnings:
223
+ with warnings.catch_warnings():
224
+ warnings.simplefilter(action="ignore", category=UserWarning)
225
+ data.to_zarr(store=url,
226
+ storage_options=obj_store.get_remote_options(),
227
+ mode="w",
228
+ zarr_format=version
229
+ )
230
+
231
+
232
+ def _append_to_zarr(data: xr.DataArray | xr.Dataset,
233
+ obj_store: ObjectStoreS3,
234
+ url: str,
235
+ append_dim: str = "time_counter",
236
+ version: int = 3,
237
+ ) -> None:
238
+ """
239
+ Append DataArray or Dataset to existing Zarr store in
240
+ cloud object storage.
241
+
242
+ Parameters
243
+ ----------
244
+ data: xr.DataArray | xr.Dataset
245
+ DataArray or DataSet to append to existing Zarr store.
246
+ obj_store: ObjectStoreS3
247
+ ObjectStoreS3 remote filesystem.
248
+ url: str
249
+ URL path to Zarr store.
250
+ append_dim: str, default="time_counter"
251
+ Dimension to append data to existing Zarr store.
252
+ version: int, default=3
253
+ Zarr version to use.
254
+ """
255
+ with timer(action='append', url=url):
256
+ # Catch consolidated metadata warnings:
257
+ with warnings.catch_warnings():
258
+ warnings.simplefilter(action="ignore", category=UserWarning)
259
+ data.to_zarr(store=url,
260
+ storage_options=obj_store.get_remote_options(),
261
+ append_dim=append_dim,
262
+ zarr_format=version
263
+ )
264
+
265
+
266
+ def _replace_in_zarr(data: xr.DataArray | xr.Dataset,
267
+ obj_store: ObjectStoreS3,
268
+ url: str,
269
+ region: dict,
270
+ version: int = 3,
271
+ ) -> None:
272
+ """
273
+ Append DataArray or Dataset to existing Zarr store in
274
+ cloud object storage.
275
+
276
+ Parameters
277
+ ----------
278
+ data: xr.DataArray | xr.Dataset
279
+ DataArray or DataSet to append to existing Zarr store.
280
+ obj_store: ObjectStoreS3
281
+ ObjectStoreS3 remote filesystem.
282
+ url: str
283
+ URL path to Zarr store.
284
+ region: dict
285
+ Region of existing Zarr store to replace data.
286
+ version: int, default=3
287
+ Zarr version to use.
288
+ """
289
+ # Drop variables w/o append dimension:
290
+ append_dim = list(region.keys())[0]
291
+ drop_list = [var for var in data.variables if append_dim not in data[var].dims]
292
+ data = data.drop_vars(drop_list)
293
+
294
+ with timer(action='replace', url=url):
295
+ # Catch consolidated metadata warnings:
296
+ with warnings.catch_warnings():
297
+ warnings.simplefilter(action="ignore", category=UserWarning)
298
+ data.to_zarr(store=url,
299
+ storage_options=obj_store.get_remote_options(),
300
+ region=region,
301
+ zarr_format=version
302
+ )
303
+
304
+
305
+ def _update_zarr_store(data: xr.DataArray | xr.Dataset,
306
+ obj_store: ObjectStoreS3,
307
+ url: str,
308
+ append_dim: str = "time_counter",
309
+ rechunk: Optional[dict] = None,
310
+ version: int = 3,
311
+ ) -> None:
312
+ """
313
+ Update an existing Zarr store in object storage by replacing
314
+ existing values and/or appending new values.
315
+
316
+ Parameters
317
+ ----------
318
+ data: xr.DataArray | xr.Dataset
319
+ DataArray or DataSet to update existing Zarr store with.
320
+ obj_store: ObjectStoreS3
321
+ ObjectStoreS3 remote filesystem.
322
+ url: str
323
+ URL path to Zarr store.
324
+ append_dim: bool, default="time_counter"
325
+ Dimension to append data to existing Zarr store.
326
+ rechunk: Optional[dict], default=None
327
+ Mapping to rechunk dimensions.
328
+ version: int, default=3
329
+ Zarr version to use.
330
+ """
331
+ # === Verify Inputs === #
332
+ if not isinstance(data, (xr.DataArray, xr.Dataset)):
333
+ raise TypeError("data must be a DataArray or Dataset.")
334
+ if not isinstance(obj_store, ObjectStoreS3):
335
+ raise TypeError("obj_store must be an ObjectStoreS3 instance.")
336
+ if not isinstance(url, str):
337
+ raise TypeError("url must be a string.")
338
+ if not isinstance(append_dim, str):
339
+ raise TypeError("append_dim must be a string.")
340
+ if rechunk is not None:
341
+ if not isinstance(rechunk, dict):
342
+ raise TypeError("rechunk must be a dictionary.")
343
+ if not isinstance(version, int):
344
+ raise TypeError("version must be an integer.")
345
+
346
+ # Convert DataArrays to Datasets:
347
+ if isinstance(data, xr.DataArray):
348
+ var = data.name
349
+ ds_source = data.to_dataset()
350
+ else:
351
+ var = None
352
+ ds_source = data
353
+
354
+ # Check source Dataset compatibility with existing store:
355
+ _check_zarr_compatibility(data=ds_source,
356
+ obj_store=obj_store,
357
+ url=url,
358
+ append_dim=append_dim,
359
+ rechunk=rechunk,
360
+ version=version
361
+ )
362
+ logging.info(f"Passed Compatibility Checks for store {url}")
363
+
364
+ # === Update existing variable in Zarr Store === #
365
+ # Extract source & target append dimension values:
366
+ ds_target = xr.open_zarr(store=url,
367
+ storage_options=obj_store.get_remote_options(),
368
+ zarr_format=version
369
+ )
370
+
371
+ if (var in ds_target.data_vars) or (var is None):
372
+
373
+ # === Updating existing Zarr store === #
374
+ # Extract source & target append dimension values:
375
+ target_append_dim = ds_target[append_dim].values
376
+ source_append_dim = ds_source[append_dim].values
377
+
378
+ # Determine intersection between source & target append dimensions:
379
+ intersect_append_dim = np.intersect1d(source_append_dim, target_append_dim)
380
+
381
+ if intersect_append_dim.size != 0:
382
+ # == Intersection exists -> replace overlapping values in target store == #
383
+
384
+ # Ensure all overlapping values exist along target append dimension:
385
+ overlap_append_dim = (source_append_dim <= target_append_dim[-1]).sum()
386
+ if intersect_append_dim.size != overlap_append_dim:
387
+ raise AppendDimensionSizeError(dim=append_dim, size=overlap_append_dim, expected_size=intersect_append_dim.size)
388
+
389
+ # Determine source and target append dimension indices of overlap:
390
+ target_ind_min = np.flatnonzero(target_append_dim == source_append_dim[0])[0]
391
+ target_ind_max = target_append_dim.size
392
+ source_ind_min = 0
393
+ source_ind_max = target_ind_max - target_ind_min
394
+ source_ind_size = source_append_dim.size
395
+
396
+ # 1. Replace overlapping values in target store:
397
+ logging.info(f"Updating {url} along {append_dim} from {target_append_dim[target_ind_min]} to {target_append_dim[target_ind_max - 1]}.")
398
+ _replace_in_zarr(data=ds_source.isel({append_dim : slice(source_ind_min, source_ind_max)}),
399
+ obj_store=obj_store,
400
+ url=url,
401
+ region={append_dim : slice(target_ind_min, target_ind_max)},
402
+ version=version,
403
+ )
404
+
405
+ # 2. Append new values to target store:
406
+ if source_ind_size > source_ind_max:
407
+ logging.info(f"Appending to {url} along {append_dim} from {source_append_dim[source_ind_max]} to {source_append_dim[source_ind_size - 1]}.")
408
+ _append_to_zarr(data=ds_source.isel({append_dim : slice(source_ind_max, source_ind_size)}),
409
+ obj_store=obj_store,
410
+ url=url,
411
+ append_dim=append_dim,
412
+ version=version,
413
+ )
414
+
415
+ else:
416
+ # == No intersection -> append all source values to target store == #
417
+ _append_to_zarr(data=ds_source,
418
+ obj_store=obj_store,
419
+ url=url,
420
+ append_dim=append_dim,
421
+ version=version,
422
+ )
423
+ else:
424
+ # == Add new variable to Zarr Store == #
425
+ logging.info(f"Sending Variable {var}")
426
+ _write_to_zarr(data=ds_source,
427
+ obj_store=obj_store,
428
+ url=url,
429
+ version=version,
430
+ )
431
+
432
+
433
+ def _preprocess_dataset(file: list[str] | str | xr.Dataset,
434
+ rechunk: Optional[dict] = None,
435
+ append_dim: str = "time_counter",
436
+ update_coords: Optional[dict] = None,
437
+ grid_filepath: Optional[str] = None,
438
+ attrs: Optional[dict] = None,
439
+ parallel: bool = False,
440
+ ) -> xr.Dataset:
441
+ """
442
+ Preprocess the dataset to be sent to the object store.
443
+
444
+ Parameters
445
+ ----------
446
+ file: list | str | xarray.Dataset
447
+ Regular expression or list of filepaths to netCDF file(s).
448
+ Users can also pass a single xarray.Dataset directly.
449
+ rechunk: Optional[dict], default=None
450
+ Mapping to rechunk dimensions. If None, dask chunks
451
+ will be set to on-disk chunks.
452
+ append_dim: str, default='time_counter'
453
+ Name of the dimension to append multi-file datasets.
454
+ update_coords: Optional[dict], default=None
455
+ Mapping of coordinate variables to update using model
456
+ grid file. Keys are coordinate variable names in the
457
+ dataset to be sent, and values are the corresponding
458
+ variable names in the model grid file. If None, no
459
+ coordinates will be updated.
460
+ grid_filepath: Optional[str], default=None
461
+ Filepath to the model grid file to update coordinate
462
+ variables. Required if update_coords is not None.
463
+ attrs: Optional[dict], default=None
464
+ Dictionary of attributes to add to the dataset.
465
+ If None, no attributes will be added.
466
+ parallel: bool, default=False
467
+ Whether to open and preprocess the dataset in parallel
468
+ using `dask.delayed`.
469
+
470
+ Returns
471
+ -------
472
+ xr.Dataset
473
+ Preprocessed (multifile) dataset with optionally
474
+ updated coordinates, chunksizes and attributes.
475
+
476
+ """
477
+ # == Verify Inputs == #
478
+ if not isinstance(file, (list, str, xr.Dataset)):
479
+ raise TypeError("filepaths must be a list, a string or an xarray Dataset.")
480
+ if isinstance(file, list):
481
+ for fpath in file:
482
+ if not isinstance(fpath, str):
483
+ raise TypeError("filepaths must be a list of strings.")
484
+ if not fpath.endswith('.nc'):
485
+ raise ValueError("Invalid file extension: only .nc files are supported.")
486
+ elif isinstance(file, str):
487
+ if not file.endswith('.nc'):
488
+ raise ValueError("Invalid file extension: only .nc files are supported.")
489
+ if rechunk is not None:
490
+ if not isinstance(rechunk, dict):
491
+ raise TypeError("rechunk must be a dictionary.")
492
+ if not isinstance(append_dim, str):
493
+ raise TypeError("append_dim must be a string.")
494
+ if update_coords is not None:
495
+ if not isinstance(update_coords, dict):
496
+ raise TypeError("update_coords must be a dictionary.")
497
+ if grid_filepath is not None:
498
+ if not isinstance(grid_filepath, str):
499
+ raise TypeError("grid_filepath must be a string.")
500
+ if attrs is not None:
501
+ if not isinstance(attrs, dict):
502
+ raise TypeError("attrs must be a dictionary.")
503
+ if not isinstance(parallel, bool):
504
+ raise TypeError("parallel must be a boolean.")
505
+
506
+ # === Load netCDF dataset === #
507
+ if rechunk is None:
508
+ # Default to dask chunks equal to on-disk chunks:
509
+ rechunk = {}
510
+
511
+ # File names from str / regular expression:
512
+ if isinstance(file, str):
513
+ if '*' in file:
514
+ filepaths = sorted(glob.glob(file))
515
+ if len(filepaths) == 0:
516
+ raise FileNotFoundError(f"No files found at {filepaths}")
517
+ else:
518
+ filepaths = [file]
519
+ # File names from list:
520
+ elif isinstance(file, list):
521
+ filepaths = file
522
+
523
+ # Use input dataset:
524
+ if isinstance(file, xr.Dataset):
525
+ ds_filepath = file
526
+ if rechunk is not None:
527
+ ds_filepath = ds_filepath.chunk(rechunk)
528
+ else:
529
+ # Open multi-file dataset:
530
+ if len(filepaths) > 1:
531
+ ds_filepath = xr.open_mfdataset(filepaths,
532
+ engine='h5netcdf',
533
+ chunks=rechunk,
534
+ parallel=parallel,
535
+ concat_dim=append_dim,
536
+ combine='nested',
537
+ data_vars='minimal',
538
+ coords='minimal',
539
+ compat='override'
540
+ )
541
+ else:
542
+ # Open single file dataset:
543
+ ds_filepath = xr.open_dataset(filepaths[0], chunks=rechunk)
544
+
545
+ # === Update coordinates using model grid file === #
546
+ if update_coords is not None:
547
+ if grid_filepath is None:
548
+ raise ValueError(
549
+ "grid_filepath must be specified to update coordinate variables."
550
+ )
551
+ else:
552
+ ds_grid = xr.open_dataset(grid_filepath)
553
+ # Update coordinate vars using model grid file:
554
+ for key in update_coords.keys():
555
+ coord_data = ds_grid[update_coords[key]].squeeze(drop=True)
556
+ # Rechunk dimensions to user specified chunks:
557
+ if rechunk is not None:
558
+ coord_chunks = {dim: rechunk[dim] for dim in coord_data.dims}
559
+ ds_filepath = ds_filepath.assign_coords(
560
+ {key: coord_data.chunk(coord_chunks)}
561
+ )
562
+ else:
563
+ ds_filepath = ds_filepath.assign_coords(
564
+ {key: coord_data}
565
+ )
566
+ logging.info('Completed: Updated coordinate variables.')
567
+
568
+ # === Update Attributes === #
569
+ if attrs is not None:
570
+ ds_filepath = ds_filepath.assign_attrs(attrs)
571
+
572
+ return ds_filepath
573
+
574
+
575
+ def _send_to_zarr(
576
+ file: list[str] | str | xr.Dataset,
577
+ bucket: str,
578
+ object_prefix: str,
579
+ store_credentials_json: str,
580
+ variables: Optional[list[str]] = None,
581
+ append_dim: str = "time_counter",
582
+ grid_filepath: Optional[str] = None,
583
+ update_coords: Optional[dict] = None,
584
+ rechunk: Optional[dict] = None,
585
+ attrs: Optional[dict] = None,
586
+ parallel: bool = False,
587
+ zarr_version: int = 3
588
+ ) -> None:
589
+ """
590
+ Write data to new Zarr store in cloud object storage.
591
+
592
+ Parameters
593
+ ----------
594
+ file: list | str | xarray.Dataset
595
+ Regular expression or list of filepaths to netCDF file(s).
596
+ Users can also pass a single xarray.Dataset directly.
597
+ bucket: str
598
+ Name of the bucket in the object store. Bucket names can contain only
599
+ lowercase letters, numbers, dots (.), and hyphens (-).
600
+ object_prefix: str
601
+ Prefix to be added to the object names in the object store.
602
+ store_credentials_json: str
603
+ Path to the JSON file containing the object store credentials.
604
+ variables: list[str], optional
605
+ List of variables to send to Zarr stores.
606
+ If None, all variables will be sent.
607
+ append_dim: str, default='time_counter'
608
+ Name of the dimension to append multifile datasets.
609
+ grid_filepath: str, optional
610
+ Path to file containing model grid parameter.
611
+ update_coords: dict, optional
612
+ Dictionary of coordinate variables to update.
613
+ rechunk: dict, optional
614
+ Rechunk strategy dictionary.
615
+ attrs: dict, optional
616
+ Attributes to add to the dataset.
617
+ parallel: bool, default=False,
618
+ Whether to perform open and preprocess steps in parallel using
619
+ `dask.delayed`.
620
+ zarr_version: int, default=3
621
+ Zarr version to use.
622
+ """
623
+ # === Initialise Asynchronous Object Store === #
624
+ logging.info("Reading object store credentials from %s", store_credentials_json)
625
+ obj_store = ObjectStoreS3(anon=False,
626
+ asynchronous=True,
627
+ store_credentials_json=store_credentials_json
628
+ )
629
+
630
+ # === Preprocess Data === #
631
+ ds_filepath = _preprocess_dataset(file=file,
632
+ rechunk=rechunk,
633
+ append_dim=append_dim,
634
+ update_coords=update_coords,
635
+ grid_filepath=grid_filepath,
636
+ attrs=attrs,
637
+ parallel=parallel,
638
+ )
639
+ if variables is None:
640
+ variables = list(ds_filepath.data_vars)
641
+
642
+ # === Send Dataset to Zarr store === #
643
+ # Write to Zarr store:
644
+ url = f"s3://{bucket}/{object_prefix}"
645
+ logging.info(f"Sending Dataset to {url}")
646
+ _write_to_zarr(data=ds_filepath[variables],
647
+ obj_store=obj_store,
648
+ url=url,
649
+ version=zarr_version
650
+ )
651
+
652
+ # Release resources to avoid memory leaks:
653
+ ds_filepath.close()
654
+
655
+ def send_to_zarr(
656
+ file: list[str] | str | xr.Dataset,
657
+ bucket: str,
658
+ object_prefix: str,
659
+ store_credentials_json: str,
660
+ variables: Optional[list[str]] = None,
661
+ append_dim: str = "time_counter",
662
+ grid_filepath: Optional[str] = None,
663
+ update_coords: Optional[dict] = None,
664
+ rechunk: Optional[dict] = None,
665
+ attrs: Optional[dict] = None,
666
+ client : Optional[Client] = None,
667
+ dask_config_kwargs: Optional[dict] = None,
668
+ dask_cluster_kwargs: Optional[dict] = None,
669
+ zarr_version: int = 3
670
+ ) -> None:
671
+ """
672
+ Write data to new Zarr store in cloud object storage with
673
+ option of using dask.
674
+
675
+ Parameters
676
+ ----------
677
+ file: list | str | xarray.Dataset
678
+ Regular expression or list of filepaths to netCDF file(s).
679
+ Users can also pass a single xarray.Dataset directly.
680
+ bucket: str
681
+ Name of the bucket in the object store. Bucket names can contain only
682
+ lowercase letters, numbers, dots (.), and hyphens (-).
683
+ object_prefix: str
684
+ Prefix to be added to the object names in the object store.
685
+ store_credentials_json: str
686
+ Path to the JSON file containing the object store credentials.
687
+ variables: list[str], optional
688
+ List of variables to send. If None, all variables will be sent.
689
+ append_dim: str, default="time_counter"
690
+ Name of the append dimension, by default "time_counter".
691
+ grid_filepath: str, optional
692
+ Path to file containing model grid parameter.
693
+ update_coords: dict, optional
694
+ Dictionary of coordinate variables to update.
695
+ rechunk: dict, optional
696
+ Rechunk strategy dictionary, by default None.
697
+ attrs: dict, optional
698
+ Attributes to add to the dataset.
699
+ client: dask.distributed.Client, optional
700
+ Dask Distributed Client.
701
+ dask_config_kwargs: dict[str,str], optional
702
+ Dask configuration settings passed to dask.config.set().
703
+ dask_cluster_kwargs: dict, optional
704
+ Dask cluster configuration settings passed to LocalCluster().
705
+ zarr_version: int, default=3
706
+ Zarr version to use.
707
+ """
708
+ if dask_cluster_kwargs is not None:
709
+ # === Send to Zarr store with Dask === #
710
+ if dask_config_kwargs is not None:
711
+ dask.config.set(dask_config_kwargs)
712
+ logging.info("Updated dask configuration settings.")
713
+
714
+ # Create local dask cluster & client:
715
+ with LocalCluster(**dask_cluster_kwargs) as cluster, Client(cluster) as client:
716
+ logging.info(f"Created LocalCluster with {dask_cluster_kwargs['n_workers']} workers @ Client: {client.dashboard_link}")
717
+
718
+ # Catch UserWarnings when rechunking data:
719
+ client.register_worker_plugin(CaptureWarningsPlugin())
720
+
721
+ _send_to_zarr(file=file,
722
+ bucket=bucket,
723
+ object_prefix=object_prefix,
724
+ store_credentials_json=store_credentials_json,
725
+ variables=variables,
726
+ append_dim=append_dim,
727
+ grid_filepath=grid_filepath,
728
+ update_coords=update_coords,
729
+ rechunk=rechunk,
730
+ attrs=attrs,
731
+ parallel=True,
732
+ zarr_version=zarr_version
733
+ )
734
+
735
+ # --- Shutdown Store & Dask Cluster --- #
736
+ cluster.close()
737
+ client.shutdown()
738
+ logging.info("Dask Cluster has been shutdown.")
739
+
740
+ elif client is not None:
741
+ logging.info(f"Using existing Dask Cluster @ Client: {client.dashboard_link}")
742
+
743
+ # Catch UserWarnings when rechunking data:
744
+ client.register_worker_plugin(CaptureWarningsPlugin())
745
+
746
+ _send_to_zarr(file=file,
747
+ bucket=bucket,
748
+ object_prefix=object_prefix,
749
+ store_credentials_json=store_credentials_json,
750
+ variables=variables,
751
+ append_dim=append_dim,
752
+ grid_filepath=grid_filepath,
753
+ update_coords=update_coords,
754
+ rechunk=rechunk,
755
+ attrs=attrs,
756
+ parallel=True,
757
+ zarr_version=zarr_version
758
+ )
759
+
760
+ # --- Shutdown Store & Dask Cluster --- #
761
+ cluster.close()
762
+ client.shutdown()
763
+ logging.info("Existing Dask Cluster has been shutdown.")
764
+
765
+ else:
766
+ # === Send to Zarr store without Dask === #
767
+ _send_to_zarr(file=file,
768
+ bucket=bucket,
769
+ object_prefix=object_prefix,
770
+ store_credentials_json=store_credentials_json,
771
+ variables=variables,
772
+ append_dim=append_dim,
773
+ grid_filepath=grid_filepath,
774
+ update_coords=update_coords,
775
+ rechunk=rechunk,
776
+ attrs=attrs,
777
+ parallel=False,
778
+ zarr_version=zarr_version
779
+ )
780
+
781
+ def _update_zarr(
782
+ file: list[str] | str | xr.Dataset,
783
+ bucket: str,
784
+ object_prefix: str,
785
+ store_credentials_json: str,
786
+ variables: Optional[list[str]] = None,
787
+ append_dim: str = "time_counter",
788
+ grid_filepath: Optional[str] = None,
789
+ update_coords: Optional[dict] = None,
790
+ rechunk: Optional[dict] = None,
791
+ attrs: Optional[dict] = None,
792
+ parallel: bool = False,
793
+ zarr_version: int = 3
794
+ ) -> None:
795
+ """
796
+ Update existing Zarr store in cloud object storage
797
+ by replacing and/or appending data.
798
+
799
+ Parameters
800
+ ----------
801
+ file: list | str
802
+ Regular expression or list of filepaths to netCDF file(s).
803
+ Users can also pass a single xarray.Dataset directly.
804
+ bucket: str
805
+ Name of the bucket in the object store. Bucket names can contain only
806
+ lowercase letters, numbers, dots (.), and hyphens (-).
807
+ object_prefix: str
808
+ Prefix to be added to the object names in the object store.
809
+ store_credentials_json: str
810
+ Path to the JSON file containing the object store credentials.
811
+ variables: list, optional
812
+ List of variables to send to Zarr stores.
813
+ If None, all variables will be sent.
814
+ append_dim: str, default='time_counter'
815
+ Name of the dimension to append multifile datasets.
816
+ grid_filepath: str, optional
817
+ Path to file containing model grid parameter.
818
+ update_coords: dict, optional
819
+ Dictionary of coordinate variables to update.
820
+ rechunk: dict, optional
821
+ Rechunk strategy dictionary.
822
+ attrs: dict, optional
823
+ Attributes to add to the dataset.
824
+ parallel: bool, default=False
825
+ Whether to perform open and preprocess steps in parallel using
826
+ `dask.delayed`.
827
+ zarr_version: int, default=3
828
+ Zarr version to use.
829
+ """
830
+ # === Initialise Asynchronous Object Store === #
831
+ logging.info("Reading object store credentials from %s", store_credentials_json)
832
+ obj_store = ObjectStoreS3(anon=False,
833
+ asynchronous=True,
834
+ store_credentials_json=store_credentials_json
835
+ )
836
+
837
+ # === Preprocess Data === #
838
+ ds_filepath = _preprocess_dataset(file=file,
839
+ rechunk=rechunk,
840
+ append_dim=append_dim,
841
+ update_coords=update_coords,
842
+ grid_filepath=grid_filepath,
843
+ attrs=attrs,
844
+ parallel=parallel,
845
+ )
846
+
847
+ if variables is None:
848
+ variables = list(ds_filepath.data_vars)
849
+ # Consider variables with append dimension only:
850
+ variables = [var for var in variables if append_dim in ds_filepath[var].dims]
851
+
852
+ # === Update Existing Zarr store === #
853
+ # Write to Zarr store:
854
+ url = f"s3://{bucket}/{object_prefix}"
855
+ logging.info(f"Updating Dataset at {url}")
856
+ _update_zarr_store(data=ds_filepath[variables],
857
+ obj_store=obj_store,
858
+ url=url,
859
+ append_dim=append_dim,
860
+ rechunk=rechunk,
861
+ version=zarr_version
862
+ )
863
+
864
+ # Release resources to avoid memory leaks:
865
+ ds_filepath.close()
866
+
867
+
868
+ def update_zarr(
869
+ file: list[str] | str | xr.Dataset,
870
+ bucket: str,
871
+ object_prefix: str,
872
+ store_credentials_json: str,
873
+ variables: Optional[list[str]] = None,
874
+ append_dim: str = "time_counter",
875
+ grid_filepath: Optional[str] = None,
876
+ update_coords: Optional[dict] = None,
877
+ rechunk: Optional[dict] = None,
878
+ attrs: Optional[dict] = None,
879
+ client : Optional[Client] = None,
880
+ dask_config_kwargs: Optional[dict] = None,
881
+ dask_cluster_kwargs: Optional[dict] = None,
882
+ zarr_version: int = 3
883
+ ) -> None:
884
+ """
885
+ Update data in existing Zarr store in cloud object
886
+ storage with option of using dask.
887
+
888
+ Parameters
889
+ ----------
890
+ file: list | str | xarray.Dataset
891
+ Regular expression or list of filepaths to netCDF file(s).
892
+ Users can also pass a single xarray.Dataset directly.
893
+ bucket: str
894
+ Name of the bucket in the object store. Bucket names can contain only
895
+ lowercase letters, numbers, dots (.), and hyphens (-).
896
+ object_prefix: str
897
+ Prefix to be added to the object names in the object store.
898
+ store_credentials_json: str
899
+ Path to the JSON file containing the object store credentials.
900
+ variables: list, optional
901
+ List of variables to send to Zarr stores.
902
+ If None, all variables will be sent.
903
+ append_dim: str, default='time_counter'
904
+ Name of the dimension to append multifile datasets.
905
+ grid_filepath: str, optional
906
+ Path to file containing model grid parameter.
907
+ update_coords: dict, optional
908
+ Dictionary of coordinate variables to update.
909
+ rechunk: dict, optional
910
+ Rechunk strategy dictionary.
911
+ attrs: dict, optional
912
+ Attributes to add to the dataset.
913
+ client: dask.distributed.Client, optional
914
+ Dask Distributed Client.
915
+ dask_config_kwargs: Dict[str,str], optional
916
+ Dask configuration settings passed to dask.config.set().
917
+ dask_cluster_kwargs: dict, optional
918
+ Dask cluster configuration settings passed to LocalCluster().
919
+ zarr_version: int, default=3
920
+ zarr version to use.
921
+ """
922
+ if dask_cluster_kwargs is not None:
923
+ # === Update Zarr store with Dask === #
924
+ if dask_config_kwargs is not None:
925
+ dask.config.set(dask_config_kwargs)
926
+ logging.info("Updated dask configuration settings.")
927
+
928
+ # Create local dask cluster & client:
929
+ with LocalCluster(**dask_cluster_kwargs) as cluster, Client(cluster) as client:
930
+ logging.info(f"Created LocalCluster with {dask_cluster_kwargs['n_workers']} workers @ Client: {client.dashboard_link}")
931
+
932
+ # Catch UserWarnings when rechunking data:
933
+ client.register_worker_plugin(CaptureWarningsPlugin())
934
+
935
+ _update_zarr(file=file,
936
+ bucket=bucket,
937
+ object_prefix=object_prefix,
938
+ store_credentials_json=store_credentials_json,
939
+ variables=variables,
940
+ append_dim=append_dim,
941
+ grid_filepath=grid_filepath,
942
+ update_coords=update_coords,
943
+ rechunk=rechunk,
944
+ attrs=attrs,
945
+ parallel=True,
946
+ zarr_version=zarr_version
947
+ )
948
+
949
+ # --- Shutdown Store & Dask Cluster --- #
950
+ cluster.close()
951
+ client.shutdown()
952
+ logging.info("Dask Cluster has been shutdown.")
953
+
954
+ elif client is not None:
955
+ logging.info(f"Using existing Dask Cluster @ Client: {client.dashboard_link}")
956
+
957
+ # Catch UserWarnings when rechunking data:
958
+ client.register_worker_plugin(CaptureWarningsPlugin())
959
+
960
+ _update_zarr(file=file,
961
+ bucket=bucket,
962
+ object_prefix=object_prefix,
963
+ store_credentials_json=store_credentials_json,
964
+ variables=variables,
965
+ append_dim=append_dim,
966
+ grid_filepath=grid_filepath,
967
+ update_coords=update_coords,
968
+ rechunk=rechunk,
969
+ attrs=attrs,
970
+ parallel=True,
971
+ zarr_version=zarr_version
972
+ )
973
+
974
+ # --- Shutdown Store & Dask Cluster --- #
975
+ cluster.close()
976
+ client.shutdown()
977
+ logging.info("Existing Dask Cluster has been shutdown.")
978
+
979
+ else:
980
+ # === Update Zarr store without Dask === #
981
+ _update_zarr(file=file,
982
+ bucket=bucket,
983
+ object_prefix=object_prefix,
984
+ store_credentials_json=store_credentials_json,
985
+ variables=variables,
986
+ append_dim=append_dim,
987
+ grid_filepath=grid_filepath,
988
+ update_coords=update_coords,
989
+ rechunk=rechunk,
990
+ attrs=attrs,
991
+ parallel=False,
992
+ zarr_version=zarr_version
993
+ )