OceanDataStore 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. OceanDataStore/__init__.py +21 -0
  2. OceanDataStore/catalog/__init__.py +12 -0
  3. OceanDataStore/catalog/oceandatacatalog.py +1242 -0
  4. OceanDataStore/catalog/stac/README.md +34 -0
  5. OceanDataStore/catalog/stac/__init__.py +30 -0
  6. OceanDataStore/catalog/stac/create_noc_stac.py +109 -0
  7. OceanDataStore/catalog/stac/npd_era5_collection.py +364 -0
  8. OceanDataStore/catalog/stac/npd_jra55_collection.py +196 -0
  9. OceanDataStore/catalog/stac/ods_obs_collection.py +534 -0
  10. OceanDataStore/catalog/stac/rapid_evo_collection.py +309 -0
  11. OceanDataStore/catalog/stac/template_collection.py +85 -0
  12. OceanDataStore/catalog/stac/utils.py +476 -0
  13. OceanDataStore/cli/__init__.py +34 -0
  14. OceanDataStore/cli/arg_parser.py +182 -0
  15. OceanDataStore/cli/cli.py +203 -0
  16. OceanDataStore/cli/exceptions.py +83 -0
  17. OceanDataStore/cli/icechunk.py +888 -0
  18. OceanDataStore/cli/logging.py +52 -0
  19. OceanDataStore/cli/object_store.py +293 -0
  20. OceanDataStore/cli/utils.py +275 -0
  21. OceanDataStore/cli/zarr.py +870 -0
  22. OceanDataStore/data/ARMOR3D/create_ARMOR3D_P1M-m_monthly_climatology.py +135 -0
  23. OceanDataStore/data/ARMOR3D/download_ARMOR3D_0.125def_P1M-m_1993_2024.py +33 -0
  24. OceanDataStore/data/ARMOR3D/run_create_ARMOR3D_P1M-m_monthly_climatology.slurm +32 -0
  25. OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_climatology_to_os.slurm +32 -0
  26. OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_monthly_to_os.slurm +32 -0
  27. OceanDataStore/data/ARMOR3D/run_update_ARMOR3D_P1m-m_monthly_to_os.slurm +32 -0
  28. OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_climatology_to_os.py +99 -0
  29. OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_to_os.py +147 -0
  30. OceanDataStore/data/ARMOR3D/update_ARMOR3D_P1m-m_monthly_to_os.py +143 -0
  31. OceanDataStore/data/EN.4.2.2/create_EN4.2.2_analysis_g10_climatology.py +162 -0
  32. OceanDataStore/data/EN.4.2.2/download_EN4.2.2_analysis_g10_data.sh +51 -0
  33. OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_climatology_to_os.slurm +32 -0
  34. OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
  35. OceanDataStore/data/EN.4.2.2/run_update_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
  36. OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_climatology_to_os.py +76 -0
  37. OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_to_os.py +165 -0
  38. OceanDataStore/data/EN.4.2.2/update_EN4.2.2_analysis_g10_monthly_to_os.py +161 -0
  39. OceanDataStore/data/ERA5/create_ERA5_daily_climatology.py +110 -0
  40. OceanDataStore/data/ERA5/create_ERA5_daily_mean.py +69 -0
  41. OceanDataStore/data/ERA5/create_ERA5_monthly_mean.py +74 -0
  42. OceanDataStore/data/ERA5/run_create_ERA5_daily_climatology.slurm +54 -0
  43. OceanDataStore/data/ERA5/run_send_ERA5_daily_climatology_to_os.slurm +32 -0
  44. OceanDataStore/data/ERA5/run_send_ERA5_daily_to_os.slurm +32 -0
  45. OceanDataStore/data/ERA5/run_send_ERA5_monthly_to_os.slurm +32 -0
  46. OceanDataStore/data/ERA5/run_update_ERA5_daily_to_os.slurm +32 -0
  47. OceanDataStore/data/ERA5/run_update_ERA5_monthly_to_os.slurm +32 -0
  48. OceanDataStore/data/ERA5/send_ERA5_daily_climatology_to_os.py +159 -0
  49. OceanDataStore/data/ERA5/send_ERA5_daily_to_os.py +141 -0
  50. OceanDataStore/data/ERA5/send_ERA5_monthly_to_os.py +173 -0
  51. OceanDataStore/data/ERA5/update_ERA5_daily_to_os.py +141 -0
  52. OceanDataStore/data/ERA5/update_ERA5_monthly_to_os.py +169 -0
  53. OceanDataStore/data/HadISST/download_HadISST1_data.sh +43 -0
  54. OceanDataStore/data/HadISST/run_send_HadISST1_monthly_to_os.slurm +32 -0
  55. OceanDataStore/data/HadISST/send_HadISST1_monthly_to_os.py +133 -0
  56. OceanDataStore/data/NSIDC/download_NSIDC_monthly_1979_2025_data.sh +54 -0
  57. OceanDataStore/data/NSIDC/process_NSIDC_SSI_Antarctic_data.py +130 -0
  58. OceanDataStore/data/NSIDC/process_NSIDC_SSI_Arctic_data.py +129 -0
  59. OceanDataStore/data/NSIDC/run_send_NSIDC_v4.0_to_OS.slurm +32 -0
  60. OceanDataStore/data/NSIDC/send_NSIDC_SII_v4.0_to_os.py +140 -0
  61. OceanDataStore/data/OISST/create_OISSTv2_daily_climatology.py +83 -0
  62. OceanDataStore/data/OISST/download_oisstv2_data.sh +43 -0
  63. OceanDataStore/data/OISST/run_create_OISSTv2_daily_climatology.slurm +44 -0
  64. OceanDataStore/data/OISST/run_send_OISSTv2_daily_climatology_to_os.slurm +32 -0
  65. OceanDataStore/data/OISST/run_send_OISSTv2_daily_to_os.slurm +32 -0
  66. OceanDataStore/data/OISST/run_send_OISSTv2_monthly_climatology_to_os.slurm +32 -0
  67. OceanDataStore/data/OISST/run_send_OISSTv2_monthly_to_os.slurm +32 -0
  68. OceanDataStore/data/OISST/run_update_OISSTv2_daily_to_os.slurm +32 -0
  69. OceanDataStore/data/OISST/send_OISSTv2_daily_climatology_to_os.py +154 -0
  70. OceanDataStore/data/OISST/send_OISSTv2_daily_ltm_climatology_to_os.py +151 -0
  71. OceanDataStore/data/OISST/send_OISSTv2_daily_to_os.py +142 -0
  72. OceanDataStore/data/OISST/send_OISSTv2_monthly_climatology_to_os.py +150 -0
  73. OceanDataStore/data/OISST/send_OISSTv2_monthly_to_os.py +145 -0
  74. OceanDataStore/data/OISST/update_OISSTv2_daily_to_os.py +142 -0
  75. OceanDataStore/data/OSTIA/create_OSTIA_daily_climatology.py +120 -0
  76. OceanDataStore/data/OSTIA/download_OSTIA_NRT.py +42 -0
  77. OceanDataStore/data/OSTIA/download_OSTIA_REP_1981_2025.py +42 -0
  78. OceanDataStore/data/OSTIA/run_create_OSTIA_daily_climatology.slurm +54 -0
  79. OceanDataStore/data/OSTIA/run_send_OSTIA_daily_climatology_to_os.slurm +32 -0
  80. OceanDataStore/data/OSTIA/run_send_OSTIA_nrt_daily_to_os.slurm +32 -0
  81. OceanDataStore/data/OSTIA/run_send_OSTIA_rep_daily_to_os.slurm +32 -0
  82. OceanDataStore/data/OSTIA/run_update_OSTIA_daily_to_os.slurm +33 -0
  83. OceanDataStore/data/OSTIA/send_OSTIA_daily_climatology_to_os.py +194 -0
  84. OceanDataStore/data/OSTIA/send_OSTIA_nrt_daily_to_os.py +141 -0
  85. OceanDataStore/data/OSTIA/send_OSTIA_rep_daily_to_os.py +145 -0
  86. OceanDataStore/data/OSTIA/update_OSTIA_copernicus_nrt_daily_to_os.py +144 -0
  87. OceanDataStore/data/OSTIA/update_OSTIA_nrt_daily_to_os.py +137 -0
  88. OceanDataStore/data/WOA23/download_WOA23_climatology.sh +41 -0
  89. OceanDataStore/data/WOA23/run_send_WOA23_annual_climatology_to_os.slurm +32 -0
  90. OceanDataStore/data/WOA23/run_send_WOA23_monthly_climatology_to_os.slurm +32 -0
  91. OceanDataStore/data/WOA23/send_WOA23_annual_climatology_to_os.py +263 -0
  92. OceanDataStore/data/WOA23/send_WOA23_monthly_climatology_to_os.py +292 -0
  93. OceanDataStore/data/update_icechunk_repo_attrs.py +76 -0
  94. OceanDataStore/data/update_noc_npd_era5v1_attrs.py +172 -0
  95. OceanDataStore/data/utils.py +506 -0
  96. OceanDataStore/zarr.py +993 -0
  97. oceandatastore-0.3.0.dist-info/METADATA +184 -0
  98. oceandatastore-0.3.0.dist-info/RECORD +104 -0
  99. oceandatastore-0.3.0.dist-info/WHEEL +5 -0
  100. oceandatastore-0.3.0.dist-info/entry_points.txt +2 -0
  101. oceandatastore-0.3.0.dist-info/licenses/LICENSE +201 -0
  102. oceandatastore-0.3.0.dist-info/scm_file_list.json +154 -0
  103. oceandatastore-0.3.0.dist-info/scm_version.json +8 -0
  104. oceandatastore-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,888 @@
1
+ # ===================================================================
2
+ # Copyright 2026 National Oceanography Centre
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0.
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
11
+ # implied. See the License for the specific language governing
12
+ # permissions and limitations under the License.
13
+ # ===================================================================
14
+ """
15
+ icechunk.py
16
+
17
+ Description:
18
+ This module defines the functions to send and update Icechunk Repositories
19
+ in cloud object storage.
20
+
21
+ Authors:
22
+ - Ollie Tooth
23
+ """
24
+ # -- Import Python Modules -- #
25
+ import logging
26
+ from typing import Optional
27
+
28
+ import dask
29
+ import icechunk
30
+ import icechunk.xarray as icechunk_xr
31
+ import numpy as np
32
+ import xarray as xr
33
+ from dask.distributed import Client, LocalCluster
34
+
35
+ from OceanDataStore.cli.exceptions import (
36
+ AppendDimensionError,
37
+ AppendDimensionSizeError,
38
+ ChunkSizeError,
39
+ DimensionNotFound,
40
+ DimensionSizeError,
41
+ )
42
+ from OceanDataStore.cli.object_store import ObjectStoreS3
43
+ from OceanDataStore.cli.utils import (
44
+ CaptureWarningsPlugin,
45
+ _preprocess_dataset,
46
+ timer,
47
+ )
48
+
49
+
50
+ # ======== Define Icechunk Validation Functions ======== #
51
+ def _check_icechunk_compatibility(
52
+ data: xr.DataArray | xr.Dataset,
53
+ dest: str,
54
+ repo: icechunk.Repository,
55
+ branch: str,
56
+ append_dim: str,
57
+ rechunk: dict,
58
+ group: Optional[str] = None,
59
+ ) -> None:
60
+ """
61
+ Check compatibility of DataArray or Dataset to update existing
62
+ IcechunkStore in cloud object storage.
63
+
64
+ Parameters
65
+ ----------
66
+ data: xr.DataArray | xr.Dataset
67
+ DataArray or DataSet to update existing IcechunkStore.
68
+ dest: str
69
+ Path to Icechunk repository in the object store.
70
+ repo: icechunk.Repository
71
+ Icechunk repository in which to write data to IcechunkStore.
72
+ branch: str
73
+ Branch on which to write data to IcechunkStore.
74
+ append_dim: str
75
+ Dimension to append data to existing IcechunkStore.
76
+ rechunk: dict
77
+ Mapping to rechunk dimensions.
78
+ group: Optional[str], default=None
79
+ Group in IcechunkStore to update.
80
+ """
81
+ # === Initialise IcechunkStore from session === #
82
+ store = repo.readonly_session(branch=branch).store
83
+
84
+ # 1. Check if IcechunkStore exists:
85
+ try:
86
+ ds_store = xr.open_zarr(store, group=group, consolidated=False)
87
+ except Exception as e:
88
+ raise FileNotFoundError(f"IcechunkStore not found in repository: {e}")
89
+
90
+ # 2. Check if core dimensions exist in IcechunkStore & sizes are consistent:
91
+ dims_data = {dim : data.sizes[dim] for dim in data.dims if dim != append_dim}
92
+ for dim in dims_data:
93
+ if dim in ds_store.dims:
94
+ if dims_data[dim] != ds_store.sizes[dim]:
95
+ raise DimensionSizeError(dim=dim, size=data.sizes[dim], expected_size=ds_store.sizes[dim])
96
+ else:
97
+ raise DimensionNotFound(dim=dim, object_name=dest)
98
+
99
+ # 3. Check if append dimension values are consistent:
100
+ if (data[append_dim][0] < ds_store[append_dim][0]):
101
+ raise AppendDimensionError(dim=append_dim)
102
+
103
+ # 4. Check if specified chunks are consistent:
104
+ if rechunk is not None:
105
+ for dim in rechunk:
106
+ if dim in ds_store.dims:
107
+ if rechunk[dim] != ds_store.chunks[dim][0]:
108
+ raise ChunkSizeError(chunks=rechunk, store_chunks=ds_store.chunks)
109
+
110
+
111
+ # ======== Define Icechunk Writer Functions ======== #
112
+ def _write_to_icechunk(
113
+ data: xr.DataArray | xr.Dataset,
114
+ dest: str,
115
+ repo: icechunk.Repository,
116
+ commit_message: str,
117
+ branch: Optional[str] = "main",
118
+ group: Optional[str] = None,
119
+ ) -> None:
120
+ """
121
+ Write DataArray or Dataset to IcechunkStore in cloud
122
+ object storage.
123
+
124
+ Parameters
125
+ ----------
126
+ data: xr.DataArray | xr.Dataset
127
+ DataArray or DataSet to write to IcechunkStore.
128
+ dest: str
129
+ Path to Icechunk repository in the object store.
130
+ repo: icechunk.Repository
131
+ Icechunk repository in which to write data to
132
+ IcechunkStore.
133
+ commit_message: str
134
+ Commit message when updating the Icechunk repository.
135
+ branch: Optional[str], default="main"
136
+ Branch on which to write data to IcechunkStore.
137
+ group: Optional[str], default=None
138
+ Group in IcechunkStore to write data to.
139
+ """
140
+ # === Convert DataArrays to Datasets === #
141
+ if isinstance(data, xr.DataArray):
142
+ var = data.name
143
+ data = data.to_dataset()
144
+ else:
145
+ var = None
146
+
147
+ # === Write Data to IcechunkStore & Commit === #
148
+ with timer(action='send', dest=dest, var=var):
149
+ session = repo.writable_session(branch=branch)
150
+ icechunk_xr.to_icechunk(data, session=session, group=group, mode='a')
151
+ session.commit(message=commit_message)
152
+
153
+
154
+ def _append_to_icechunk(
155
+ data: xr.DataArray | xr.Dataset,
156
+ dest: str,
157
+ repo: icechunk.Repository,
158
+ commit_message: str,
159
+ branch: Optional[str] = "main",
160
+ group: Optional[str] = None,
161
+ append_dim: Optional[str] = "time_counter",
162
+ ) -> None:
163
+ """
164
+ Append DataArray or Dataset to existing IcechunkStore in
165
+ cloud object storage.
166
+
167
+ Parameters
168
+ ----------
169
+ data: xr.DataArray | xr.Dataset
170
+ DataArray or DataSet to append to existing IcechunkStore.
171
+ dest: str
172
+ Path to Icechunk repository in the object store.
173
+ repo: icechunk.Repository
174
+ Icechunk repository in which to write data to
175
+ IcechunkStore.
176
+ commit_message: str
177
+ Commit message when updating the Icechunk repository.
178
+ branch: Optional[str], default="main"
179
+ Branch on which to write data to IcechunkStore.
180
+ group: Optional[str], default=None
181
+ Group in IcechunkStore to append data to.
182
+ append_dim: Optional[str], default="time_counter"
183
+ Dimension to append data to existing IcechunkStore.
184
+ """
185
+ # === Convert DataArrays to Datasets === #
186
+ if isinstance(data, xr.DataArray):
187
+ data = data.to_dataset()
188
+
189
+ # === Append Data to IcechunkStore & Commit === #
190
+ with timer(action='append', dest=dest):
191
+ session = repo.writable_session(branch=branch)
192
+ icechunk_xr.to_icechunk(obj=data, session=session, group=group, append_dim=append_dim)
193
+ session.commit(message=commit_message)
194
+
195
+
196
+ def _replace_in_icechunk(
197
+ data: xr.DataArray | xr.Dataset,
198
+ dest: str,
199
+ region: dict,
200
+ repo: icechunk.Repository,
201
+ commit_message: str,
202
+ branch: Optional[str] = "main",
203
+ group: Optional[str] = None,
204
+ ) -> None:
205
+ """
206
+ Replace data in existing IcechunkStore in cloud object storage.
207
+
208
+ Parameters
209
+ ----------
210
+ data: xr.DataArray | xr.Dataset
211
+ DataArray or Dataset used to replace data in existing IcechunkStore.
212
+ dest: str
213
+ Path to Icechunk repository in the object store.
214
+ region: dict
215
+ Region of existing IcechunkStore to replace data.
216
+ repo: icechunk.Repository
217
+ Icechunk repository in which to replace data in IcechunkStore.
218
+ commit_message: str
219
+ Commit message when updating the Icechunk repository.
220
+ branch: Optional[str], default="main"
221
+ Branch on which to write data to IcechunkStore.
222
+ group: Optional[str], default=None
223
+ Group in IcechunkStore to replace data in.
224
+ """
225
+ # === Convert DataArrays to Datasets === #
226
+ if isinstance(data, xr.DataArray):
227
+ data = data.to_dataset()
228
+
229
+ # Drop variables w/o append dimension:
230
+ append_dim = list(region.keys())[0]
231
+ drop_list = [var for var in data.variables if append_dim not in data[var].dims]
232
+ data = data.drop_vars(drop_list)
233
+
234
+ # === Write Data to IcechunkStore & Commit === #
235
+ with timer(action='replace', dest=dest):
236
+ session = repo.writable_session(branch=branch)
237
+ icechunk_xr.to_icechunk(obj=data, session=session, region=region, group=group)
238
+ session.commit(message=commit_message)
239
+
240
+
241
+ def _update_icechunk_store(
242
+ data: xr.DataArray | xr.Dataset,
243
+ dest: str,
244
+ repo: icechunk.Repository,
245
+ commit_message: str,
246
+ branch: Optional[str] = "main",
247
+ group: Optional[str] = None,
248
+ append_dim: Optional[str] = "time_counter",
249
+ rechunk: Optional[dict] = None,
250
+ ) -> None:
251
+ """
252
+ Update an existing IcechunkStore in object storage by replacing
253
+ existing values and/or appending new values.
254
+
255
+ Parameters
256
+ ----------
257
+ data: xr.DataArray | xr.Dataset
258
+ DataArray or DataSet to append to existing IcechunkStore.
259
+ dest: str
260
+ Path to Icechunk repository in the object store.
261
+ repo: icechunk.Repository
262
+ Icechunk repository in which to write data to
263
+ IcechunkStore.
264
+ commit_message: str
265
+ Commit message when updating the Icechunk repository.
266
+ branch: str, default="main"
267
+ Branch on which to write data to IcechunkStore.
268
+ group: Optional[str], default=None
269
+ Group in IcechunkStore to update.
270
+ append_dim: Optional[str], default="time_counter"
271
+ Dimension to append data to existing IcechunkStore.
272
+ rechunk: Optional[dict], default=None
273
+ Mapping to rechunk dimensions.
274
+ """
275
+ # Convert DataArrays to Datasets:
276
+ if isinstance(data, xr.DataArray):
277
+ var = data.name
278
+ ds_source = data.to_dataset()
279
+ else:
280
+ var = None
281
+ ds_source = data
282
+
283
+ # Extract source & target append dimension values:
284
+ store = repo.readonly_session(branch=branch).store
285
+ ds_target = xr.open_zarr(store, group=group, consolidated=False)
286
+ target_append_dim = ds_target[append_dim].values
287
+ source_append_dim = ds_source[append_dim].values
288
+
289
+ # === Update existing variable in IcechunkStore === #
290
+ if (var in ds_target.data_vars) or (var is None):
291
+ # Check source Dataset compatibility with existing store:
292
+ _check_icechunk_compatibility(data=ds_source,
293
+ dest=dest,
294
+ repo=repo,
295
+ branch=branch,
296
+ append_dim=append_dim,
297
+ rechunk=rechunk,
298
+ group=group
299
+ )
300
+ logging.info(f"Passed Compatibility Checks for IcechunkStore {dest}")
301
+
302
+ # Determine intersection between source & target append dimensions:
303
+ intersect_append_dim = np.intersect1d(source_append_dim, target_append_dim)
304
+
305
+ if intersect_append_dim.size != 0:
306
+ # == Intersection exists -> replace overlapping values in target store == #
307
+
308
+ # Ensure all overlapping values exist along target append dimension:
309
+ overlap_append_dim = (source_append_dim <= target_append_dim[-1]).sum()
310
+ if intersect_append_dim.size != overlap_append_dim:
311
+ raise AppendDimensionSizeError(dim=append_dim, size=overlap_append_dim, expected_size=intersect_append_dim.size)
312
+
313
+ # Determine source and target append dimension indices of overlap:
314
+ target_ind_min = np.flatnonzero(target_append_dim == source_append_dim[0])[0]
315
+ target_ind_max = target_append_dim.size
316
+ source_ind_min = 0
317
+ source_ind_max = target_ind_max - target_ind_min
318
+ source_ind_size = source_append_dim.size
319
+
320
+ # 1. Replace overlapping values in target IcechunkStore:
321
+ logging.info(f"Updating {dest} along {append_dim} from {target_append_dim[target_ind_min]} to {target_append_dim[target_ind_max - 1]}.")
322
+ if var is not None:
323
+ rep_commit_message = f"{commit_message} -> Updated {var} along {append_dim} from {target_append_dim[target_ind_min]} to {target_append_dim[target_ind_max - 1]}."
324
+ else:
325
+ rep_commit_message = f"{commit_message} -> Updated {dest} along {append_dim} from {target_append_dim[target_ind_min]} to {target_append_dim[target_ind_max - 1]}."
326
+
327
+ _replace_in_icechunk(data=ds_source.isel({append_dim : slice(source_ind_min, source_ind_max)}),
328
+ repo=repo,
329
+ dest=dest,
330
+ region={append_dim : slice(target_ind_min, target_ind_max)},
331
+ commit_message=rep_commit_message,
332
+ branch=branch,
333
+ group=group
334
+ )
335
+
336
+ # 2. Append new values to target IcechunkStore:
337
+ if source_ind_size > source_ind_max:
338
+ logging.info(f"Appending to {dest} along {append_dim} from {source_append_dim[source_ind_max]} to {source_append_dim[source_ind_size - 1]}.")
339
+ if var is not None:
340
+ app_commit_message = f"{commit_message} -> Appended to {var} along {append_dim} from {source_append_dim[source_ind_max]} to {source_append_dim[source_ind_size - 1]}."
341
+ else:
342
+ app_commit_message = f"{commit_message} -> Appended to {dest} along {append_dim} from {source_append_dim[source_ind_max]} to {source_append_dim[source_ind_size - 1]}."
343
+
344
+ _append_to_icechunk(data=ds_source.isel({append_dim : slice(source_ind_max, source_ind_size)}),
345
+ repo=repo,
346
+ dest=dest,
347
+ commit_message=app_commit_message,
348
+ branch=branch,
349
+ group=group,
350
+ append_dim=append_dim
351
+ )
352
+ else:
353
+ # == No intersection -> append all source values to target IcechunkStore == #
354
+ logging.info(f"Appending to {dest} along {append_dim} from {source_append_dim[0]} to {source_append_dim[-1]}.")
355
+ if var is not None:
356
+ app_commit_message = f"{commit_message} -> Appended {var} along {append_dim} from {source_append_dim[0]} to {source_append_dim[-1]}."
357
+ else:
358
+ app_commit_message = f"{commit_message} -> Appended to {dest} along {append_dim} from {source_append_dim[0]} to {source_append_dim[-1]}."
359
+
360
+ _append_to_icechunk(data=ds_source,
361
+ repo=repo,
362
+ dest=dest,
363
+ commit_message=app_commit_message,
364
+ branch=branch,
365
+ group=group,
366
+ append_dim=append_dim
367
+ )
368
+ else:
369
+ # == Add new variable to IcechunkStore == #
370
+ logging.info(f"Sending Variable {var}")
371
+ snd_commit_message = f"{commit_message} -> Sent {var} along {append_dim} from {source_append_dim[0]} to {source_append_dim[-1]}."
372
+ _write_to_icechunk(data=ds_source,
373
+ dest=dest,
374
+ repo=repo,
375
+ commit_message=snd_commit_message,
376
+ branch=branch,
377
+ group=group
378
+ )
379
+
380
+
381
+ def _send_to_icechunk(
382
+ file: list[str] | str | xr.Dataset,
383
+ bucket: str,
384
+ object_prefix: str,
385
+ store_credentials_json: str,
386
+ exists: Optional[bool] = False,
387
+ group: Optional[str] = None,
388
+ variables: Optional[list[str]] = None,
389
+ append_dim: Optional[str] = 'time_counter',
390
+ grid_filepath: Optional[str] = None,
391
+ update_coords: Optional[dict] = None,
392
+ rechunk: Optional[dict] = None,
393
+ attrs: Optional[dict] = None,
394
+ parallel: Optional[bool] = False,
395
+ branch: Optional[str] = "main",
396
+ commit_message: Optional[str] = "Add new data to my Icechunk repository",
397
+ variable_commits: Optional[bool] = False,
398
+ icechunk_config: Optional[dict] = None,
399
+ ) -> None:
400
+ """
401
+ Write data to new Icechunk repository in cloud object storage.
402
+
403
+ Parameters
404
+ ----------
405
+ file: list | str | xarray.Dataset
406
+ Regular expression or list of filepaths to netCDF file(s).
407
+ Users can also pass a single xarray.Dataset directly.
408
+ bucket: str
409
+ Name of the bucket in the object store. Bucket names can contain only
410
+ lowercase letters, numbers, dots (.), and hyphens (-).
411
+ object_prefix: str
412
+ Prefix to be added to the object names in the object store.
413
+ store_credentials_json: str
414
+ Path to the JSON file containing the object store credentials.
415
+ exists: Optional[bool], default=False
416
+ Whether to write to an existing Icechunk repository or create a new repository.
417
+ group: Optional[str], default=None
418
+ Group in Icechunk repository to write data to.
419
+ variables: Optional[list[str]], default=None
420
+ List of variables to send. If None, all variables will be sent.
421
+ append_dim: Optional[str], default='time_counter'
422
+ Name of the dimension to append multifile datasets.
423
+ grid_filepath: Optional[str], default=None
424
+ Path to file containing model grid parameter.
425
+ update_coords: Optional[dict], default=None
426
+ Dictionary of coordinate variables to update.
427
+ rechunk: Optional[dict], default=None
428
+ Rechunk strategy dictionary, by default None.
429
+ attrs: Optional[dict], default=None
430
+ Attributes to add to the dataset.
431
+ parallel: Optional[bool], default=False
432
+ Whether to perform open and preprocess steps in parallel using
433
+ `dask.delayed`.
434
+ branch: Optional[str], default="main"
435
+ Branch on which to write data to IcechunkStore.
436
+ commit_message: Optional[str], default="Initial commit"
437
+ Commit message when updating the Icechunk repository.
438
+ variable_commits: Optional[bool], default=False
439
+ Whether to write each variable to Icechunk repository using
440
+ separate commits.
441
+ icechunk_config: Optional[dict], default=None
442
+ Icechunk repository configuration.
443
+ """
444
+ # === Initialise Synchronous Object Store === #
445
+ logging.info("Reading object store credentials from %s", store_credentials_json)
446
+ obj_store = ObjectStoreS3(anon=False,
447
+ asynchronous=False,
448
+ store_credentials_json=store_credentials_json
449
+ )
450
+
451
+ if icechunk_config is None:
452
+ icechunk_config = {"storage_config_kwargs": {'region': 'us-east-1', 'force_path_style': True},
453
+ "repository_config_kwargs": {},
454
+ "storage_settings_kwargs": {'unsafe_use_conditional_update': False, 'unsafe_use_conditional_create': False},
455
+ }
456
+
457
+ # === Preprocess Data === #
458
+ ds_filepath = _preprocess_dataset(file=file,
459
+ rechunk=rechunk,
460
+ append_dim=append_dim,
461
+ update_coords=update_coords,
462
+ grid_filepath=grid_filepath,
463
+ attrs=attrs,
464
+ parallel=parallel,
465
+ )
466
+
467
+ # Consider variables with append dimension only:
468
+ if variables is None:
469
+ variables = list(ds_filepath.data_vars)
470
+
471
+ # Extract append dimension values:
472
+ if append_dim in ds_filepath.dims:
473
+ source_append_dim = ds_filepath[append_dim].values
474
+
475
+ # === Send Variables to Icechunk Repo === #
476
+ if exists:
477
+ # Open existing Icechunk repo:
478
+ try:
479
+ repo = obj_store.open_icechunk_repo(bucket=bucket,
480
+ prefix=object_prefix,
481
+ storage_config_kwargs=icechunk_config["storage_config_kwargs"],
482
+ repository_config_kwargs=icechunk_config["repository_config_kwargs"],
483
+ storage_settings_kwargs=icechunk_config["storage_settings_kwargs"],
484
+ )
485
+ except icechunk.IcechunkError:
486
+ logging.info(f"Failed to open existing Icechunk repository at {bucket}/{object_prefix}")
487
+
488
+ else:
489
+ try:
490
+ # Create new Icechunk repo:
491
+ repo = obj_store.create_icechunk_repo(bucket=bucket,
492
+ prefix=object_prefix,
493
+ storage_config_kwargs=icechunk_config["storage_config_kwargs"],
494
+ repository_config_kwargs=icechunk_config["repository_config_kwargs"],
495
+ storage_settings_kwargs=icechunk_config["storage_settings_kwargs"],
496
+ )
497
+ except icechunk.IcechunkError:
498
+ logging.info(f"Failed to create new Icechunk repository at {bucket}/{object_prefix}")
499
+
500
+ # Write data to Icechunk repository:
501
+ if variable_commits:
502
+ for var in variables:
503
+ logging.info(f"Sending Variable: {var}")
504
+ if append_dim in ds_filepath[var].dims:
505
+ snd_commit_message = f"{commit_message} -> Sent {var} along {append_dim} from {source_append_dim[0]} to {source_append_dim[-1]}."
506
+ else:
507
+ snd_commit_message = f"{commit_message} -> Sent {var}."
508
+
509
+ # Write each variable using separate commits to the repo:
510
+ _write_to_icechunk(data=ds_filepath[var],
511
+ dest=f"{bucket}/{object_prefix}",
512
+ repo=repo,
513
+ commit_message=snd_commit_message,
514
+ branch=branch,
515
+ group=group
516
+ )
517
+ else:
518
+ # Write all variables using single commit to the repo:
519
+ logging.info(f"Sending Dataset: {object_prefix}")
520
+ if append_dim in ds_filepath.dims:
521
+ snd_commit_message = f"{commit_message} -> Sent dataset along {append_dim} from {source_append_dim[0]} to {source_append_dim[-1]}."
522
+ else:
523
+ snd_commit_message = f"{commit_message} -> Sent dataset."
524
+
525
+ _write_to_icechunk(data=ds_filepath[variables],
526
+ dest=f"{bucket}/{object_prefix}",
527
+ repo=repo,
528
+ commit_message=snd_commit_message,
529
+ branch=branch,
530
+ group=group
531
+ )
532
+
533
+ # Release resources to avoid memory leaks:
534
+ ds_filepath.close()
535
+
536
+
537
+ def _update_icechunk(
538
+ file: list[str] | str | xr.Dataset,
539
+ bucket: str,
540
+ object_prefix: str,
541
+ store_credentials_json: str,
542
+ group: Optional[str] = None,
543
+ variables: Optional[list[str]] = None,
544
+ append_dim: Optional[str] = 'time_counter',
545
+ grid_filepath: Optional[str] = None,
546
+ update_coords: Optional[dict] = None,
547
+ rechunk: Optional[dict] = None,
548
+ attrs: Optional[dict] = None,
549
+ parallel: bool = False,
550
+ branch: str = "main",
551
+ commit_message: str = "Update data in my Icechunk repository",
552
+ icechunk_config: Optional[dict] = None,
553
+ ) -> None:
554
+ """
555
+ Update data in existing Icechunk repository in cloud object storage
556
+ by replacing and/or appending data.
557
+
558
+ Parameters
559
+ ----------
560
+ file: list | str | xarray.Dataset
561
+ Regular expression or list of filepaths to netCDF file(s).
562
+ Users can also pass a single xarray.Dataset directly.
563
+ bucket: str
564
+ Name of the bucket in the object store. Bucket names can contain only
565
+ lowercase letters, numbers, dots (.), and hyphens (-).
566
+ object_prefix: str
567
+ Prefix to be added to the object names in the object store.
568
+ store_credentials_json: str
569
+ Path to the JSON file containing the object store credentials.
570
+ group: Optional[str], default=None
571
+ Group in Icechunk repository to write data to.
572
+ variables: Optional[list[str]], default=None
573
+ List of variables to send. If None, all variables will be sent.
574
+ append_dim: Optional[str], default='time_counter'
575
+ Name of the dimension to append multifile datasets.
576
+ grid_filepath: Optional[str], default=None
577
+ Path to file containing model grid parameter.
578
+ update_coords: Optional[dict], default=None
579
+ Dictionary of coordinate variables to update.
580
+ rechunk: Optional[dict], default=None
581
+ Rechunk strategy dictionary, by default None.
582
+ attrs: Optional[dict], default=None
583
+ Attributes to add to the dataset.
584
+ parallel: Optional[bool], default=False
585
+ Whether to perform open and preprocess steps in parallel using
586
+ `dask.delayed`.
587
+ branch: Optional[str], default="main"
588
+ Branch on which to write data to IcechunkStore.
589
+ commit_message: Optional[str], default="Update commit"
590
+ Commit message when updating the Icechunk repository.
591
+ icechunk_config: Optional[dict], default=None
592
+ Icechunk repository configuration.
593
+ """
594
+ # === Initialise Synchronous Object Store === #
595
+ logging.info("Reading object store credentials from %s", store_credentials_json)
596
+ obj_store = ObjectStoreS3(anon=False,
597
+ asynchronous=False,
598
+ store_credentials_json=store_credentials_json
599
+ )
600
+
601
+ if icechunk_config is None:
602
+ icechunk_config = {"storage_config_kwargs": {'region': 'us-east-1', 'force_path_style': True},
603
+ "repository_config_kwargs": {},
604
+ "storage_settings_kwargs": {'unsafe_use_conditional_update': False, 'unsafe_use_conditional_create': False},
605
+ }
606
+
607
+ # === Preprocess Data === #
608
+ ds_filepath = _preprocess_dataset(file=file,
609
+ rechunk=rechunk,
610
+ append_dim=append_dim,
611
+ update_coords=update_coords,
612
+ grid_filepath=grid_filepath,
613
+ attrs=attrs,
614
+ parallel=parallel,
615
+ )
616
+
617
+ if variables is None:
618
+ variables = list(ds_filepath.data_vars)
619
+ # Consider variables with append dimension only:
620
+ variables = [var for var in variables if append_dim in ds_filepath[var].dims]
621
+
622
+ # === Update Variables in Icechunk Repo === #
623
+ try:
624
+ # Open existing Icechunk repo:
625
+ repo = obj_store.open_icechunk_repo(bucket=bucket,
626
+ prefix=object_prefix,
627
+ storage_config_kwargs=icechunk_config["storage_config_kwargs"],
628
+ repository_config_kwargs=icechunk_config["repository_config_kwargs"],
629
+ storage_settings_kwargs=icechunk_config["storage_settings_kwargs"],
630
+ )
631
+
632
+ # Update dataset using single commit to the repo:
633
+ logging.info(f"Updating Dataset {object_prefix}")
634
+ _update_icechunk_store(data=ds_filepath[variables],
635
+ dest=f"{bucket}/{object_prefix}",
636
+ repo=repo,
637
+ commit_message=commit_message,
638
+ branch=branch,
639
+ group=group,
640
+ append_dim=append_dim,
641
+ rechunk=rechunk,
642
+ )
643
+
644
+ except icechunk.IcechunkError:
645
+ logging.info(f"Skipping Dataset: Icechunk repository does not exist at {bucket}/{object_prefix}")
646
+
647
+ # Release resources to avoid memory leaks:
648
+ ds_filepath.close()
649
+
650
+
651
+ # ============ Define Public Functions ============ #
652
+ def send_to_icechunk(
653
+ file: list[str] | str | xr.Dataset,
654
+ bucket: str,
655
+ object_prefix: str,
656
+ store_credentials_json: str,
657
+ exists: Optional[bool] = False,
658
+ group: Optional[str] = None,
659
+ variables: Optional[list[str]] = None,
660
+ append_dim: Optional[str] = 'time_counter',
661
+ grid_filepath: Optional[str] = None,
662
+ update_coords: Optional[dict] = None,
663
+ rechunk: Optional[dict] = None,
664
+ attrs: Optional[dict] = None,
665
+ branch: Optional[str] = "main",
666
+ commit_message: Optional[str] = "Add new data to my Icechunk repository",
667
+ variable_commits: Optional[bool] = False,
668
+ dask_config_kwargs: Optional[dict] = None,
669
+ dask_cluster_kwargs: Optional[dict] = None,
670
+ icechunk_config: Optional[dict] = None,
671
+ ) -> None:
672
+ """
673
+ Write data to new Icechunk repository in cloud object storage with
674
+ option of using dask.
675
+
676
+ Parameters
677
+ ----------
678
+ file: list | str | xarray.Dataset
679
+ Regular expression or list of filepaths to netCDF file(s).
680
+ Users can also pass a single xarray.Dataset directly.
681
+ bucket: str
682
+ Name of the bucket in the object store. Bucket names can contain only
683
+ lowercase letters, numbers, dots (.), and hyphens (-).
684
+ object_prefix: str
685
+ Prefix to be added to the object names in the object store.
686
+ store_credentials_json: str
687
+ Path to the JSON file containing the object store credentials.
688
+ exists: Optional[bool], default=False
689
+ Whether to write to an existing Icechunk repository or create a new repository.
690
+ group: Optional[str], default=None
691
+ Group in Icechunk repository to write data to.
692
+ variables: Optional[list[str]], default=None
693
+ List of variables to send. If None, all variables will be sent.
694
+ append_dim: Optional[str], default='time_counter'
695
+ Name of the dimension to append multifile datasets.
696
+ grid_filepath: Optional[str], default=None
697
+ Path to file containing model grid parameter.
698
+ update_coords: Optional[dict], default=None
699
+ Dictionary of coordinate variables to update.
700
+ rechunk: Optional[dict], default=None
701
+ Rechunk strategy dictionary, by default None.
702
+ attrs: Optional[dict], default=None
703
+ Attributes to add to the dataset.
704
+ branch: Optional[str], default="main"
705
+ Branch on which to write data to IcechunkStore.
706
+ commit_message: Optional[str], default="Initial commit"
707
+ Commit message when updating the Icechunk repository.
708
+ variable_commits: Optional[bool], default=False
709
+ Whether to write each variable to Icechunk repository using
710
+ separate commits.
711
+ dask_config_kwargs: Optional[dict], default=None
712
+ Dask configuration settings passed to dask.config.set().
713
+ dask_cluster_kwargs: Optional[dict], default=None
714
+ Dask cluster configuration settings passed to LocalCluster().
715
+ icechunk_config: Optional[dict], default=None
716
+ Icechunk repository configuration.
717
+ """
718
+ if dask_cluster_kwargs is not None:
719
+ # === Send to Icechunk repo(s) with Dask === #
720
+ if dask_config_kwargs is not None:
721
+ dask.config.set(dask_config_kwargs)
722
+ logging.info("Updated dask configuration settings.")
723
+
724
+ # Create local dask cluster & client:
725
+ with LocalCluster(**dask_cluster_kwargs) as cluster, Client(cluster) as client:
726
+ logging.info(f"Created LocalCluster with {dask_cluster_kwargs['n_workers']} workers @ Client: {client.dashboard_link}")
727
+
728
+ # Catch UserWarnings when rechunking data:
729
+ client.register_worker_plugin(CaptureWarningsPlugin())
730
+
731
+ _send_to_icechunk(file=file,
732
+ bucket=bucket,
733
+ object_prefix=object_prefix,
734
+ store_credentials_json=store_credentials_json,
735
+ exists=exists,
736
+ group=group,
737
+ variables=variables,
738
+ append_dim=append_dim,
739
+ grid_filepath=grid_filepath,
740
+ update_coords=update_coords,
741
+ rechunk=rechunk,
742
+ attrs=attrs,
743
+ parallel=True,
744
+ branch=branch,
745
+ commit_message=commit_message,
746
+ variable_commits=variable_commits,
747
+ icechunk_config=icechunk_config
748
+ )
749
+
750
+ # --- Shutdown Store & Dask Cluster --- #
751
+ cluster.close()
752
+ client.shutdown()
753
+ logging.info("Dask Cluster has been shutdown.")
754
+
755
+ else:
756
+ # === Send to Icechunk repo(s) without Dask === #
757
+ _send_to_icechunk(file=file,
758
+ bucket=bucket,
759
+ object_prefix=object_prefix,
760
+ store_credentials_json=store_credentials_json,
761
+ exists=exists,
762
+ group=group,
763
+ variables=variables,
764
+ append_dim=append_dim,
765
+ grid_filepath=grid_filepath,
766
+ update_coords=update_coords,
767
+ rechunk=rechunk,
768
+ attrs=attrs,
769
+ parallel=False,
770
+ branch=branch,
771
+ commit_message=commit_message,
772
+ variable_commits=variable_commits,
773
+ icechunk_config=icechunk_config
774
+ )
775
+
776
+
777
+ def update_icechunk(
778
+ file: list[str] | str | xr.Dataset,
779
+ bucket: str,
780
+ object_prefix: str,
781
+ store_credentials_json: str,
782
+ group: Optional[str] = None,
783
+ variables: Optional[list[str]] = None,
784
+ append_dim: Optional[str] = 'time_counter',
785
+ grid_filepath: Optional[str] = None,
786
+ update_coords: Optional[dict] = None,
787
+ rechunk: Optional[dict] = None,
788
+ attrs: Optional[dict] = None,
789
+ branch: Optional[str] = "main",
790
+ commit_message: Optional[str] = "Update data in my Icechunk repository",
791
+ dask_config_kwargs: Optional[dict] = None,
792
+ dask_cluster_kwargs: Optional[dict] = None,
793
+ icechunk_config: Optional[dict] = None,
794
+ ) -> None:
795
+ """
796
+ Update data in existing Icechunk repository in cloud object
797
+ storage with option of using dask.
798
+
799
+ Parameters
800
+ ----------
801
+ file: list | str | xarray.Dataset
802
+ Regular expression or list of filepaths to netCDF file(s).
803
+ Users can also pass a single xarray.Dataset directly.
804
+ bucket: str
805
+ Name of the bucket in the object store. Bucket names can contain only
806
+ lowercase letters, numbers, dots (.), and hyphens (-).
807
+ object_prefix: str
808
+ Prefix to be added to the object names in the object store.
809
+ store_credentials_json: str
810
+ Path to the JSON file containing the object store credentials.
811
+ group: Optional[str], default=None
812
+ Group in Icechunk repository to write data to.
813
+ variables: Optional[list[str]], default=None
814
+ List of variables to send. If None, all variables will be sent.
815
+ append_dim: Optional[str], default='time_counter'
816
+ Name of the dimension to append multifile datasets.
817
+ grid_filepath: Optional[str], default=None
818
+ Path to file containing model grid parameter.
819
+ update_coords: Optional[dict], default=None
820
+ Dictionary of coordinate variables to update.
821
+ rechunk: Optional[dict], default=None
822
+ Rechunk strategy dictionary, by default None.
823
+ attrs: Optional[dict], default=None
824
+ Attributes to add to the dataset.
825
+ branch: Optional[str], default="main"
826
+ Branch on which to write data to IcechunkStore.
827
+ commit_message: Optional[str], default="Initial commit"
828
+ Commit message when updating the Icechunk repository.
829
+ dask_config_kwargs: Optional[dict], default=None
830
+ Dask configuration settings passed to dask.config.set().
831
+ dask_cluster_kwargs: Optional[dict], default=None
832
+ Dask cluster configuration settings passed to LocalCluster().
833
+ icechunk_config: Optional[dict], default=None
834
+ Icechunk repository configuration.
835
+ """
836
+ # === Update Icechunk repo(s) with Dask === #
837
+ if dask_cluster_kwargs is not None:
838
+ if dask_config_kwargs is not None:
839
+ dask.config.set(dask_config_kwargs)
840
+ logging.info("Updated dask configuration settings.")
841
+
842
+ # Create local dask cluster & client:
843
+ with LocalCluster(**dask_cluster_kwargs) as cluster, Client(cluster) as client:
844
+ logging.info(f"Created LocalCluster with {dask_cluster_kwargs['n_workers']} workers @ Client: {client.dashboard_link}")
845
+
846
+ # Catch UserWarnings when rechunking data:
847
+ client.register_worker_plugin(CaptureWarningsPlugin())
848
+
849
+ _update_icechunk(file=file,
850
+ bucket=bucket,
851
+ object_prefix=object_prefix,
852
+ store_credentials_json=store_credentials_json,
853
+ group=group,
854
+ variables=variables,
855
+ append_dim=append_dim,
856
+ grid_filepath=grid_filepath,
857
+ update_coords=update_coords,
858
+ rechunk=rechunk,
859
+ attrs=attrs,
860
+ parallel=True,
861
+ branch=branch,
862
+ commit_message=commit_message,
863
+ icechunk_config=icechunk_config
864
+ )
865
+
866
+ # --- Shutdown Store & Dask Cluster --- #
867
+ cluster.close()
868
+ client.shutdown()
869
+ logging.info("Dask Cluster has been shutdown.")
870
+
871
+ else:
872
+ # === Update Icechunk repo(s) without Dask === #
873
+ _update_icechunk(file=file,
874
+ bucket=bucket,
875
+ object_prefix=object_prefix,
876
+ store_credentials_json=store_credentials_json,
877
+ group=group,
878
+ variables=variables,
879
+ append_dim=append_dim,
880
+ grid_filepath=grid_filepath,
881
+ update_coords=update_coords,
882
+ rechunk=rechunk,
883
+ attrs=attrs,
884
+ parallel=False,
885
+ branch=branch,
886
+ commit_message=commit_message,
887
+ icechunk_config=icechunk_config
888
+ )