OceanDataStore 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- OceanDataStore/__init__.py +21 -0
- OceanDataStore/catalog/__init__.py +12 -0
- OceanDataStore/catalog/oceandatacatalog.py +1242 -0
- OceanDataStore/catalog/stac/README.md +34 -0
- OceanDataStore/catalog/stac/__init__.py +30 -0
- OceanDataStore/catalog/stac/create_noc_stac.py +109 -0
- OceanDataStore/catalog/stac/npd_era5_collection.py +364 -0
- OceanDataStore/catalog/stac/npd_jra55_collection.py +196 -0
- OceanDataStore/catalog/stac/ods_obs_collection.py +534 -0
- OceanDataStore/catalog/stac/rapid_evo_collection.py +309 -0
- OceanDataStore/catalog/stac/template_collection.py +85 -0
- OceanDataStore/catalog/stac/utils.py +476 -0
- OceanDataStore/cli/__init__.py +34 -0
- OceanDataStore/cli/arg_parser.py +182 -0
- OceanDataStore/cli/cli.py +203 -0
- OceanDataStore/cli/exceptions.py +83 -0
- OceanDataStore/cli/icechunk.py +888 -0
- OceanDataStore/cli/logging.py +52 -0
- OceanDataStore/cli/object_store.py +293 -0
- OceanDataStore/cli/utils.py +275 -0
- OceanDataStore/cli/zarr.py +870 -0
- OceanDataStore/data/ARMOR3D/create_ARMOR3D_P1M-m_monthly_climatology.py +135 -0
- OceanDataStore/data/ARMOR3D/download_ARMOR3D_0.125def_P1M-m_1993_2024.py +33 -0
- OceanDataStore/data/ARMOR3D/run_create_ARMOR3D_P1M-m_monthly_climatology.slurm +32 -0
- OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_climatology_to_os.slurm +32 -0
- OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ARMOR3D/run_update_ARMOR3D_P1m-m_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_climatology_to_os.py +99 -0
- OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_to_os.py +147 -0
- OceanDataStore/data/ARMOR3D/update_ARMOR3D_P1m-m_monthly_to_os.py +143 -0
- OceanDataStore/data/EN.4.2.2/create_EN4.2.2_analysis_g10_climatology.py +162 -0
- OceanDataStore/data/EN.4.2.2/download_EN4.2.2_analysis_g10_data.sh +51 -0
- OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_climatology_to_os.slurm +32 -0
- OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
- OceanDataStore/data/EN.4.2.2/run_update_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
- OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_climatology_to_os.py +76 -0
- OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_to_os.py +165 -0
- OceanDataStore/data/EN.4.2.2/update_EN4.2.2_analysis_g10_monthly_to_os.py +161 -0
- OceanDataStore/data/ERA5/create_ERA5_daily_climatology.py +110 -0
- OceanDataStore/data/ERA5/create_ERA5_daily_mean.py +69 -0
- OceanDataStore/data/ERA5/create_ERA5_monthly_mean.py +74 -0
- OceanDataStore/data/ERA5/run_create_ERA5_daily_climatology.slurm +54 -0
- OceanDataStore/data/ERA5/run_send_ERA5_daily_climatology_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_send_ERA5_daily_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_send_ERA5_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_update_ERA5_daily_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_update_ERA5_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/send_ERA5_daily_climatology_to_os.py +159 -0
- OceanDataStore/data/ERA5/send_ERA5_daily_to_os.py +141 -0
- OceanDataStore/data/ERA5/send_ERA5_monthly_to_os.py +173 -0
- OceanDataStore/data/ERA5/update_ERA5_daily_to_os.py +141 -0
- OceanDataStore/data/ERA5/update_ERA5_monthly_to_os.py +169 -0
- OceanDataStore/data/HadISST/download_HadISST1_data.sh +43 -0
- OceanDataStore/data/HadISST/run_send_HadISST1_monthly_to_os.slurm +32 -0
- OceanDataStore/data/HadISST/send_HadISST1_monthly_to_os.py +133 -0
- OceanDataStore/data/NSIDC/download_NSIDC_monthly_1979_2025_data.sh +54 -0
- OceanDataStore/data/NSIDC/process_NSIDC_SSI_Antarctic_data.py +130 -0
- OceanDataStore/data/NSIDC/process_NSIDC_SSI_Arctic_data.py +129 -0
- OceanDataStore/data/NSIDC/run_send_NSIDC_v4.0_to_OS.slurm +32 -0
- OceanDataStore/data/NSIDC/send_NSIDC_SII_v4.0_to_os.py +140 -0
- OceanDataStore/data/OISST/create_OISSTv2_daily_climatology.py +83 -0
- OceanDataStore/data/OISST/download_oisstv2_data.sh +43 -0
- OceanDataStore/data/OISST/run_create_OISSTv2_daily_climatology.slurm +44 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_daily_climatology_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_daily_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_monthly_climatology_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_monthly_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_update_OISSTv2_daily_to_os.slurm +32 -0
- OceanDataStore/data/OISST/send_OISSTv2_daily_climatology_to_os.py +154 -0
- OceanDataStore/data/OISST/send_OISSTv2_daily_ltm_climatology_to_os.py +151 -0
- OceanDataStore/data/OISST/send_OISSTv2_daily_to_os.py +142 -0
- OceanDataStore/data/OISST/send_OISSTv2_monthly_climatology_to_os.py +150 -0
- OceanDataStore/data/OISST/send_OISSTv2_monthly_to_os.py +145 -0
- OceanDataStore/data/OISST/update_OISSTv2_daily_to_os.py +142 -0
- OceanDataStore/data/OSTIA/create_OSTIA_daily_climatology.py +120 -0
- OceanDataStore/data/OSTIA/download_OSTIA_NRT.py +42 -0
- OceanDataStore/data/OSTIA/download_OSTIA_REP_1981_2025.py +42 -0
- OceanDataStore/data/OSTIA/run_create_OSTIA_daily_climatology.slurm +54 -0
- OceanDataStore/data/OSTIA/run_send_OSTIA_daily_climatology_to_os.slurm +32 -0
- OceanDataStore/data/OSTIA/run_send_OSTIA_nrt_daily_to_os.slurm +32 -0
- OceanDataStore/data/OSTIA/run_send_OSTIA_rep_daily_to_os.slurm +32 -0
- OceanDataStore/data/OSTIA/run_update_OSTIA_daily_to_os.slurm +33 -0
- OceanDataStore/data/OSTIA/send_OSTIA_daily_climatology_to_os.py +194 -0
- OceanDataStore/data/OSTIA/send_OSTIA_nrt_daily_to_os.py +141 -0
- OceanDataStore/data/OSTIA/send_OSTIA_rep_daily_to_os.py +145 -0
- OceanDataStore/data/OSTIA/update_OSTIA_copernicus_nrt_daily_to_os.py +144 -0
- OceanDataStore/data/OSTIA/update_OSTIA_nrt_daily_to_os.py +137 -0
- OceanDataStore/data/WOA23/download_WOA23_climatology.sh +41 -0
- OceanDataStore/data/WOA23/run_send_WOA23_annual_climatology_to_os.slurm +32 -0
- OceanDataStore/data/WOA23/run_send_WOA23_monthly_climatology_to_os.slurm +32 -0
- OceanDataStore/data/WOA23/send_WOA23_annual_climatology_to_os.py +263 -0
- OceanDataStore/data/WOA23/send_WOA23_monthly_climatology_to_os.py +292 -0
- OceanDataStore/data/update_icechunk_repo_attrs.py +76 -0
- OceanDataStore/data/update_noc_npd_era5v1_attrs.py +172 -0
- OceanDataStore/data/utils.py +506 -0
- OceanDataStore/zarr.py +993 -0
- oceandatastore-0.3.0.dist-info/METADATA +184 -0
- oceandatastore-0.3.0.dist-info/RECORD +104 -0
- oceandatastore-0.3.0.dist-info/WHEEL +5 -0
- oceandatastore-0.3.0.dist-info/entry_points.txt +2 -0
- oceandatastore-0.3.0.dist-info/licenses/LICENSE +201 -0
- oceandatastore-0.3.0.dist-info/scm_file_list.json +154 -0
- oceandatastore-0.3.0.dist-info/scm_version.json +8 -0
- oceandatastore-0.3.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,888 @@
|
|
|
1
|
+
# ===================================================================
|
|
2
|
+
# Copyright 2026 National Oceanography Centre
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0.
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
11
|
+
# implied. See the License for the specific language governing
|
|
12
|
+
# permissions and limitations under the License.
|
|
13
|
+
# ===================================================================
|
|
14
|
+
"""
|
|
15
|
+
icechunk.py
|
|
16
|
+
|
|
17
|
+
Description:
|
|
18
|
+
This module defines the functions to send and update Icechunk Repositories
|
|
19
|
+
in cloud object storage.
|
|
20
|
+
|
|
21
|
+
Authors:
|
|
22
|
+
- Ollie Tooth
|
|
23
|
+
"""
|
|
24
|
+
# -- Import Python Modules -- #
|
|
25
|
+
import logging
|
|
26
|
+
from typing import Optional
|
|
27
|
+
|
|
28
|
+
import dask
|
|
29
|
+
import icechunk
|
|
30
|
+
import icechunk.xarray as icechunk_xr
|
|
31
|
+
import numpy as np
|
|
32
|
+
import xarray as xr
|
|
33
|
+
from dask.distributed import Client, LocalCluster
|
|
34
|
+
|
|
35
|
+
from OceanDataStore.cli.exceptions import (
|
|
36
|
+
AppendDimensionError,
|
|
37
|
+
AppendDimensionSizeError,
|
|
38
|
+
ChunkSizeError,
|
|
39
|
+
DimensionNotFound,
|
|
40
|
+
DimensionSizeError,
|
|
41
|
+
)
|
|
42
|
+
from OceanDataStore.cli.object_store import ObjectStoreS3
|
|
43
|
+
from OceanDataStore.cli.utils import (
|
|
44
|
+
CaptureWarningsPlugin,
|
|
45
|
+
_preprocess_dataset,
|
|
46
|
+
timer,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ======== Define Icechunk Validation Functions ======== #
|
|
51
|
+
def _check_icechunk_compatibility(
|
|
52
|
+
data: xr.DataArray | xr.Dataset,
|
|
53
|
+
dest: str,
|
|
54
|
+
repo: icechunk.Repository,
|
|
55
|
+
branch: str,
|
|
56
|
+
append_dim: str,
|
|
57
|
+
rechunk: dict,
|
|
58
|
+
group: Optional[str] = None,
|
|
59
|
+
) -> None:
|
|
60
|
+
"""
|
|
61
|
+
Check compatibility of DataArray or Dataset to update existing
|
|
62
|
+
IcechunkStore in cloud object storage.
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
data: xr.DataArray | xr.Dataset
|
|
67
|
+
DataArray or DataSet to update existing IcechunkStore.
|
|
68
|
+
dest: str
|
|
69
|
+
Path to Icechunk repository in the object store.
|
|
70
|
+
repo: icechunk.Repository
|
|
71
|
+
Icechunk repository in which to write data to IcechunkStore.
|
|
72
|
+
branch: str
|
|
73
|
+
Branch on which to write data to IcechunkStore.
|
|
74
|
+
append_dim: str
|
|
75
|
+
Dimension to append data to existing IcechunkStore.
|
|
76
|
+
rechunk: dict
|
|
77
|
+
Mapping to rechunk dimensions.
|
|
78
|
+
group: Optional[str], default=None
|
|
79
|
+
Group in IcechunkStore to update.
|
|
80
|
+
"""
|
|
81
|
+
# === Initialise IcechunkStore from session === #
|
|
82
|
+
store = repo.readonly_session(branch=branch).store
|
|
83
|
+
|
|
84
|
+
# 1. Check if IcechunkStore exists:
|
|
85
|
+
try:
|
|
86
|
+
ds_store = xr.open_zarr(store, group=group, consolidated=False)
|
|
87
|
+
except Exception as e:
|
|
88
|
+
raise FileNotFoundError(f"IcechunkStore not found in repository: {e}")
|
|
89
|
+
|
|
90
|
+
# 2. Check if core dimensions exist in IcechunkStore & sizes are consistent:
|
|
91
|
+
dims_data = {dim : data.sizes[dim] for dim in data.dims if dim != append_dim}
|
|
92
|
+
for dim in dims_data:
|
|
93
|
+
if dim in ds_store.dims:
|
|
94
|
+
if dims_data[dim] != ds_store.sizes[dim]:
|
|
95
|
+
raise DimensionSizeError(dim=dim, size=data.sizes[dim], expected_size=ds_store.sizes[dim])
|
|
96
|
+
else:
|
|
97
|
+
raise DimensionNotFound(dim=dim, object_name=dest)
|
|
98
|
+
|
|
99
|
+
# 3. Check if append dimension values are consistent:
|
|
100
|
+
if (data[append_dim][0] < ds_store[append_dim][0]):
|
|
101
|
+
raise AppendDimensionError(dim=append_dim)
|
|
102
|
+
|
|
103
|
+
# 4. Check if specified chunks are consistent:
|
|
104
|
+
if rechunk is not None:
|
|
105
|
+
for dim in rechunk:
|
|
106
|
+
if dim in ds_store.dims:
|
|
107
|
+
if rechunk[dim] != ds_store.chunks[dim][0]:
|
|
108
|
+
raise ChunkSizeError(chunks=rechunk, store_chunks=ds_store.chunks)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# ======== Define Icechunk Writer Functions ======== #
|
|
112
|
+
def _write_to_icechunk(
|
|
113
|
+
data: xr.DataArray | xr.Dataset,
|
|
114
|
+
dest: str,
|
|
115
|
+
repo: icechunk.Repository,
|
|
116
|
+
commit_message: str,
|
|
117
|
+
branch: Optional[str] = "main",
|
|
118
|
+
group: Optional[str] = None,
|
|
119
|
+
) -> None:
|
|
120
|
+
"""
|
|
121
|
+
Write DataArray or Dataset to IcechunkStore in cloud
|
|
122
|
+
object storage.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
data: xr.DataArray | xr.Dataset
|
|
127
|
+
DataArray or DataSet to write to IcechunkStore.
|
|
128
|
+
dest: str
|
|
129
|
+
Path to Icechunk repository in the object store.
|
|
130
|
+
repo: icechunk.Repository
|
|
131
|
+
Icechunk repository in which to write data to
|
|
132
|
+
IcechunkStore.
|
|
133
|
+
commit_message: str
|
|
134
|
+
Commit message when updating the Icechunk repository.
|
|
135
|
+
branch: Optional[str], default="main"
|
|
136
|
+
Branch on which to write data to IcechunkStore.
|
|
137
|
+
group: Optional[str], default=None
|
|
138
|
+
Group in IcechunkStore to write data to.
|
|
139
|
+
"""
|
|
140
|
+
# === Convert DataArrays to Datasets === #
|
|
141
|
+
if isinstance(data, xr.DataArray):
|
|
142
|
+
var = data.name
|
|
143
|
+
data = data.to_dataset()
|
|
144
|
+
else:
|
|
145
|
+
var = None
|
|
146
|
+
|
|
147
|
+
# === Write Data to IcechunkStore & Commit === #
|
|
148
|
+
with timer(action='send', dest=dest, var=var):
|
|
149
|
+
session = repo.writable_session(branch=branch)
|
|
150
|
+
icechunk_xr.to_icechunk(data, session=session, group=group, mode='a')
|
|
151
|
+
session.commit(message=commit_message)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _append_to_icechunk(
|
|
155
|
+
data: xr.DataArray | xr.Dataset,
|
|
156
|
+
dest: str,
|
|
157
|
+
repo: icechunk.Repository,
|
|
158
|
+
commit_message: str,
|
|
159
|
+
branch: Optional[str] = "main",
|
|
160
|
+
group: Optional[str] = None,
|
|
161
|
+
append_dim: Optional[str] = "time_counter",
|
|
162
|
+
) -> None:
|
|
163
|
+
"""
|
|
164
|
+
Append DataArray or Dataset to existing IcechunkStore in
|
|
165
|
+
cloud object storage.
|
|
166
|
+
|
|
167
|
+
Parameters
|
|
168
|
+
----------
|
|
169
|
+
data: xr.DataArray | xr.Dataset
|
|
170
|
+
DataArray or DataSet to append to existing IcechunkStore.
|
|
171
|
+
dest: str
|
|
172
|
+
Path to Icechunk repository in the object store.
|
|
173
|
+
repo: icechunk.Repository
|
|
174
|
+
Icechunk repository in which to write data to
|
|
175
|
+
IcechunkStore.
|
|
176
|
+
commit_message: str
|
|
177
|
+
Commit message when updating the Icechunk repository.
|
|
178
|
+
branch: Optional[str], default="main"
|
|
179
|
+
Branch on which to write data to IcechunkStore.
|
|
180
|
+
group: Optional[str], default=None
|
|
181
|
+
Group in IcechunkStore to append data to.
|
|
182
|
+
append_dim: Optional[str], default="time_counter"
|
|
183
|
+
Dimension to append data to existing IcechunkStore.
|
|
184
|
+
"""
|
|
185
|
+
# === Convert DataArrays to Datasets === #
|
|
186
|
+
if isinstance(data, xr.DataArray):
|
|
187
|
+
data = data.to_dataset()
|
|
188
|
+
|
|
189
|
+
# === Append Data to IcechunkStore & Commit === #
|
|
190
|
+
with timer(action='append', dest=dest):
|
|
191
|
+
session = repo.writable_session(branch=branch)
|
|
192
|
+
icechunk_xr.to_icechunk(obj=data, session=session, group=group, append_dim=append_dim)
|
|
193
|
+
session.commit(message=commit_message)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _replace_in_icechunk(
|
|
197
|
+
data: xr.DataArray | xr.Dataset,
|
|
198
|
+
dest: str,
|
|
199
|
+
region: dict,
|
|
200
|
+
repo: icechunk.Repository,
|
|
201
|
+
commit_message: str,
|
|
202
|
+
branch: Optional[str] = "main",
|
|
203
|
+
group: Optional[str] = None,
|
|
204
|
+
) -> None:
|
|
205
|
+
"""
|
|
206
|
+
Replace data in existing IcechunkStore in cloud object storage.
|
|
207
|
+
|
|
208
|
+
Parameters
|
|
209
|
+
----------
|
|
210
|
+
data: xr.DataArray | xr.Dataset
|
|
211
|
+
DataArray or Dataset used to replace data in existing IcechunkStore.
|
|
212
|
+
dest: str
|
|
213
|
+
Path to Icechunk repository in the object store.
|
|
214
|
+
region: dict
|
|
215
|
+
Region of existing IcechunkStore to replace data.
|
|
216
|
+
repo: icechunk.Repository
|
|
217
|
+
Icechunk repository in which to replace data in IcechunkStore.
|
|
218
|
+
commit_message: str
|
|
219
|
+
Commit message when updating the Icechunk repository.
|
|
220
|
+
branch: Optional[str], default="main"
|
|
221
|
+
Branch on which to write data to IcechunkStore.
|
|
222
|
+
group: Optional[str], default=None
|
|
223
|
+
Group in IcechunkStore to replace data in.
|
|
224
|
+
"""
|
|
225
|
+
# === Convert DataArrays to Datasets === #
|
|
226
|
+
if isinstance(data, xr.DataArray):
|
|
227
|
+
data = data.to_dataset()
|
|
228
|
+
|
|
229
|
+
# Drop variables w/o append dimension:
|
|
230
|
+
append_dim = list(region.keys())[0]
|
|
231
|
+
drop_list = [var for var in data.variables if append_dim not in data[var].dims]
|
|
232
|
+
data = data.drop_vars(drop_list)
|
|
233
|
+
|
|
234
|
+
# === Write Data to IcechunkStore & Commit === #
|
|
235
|
+
with timer(action='replace', dest=dest):
|
|
236
|
+
session = repo.writable_session(branch=branch)
|
|
237
|
+
icechunk_xr.to_icechunk(obj=data, session=session, region=region, group=group)
|
|
238
|
+
session.commit(message=commit_message)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _update_icechunk_store(
|
|
242
|
+
data: xr.DataArray | xr.Dataset,
|
|
243
|
+
dest: str,
|
|
244
|
+
repo: icechunk.Repository,
|
|
245
|
+
commit_message: str,
|
|
246
|
+
branch: Optional[str] = "main",
|
|
247
|
+
group: Optional[str] = None,
|
|
248
|
+
append_dim: Optional[str] = "time_counter",
|
|
249
|
+
rechunk: Optional[dict] = None,
|
|
250
|
+
) -> None:
|
|
251
|
+
"""
|
|
252
|
+
Update an existing IcechunkStore in object storage by replacing
|
|
253
|
+
existing values and/or appending new values.
|
|
254
|
+
|
|
255
|
+
Parameters
|
|
256
|
+
----------
|
|
257
|
+
data: xr.DataArray | xr.Dataset
|
|
258
|
+
DataArray or DataSet to append to existing IcechunkStore.
|
|
259
|
+
dest: str
|
|
260
|
+
Path to Icechunk repository in the object store.
|
|
261
|
+
repo: icechunk.Repository
|
|
262
|
+
Icechunk repository in which to write data to
|
|
263
|
+
IcechunkStore.
|
|
264
|
+
commit_message: str
|
|
265
|
+
Commit message when updating the Icechunk repository.
|
|
266
|
+
branch: str, default="main"
|
|
267
|
+
Branch on which to write data to IcechunkStore.
|
|
268
|
+
group: Optional[str], default=None
|
|
269
|
+
Group in IcechunkStore to update.
|
|
270
|
+
append_dim: Optional[str], default="time_counter"
|
|
271
|
+
Dimension to append data to existing IcechunkStore.
|
|
272
|
+
rechunk: Optional[dict], default=None
|
|
273
|
+
Mapping to rechunk dimensions.
|
|
274
|
+
"""
|
|
275
|
+
# Convert DataArrays to Datasets:
|
|
276
|
+
if isinstance(data, xr.DataArray):
|
|
277
|
+
var = data.name
|
|
278
|
+
ds_source = data.to_dataset()
|
|
279
|
+
else:
|
|
280
|
+
var = None
|
|
281
|
+
ds_source = data
|
|
282
|
+
|
|
283
|
+
# Extract source & target append dimension values:
|
|
284
|
+
store = repo.readonly_session(branch=branch).store
|
|
285
|
+
ds_target = xr.open_zarr(store, group=group, consolidated=False)
|
|
286
|
+
target_append_dim = ds_target[append_dim].values
|
|
287
|
+
source_append_dim = ds_source[append_dim].values
|
|
288
|
+
|
|
289
|
+
# === Update existing variable in IcechunkStore === #
|
|
290
|
+
if (var in ds_target.data_vars) or (var is None):
|
|
291
|
+
# Check source Dataset compatibility with existing store:
|
|
292
|
+
_check_icechunk_compatibility(data=ds_source,
|
|
293
|
+
dest=dest,
|
|
294
|
+
repo=repo,
|
|
295
|
+
branch=branch,
|
|
296
|
+
append_dim=append_dim,
|
|
297
|
+
rechunk=rechunk,
|
|
298
|
+
group=group
|
|
299
|
+
)
|
|
300
|
+
logging.info(f"Passed Compatibility Checks for IcechunkStore {dest}")
|
|
301
|
+
|
|
302
|
+
# Determine intersection between source & target append dimensions:
|
|
303
|
+
intersect_append_dim = np.intersect1d(source_append_dim, target_append_dim)
|
|
304
|
+
|
|
305
|
+
if intersect_append_dim.size != 0:
|
|
306
|
+
# == Intersection exists -> replace overlapping values in target store == #
|
|
307
|
+
|
|
308
|
+
# Ensure all overlapping values exist along target append dimension:
|
|
309
|
+
overlap_append_dim = (source_append_dim <= target_append_dim[-1]).sum()
|
|
310
|
+
if intersect_append_dim.size != overlap_append_dim:
|
|
311
|
+
raise AppendDimensionSizeError(dim=append_dim, size=overlap_append_dim, expected_size=intersect_append_dim.size)
|
|
312
|
+
|
|
313
|
+
# Determine source and target append dimension indices of overlap:
|
|
314
|
+
target_ind_min = np.flatnonzero(target_append_dim == source_append_dim[0])[0]
|
|
315
|
+
target_ind_max = target_append_dim.size
|
|
316
|
+
source_ind_min = 0
|
|
317
|
+
source_ind_max = target_ind_max - target_ind_min
|
|
318
|
+
source_ind_size = source_append_dim.size
|
|
319
|
+
|
|
320
|
+
# 1. Replace overlapping values in target IcechunkStore:
|
|
321
|
+
logging.info(f"Updating {dest} along {append_dim} from {target_append_dim[target_ind_min]} to {target_append_dim[target_ind_max - 1]}.")
|
|
322
|
+
if var is not None:
|
|
323
|
+
rep_commit_message = f"{commit_message} -> Updated {var} along {append_dim} from {target_append_dim[target_ind_min]} to {target_append_dim[target_ind_max - 1]}."
|
|
324
|
+
else:
|
|
325
|
+
rep_commit_message = f"{commit_message} -> Updated {dest} along {append_dim} from {target_append_dim[target_ind_min]} to {target_append_dim[target_ind_max - 1]}."
|
|
326
|
+
|
|
327
|
+
_replace_in_icechunk(data=ds_source.isel({append_dim : slice(source_ind_min, source_ind_max)}),
|
|
328
|
+
repo=repo,
|
|
329
|
+
dest=dest,
|
|
330
|
+
region={append_dim : slice(target_ind_min, target_ind_max)},
|
|
331
|
+
commit_message=rep_commit_message,
|
|
332
|
+
branch=branch,
|
|
333
|
+
group=group
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# 2. Append new values to target IcechunkStore:
|
|
337
|
+
if source_ind_size > source_ind_max:
|
|
338
|
+
logging.info(f"Appending to {dest} along {append_dim} from {source_append_dim[source_ind_max]} to {source_append_dim[source_ind_size - 1]}.")
|
|
339
|
+
if var is not None:
|
|
340
|
+
app_commit_message = f"{commit_message} -> Appended to {var} along {append_dim} from {source_append_dim[source_ind_max]} to {source_append_dim[source_ind_size - 1]}."
|
|
341
|
+
else:
|
|
342
|
+
app_commit_message = f"{commit_message} -> Appended to {dest} along {append_dim} from {source_append_dim[source_ind_max]} to {source_append_dim[source_ind_size - 1]}."
|
|
343
|
+
|
|
344
|
+
_append_to_icechunk(data=ds_source.isel({append_dim : slice(source_ind_max, source_ind_size)}),
|
|
345
|
+
repo=repo,
|
|
346
|
+
dest=dest,
|
|
347
|
+
commit_message=app_commit_message,
|
|
348
|
+
branch=branch,
|
|
349
|
+
group=group,
|
|
350
|
+
append_dim=append_dim
|
|
351
|
+
)
|
|
352
|
+
else:
|
|
353
|
+
# == No intersection -> append all source values to target IcechunkStore == #
|
|
354
|
+
logging.info(f"Appending to {dest} along {append_dim} from {source_append_dim[0]} to {source_append_dim[-1]}.")
|
|
355
|
+
if var is not None:
|
|
356
|
+
app_commit_message = f"{commit_message} -> Appended {var} along {append_dim} from {source_append_dim[0]} to {source_append_dim[-1]}."
|
|
357
|
+
else:
|
|
358
|
+
app_commit_message = f"{commit_message} -> Appended to {dest} along {append_dim} from {source_append_dim[0]} to {source_append_dim[-1]}."
|
|
359
|
+
|
|
360
|
+
_append_to_icechunk(data=ds_source,
|
|
361
|
+
repo=repo,
|
|
362
|
+
dest=dest,
|
|
363
|
+
commit_message=app_commit_message,
|
|
364
|
+
branch=branch,
|
|
365
|
+
group=group,
|
|
366
|
+
append_dim=append_dim
|
|
367
|
+
)
|
|
368
|
+
else:
|
|
369
|
+
# == Add new variable to IcechunkStore == #
|
|
370
|
+
logging.info(f"Sending Variable {var}")
|
|
371
|
+
snd_commit_message = f"{commit_message} -> Sent {var} along {append_dim} from {source_append_dim[0]} to {source_append_dim[-1]}."
|
|
372
|
+
_write_to_icechunk(data=ds_source,
|
|
373
|
+
dest=dest,
|
|
374
|
+
repo=repo,
|
|
375
|
+
commit_message=snd_commit_message,
|
|
376
|
+
branch=branch,
|
|
377
|
+
group=group
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def _send_to_icechunk(
|
|
382
|
+
file: list[str] | str | xr.Dataset,
|
|
383
|
+
bucket: str,
|
|
384
|
+
object_prefix: str,
|
|
385
|
+
store_credentials_json: str,
|
|
386
|
+
exists: Optional[bool] = False,
|
|
387
|
+
group: Optional[str] = None,
|
|
388
|
+
variables: Optional[list[str]] = None,
|
|
389
|
+
append_dim: Optional[str] = 'time_counter',
|
|
390
|
+
grid_filepath: Optional[str] = None,
|
|
391
|
+
update_coords: Optional[dict] = None,
|
|
392
|
+
rechunk: Optional[dict] = None,
|
|
393
|
+
attrs: Optional[dict] = None,
|
|
394
|
+
parallel: Optional[bool] = False,
|
|
395
|
+
branch: Optional[str] = "main",
|
|
396
|
+
commit_message: Optional[str] = "Add new data to my Icechunk repository",
|
|
397
|
+
variable_commits: Optional[bool] = False,
|
|
398
|
+
icechunk_config: Optional[dict] = None,
|
|
399
|
+
) -> None:
|
|
400
|
+
"""
|
|
401
|
+
Write data to new Icechunk repository in cloud object storage.
|
|
402
|
+
|
|
403
|
+
Parameters
|
|
404
|
+
----------
|
|
405
|
+
file: list | str | xarray.Dataset
|
|
406
|
+
Regular expression or list of filepaths to netCDF file(s).
|
|
407
|
+
Users can also pass a single xarray.Dataset directly.
|
|
408
|
+
bucket: str
|
|
409
|
+
Name of the bucket in the object store. Bucket names can contain only
|
|
410
|
+
lowercase letters, numbers, dots (.), and hyphens (-).
|
|
411
|
+
object_prefix: str
|
|
412
|
+
Prefix to be added to the object names in the object store.
|
|
413
|
+
store_credentials_json: str
|
|
414
|
+
Path to the JSON file containing the object store credentials.
|
|
415
|
+
exists: Optional[bool], default=False
|
|
416
|
+
Whether to write to an existing Icechunk repository or create a new repository.
|
|
417
|
+
group: Optional[str], default=None
|
|
418
|
+
Group in Icechunk repository to write data to.
|
|
419
|
+
variables: Optional[list[str]], default=None
|
|
420
|
+
List of variables to send. If None, all variables will be sent.
|
|
421
|
+
append_dim: Optional[str], default='time_counter'
|
|
422
|
+
Name of the dimension to append multifile datasets.
|
|
423
|
+
grid_filepath: Optional[str], default=None
|
|
424
|
+
Path to file containing model grid parameter.
|
|
425
|
+
update_coords: Optional[dict], default=None
|
|
426
|
+
Dictionary of coordinate variables to update.
|
|
427
|
+
rechunk: Optional[dict], default=None
|
|
428
|
+
Rechunk strategy dictionary, by default None.
|
|
429
|
+
attrs: Optional[dict], default=None
|
|
430
|
+
Attributes to add to the dataset.
|
|
431
|
+
parallel: Optional[bool], default=False
|
|
432
|
+
Whether to perform open and preprocess steps in parallel using
|
|
433
|
+
`dask.delayed`.
|
|
434
|
+
branch: Optional[str], default="main"
|
|
435
|
+
Branch on which to write data to IcechunkStore.
|
|
436
|
+
commit_message: Optional[str], default="Initial commit"
|
|
437
|
+
Commit message when updating the Icechunk repository.
|
|
438
|
+
variable_commits: Optional[bool], default=False
|
|
439
|
+
Whether to write each variable to Icechunk repository using
|
|
440
|
+
separate commits.
|
|
441
|
+
icechunk_config: Optional[dict], default=None
|
|
442
|
+
Icechunk repository configuration.
|
|
443
|
+
"""
|
|
444
|
+
# === Initialise Synchronous Object Store === #
|
|
445
|
+
logging.info("Reading object store credentials from %s", store_credentials_json)
|
|
446
|
+
obj_store = ObjectStoreS3(anon=False,
|
|
447
|
+
asynchronous=False,
|
|
448
|
+
store_credentials_json=store_credentials_json
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
if icechunk_config is None:
|
|
452
|
+
icechunk_config = {"storage_config_kwargs": {'region': 'us-east-1', 'force_path_style': True},
|
|
453
|
+
"repository_config_kwargs": {},
|
|
454
|
+
"storage_settings_kwargs": {'unsafe_use_conditional_update': False, 'unsafe_use_conditional_create': False},
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
# === Preprocess Data === #
|
|
458
|
+
ds_filepath = _preprocess_dataset(file=file,
|
|
459
|
+
rechunk=rechunk,
|
|
460
|
+
append_dim=append_dim,
|
|
461
|
+
update_coords=update_coords,
|
|
462
|
+
grid_filepath=grid_filepath,
|
|
463
|
+
attrs=attrs,
|
|
464
|
+
parallel=parallel,
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
# Consider variables with append dimension only:
|
|
468
|
+
if variables is None:
|
|
469
|
+
variables = list(ds_filepath.data_vars)
|
|
470
|
+
|
|
471
|
+
# Extract append dimension values:
|
|
472
|
+
if append_dim in ds_filepath.dims:
|
|
473
|
+
source_append_dim = ds_filepath[append_dim].values
|
|
474
|
+
|
|
475
|
+
# === Send Variables to Icechunk Repo === #
|
|
476
|
+
if exists:
|
|
477
|
+
# Open existing Icechunk repo:
|
|
478
|
+
try:
|
|
479
|
+
repo = obj_store.open_icechunk_repo(bucket=bucket,
|
|
480
|
+
prefix=object_prefix,
|
|
481
|
+
storage_config_kwargs=icechunk_config["storage_config_kwargs"],
|
|
482
|
+
repository_config_kwargs=icechunk_config["repository_config_kwargs"],
|
|
483
|
+
storage_settings_kwargs=icechunk_config["storage_settings_kwargs"],
|
|
484
|
+
)
|
|
485
|
+
except icechunk.IcechunkError:
|
|
486
|
+
logging.info(f"Failed to open existing Icechunk repository at {bucket}/{object_prefix}")
|
|
487
|
+
|
|
488
|
+
else:
|
|
489
|
+
try:
|
|
490
|
+
# Create new Icechunk repo:
|
|
491
|
+
repo = obj_store.create_icechunk_repo(bucket=bucket,
|
|
492
|
+
prefix=object_prefix,
|
|
493
|
+
storage_config_kwargs=icechunk_config["storage_config_kwargs"],
|
|
494
|
+
repository_config_kwargs=icechunk_config["repository_config_kwargs"],
|
|
495
|
+
storage_settings_kwargs=icechunk_config["storage_settings_kwargs"],
|
|
496
|
+
)
|
|
497
|
+
except icechunk.IcechunkError:
|
|
498
|
+
logging.info(f"Failed to create new Icechunk repository at {bucket}/{object_prefix}")
|
|
499
|
+
|
|
500
|
+
# Write data to Icechunk repository:
|
|
501
|
+
if variable_commits:
|
|
502
|
+
for var in variables:
|
|
503
|
+
logging.info(f"Sending Variable: {var}")
|
|
504
|
+
if append_dim in ds_filepath[var].dims:
|
|
505
|
+
snd_commit_message = f"{commit_message} -> Sent {var} along {append_dim} from {source_append_dim[0]} to {source_append_dim[-1]}."
|
|
506
|
+
else:
|
|
507
|
+
snd_commit_message = f"{commit_message} -> Sent {var}."
|
|
508
|
+
|
|
509
|
+
# Write each variable using separate commits to the repo:
|
|
510
|
+
_write_to_icechunk(data=ds_filepath[var],
|
|
511
|
+
dest=f"{bucket}/{object_prefix}",
|
|
512
|
+
repo=repo,
|
|
513
|
+
commit_message=snd_commit_message,
|
|
514
|
+
branch=branch,
|
|
515
|
+
group=group
|
|
516
|
+
)
|
|
517
|
+
else:
|
|
518
|
+
# Write all variables using single commit to the repo:
|
|
519
|
+
logging.info(f"Sending Dataset: {object_prefix}")
|
|
520
|
+
if append_dim in ds_filepath.dims:
|
|
521
|
+
snd_commit_message = f"{commit_message} -> Sent dataset along {append_dim} from {source_append_dim[0]} to {source_append_dim[-1]}."
|
|
522
|
+
else:
|
|
523
|
+
snd_commit_message = f"{commit_message} -> Sent dataset."
|
|
524
|
+
|
|
525
|
+
_write_to_icechunk(data=ds_filepath[variables],
|
|
526
|
+
dest=f"{bucket}/{object_prefix}",
|
|
527
|
+
repo=repo,
|
|
528
|
+
commit_message=snd_commit_message,
|
|
529
|
+
branch=branch,
|
|
530
|
+
group=group
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
# Release resources to avoid memory leaks:
|
|
534
|
+
ds_filepath.close()
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def _update_icechunk(
|
|
538
|
+
file: list[str] | str | xr.Dataset,
|
|
539
|
+
bucket: str,
|
|
540
|
+
object_prefix: str,
|
|
541
|
+
store_credentials_json: str,
|
|
542
|
+
group: Optional[str] = None,
|
|
543
|
+
variables: Optional[list[str]] = None,
|
|
544
|
+
append_dim: Optional[str] = 'time_counter',
|
|
545
|
+
grid_filepath: Optional[str] = None,
|
|
546
|
+
update_coords: Optional[dict] = None,
|
|
547
|
+
rechunk: Optional[dict] = None,
|
|
548
|
+
attrs: Optional[dict] = None,
|
|
549
|
+
parallel: bool = False,
|
|
550
|
+
branch: str = "main",
|
|
551
|
+
commit_message: str = "Update data in my Icechunk repository",
|
|
552
|
+
icechunk_config: Optional[dict] = None,
|
|
553
|
+
) -> None:
|
|
554
|
+
"""
|
|
555
|
+
Update data in existing Icechunk repository in cloud object storage
|
|
556
|
+
by replacing and/or appending data.
|
|
557
|
+
|
|
558
|
+
Parameters
|
|
559
|
+
----------
|
|
560
|
+
file: list | str | xarray.Dataset
|
|
561
|
+
Regular expression or list of filepaths to netCDF file(s).
|
|
562
|
+
Users can also pass a single xarray.Dataset directly.
|
|
563
|
+
bucket: str
|
|
564
|
+
Name of the bucket in the object store. Bucket names can contain only
|
|
565
|
+
lowercase letters, numbers, dots (.), and hyphens (-).
|
|
566
|
+
object_prefix: str
|
|
567
|
+
Prefix to be added to the object names in the object store.
|
|
568
|
+
store_credentials_json: str
|
|
569
|
+
Path to the JSON file containing the object store credentials.
|
|
570
|
+
group: Optional[str], default=None
|
|
571
|
+
Group in Icechunk repository to write data to.
|
|
572
|
+
variables: Optional[list[str]], default=None
|
|
573
|
+
List of variables to send. If None, all variables will be sent.
|
|
574
|
+
append_dim: Optional[str], default='time_counter'
|
|
575
|
+
Name of the dimension to append multifile datasets.
|
|
576
|
+
grid_filepath: Optional[str], default=None
|
|
577
|
+
Path to file containing model grid parameter.
|
|
578
|
+
update_coords: Optional[dict], default=None
|
|
579
|
+
Dictionary of coordinate variables to update.
|
|
580
|
+
rechunk: Optional[dict], default=None
|
|
581
|
+
Rechunk strategy dictionary, by default None.
|
|
582
|
+
attrs: Optional[dict], default=None
|
|
583
|
+
Attributes to add to the dataset.
|
|
584
|
+
parallel: Optional[bool], default=False
|
|
585
|
+
Whether to perform open and preprocess steps in parallel using
|
|
586
|
+
`dask.delayed`.
|
|
587
|
+
branch: Optional[str], default="main"
|
|
588
|
+
Branch on which to write data to IcechunkStore.
|
|
589
|
+
commit_message: Optional[str], default="Update commit"
|
|
590
|
+
Commit message when updating the Icechunk repository.
|
|
591
|
+
icechunk_config: Optional[dict], default=None
|
|
592
|
+
Icechunk repository configuration.
|
|
593
|
+
"""
|
|
594
|
+
# === Initialise Synchronous Object Store === #
|
|
595
|
+
logging.info("Reading object store credentials from %s", store_credentials_json)
|
|
596
|
+
obj_store = ObjectStoreS3(anon=False,
|
|
597
|
+
asynchronous=False,
|
|
598
|
+
store_credentials_json=store_credentials_json
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
if icechunk_config is None:
|
|
602
|
+
icechunk_config = {"storage_config_kwargs": {'region': 'us-east-1', 'force_path_style': True},
|
|
603
|
+
"repository_config_kwargs": {},
|
|
604
|
+
"storage_settings_kwargs": {'unsafe_use_conditional_update': False, 'unsafe_use_conditional_create': False},
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
# === Preprocess Data === #
|
|
608
|
+
ds_filepath = _preprocess_dataset(file=file,
|
|
609
|
+
rechunk=rechunk,
|
|
610
|
+
append_dim=append_dim,
|
|
611
|
+
update_coords=update_coords,
|
|
612
|
+
grid_filepath=grid_filepath,
|
|
613
|
+
attrs=attrs,
|
|
614
|
+
parallel=parallel,
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
if variables is None:
|
|
618
|
+
variables = list(ds_filepath.data_vars)
|
|
619
|
+
# Consider variables with append dimension only:
|
|
620
|
+
variables = [var for var in variables if append_dim in ds_filepath[var].dims]
|
|
621
|
+
|
|
622
|
+
# === Update Variables in Icechunk Repo === #
|
|
623
|
+
try:
|
|
624
|
+
# Open existing Icechunk repo:
|
|
625
|
+
repo = obj_store.open_icechunk_repo(bucket=bucket,
|
|
626
|
+
prefix=object_prefix,
|
|
627
|
+
storage_config_kwargs=icechunk_config["storage_config_kwargs"],
|
|
628
|
+
repository_config_kwargs=icechunk_config["repository_config_kwargs"],
|
|
629
|
+
storage_settings_kwargs=icechunk_config["storage_settings_kwargs"],
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
# Update dataset using single commit to the repo:
|
|
633
|
+
logging.info(f"Updating Dataset {object_prefix}")
|
|
634
|
+
_update_icechunk_store(data=ds_filepath[variables],
|
|
635
|
+
dest=f"{bucket}/{object_prefix}",
|
|
636
|
+
repo=repo,
|
|
637
|
+
commit_message=commit_message,
|
|
638
|
+
branch=branch,
|
|
639
|
+
group=group,
|
|
640
|
+
append_dim=append_dim,
|
|
641
|
+
rechunk=rechunk,
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
except icechunk.IcechunkError:
|
|
645
|
+
logging.info(f"Skipping Dataset: Icechunk repository does not exist at {bucket}/{object_prefix}")
|
|
646
|
+
|
|
647
|
+
# Release resources to avoid memory leaks:
|
|
648
|
+
ds_filepath.close()
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
# ============ Define Public Functions ============ #
|
|
652
|
+
def send_to_icechunk(
|
|
653
|
+
file: list[str] | str | xr.Dataset,
|
|
654
|
+
bucket: str,
|
|
655
|
+
object_prefix: str,
|
|
656
|
+
store_credentials_json: str,
|
|
657
|
+
exists: Optional[bool] = False,
|
|
658
|
+
group: Optional[str] = None,
|
|
659
|
+
variables: Optional[list[str]] = None,
|
|
660
|
+
append_dim: Optional[str] = 'time_counter',
|
|
661
|
+
grid_filepath: Optional[str] = None,
|
|
662
|
+
update_coords: Optional[dict] = None,
|
|
663
|
+
rechunk: Optional[dict] = None,
|
|
664
|
+
attrs: Optional[dict] = None,
|
|
665
|
+
branch: Optional[str] = "main",
|
|
666
|
+
commit_message: Optional[str] = "Add new data to my Icechunk repository",
|
|
667
|
+
variable_commits: Optional[bool] = False,
|
|
668
|
+
dask_config_kwargs: Optional[dict] = None,
|
|
669
|
+
dask_cluster_kwargs: Optional[dict] = None,
|
|
670
|
+
icechunk_config: Optional[dict] = None,
|
|
671
|
+
) -> None:
|
|
672
|
+
"""
|
|
673
|
+
Write data to new Icechunk repository in cloud object storage with
|
|
674
|
+
option of using dask.
|
|
675
|
+
|
|
676
|
+
Parameters
|
|
677
|
+
----------
|
|
678
|
+
file: list | str | xarray.Dataset
|
|
679
|
+
Regular expression or list of filepaths to netCDF file(s).
|
|
680
|
+
Users can also pass a single xarray.Dataset directly.
|
|
681
|
+
bucket: str
|
|
682
|
+
Name of the bucket in the object store. Bucket names can contain only
|
|
683
|
+
lowercase letters, numbers, dots (.), and hyphens (-).
|
|
684
|
+
object_prefix: str
|
|
685
|
+
Prefix to be added to the object names in the object store.
|
|
686
|
+
store_credentials_json: str
|
|
687
|
+
Path to the JSON file containing the object store credentials.
|
|
688
|
+
exists: Optional[bool], default=False
|
|
689
|
+
Whether to write to an existing Icechunk repository or create a new repository.
|
|
690
|
+
group: Optional[str], default=None
|
|
691
|
+
Group in Icechunk repository to write data to.
|
|
692
|
+
variables: Optional[list[str]], default=None
|
|
693
|
+
List of variables to send. If None, all variables will be sent.
|
|
694
|
+
append_dim: Optional[str], default='time_counter'
|
|
695
|
+
Name of the dimension to append multifile datasets.
|
|
696
|
+
grid_filepath: Optional[str], default=None
|
|
697
|
+
Path to file containing model grid parameter.
|
|
698
|
+
update_coords: Optional[dict], default=None
|
|
699
|
+
Dictionary of coordinate variables to update.
|
|
700
|
+
rechunk: Optional[dict], default=None
|
|
701
|
+
Rechunk strategy dictionary, by default None.
|
|
702
|
+
attrs: Optional[dict], default=None
|
|
703
|
+
Attributes to add to the dataset.
|
|
704
|
+
branch: Optional[str], default="main"
|
|
705
|
+
Branch on which to write data to IcechunkStore.
|
|
706
|
+
commit_message: Optional[str], default="Initial commit"
|
|
707
|
+
Commit message when updating the Icechunk repository.
|
|
708
|
+
variable_commits: Optional[bool], default=False
|
|
709
|
+
Whether to write each variable to Icechunk repository using
|
|
710
|
+
separate commits.
|
|
711
|
+
dask_config_kwargs: Optional[dict], default=None
|
|
712
|
+
Dask configuration settings passed to dask.config.set().
|
|
713
|
+
dask_cluster_kwargs: Optional[dict], default=None
|
|
714
|
+
Dask cluster configuration settings passed to LocalCluster().
|
|
715
|
+
icechunk_config: Optional[dict], default=None
|
|
716
|
+
Icechunk repository configuration.
|
|
717
|
+
"""
|
|
718
|
+
if dask_cluster_kwargs is not None:
|
|
719
|
+
# === Send to Icechunk repo(s) with Dask === #
|
|
720
|
+
if dask_config_kwargs is not None:
|
|
721
|
+
dask.config.set(dask_config_kwargs)
|
|
722
|
+
logging.info("Updated dask configuration settings.")
|
|
723
|
+
|
|
724
|
+
# Create local dask cluster & client:
|
|
725
|
+
with LocalCluster(**dask_cluster_kwargs) as cluster, Client(cluster) as client:
|
|
726
|
+
logging.info(f"Created LocalCluster with {dask_cluster_kwargs['n_workers']} workers @ Client: {client.dashboard_link}")
|
|
727
|
+
|
|
728
|
+
# Catch UserWarnings when rechunking data:
|
|
729
|
+
client.register_worker_plugin(CaptureWarningsPlugin())
|
|
730
|
+
|
|
731
|
+
_send_to_icechunk(file=file,
|
|
732
|
+
bucket=bucket,
|
|
733
|
+
object_prefix=object_prefix,
|
|
734
|
+
store_credentials_json=store_credentials_json,
|
|
735
|
+
exists=exists,
|
|
736
|
+
group=group,
|
|
737
|
+
variables=variables,
|
|
738
|
+
append_dim=append_dim,
|
|
739
|
+
grid_filepath=grid_filepath,
|
|
740
|
+
update_coords=update_coords,
|
|
741
|
+
rechunk=rechunk,
|
|
742
|
+
attrs=attrs,
|
|
743
|
+
parallel=True,
|
|
744
|
+
branch=branch,
|
|
745
|
+
commit_message=commit_message,
|
|
746
|
+
variable_commits=variable_commits,
|
|
747
|
+
icechunk_config=icechunk_config
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
# --- Shutdown Store & Dask Cluster --- #
|
|
751
|
+
cluster.close()
|
|
752
|
+
client.shutdown()
|
|
753
|
+
logging.info("Dask Cluster has been shutdown.")
|
|
754
|
+
|
|
755
|
+
else:
|
|
756
|
+
# === Send to Icechunk repo(s) without Dask === #
|
|
757
|
+
_send_to_icechunk(file=file,
|
|
758
|
+
bucket=bucket,
|
|
759
|
+
object_prefix=object_prefix,
|
|
760
|
+
store_credentials_json=store_credentials_json,
|
|
761
|
+
exists=exists,
|
|
762
|
+
group=group,
|
|
763
|
+
variables=variables,
|
|
764
|
+
append_dim=append_dim,
|
|
765
|
+
grid_filepath=grid_filepath,
|
|
766
|
+
update_coords=update_coords,
|
|
767
|
+
rechunk=rechunk,
|
|
768
|
+
attrs=attrs,
|
|
769
|
+
parallel=False,
|
|
770
|
+
branch=branch,
|
|
771
|
+
commit_message=commit_message,
|
|
772
|
+
variable_commits=variable_commits,
|
|
773
|
+
icechunk_config=icechunk_config
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
def update_icechunk(
|
|
778
|
+
file: list[str] | str | xr.Dataset,
|
|
779
|
+
bucket: str,
|
|
780
|
+
object_prefix: str,
|
|
781
|
+
store_credentials_json: str,
|
|
782
|
+
group: Optional[str] = None,
|
|
783
|
+
variables: Optional[list[str]] = None,
|
|
784
|
+
append_dim: Optional[str] = 'time_counter',
|
|
785
|
+
grid_filepath: Optional[str] = None,
|
|
786
|
+
update_coords: Optional[dict] = None,
|
|
787
|
+
rechunk: Optional[dict] = None,
|
|
788
|
+
attrs: Optional[dict] = None,
|
|
789
|
+
branch: Optional[str] = "main",
|
|
790
|
+
commit_message: Optional[str] = "Update data in my Icechunk repository",
|
|
791
|
+
dask_config_kwargs: Optional[dict] = None,
|
|
792
|
+
dask_cluster_kwargs: Optional[dict] = None,
|
|
793
|
+
icechunk_config: Optional[dict] = None,
|
|
794
|
+
) -> None:
|
|
795
|
+
"""
|
|
796
|
+
Update data in existing Icechunk repository in cloud object
|
|
797
|
+
storage with option of using dask.
|
|
798
|
+
|
|
799
|
+
Parameters
|
|
800
|
+
----------
|
|
801
|
+
file: list | str | xarray.Dataset
|
|
802
|
+
Regular expression or list of filepaths to netCDF file(s).
|
|
803
|
+
Users can also pass a single xarray.Dataset directly.
|
|
804
|
+
bucket: str
|
|
805
|
+
Name of the bucket in the object store. Bucket names can contain only
|
|
806
|
+
lowercase letters, numbers, dots (.), and hyphens (-).
|
|
807
|
+
object_prefix: str
|
|
808
|
+
Prefix to be added to the object names in the object store.
|
|
809
|
+
store_credentials_json: str
|
|
810
|
+
Path to the JSON file containing the object store credentials.
|
|
811
|
+
group: Optional[str], default=None
|
|
812
|
+
Group in Icechunk repository to write data to.
|
|
813
|
+
variables: Optional[list[str]], default=None
|
|
814
|
+
List of variables to send. If None, all variables will be sent.
|
|
815
|
+
append_dim: Optional[str], default='time_counter'
|
|
816
|
+
Name of the dimension to append multifile datasets.
|
|
817
|
+
grid_filepath: Optional[str], default=None
|
|
818
|
+
Path to file containing model grid parameter.
|
|
819
|
+
update_coords: Optional[dict], default=None
|
|
820
|
+
Dictionary of coordinate variables to update.
|
|
821
|
+
rechunk: Optional[dict], default=None
|
|
822
|
+
Rechunk strategy dictionary, by default None.
|
|
823
|
+
attrs: Optional[dict], default=None
|
|
824
|
+
Attributes to add to the dataset.
|
|
825
|
+
branch: Optional[str], default="main"
|
|
826
|
+
Branch on which to write data to IcechunkStore.
|
|
827
|
+
commit_message: Optional[str], default="Initial commit"
|
|
828
|
+
Commit message when updating the Icechunk repository.
|
|
829
|
+
dask_config_kwargs: Optional[dict], default=None
|
|
830
|
+
Dask configuration settings passed to dask.config.set().
|
|
831
|
+
dask_cluster_kwargs: Optional[dict], default=None
|
|
832
|
+
Dask cluster configuration settings passed to LocalCluster().
|
|
833
|
+
icechunk_config: Optional[dict], default=None
|
|
834
|
+
Icechunk repository configuration.
|
|
835
|
+
"""
|
|
836
|
+
# === Update Icechunk repo(s) with Dask === #
|
|
837
|
+
if dask_cluster_kwargs is not None:
|
|
838
|
+
if dask_config_kwargs is not None:
|
|
839
|
+
dask.config.set(dask_config_kwargs)
|
|
840
|
+
logging.info("Updated dask configuration settings.")
|
|
841
|
+
|
|
842
|
+
# Create local dask cluster & client:
|
|
843
|
+
with LocalCluster(**dask_cluster_kwargs) as cluster, Client(cluster) as client:
|
|
844
|
+
logging.info(f"Created LocalCluster with {dask_cluster_kwargs['n_workers']} workers @ Client: {client.dashboard_link}")
|
|
845
|
+
|
|
846
|
+
# Catch UserWarnings when rechunking data:
|
|
847
|
+
client.register_worker_plugin(CaptureWarningsPlugin())
|
|
848
|
+
|
|
849
|
+
_update_icechunk(file=file,
|
|
850
|
+
bucket=bucket,
|
|
851
|
+
object_prefix=object_prefix,
|
|
852
|
+
store_credentials_json=store_credentials_json,
|
|
853
|
+
group=group,
|
|
854
|
+
variables=variables,
|
|
855
|
+
append_dim=append_dim,
|
|
856
|
+
grid_filepath=grid_filepath,
|
|
857
|
+
update_coords=update_coords,
|
|
858
|
+
rechunk=rechunk,
|
|
859
|
+
attrs=attrs,
|
|
860
|
+
parallel=True,
|
|
861
|
+
branch=branch,
|
|
862
|
+
commit_message=commit_message,
|
|
863
|
+
icechunk_config=icechunk_config
|
|
864
|
+
)
|
|
865
|
+
|
|
866
|
+
# --- Shutdown Store & Dask Cluster --- #
|
|
867
|
+
cluster.close()
|
|
868
|
+
client.shutdown()
|
|
869
|
+
logging.info("Dask Cluster has been shutdown.")
|
|
870
|
+
|
|
871
|
+
else:
|
|
872
|
+
# === Update Icechunk repo(s) without Dask === #
|
|
873
|
+
_update_icechunk(file=file,
|
|
874
|
+
bucket=bucket,
|
|
875
|
+
object_prefix=object_prefix,
|
|
876
|
+
store_credentials_json=store_credentials_json,
|
|
877
|
+
group=group,
|
|
878
|
+
variables=variables,
|
|
879
|
+
append_dim=append_dim,
|
|
880
|
+
grid_filepath=grid_filepath,
|
|
881
|
+
update_coords=update_coords,
|
|
882
|
+
rechunk=rechunk,
|
|
883
|
+
attrs=attrs,
|
|
884
|
+
parallel=False,
|
|
885
|
+
branch=branch,
|
|
886
|
+
commit_message=commit_message,
|
|
887
|
+
icechunk_config=icechunk_config
|
|
888
|
+
)
|