OceanDataStore 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- OceanDataStore/__init__.py +21 -0
- OceanDataStore/catalog/__init__.py +12 -0
- OceanDataStore/catalog/oceandatacatalog.py +1242 -0
- OceanDataStore/catalog/stac/README.md +34 -0
- OceanDataStore/catalog/stac/__init__.py +30 -0
- OceanDataStore/catalog/stac/create_noc_stac.py +109 -0
- OceanDataStore/catalog/stac/npd_era5_collection.py +364 -0
- OceanDataStore/catalog/stac/npd_jra55_collection.py +196 -0
- OceanDataStore/catalog/stac/ods_obs_collection.py +534 -0
- OceanDataStore/catalog/stac/rapid_evo_collection.py +309 -0
- OceanDataStore/catalog/stac/template_collection.py +85 -0
- OceanDataStore/catalog/stac/utils.py +476 -0
- OceanDataStore/cli/__init__.py +34 -0
- OceanDataStore/cli/arg_parser.py +182 -0
- OceanDataStore/cli/cli.py +203 -0
- OceanDataStore/cli/exceptions.py +83 -0
- OceanDataStore/cli/icechunk.py +888 -0
- OceanDataStore/cli/logging.py +52 -0
- OceanDataStore/cli/object_store.py +293 -0
- OceanDataStore/cli/utils.py +275 -0
- OceanDataStore/cli/zarr.py +870 -0
- OceanDataStore/data/ARMOR3D/create_ARMOR3D_P1M-m_monthly_climatology.py +135 -0
- OceanDataStore/data/ARMOR3D/download_ARMOR3D_0.125def_P1M-m_1993_2024.py +33 -0
- OceanDataStore/data/ARMOR3D/run_create_ARMOR3D_P1M-m_monthly_climatology.slurm +32 -0
- OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_climatology_to_os.slurm +32 -0
- OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ARMOR3D/run_update_ARMOR3D_P1m-m_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_climatology_to_os.py +99 -0
- OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_to_os.py +147 -0
- OceanDataStore/data/ARMOR3D/update_ARMOR3D_P1m-m_monthly_to_os.py +143 -0
- OceanDataStore/data/EN.4.2.2/create_EN4.2.2_analysis_g10_climatology.py +162 -0
- OceanDataStore/data/EN.4.2.2/download_EN4.2.2_analysis_g10_data.sh +51 -0
- OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_climatology_to_os.slurm +32 -0
- OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
- OceanDataStore/data/EN.4.2.2/run_update_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
- OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_climatology_to_os.py +76 -0
- OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_to_os.py +165 -0
- OceanDataStore/data/EN.4.2.2/update_EN4.2.2_analysis_g10_monthly_to_os.py +161 -0
- OceanDataStore/data/ERA5/create_ERA5_daily_climatology.py +110 -0
- OceanDataStore/data/ERA5/create_ERA5_daily_mean.py +69 -0
- OceanDataStore/data/ERA5/create_ERA5_monthly_mean.py +74 -0
- OceanDataStore/data/ERA5/run_create_ERA5_daily_climatology.slurm +54 -0
- OceanDataStore/data/ERA5/run_send_ERA5_daily_climatology_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_send_ERA5_daily_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_send_ERA5_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_update_ERA5_daily_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/run_update_ERA5_monthly_to_os.slurm +32 -0
- OceanDataStore/data/ERA5/send_ERA5_daily_climatology_to_os.py +159 -0
- OceanDataStore/data/ERA5/send_ERA5_daily_to_os.py +141 -0
- OceanDataStore/data/ERA5/send_ERA5_monthly_to_os.py +173 -0
- OceanDataStore/data/ERA5/update_ERA5_daily_to_os.py +141 -0
- OceanDataStore/data/ERA5/update_ERA5_monthly_to_os.py +169 -0
- OceanDataStore/data/HadISST/download_HadISST1_data.sh +43 -0
- OceanDataStore/data/HadISST/run_send_HadISST1_monthly_to_os.slurm +32 -0
- OceanDataStore/data/HadISST/send_HadISST1_monthly_to_os.py +133 -0
- OceanDataStore/data/NSIDC/download_NSIDC_monthly_1979_2025_data.sh +54 -0
- OceanDataStore/data/NSIDC/process_NSIDC_SSI_Antarctic_data.py +130 -0
- OceanDataStore/data/NSIDC/process_NSIDC_SSI_Arctic_data.py +129 -0
- OceanDataStore/data/NSIDC/run_send_NSIDC_v4.0_to_OS.slurm +32 -0
- OceanDataStore/data/NSIDC/send_NSIDC_SII_v4.0_to_os.py +140 -0
- OceanDataStore/data/OISST/create_OISSTv2_daily_climatology.py +83 -0
- OceanDataStore/data/OISST/download_oisstv2_data.sh +43 -0
- OceanDataStore/data/OISST/run_create_OISSTv2_daily_climatology.slurm +44 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_daily_climatology_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_daily_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_monthly_climatology_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_send_OISSTv2_monthly_to_os.slurm +32 -0
- OceanDataStore/data/OISST/run_update_OISSTv2_daily_to_os.slurm +32 -0
- OceanDataStore/data/OISST/send_OISSTv2_daily_climatology_to_os.py +154 -0
- OceanDataStore/data/OISST/send_OISSTv2_daily_ltm_climatology_to_os.py +151 -0
- OceanDataStore/data/OISST/send_OISSTv2_daily_to_os.py +142 -0
- OceanDataStore/data/OISST/send_OISSTv2_monthly_climatology_to_os.py +150 -0
- OceanDataStore/data/OISST/send_OISSTv2_monthly_to_os.py +145 -0
- OceanDataStore/data/OISST/update_OISSTv2_daily_to_os.py +142 -0
- OceanDataStore/data/OSTIA/create_OSTIA_daily_climatology.py +120 -0
- OceanDataStore/data/OSTIA/download_OSTIA_NRT.py +42 -0
- OceanDataStore/data/OSTIA/download_OSTIA_REP_1981_2025.py +42 -0
- OceanDataStore/data/OSTIA/run_create_OSTIA_daily_climatology.slurm +54 -0
- OceanDataStore/data/OSTIA/run_send_OSTIA_daily_climatology_to_os.slurm +32 -0
- OceanDataStore/data/OSTIA/run_send_OSTIA_nrt_daily_to_os.slurm +32 -0
- OceanDataStore/data/OSTIA/run_send_OSTIA_rep_daily_to_os.slurm +32 -0
- OceanDataStore/data/OSTIA/run_update_OSTIA_daily_to_os.slurm +33 -0
- OceanDataStore/data/OSTIA/send_OSTIA_daily_climatology_to_os.py +194 -0
- OceanDataStore/data/OSTIA/send_OSTIA_nrt_daily_to_os.py +141 -0
- OceanDataStore/data/OSTIA/send_OSTIA_rep_daily_to_os.py +145 -0
- OceanDataStore/data/OSTIA/update_OSTIA_copernicus_nrt_daily_to_os.py +144 -0
- OceanDataStore/data/OSTIA/update_OSTIA_nrt_daily_to_os.py +137 -0
- OceanDataStore/data/WOA23/download_WOA23_climatology.sh +41 -0
- OceanDataStore/data/WOA23/run_send_WOA23_annual_climatology_to_os.slurm +32 -0
- OceanDataStore/data/WOA23/run_send_WOA23_monthly_climatology_to_os.slurm +32 -0
- OceanDataStore/data/WOA23/send_WOA23_annual_climatology_to_os.py +263 -0
- OceanDataStore/data/WOA23/send_WOA23_monthly_climatology_to_os.py +292 -0
- OceanDataStore/data/update_icechunk_repo_attrs.py +76 -0
- OceanDataStore/data/update_noc_npd_era5v1_attrs.py +172 -0
- OceanDataStore/data/utils.py +506 -0
- OceanDataStore/zarr.py +993 -0
- oceandatastore-0.3.0.dist-info/METADATA +184 -0
- oceandatastore-0.3.0.dist-info/RECORD +104 -0
- oceandatastore-0.3.0.dist-info/WHEEL +5 -0
- oceandatastore-0.3.0.dist-info/entry_points.txt +2 -0
- oceandatastore-0.3.0.dist-info/licenses/LICENSE +201 -0
- oceandatastore-0.3.0.dist-info/scm_file_list.json +154 -0
- oceandatastore-0.3.0.dist-info/scm_version.json +8 -0
- oceandatastore-0.3.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# ===================================================================
|
|
2
|
+
# Copyright 2026 National Oceanography Centre
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0.
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
11
|
+
# implied. See the License for the specific language governing
|
|
12
|
+
# permissions and limitations under the License.
|
|
13
|
+
# ===================================================================
|
|
14
|
+
"""
|
|
15
|
+
logging.py
|
|
16
|
+
|
|
17
|
+
Description:
|
|
18
|
+
This module defines the logging utility function for the OceanDataStore
|
|
19
|
+
package.
|
|
20
|
+
|
|
21
|
+
Authors:
|
|
22
|
+
- Ollie Tooth
|
|
23
|
+
"""
|
|
24
|
+
import sys
|
|
25
|
+
import logging
|
|
26
|
+
|
|
27
|
+
from OceanDataStore.cli.arg_parser import __version__
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def initialise_logging():
|
|
31
|
+
"""
|
|
32
|
+
Initialise OceanDataStore logging.
|
|
33
|
+
"""
|
|
34
|
+
logging.basicConfig(
|
|
35
|
+
stream=sys.stdout,
|
|
36
|
+
format="🌐 OceanDataStore 🌐 | %(levelname)10s | %(asctime)s | %(message)s",
|
|
37
|
+
level=logging.INFO,
|
|
38
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
logging.info(
|
|
42
|
+
f"""
|
|
43
|
+
.~~~.
|
|
44
|
+
.( ).~~~~~~.
|
|
45
|
+
~( ).~~~.
|
|
46
|
+
.( OceanDataStore ).
|
|
47
|
+
(___________________________).
|
|
48
|
+
version: {__version__}
|
|
49
|
+
|
|
50
|
+
""",
|
|
51
|
+
extra={"simple": True},
|
|
52
|
+
)
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
# ===================================================================
|
|
2
|
+
# Copyright 2026 National Oceanography Centre
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0.
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
11
|
+
# implied. See the License for the specific language governing
|
|
12
|
+
# permissions and limitations under the License.
|
|
13
|
+
# ===================================================================
|
|
14
|
+
"""
|
|
15
|
+
object_store.py
|
|
16
|
+
|
|
17
|
+
Description:
|
|
18
|
+
This module defines the ObjectStoreS3 class, which is a subclass
|
|
19
|
+
of the S3FileSystem class from the s3fs library.
|
|
20
|
+
|
|
21
|
+
Authors:
|
|
22
|
+
- Ollie Tooth
|
|
23
|
+
- Joao Morado
|
|
24
|
+
- Tobias Ferreira
|
|
25
|
+
"""
|
|
26
|
+
import json
|
|
27
|
+
import logging
|
|
28
|
+
from typing import Union
|
|
29
|
+
|
|
30
|
+
import icechunk
|
|
31
|
+
import s3fs
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ObjectStoreS3(s3fs.S3FileSystem):
|
|
35
|
+
"""
|
|
36
|
+
Initialize S3 Object Store.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
anon, bool (False)
|
|
41
|
+
Whether to use anonymous connection (public buckets only).
|
|
42
|
+
asynchronous, bool (True)
|
|
43
|
+
Whether to use asynchronous operations (instance to be used inside corountines).
|
|
44
|
+
store_credentials_json, str (None)
|
|
45
|
+
File path to object store credentials .json file.
|
|
46
|
+
secret, str (None)
|
|
47
|
+
If not anonymous, use this secret key to access object store.
|
|
48
|
+
key, str (None)
|
|
49
|
+
If not anonymous, use this key to access object store.
|
|
50
|
+
endpoint_url, str (None)
|
|
51
|
+
Endpoint URL of object store. Needed for non-AWS S3 object stores.
|
|
52
|
+
"""
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
anon: bool = False,
|
|
56
|
+
asynchronous: bool = False,
|
|
57
|
+
store_credentials_json: Union[str, None] = None,
|
|
58
|
+
secret: Union[str, None] = None,
|
|
59
|
+
key: Union[str, None] = None,
|
|
60
|
+
endpoint_url: Union[str, None] = None,
|
|
61
|
+
*fs_args,
|
|
62
|
+
**fs_kwargs,
|
|
63
|
+
) -> None:
|
|
64
|
+
|
|
65
|
+
# Get object store credentials:
|
|
66
|
+
self._anon = anon
|
|
67
|
+
self._asynchronous = asynchronous
|
|
68
|
+
|
|
69
|
+
if store_credentials_json is None:
|
|
70
|
+
logging.info(
|
|
71
|
+
"No JSON file was provided."
|
|
72
|
+
"Object store credentials will be obtained from the arguments passed."
|
|
73
|
+
)
|
|
74
|
+
self._store_credentials = {
|
|
75
|
+
"secret": secret,
|
|
76
|
+
"token": key,
|
|
77
|
+
"endpoint_url": endpoint_url,
|
|
78
|
+
}
|
|
79
|
+
else:
|
|
80
|
+
self._store_credentials = self.load_store_credentials(store_credentials_json)
|
|
81
|
+
|
|
82
|
+
# Configure storage options:
|
|
83
|
+
self._storage_options = self.get_storage_options()
|
|
84
|
+
|
|
85
|
+
super().__init__(*fs_args, **self._storage_options, **fs_kwargs)
|
|
86
|
+
|
|
87
|
+
@staticmethod
|
|
88
|
+
def load_store_credentials(path: str) -> dict:
|
|
89
|
+
"""
|
|
90
|
+
Set the credentials of the object store from a JSON file.
|
|
91
|
+
|
|
92
|
+
Parameters
|
|
93
|
+
----------
|
|
94
|
+
path
|
|
95
|
+
Absolute or relative filepath to the JSON file containing
|
|
96
|
+
the object store credentials.
|
|
97
|
+
|
|
98
|
+
Returns
|
|
99
|
+
-------
|
|
100
|
+
store_credentials
|
|
101
|
+
Dictionary containing the values of the `token`,
|
|
102
|
+
`secret` and `endpoint_url` keys used to access the
|
|
103
|
+
object store.
|
|
104
|
+
"""
|
|
105
|
+
try:
|
|
106
|
+
with open(path) as f:
|
|
107
|
+
store_credentials = json.load(f)
|
|
108
|
+
except Exception as error:
|
|
109
|
+
raise Exception(error)
|
|
110
|
+
|
|
111
|
+
for key in ["token", "secret", "endpoint_url"]:
|
|
112
|
+
if key not in store_credentials:
|
|
113
|
+
logging.warning(
|
|
114
|
+
'"%s" is not a key in the JSON file provided. Its value will be set to None.',
|
|
115
|
+
key
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
return store_credentials
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def get_storage_options(
|
|
122
|
+
self,
|
|
123
|
+
set_async: bool=False,
|
|
124
|
+
) -> dict:
|
|
125
|
+
"""
|
|
126
|
+
Get the storage options to access the object store.
|
|
127
|
+
|
|
128
|
+
Returns
|
|
129
|
+
-------
|
|
130
|
+
storage_options
|
|
131
|
+
Dictionary containing the storage options to access the object store.
|
|
132
|
+
|
|
133
|
+
"""
|
|
134
|
+
# Create storage options dict from credentials:
|
|
135
|
+
self._storage_options = {
|
|
136
|
+
"anon": self._anon,
|
|
137
|
+
"secret": self._store_credentials["secret"],
|
|
138
|
+
"key": self._store_credentials["token"],
|
|
139
|
+
"client_kwargs": {
|
|
140
|
+
"endpoint_url": self._store_credentials["endpoint_url"],
|
|
141
|
+
},
|
|
142
|
+
"config_kwargs": {
|
|
143
|
+
"request_checksum_calculation": "when_required",
|
|
144
|
+
"response_checksum_validation": "when_required",
|
|
145
|
+
},
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if set_async:
|
|
149
|
+
# Override asynchronous option of ObjectStoreS3:
|
|
150
|
+
self._storage_options["asynchronous"] = True
|
|
151
|
+
else:
|
|
152
|
+
self._storage_options["asynchronous"] = self._asynchronous
|
|
153
|
+
|
|
154
|
+
return self._storage_options
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def create_bucket(
|
|
158
|
+
self,
|
|
159
|
+
bucket: str,
|
|
160
|
+
**kwargs
|
|
161
|
+
) -> None:
|
|
162
|
+
"""
|
|
163
|
+
Create a bucket in the object store.
|
|
164
|
+
|
|
165
|
+
Parameters
|
|
166
|
+
----------
|
|
167
|
+
bucket
|
|
168
|
+
Name of bucket to create.
|
|
169
|
+
Bucket names can consist only of lowercase letters,
|
|
170
|
+
numbers, dots (.), and hyphens (-).
|
|
171
|
+
"""
|
|
172
|
+
try:
|
|
173
|
+
return self.mkdir(bucket, **kwargs)
|
|
174
|
+
except FileExistsError:
|
|
175
|
+
logging.info(f"Bucket {bucket} already exists.")
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def create_icechunk_repo(
|
|
179
|
+
self,
|
|
180
|
+
bucket: str,
|
|
181
|
+
prefix: str,
|
|
182
|
+
storage_config_kwargs: dict = {'region': 'us-east-1', 'force_path_style': True},
|
|
183
|
+
repository_config_kwargs: dict = {},
|
|
184
|
+
storage_settings_kwargs: dict = {'unsafe_use_conditional_update': False, 'unsafe_use_conditional_create': False},
|
|
185
|
+
) -> icechunk.Repository:
|
|
186
|
+
"""
|
|
187
|
+
Create a new Icechunk repository in cloud object storage.
|
|
188
|
+
|
|
189
|
+
Parameters
|
|
190
|
+
----------
|
|
191
|
+
bucket: str
|
|
192
|
+
Name of bucket in s3 object store.
|
|
193
|
+
prefix: str
|
|
194
|
+
Name of prefix within bucket to store object.
|
|
195
|
+
storage_config_kwargs
|
|
196
|
+
Kwargs for icechunk.s3_storage().
|
|
197
|
+
See: https://icechunk.io/en/latest/icechunk-python/storage/.
|
|
198
|
+
repository_config_kwargs
|
|
199
|
+
Kwargs for icechunk.RepositoryConfig().
|
|
200
|
+
See: https://icechunk.io/en/latest/icechunk-python/configuration/.
|
|
201
|
+
storage_settings_kwargs
|
|
202
|
+
Kwargs for icechunk.StorageSettings().
|
|
203
|
+
See: https://icechunk.io/en/latest/icechunk-python/configuration/#storage.
|
|
204
|
+
|
|
205
|
+
Returns
|
|
206
|
+
-------
|
|
207
|
+
repo, icechunk.Repository
|
|
208
|
+
Icechunk repository.
|
|
209
|
+
"""
|
|
210
|
+
# -- Define S3 storage -- #
|
|
211
|
+
storage = icechunk.s3_storage(
|
|
212
|
+
bucket=bucket,
|
|
213
|
+
prefix=prefix,
|
|
214
|
+
access_key_id=self._store_credentials["token"],
|
|
215
|
+
secret_access_key=self._store_credentials['secret'],
|
|
216
|
+
endpoint_url=self._store_credentials['endpoint_url'],
|
|
217
|
+
**storage_config_kwargs
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# -- Define Icechunk repo config -- #
|
|
221
|
+
repo_config = icechunk.RepositoryConfig(
|
|
222
|
+
storage = icechunk.StorageSettings(
|
|
223
|
+
**storage_settings_kwargs,
|
|
224
|
+
),
|
|
225
|
+
**repository_config_kwargs,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# -- Create Icechunk repo -- #
|
|
229
|
+
repo = icechunk.Repository.create(
|
|
230
|
+
storage=storage,
|
|
231
|
+
config=repo_config
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
return repo
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def open_icechunk_repo(
|
|
238
|
+
self,
|
|
239
|
+
bucket: str,
|
|
240
|
+
prefix: str,
|
|
241
|
+
storage_config_kwargs: dict = {'region': 'us-east-1', 'force_path_style': True},
|
|
242
|
+
repository_config_kwargs: dict = {},
|
|
243
|
+
storage_settings_kwargs: dict = {'unsafe_use_conditional_update': False, 'unsafe_use_conditional_create': False},
|
|
244
|
+
) -> icechunk.Repository:
|
|
245
|
+
"""
|
|
246
|
+
Open an existing Icechunk repository in cloud object storage.
|
|
247
|
+
|
|
248
|
+
Parameters
|
|
249
|
+
----------
|
|
250
|
+
bucket: str
|
|
251
|
+
Name of bucket in s3 object store.
|
|
252
|
+
prefix: str
|
|
253
|
+
Name of prefix within bucket to store object.
|
|
254
|
+
storage_config_kwargs
|
|
255
|
+
Kwargs for icechunk.s3_storage().
|
|
256
|
+
See: https://icechunk.io/en/latest/icechunk-python/storage/.
|
|
257
|
+
repository_config_kwargs
|
|
258
|
+
Kwargs for icechunk.RepositoryConfig().
|
|
259
|
+
See: https://icechunk.io/en/latest/icechunk-python/configuration/.
|
|
260
|
+
storage_settings_kwargs
|
|
261
|
+
Kwargs for icechunk.StorageSettings().
|
|
262
|
+
See: https://icechunk.io/en/latest/icechunk-python/configuration/#storage.
|
|
263
|
+
|
|
264
|
+
Returns
|
|
265
|
+
-------
|
|
266
|
+
repo, icechunk.Repository
|
|
267
|
+
Icechunk repository.
|
|
268
|
+
"""
|
|
269
|
+
# -- Define S3 storage -- #
|
|
270
|
+
storage = icechunk.s3_storage(
|
|
271
|
+
bucket=bucket,
|
|
272
|
+
prefix=prefix,
|
|
273
|
+
access_key_id=self._store_credentials["token"],
|
|
274
|
+
secret_access_key=self._store_credentials['secret'],
|
|
275
|
+
endpoint_url=self._store_credentials['endpoint_url'],
|
|
276
|
+
**storage_config_kwargs
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# -- Define Icechunk repo config -- #
|
|
280
|
+
repo_config = icechunk.RepositoryConfig(
|
|
281
|
+
storage = icechunk.StorageSettings(
|
|
282
|
+
**storage_settings_kwargs,
|
|
283
|
+
),
|
|
284
|
+
**repository_config_kwargs,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# -- Open existing Icechunk repo -- #
|
|
288
|
+
repo = icechunk.Repository.open(
|
|
289
|
+
storage=storage,
|
|
290
|
+
config=repo_config
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
return repo
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
# ===================================================================
|
|
2
|
+
# Copyright 2026 National Oceanography Centre
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0.
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
11
|
+
# implied. See the License for the specific language governing
|
|
12
|
+
# permissions and limitations under the License.
|
|
13
|
+
# ===================================================================
|
|
14
|
+
"""
|
|
15
|
+
utils.py
|
|
16
|
+
|
|
17
|
+
Description:
|
|
18
|
+
This module defines utility functions and classes for the OceanDataStore CLI.
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
Authors:
|
|
22
|
+
- Ollie Tooth
|
|
23
|
+
"""
|
|
24
|
+
# -- Import Python Modules -- #
|
|
25
|
+
import time
|
|
26
|
+
import logging
|
|
27
|
+
from typing import Optional
|
|
28
|
+
|
|
29
|
+
import glob
|
|
30
|
+
import xarray as xr
|
|
31
|
+
|
|
32
|
+
from dask.distributed.diagnostics.plugin import WorkerPlugin
|
|
33
|
+
|
|
34
|
+
# -- Import OceanDataStore Modules -- #
|
|
35
|
+
from OceanDataStore.cli.object_store import ObjectStoreS3
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# -- Define Dask WorkerPlugins -- #
|
|
39
|
+
class CaptureWarningsPlugin(WorkerPlugin):
|
|
40
|
+
def setup(self, worker):
|
|
41
|
+
# Used to catch UserWarnings when rechunking:
|
|
42
|
+
logging.captureWarnings(True)
|
|
43
|
+
def teardown(self, worker):
|
|
44
|
+
logging.captureWarnings(False)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class CloseClientSessionPlugin(WorkerPlugin):
|
|
48
|
+
async def teardown(self, worker):
|
|
49
|
+
import s3fs
|
|
50
|
+
for fs in list(s3fs.S3FileSystem._cache.values()):
|
|
51
|
+
try:
|
|
52
|
+
if hasattr(fs, '_s3') and fs._s3 is not None:
|
|
53
|
+
await fs._s3.close()
|
|
54
|
+
except Exception:
|
|
55
|
+
pass
|
|
56
|
+
s3fs.S3FileSystem.clear_instance_cache()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# -- Utility Classes & Functions -- #
|
|
60
|
+
class timer():
|
|
61
|
+
"""
|
|
62
|
+
Timer context manager class to return time
|
|
63
|
+
taken to write variables & datasets to an
|
|
64
|
+
object store.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
action : str
|
|
69
|
+
Action to be performed. Options are 'send' or 'update'.
|
|
70
|
+
url : str
|
|
71
|
+
URL path in the object store.
|
|
72
|
+
var : Optional[str], default=None
|
|
73
|
+
Name of variable to be sent or updated to store.
|
|
74
|
+
"""
|
|
75
|
+
def __init__(self, action: str, dest: str, var: Optional[str] = None) -> None:
|
|
76
|
+
# Define class attributes:
|
|
77
|
+
if action == 'send':
|
|
78
|
+
if var is not None:
|
|
79
|
+
self.action = f'Sent {var} to'
|
|
80
|
+
else:
|
|
81
|
+
self.action = 'Sent dataset to'
|
|
82
|
+
elif action == 'replace':
|
|
83
|
+
if var is not None:
|
|
84
|
+
self.action = f'Updated {var} in'
|
|
85
|
+
else:
|
|
86
|
+
self.action = 'Updated'
|
|
87
|
+
elif action == 'append':
|
|
88
|
+
if var is not None:
|
|
89
|
+
self.action = f'Appended {var} to'
|
|
90
|
+
else:
|
|
91
|
+
self.action = 'Appended to'
|
|
92
|
+
else:
|
|
93
|
+
raise ValueError("Invalid action: must be 'send', 'replace' or 'append'.")
|
|
94
|
+
self.dest = dest
|
|
95
|
+
|
|
96
|
+
def __enter__(self):
|
|
97
|
+
self.t_start = time.time()
|
|
98
|
+
|
|
99
|
+
def __exit__(self, type, value, traceback):
|
|
100
|
+
self.t_end = time.time()
|
|
101
|
+
logging.info(
|
|
102
|
+
f"Completed: {self.action} store s3://{self.dest.replace('s3://', '')} in {(self.t_end - self.t_start):.2f} seconds"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _preprocess_dataset(file: list[str] | str | xr.Dataset,
|
|
107
|
+
rechunk: Optional[dict] = None,
|
|
108
|
+
append_dim: Optional[str] = "time_counter",
|
|
109
|
+
update_coords: Optional[dict] = None,
|
|
110
|
+
grid_filepath: Optional[str] = None,
|
|
111
|
+
attrs: Optional[dict] = None,
|
|
112
|
+
parallel: bool = False,
|
|
113
|
+
) -> xr.Dataset:
|
|
114
|
+
"""
|
|
115
|
+
Preprocess the dataset to be sent to the object store.
|
|
116
|
+
|
|
117
|
+
Parameters
|
|
118
|
+
----------
|
|
119
|
+
file: list | str | xarray.Dataset
|
|
120
|
+
Regular expression or list of filepaths to netCDF file(s).
|
|
121
|
+
Users can also pass a single xarray.Dataset directly.
|
|
122
|
+
rechunk: Optional[dict], default=None
|
|
123
|
+
Mapping to rechunk dimensions. If None, dask chunks
|
|
124
|
+
will be set to on-disk chunks.
|
|
125
|
+
append_dim: str, default='time_counter'
|
|
126
|
+
Name of the dimension to append multi-file datasets.
|
|
127
|
+
update_coords: Optional[dict], default=None
|
|
128
|
+
Mapping of coordinate variables to update using model
|
|
129
|
+
grid file. Keys are coordinate variable names in the
|
|
130
|
+
dataset to be sent, and values are the corresponding
|
|
131
|
+
variable names in the model grid file. If None, no
|
|
132
|
+
coordinates will be updated.
|
|
133
|
+
grid_filepath: Optional[str], default=None
|
|
134
|
+
Filepath to the model grid file to update coordinate
|
|
135
|
+
variables. Required if update_coords is not None.
|
|
136
|
+
attrs: Optional[dict], default=None
|
|
137
|
+
Dictionary of attributes to add to the dataset.
|
|
138
|
+
If None, no attributes will be added.
|
|
139
|
+
parallel: bool, default=False
|
|
140
|
+
Whether to open and preprocess the dataset in parallel
|
|
141
|
+
using `dask.delayed`.
|
|
142
|
+
|
|
143
|
+
Returns
|
|
144
|
+
-------
|
|
145
|
+
xr.Dataset
|
|
146
|
+
Preprocessed (multifile) dataset with optionally
|
|
147
|
+
updated coordinates, chunksizes and attributes.
|
|
148
|
+
|
|
149
|
+
"""
|
|
150
|
+
# == Verify Inputs == #
|
|
151
|
+
if not isinstance(file, (list, str, xr.Dataset)):
|
|
152
|
+
raise TypeError("filepaths must be a list, a string or an xarray Dataset.")
|
|
153
|
+
if isinstance(file, list):
|
|
154
|
+
for fpath in file:
|
|
155
|
+
if not isinstance(fpath, str):
|
|
156
|
+
raise TypeError("filepaths must be a list of strings.")
|
|
157
|
+
if not fpath.endswith('.nc'):
|
|
158
|
+
raise ValueError("Invalid file extension: only .nc files are supported.")
|
|
159
|
+
elif isinstance(file, str):
|
|
160
|
+
if not file.endswith('.nc'):
|
|
161
|
+
raise ValueError("Invalid file extension: only .nc files are supported.")
|
|
162
|
+
if rechunk is not None:
|
|
163
|
+
if not isinstance(rechunk, dict):
|
|
164
|
+
raise TypeError("rechunk must be a dictionary.")
|
|
165
|
+
if not isinstance(append_dim, str):
|
|
166
|
+
raise TypeError("append_dim must be a string.")
|
|
167
|
+
if update_coords is not None:
|
|
168
|
+
if not isinstance(update_coords, dict):
|
|
169
|
+
raise TypeError("update_coords must be a dictionary.")
|
|
170
|
+
if grid_filepath is not None:
|
|
171
|
+
if not isinstance(grid_filepath, str):
|
|
172
|
+
raise TypeError("grid_filepath must be a string.")
|
|
173
|
+
if attrs is not None:
|
|
174
|
+
if not isinstance(attrs, dict):
|
|
175
|
+
raise TypeError("attrs must be a dictionary.")
|
|
176
|
+
if not isinstance(parallel, bool):
|
|
177
|
+
raise TypeError("parallel must be a boolean.")
|
|
178
|
+
|
|
179
|
+
# === Load netCDF dataset === #
|
|
180
|
+
if rechunk is None:
|
|
181
|
+
# Default to dask chunks equal to on-disk chunks:
|
|
182
|
+
rechunk = {}
|
|
183
|
+
|
|
184
|
+
# File names from str / regular expression:
|
|
185
|
+
if isinstance(file, str):
|
|
186
|
+
if '*' in file:
|
|
187
|
+
filepaths = sorted(glob.glob(file))
|
|
188
|
+
if len(filepaths) == 0:
|
|
189
|
+
raise FileNotFoundError(f"No files found at {filepaths}")
|
|
190
|
+
else:
|
|
191
|
+
filepaths = [file]
|
|
192
|
+
# File names from list:
|
|
193
|
+
elif isinstance(file, list):
|
|
194
|
+
filepaths = file
|
|
195
|
+
|
|
196
|
+
# Use input dataset:
|
|
197
|
+
if isinstance(file, xr.Dataset):
|
|
198
|
+
ds_filepath = file
|
|
199
|
+
if rechunk is not None:
|
|
200
|
+
ds_filepath = ds_filepath.chunk(rechunk)
|
|
201
|
+
else:
|
|
202
|
+
# Open multi-file dataset:
|
|
203
|
+
if len(filepaths) > 1:
|
|
204
|
+
ds_filepath = xr.open_mfdataset(filepaths,
|
|
205
|
+
engine='h5netcdf',
|
|
206
|
+
chunks=rechunk,
|
|
207
|
+
parallel=parallel,
|
|
208
|
+
concat_dim=append_dim,
|
|
209
|
+
combine='nested',
|
|
210
|
+
data_vars='minimal',
|
|
211
|
+
coords='minimal',
|
|
212
|
+
compat='override'
|
|
213
|
+
)
|
|
214
|
+
else:
|
|
215
|
+
# Open single file dataset:
|
|
216
|
+
ds_filepath = xr.open_dataset(filepaths[0], chunks=rechunk)
|
|
217
|
+
|
|
218
|
+
# === Update coordinates using model grid file === #
|
|
219
|
+
if update_coords is not None:
|
|
220
|
+
if grid_filepath is None:
|
|
221
|
+
raise ValueError(
|
|
222
|
+
"grid_filepath must be specified to update coordinate variables."
|
|
223
|
+
)
|
|
224
|
+
else:
|
|
225
|
+
ds_grid = xr.open_dataset(grid_filepath)
|
|
226
|
+
# Update coordinate vars using model grid file:
|
|
227
|
+
for key in update_coords.keys():
|
|
228
|
+
coord_data = ds_grid[update_coords[key]].squeeze(drop=True)
|
|
229
|
+
# Rechunk dimensions to user specified chunks:
|
|
230
|
+
if rechunk is not None:
|
|
231
|
+
coord_chunks = {dim: rechunk[dim] for dim in coord_data.dims}
|
|
232
|
+
ds_filepath = ds_filepath.assign_coords(
|
|
233
|
+
{key: coord_data.chunk(coord_chunks)}
|
|
234
|
+
)
|
|
235
|
+
else:
|
|
236
|
+
ds_filepath = ds_filepath.assign_coords(
|
|
237
|
+
{key: coord_data}
|
|
238
|
+
)
|
|
239
|
+
logging.info('Completed: Updated coordinate variables.')
|
|
240
|
+
|
|
241
|
+
# === Update Attributes === #
|
|
242
|
+
if attrs is not None:
|
|
243
|
+
ds_filepath = ds_filepath.assign_attrs(attrs)
|
|
244
|
+
|
|
245
|
+
return ds_filepath
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
# -- Command Line Interface Utility Functions -- #
|
|
249
|
+
def list_objects(
|
|
250
|
+
dest: str,
|
|
251
|
+
store_credentials_json: str,
|
|
252
|
+
) -> list[str]:
|
|
253
|
+
"""
|
|
254
|
+
List the objects contained inside a bucket / object.
|
|
255
|
+
|
|
256
|
+
Parameters
|
|
257
|
+
----------
|
|
258
|
+
dest: str
|
|
259
|
+
Destination path in the object store.
|
|
260
|
+
store_credentials_json: str
|
|
261
|
+
Path to the JSON file containing the object store credentials.
|
|
262
|
+
|
|
263
|
+
Returns
|
|
264
|
+
-------
|
|
265
|
+
list[str]
|
|
266
|
+
List of objects contained inside the bucket / object.
|
|
267
|
+
"""
|
|
268
|
+
# === Initialise synchronous object store === #
|
|
269
|
+
logging.info("Reading object store credentials from %s", store_credentials_json)
|
|
270
|
+
obj_store = ObjectStoreS3(anon=False,
|
|
271
|
+
asynchronous=False,
|
|
272
|
+
store_credentials_json=store_credentials_json
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
logging.info(obj_store.ls(dest))
|