OceanDataStore 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. OceanDataStore/__init__.py +21 -0
  2. OceanDataStore/catalog/__init__.py +12 -0
  3. OceanDataStore/catalog/oceandatacatalog.py +1242 -0
  4. OceanDataStore/catalog/stac/README.md +34 -0
  5. OceanDataStore/catalog/stac/__init__.py +30 -0
  6. OceanDataStore/catalog/stac/create_noc_stac.py +109 -0
  7. OceanDataStore/catalog/stac/npd_era5_collection.py +364 -0
  8. OceanDataStore/catalog/stac/npd_jra55_collection.py +196 -0
  9. OceanDataStore/catalog/stac/ods_obs_collection.py +534 -0
  10. OceanDataStore/catalog/stac/rapid_evo_collection.py +309 -0
  11. OceanDataStore/catalog/stac/template_collection.py +85 -0
  12. OceanDataStore/catalog/stac/utils.py +476 -0
  13. OceanDataStore/cli/__init__.py +34 -0
  14. OceanDataStore/cli/arg_parser.py +182 -0
  15. OceanDataStore/cli/cli.py +203 -0
  16. OceanDataStore/cli/exceptions.py +83 -0
  17. OceanDataStore/cli/icechunk.py +888 -0
  18. OceanDataStore/cli/logging.py +52 -0
  19. OceanDataStore/cli/object_store.py +293 -0
  20. OceanDataStore/cli/utils.py +275 -0
  21. OceanDataStore/cli/zarr.py +870 -0
  22. OceanDataStore/data/ARMOR3D/create_ARMOR3D_P1M-m_monthly_climatology.py +135 -0
  23. OceanDataStore/data/ARMOR3D/download_ARMOR3D_0.125def_P1M-m_1993_2024.py +33 -0
  24. OceanDataStore/data/ARMOR3D/run_create_ARMOR3D_P1M-m_monthly_climatology.slurm +32 -0
  25. OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_climatology_to_os.slurm +32 -0
  26. OceanDataStore/data/ARMOR3D/run_send_ARMOR3D_P1M-m_monthly_to_os.slurm +32 -0
  27. OceanDataStore/data/ARMOR3D/run_update_ARMOR3D_P1m-m_monthly_to_os.slurm +32 -0
  28. OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_climatology_to_os.py +99 -0
  29. OceanDataStore/data/ARMOR3D/send_ARMOR3D_P1m-m_monthly_to_os.py +147 -0
  30. OceanDataStore/data/ARMOR3D/update_ARMOR3D_P1m-m_monthly_to_os.py +143 -0
  31. OceanDataStore/data/EN.4.2.2/create_EN4.2.2_analysis_g10_climatology.py +162 -0
  32. OceanDataStore/data/EN.4.2.2/download_EN4.2.2_analysis_g10_data.sh +51 -0
  33. OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_climatology_to_os.slurm +32 -0
  34. OceanDataStore/data/EN.4.2.2/run_send_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
  35. OceanDataStore/data/EN.4.2.2/run_update_EN4.2.2_analysis_g10_monthly_to_os.slurm +32 -0
  36. OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_climatology_to_os.py +76 -0
  37. OceanDataStore/data/EN.4.2.2/send_EN4.2.2_analysis_g10_monthly_to_os.py +165 -0
  38. OceanDataStore/data/EN.4.2.2/update_EN4.2.2_analysis_g10_monthly_to_os.py +161 -0
  39. OceanDataStore/data/ERA5/create_ERA5_daily_climatology.py +110 -0
  40. OceanDataStore/data/ERA5/create_ERA5_daily_mean.py +69 -0
  41. OceanDataStore/data/ERA5/create_ERA5_monthly_mean.py +74 -0
  42. OceanDataStore/data/ERA5/run_create_ERA5_daily_climatology.slurm +54 -0
  43. OceanDataStore/data/ERA5/run_send_ERA5_daily_climatology_to_os.slurm +32 -0
  44. OceanDataStore/data/ERA5/run_send_ERA5_daily_to_os.slurm +32 -0
  45. OceanDataStore/data/ERA5/run_send_ERA5_monthly_to_os.slurm +32 -0
  46. OceanDataStore/data/ERA5/run_update_ERA5_daily_to_os.slurm +32 -0
  47. OceanDataStore/data/ERA5/run_update_ERA5_monthly_to_os.slurm +32 -0
  48. OceanDataStore/data/ERA5/send_ERA5_daily_climatology_to_os.py +159 -0
  49. OceanDataStore/data/ERA5/send_ERA5_daily_to_os.py +141 -0
  50. OceanDataStore/data/ERA5/send_ERA5_monthly_to_os.py +173 -0
  51. OceanDataStore/data/ERA5/update_ERA5_daily_to_os.py +141 -0
  52. OceanDataStore/data/ERA5/update_ERA5_monthly_to_os.py +169 -0
  53. OceanDataStore/data/HadISST/download_HadISST1_data.sh +43 -0
  54. OceanDataStore/data/HadISST/run_send_HadISST1_monthly_to_os.slurm +32 -0
  55. OceanDataStore/data/HadISST/send_HadISST1_monthly_to_os.py +133 -0
  56. OceanDataStore/data/NSIDC/download_NSIDC_monthly_1979_2025_data.sh +54 -0
  57. OceanDataStore/data/NSIDC/process_NSIDC_SSI_Antarctic_data.py +130 -0
  58. OceanDataStore/data/NSIDC/process_NSIDC_SSI_Arctic_data.py +129 -0
  59. OceanDataStore/data/NSIDC/run_send_NSIDC_v4.0_to_OS.slurm +32 -0
  60. OceanDataStore/data/NSIDC/send_NSIDC_SII_v4.0_to_os.py +140 -0
  61. OceanDataStore/data/OISST/create_OISSTv2_daily_climatology.py +83 -0
  62. OceanDataStore/data/OISST/download_oisstv2_data.sh +43 -0
  63. OceanDataStore/data/OISST/run_create_OISSTv2_daily_climatology.slurm +44 -0
  64. OceanDataStore/data/OISST/run_send_OISSTv2_daily_climatology_to_os.slurm +32 -0
  65. OceanDataStore/data/OISST/run_send_OISSTv2_daily_to_os.slurm +32 -0
  66. OceanDataStore/data/OISST/run_send_OISSTv2_monthly_climatology_to_os.slurm +32 -0
  67. OceanDataStore/data/OISST/run_send_OISSTv2_monthly_to_os.slurm +32 -0
  68. OceanDataStore/data/OISST/run_update_OISSTv2_daily_to_os.slurm +32 -0
  69. OceanDataStore/data/OISST/send_OISSTv2_daily_climatology_to_os.py +154 -0
  70. OceanDataStore/data/OISST/send_OISSTv2_daily_ltm_climatology_to_os.py +151 -0
  71. OceanDataStore/data/OISST/send_OISSTv2_daily_to_os.py +142 -0
  72. OceanDataStore/data/OISST/send_OISSTv2_monthly_climatology_to_os.py +150 -0
  73. OceanDataStore/data/OISST/send_OISSTv2_monthly_to_os.py +145 -0
  74. OceanDataStore/data/OISST/update_OISSTv2_daily_to_os.py +142 -0
  75. OceanDataStore/data/OSTIA/create_OSTIA_daily_climatology.py +120 -0
  76. OceanDataStore/data/OSTIA/download_OSTIA_NRT.py +42 -0
  77. OceanDataStore/data/OSTIA/download_OSTIA_REP_1981_2025.py +42 -0
  78. OceanDataStore/data/OSTIA/run_create_OSTIA_daily_climatology.slurm +54 -0
  79. OceanDataStore/data/OSTIA/run_send_OSTIA_daily_climatology_to_os.slurm +32 -0
  80. OceanDataStore/data/OSTIA/run_send_OSTIA_nrt_daily_to_os.slurm +32 -0
  81. OceanDataStore/data/OSTIA/run_send_OSTIA_rep_daily_to_os.slurm +32 -0
  82. OceanDataStore/data/OSTIA/run_update_OSTIA_daily_to_os.slurm +33 -0
  83. OceanDataStore/data/OSTIA/send_OSTIA_daily_climatology_to_os.py +194 -0
  84. OceanDataStore/data/OSTIA/send_OSTIA_nrt_daily_to_os.py +141 -0
  85. OceanDataStore/data/OSTIA/send_OSTIA_rep_daily_to_os.py +145 -0
  86. OceanDataStore/data/OSTIA/update_OSTIA_copernicus_nrt_daily_to_os.py +144 -0
  87. OceanDataStore/data/OSTIA/update_OSTIA_nrt_daily_to_os.py +137 -0
  88. OceanDataStore/data/WOA23/download_WOA23_climatology.sh +41 -0
  89. OceanDataStore/data/WOA23/run_send_WOA23_annual_climatology_to_os.slurm +32 -0
  90. OceanDataStore/data/WOA23/run_send_WOA23_monthly_climatology_to_os.slurm +32 -0
  91. OceanDataStore/data/WOA23/send_WOA23_annual_climatology_to_os.py +263 -0
  92. OceanDataStore/data/WOA23/send_WOA23_monthly_climatology_to_os.py +292 -0
  93. OceanDataStore/data/update_icechunk_repo_attrs.py +76 -0
  94. OceanDataStore/data/update_noc_npd_era5v1_attrs.py +172 -0
  95. OceanDataStore/data/utils.py +506 -0
  96. OceanDataStore/zarr.py +993 -0
  97. oceandatastore-0.3.0.dist-info/METADATA +184 -0
  98. oceandatastore-0.3.0.dist-info/RECORD +104 -0
  99. oceandatastore-0.3.0.dist-info/WHEEL +5 -0
  100. oceandatastore-0.3.0.dist-info/entry_points.txt +2 -0
  101. oceandatastore-0.3.0.dist-info/licenses/LICENSE +201 -0
  102. oceandatastore-0.3.0.dist-info/scm_file_list.json +154 -0
  103. oceandatastore-0.3.0.dist-info/scm_version.json +8 -0
  104. oceandatastore-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,52 @@
1
+ # ===================================================================
2
+ # Copyright 2026 National Oceanography Centre
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0.
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
11
+ # implied. See the License for the specific language governing
12
+ # permissions and limitations under the License.
13
+ # ===================================================================
14
+ """
15
+ logging.py
16
+
17
+ Description:
18
+ This module defines the logging utility function for the OceanDataStore
19
+ package.
20
+
21
+ Authors:
22
+ - Ollie Tooth
23
+ """
24
+ import sys
25
+ import logging
26
+
27
+ from OceanDataStore.cli.arg_parser import __version__
28
+
29
+
30
+ def initialise_logging():
31
+ """
32
+ Initialise OceanDataStore logging.
33
+ """
34
+ logging.basicConfig(
35
+ stream=sys.stdout,
36
+ format="🌐 OceanDataStore 🌐 | %(levelname)10s | %(asctime)s | %(message)s",
37
+ level=logging.INFO,
38
+ datefmt="%Y-%m-%d %H:%M:%S",
39
+ )
40
+
41
+ logging.info(
42
+ f"""
43
+ .~~~.
44
+ .( ).~~~~~~.
45
+ ~( ).~~~.
46
+ .( OceanDataStore ).
47
+ (___________________________).
48
+ version: {__version__}
49
+
50
+ """,
51
+ extra={"simple": True},
52
+ )
@@ -0,0 +1,293 @@
1
+ # ===================================================================
2
+ # Copyright 2026 National Oceanography Centre
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0.
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
11
+ # implied. See the License for the specific language governing
12
+ # permissions and limitations under the License.
13
+ # ===================================================================
14
+ """
15
+ object_store.py
16
+
17
+ Description:
18
+ This module defines the ObjectStoreS3 class, which is a subclass
19
+ of the S3FileSystem class from the s3fs library.
20
+
21
+ Authors:
22
+ - Ollie Tooth
23
+ - Joao Morado
24
+ - Tobias Ferreira
25
+ """
26
+ import json
27
+ import logging
28
+ from typing import Union
29
+
30
+ import icechunk
31
+ import s3fs
32
+
33
+
34
+ class ObjectStoreS3(s3fs.S3FileSystem):
35
+ """
36
+ Initialize S3 Object Store.
37
+
38
+ Parameters
39
+ ----------
40
+ anon, bool (False)
41
+ Whether to use anonymous connection (public buckets only).
42
+ asynchronous, bool (True)
43
+ Whether to use asynchronous operations (instance to be used inside corountines).
44
+ store_credentials_json, str (None)
45
+ File path to object store credentials .json file.
46
+ secret, str (None)
47
+ If not anonymous, use this secret key to access object store.
48
+ key, str (None)
49
+ If not anonymous, use this key to access object store.
50
+ endpoint_url, str (None)
51
+ Endpoint URL of object store. Needed for non-AWS S3 object stores.
52
+ """
53
+ def __init__(
54
+ self,
55
+ anon: bool = False,
56
+ asynchronous: bool = False,
57
+ store_credentials_json: Union[str, None] = None,
58
+ secret: Union[str, None] = None,
59
+ key: Union[str, None] = None,
60
+ endpoint_url: Union[str, None] = None,
61
+ *fs_args,
62
+ **fs_kwargs,
63
+ ) -> None:
64
+
65
+ # Get object store credentials:
66
+ self._anon = anon
67
+ self._asynchronous = asynchronous
68
+
69
+ if store_credentials_json is None:
70
+ logging.info(
71
+ "No JSON file was provided."
72
+ "Object store credentials will be obtained from the arguments passed."
73
+ )
74
+ self._store_credentials = {
75
+ "secret": secret,
76
+ "token": key,
77
+ "endpoint_url": endpoint_url,
78
+ }
79
+ else:
80
+ self._store_credentials = self.load_store_credentials(store_credentials_json)
81
+
82
+ # Configure storage options:
83
+ self._storage_options = self.get_storage_options()
84
+
85
+ super().__init__(*fs_args, **self._storage_options, **fs_kwargs)
86
+
87
+ @staticmethod
88
+ def load_store_credentials(path: str) -> dict:
89
+ """
90
+ Set the credentials of the object store from a JSON file.
91
+
92
+ Parameters
93
+ ----------
94
+ path
95
+ Absolute or relative filepath to the JSON file containing
96
+ the object store credentials.
97
+
98
+ Returns
99
+ -------
100
+ store_credentials
101
+ Dictionary containing the values of the `token`,
102
+ `secret` and `endpoint_url` keys used to access the
103
+ object store.
104
+ """
105
+ try:
106
+ with open(path) as f:
107
+ store_credentials = json.load(f)
108
+ except Exception as error:
109
+ raise Exception(error)
110
+
111
+ for key in ["token", "secret", "endpoint_url"]:
112
+ if key not in store_credentials:
113
+ logging.warning(
114
+ '"%s" is not a key in the JSON file provided. Its value will be set to None.',
115
+ key
116
+ )
117
+
118
+ return store_credentials
119
+
120
+
121
+ def get_storage_options(
122
+ self,
123
+ set_async: bool=False,
124
+ ) -> dict:
125
+ """
126
+ Get the storage options to access the object store.
127
+
128
+ Returns
129
+ -------
130
+ storage_options
131
+ Dictionary containing the storage options to access the object store.
132
+
133
+ """
134
+ # Create storage options dict from credentials:
135
+ self._storage_options = {
136
+ "anon": self._anon,
137
+ "secret": self._store_credentials["secret"],
138
+ "key": self._store_credentials["token"],
139
+ "client_kwargs": {
140
+ "endpoint_url": self._store_credentials["endpoint_url"],
141
+ },
142
+ "config_kwargs": {
143
+ "request_checksum_calculation": "when_required",
144
+ "response_checksum_validation": "when_required",
145
+ },
146
+ }
147
+
148
+ if set_async:
149
+ # Override asynchronous option of ObjectStoreS3:
150
+ self._storage_options["asynchronous"] = True
151
+ else:
152
+ self._storage_options["asynchronous"] = self._asynchronous
153
+
154
+ return self._storage_options
155
+
156
+
157
+ def create_bucket(
158
+ self,
159
+ bucket: str,
160
+ **kwargs
161
+ ) -> None:
162
+ """
163
+ Create a bucket in the object store.
164
+
165
+ Parameters
166
+ ----------
167
+ bucket
168
+ Name of bucket to create.
169
+ Bucket names can consist only of lowercase letters,
170
+ numbers, dots (.), and hyphens (-).
171
+ """
172
+ try:
173
+ return self.mkdir(bucket, **kwargs)
174
+ except FileExistsError:
175
+ logging.info(f"Bucket {bucket} already exists.")
176
+
177
+
178
+ def create_icechunk_repo(
179
+ self,
180
+ bucket: str,
181
+ prefix: str,
182
+ storage_config_kwargs: dict = {'region': 'us-east-1', 'force_path_style': True},
183
+ repository_config_kwargs: dict = {},
184
+ storage_settings_kwargs: dict = {'unsafe_use_conditional_update': False, 'unsafe_use_conditional_create': False},
185
+ ) -> icechunk.Repository:
186
+ """
187
+ Create a new Icechunk repository in cloud object storage.
188
+
189
+ Parameters
190
+ ----------
191
+ bucket: str
192
+ Name of bucket in s3 object store.
193
+ prefix: str
194
+ Name of prefix within bucket to store object.
195
+ storage_config_kwargs
196
+ Kwargs for icechunk.s3_storage().
197
+ See: https://icechunk.io/en/latest/icechunk-python/storage/.
198
+ repository_config_kwargs
199
+ Kwargs for icechunk.RepositoryConfig().
200
+ See: https://icechunk.io/en/latest/icechunk-python/configuration/.
201
+ storage_settings_kwargs
202
+ Kwargs for icechunk.StorageSettings().
203
+ See: https://icechunk.io/en/latest/icechunk-python/configuration/#storage.
204
+
205
+ Returns
206
+ -------
207
+ repo, icechunk.Repository
208
+ Icechunk repository.
209
+ """
210
+ # -- Define S3 storage -- #
211
+ storage = icechunk.s3_storage(
212
+ bucket=bucket,
213
+ prefix=prefix,
214
+ access_key_id=self._store_credentials["token"],
215
+ secret_access_key=self._store_credentials['secret'],
216
+ endpoint_url=self._store_credentials['endpoint_url'],
217
+ **storage_config_kwargs
218
+ )
219
+
220
+ # -- Define Icechunk repo config -- #
221
+ repo_config = icechunk.RepositoryConfig(
222
+ storage = icechunk.StorageSettings(
223
+ **storage_settings_kwargs,
224
+ ),
225
+ **repository_config_kwargs,
226
+ )
227
+
228
+ # -- Create Icechunk repo -- #
229
+ repo = icechunk.Repository.create(
230
+ storage=storage,
231
+ config=repo_config
232
+ )
233
+
234
+ return repo
235
+
236
+
237
+ def open_icechunk_repo(
238
+ self,
239
+ bucket: str,
240
+ prefix: str,
241
+ storage_config_kwargs: dict = {'region': 'us-east-1', 'force_path_style': True},
242
+ repository_config_kwargs: dict = {},
243
+ storage_settings_kwargs: dict = {'unsafe_use_conditional_update': False, 'unsafe_use_conditional_create': False},
244
+ ) -> icechunk.Repository:
245
+ """
246
+ Open an existing Icechunk repository in cloud object storage.
247
+
248
+ Parameters
249
+ ----------
250
+ bucket: str
251
+ Name of bucket in s3 object store.
252
+ prefix: str
253
+ Name of prefix within bucket to store object.
254
+ storage_config_kwargs
255
+ Kwargs for icechunk.s3_storage().
256
+ See: https://icechunk.io/en/latest/icechunk-python/storage/.
257
+ repository_config_kwargs
258
+ Kwargs for icechunk.RepositoryConfig().
259
+ See: https://icechunk.io/en/latest/icechunk-python/configuration/.
260
+ storage_settings_kwargs
261
+ Kwargs for icechunk.StorageSettings().
262
+ See: https://icechunk.io/en/latest/icechunk-python/configuration/#storage.
263
+
264
+ Returns
265
+ -------
266
+ repo, icechunk.Repository
267
+ Icechunk repository.
268
+ """
269
+ # -- Define S3 storage -- #
270
+ storage = icechunk.s3_storage(
271
+ bucket=bucket,
272
+ prefix=prefix,
273
+ access_key_id=self._store_credentials["token"],
274
+ secret_access_key=self._store_credentials['secret'],
275
+ endpoint_url=self._store_credentials['endpoint_url'],
276
+ **storage_config_kwargs
277
+ )
278
+
279
+ # -- Define Icechunk repo config -- #
280
+ repo_config = icechunk.RepositoryConfig(
281
+ storage = icechunk.StorageSettings(
282
+ **storage_settings_kwargs,
283
+ ),
284
+ **repository_config_kwargs,
285
+ )
286
+
287
+ # -- Open existing Icechunk repo -- #
288
+ repo = icechunk.Repository.open(
289
+ storage=storage,
290
+ config=repo_config
291
+ )
292
+
293
+ return repo
@@ -0,0 +1,275 @@
1
+ # ===================================================================
2
+ # Copyright 2026 National Oceanography Centre
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0.
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
11
+ # implied. See the License for the specific language governing
12
+ # permissions and limitations under the License.
13
+ # ===================================================================
14
+ """
15
+ utils.py
16
+
17
+ Description:
18
+ This module defines utility functions and classes for the OceanDataStore CLI.
19
+
20
+
21
+ Authors:
22
+ - Ollie Tooth
23
+ """
24
+ # -- Import Python Modules -- #
25
+ import time
26
+ import logging
27
+ from typing import Optional
28
+
29
+ import glob
30
+ import xarray as xr
31
+
32
+ from dask.distributed.diagnostics.plugin import WorkerPlugin
33
+
34
+ # -- Import OceanDataStore Modules -- #
35
+ from OceanDataStore.cli.object_store import ObjectStoreS3
36
+
37
+
38
+ # -- Define Dask WorkerPlugins -- #
39
+ class CaptureWarningsPlugin(WorkerPlugin):
40
+ def setup(self, worker):
41
+ # Used to catch UserWarnings when rechunking:
42
+ logging.captureWarnings(True)
43
+ def teardown(self, worker):
44
+ logging.captureWarnings(False)
45
+
46
+
47
+ class CloseClientSessionPlugin(WorkerPlugin):
48
+ async def teardown(self, worker):
49
+ import s3fs
50
+ for fs in list(s3fs.S3FileSystem._cache.values()):
51
+ try:
52
+ if hasattr(fs, '_s3') and fs._s3 is not None:
53
+ await fs._s3.close()
54
+ except Exception:
55
+ pass
56
+ s3fs.S3FileSystem.clear_instance_cache()
57
+
58
+
59
+ # -- Utility Classes & Functions -- #
60
+ class timer():
61
+ """
62
+ Timer context manager class to return time
63
+ taken to write variables & datasets to an
64
+ object store.
65
+
66
+ Parameters
67
+ ----------
68
+ action : str
69
+ Action to be performed. Options are 'send' or 'update'.
70
+ url : str
71
+ URL path in the object store.
72
+ var : Optional[str], default=None
73
+ Name of variable to be sent or updated to store.
74
+ """
75
+ def __init__(self, action: str, dest: str, var: Optional[str] = None) -> None:
76
+ # Define class attributes:
77
+ if action == 'send':
78
+ if var is not None:
79
+ self.action = f'Sent {var} to'
80
+ else:
81
+ self.action = 'Sent dataset to'
82
+ elif action == 'replace':
83
+ if var is not None:
84
+ self.action = f'Updated {var} in'
85
+ else:
86
+ self.action = 'Updated'
87
+ elif action == 'append':
88
+ if var is not None:
89
+ self.action = f'Appended {var} to'
90
+ else:
91
+ self.action = 'Appended to'
92
+ else:
93
+ raise ValueError("Invalid action: must be 'send', 'replace' or 'append'.")
94
+ self.dest = dest
95
+
96
+ def __enter__(self):
97
+ self.t_start = time.time()
98
+
99
+ def __exit__(self, type, value, traceback):
100
+ self.t_end = time.time()
101
+ logging.info(
102
+ f"Completed: {self.action} store s3://{self.dest.replace('s3://', '')} in {(self.t_end - self.t_start):.2f} seconds"
103
+ )
104
+
105
+
106
+ def _preprocess_dataset(file: list[str] | str | xr.Dataset,
107
+ rechunk: Optional[dict] = None,
108
+ append_dim: Optional[str] = "time_counter",
109
+ update_coords: Optional[dict] = None,
110
+ grid_filepath: Optional[str] = None,
111
+ attrs: Optional[dict] = None,
112
+ parallel: bool = False,
113
+ ) -> xr.Dataset:
114
+ """
115
+ Preprocess the dataset to be sent to the object store.
116
+
117
+ Parameters
118
+ ----------
119
+ file: list | str | xarray.Dataset
120
+ Regular expression or list of filepaths to netCDF file(s).
121
+ Users can also pass a single xarray.Dataset directly.
122
+ rechunk: Optional[dict], default=None
123
+ Mapping to rechunk dimensions. If None, dask chunks
124
+ will be set to on-disk chunks.
125
+ append_dim: str, default='time_counter'
126
+ Name of the dimension to append multi-file datasets.
127
+ update_coords: Optional[dict], default=None
128
+ Mapping of coordinate variables to update using model
129
+ grid file. Keys are coordinate variable names in the
130
+ dataset to be sent, and values are the corresponding
131
+ variable names in the model grid file. If None, no
132
+ coordinates will be updated.
133
+ grid_filepath: Optional[str], default=None
134
+ Filepath to the model grid file to update coordinate
135
+ variables. Required if update_coords is not None.
136
+ attrs: Optional[dict], default=None
137
+ Dictionary of attributes to add to the dataset.
138
+ If None, no attributes will be added.
139
+ parallel: bool, default=False
140
+ Whether to open and preprocess the dataset in parallel
141
+ using `dask.delayed`.
142
+
143
+ Returns
144
+ -------
145
+ xr.Dataset
146
+ Preprocessed (multifile) dataset with optionally
147
+ updated coordinates, chunksizes and attributes.
148
+
149
+ """
150
+ # == Verify Inputs == #
151
+ if not isinstance(file, (list, str, xr.Dataset)):
152
+ raise TypeError("filepaths must be a list, a string or an xarray Dataset.")
153
+ if isinstance(file, list):
154
+ for fpath in file:
155
+ if not isinstance(fpath, str):
156
+ raise TypeError("filepaths must be a list of strings.")
157
+ if not fpath.endswith('.nc'):
158
+ raise ValueError("Invalid file extension: only .nc files are supported.")
159
+ elif isinstance(file, str):
160
+ if not file.endswith('.nc'):
161
+ raise ValueError("Invalid file extension: only .nc files are supported.")
162
+ if rechunk is not None:
163
+ if not isinstance(rechunk, dict):
164
+ raise TypeError("rechunk must be a dictionary.")
165
+ if not isinstance(append_dim, str):
166
+ raise TypeError("append_dim must be a string.")
167
+ if update_coords is not None:
168
+ if not isinstance(update_coords, dict):
169
+ raise TypeError("update_coords must be a dictionary.")
170
+ if grid_filepath is not None:
171
+ if not isinstance(grid_filepath, str):
172
+ raise TypeError("grid_filepath must be a string.")
173
+ if attrs is not None:
174
+ if not isinstance(attrs, dict):
175
+ raise TypeError("attrs must be a dictionary.")
176
+ if not isinstance(parallel, bool):
177
+ raise TypeError("parallel must be a boolean.")
178
+
179
+ # === Load netCDF dataset === #
180
+ if rechunk is None:
181
+ # Default to dask chunks equal to on-disk chunks:
182
+ rechunk = {}
183
+
184
+ # File names from str / regular expression:
185
+ if isinstance(file, str):
186
+ if '*' in file:
187
+ filepaths = sorted(glob.glob(file))
188
+ if len(filepaths) == 0:
189
+ raise FileNotFoundError(f"No files found at {filepaths}")
190
+ else:
191
+ filepaths = [file]
192
+ # File names from list:
193
+ elif isinstance(file, list):
194
+ filepaths = file
195
+
196
+ # Use input dataset:
197
+ if isinstance(file, xr.Dataset):
198
+ ds_filepath = file
199
+ if rechunk is not None:
200
+ ds_filepath = ds_filepath.chunk(rechunk)
201
+ else:
202
+ # Open multi-file dataset:
203
+ if len(filepaths) > 1:
204
+ ds_filepath = xr.open_mfdataset(filepaths,
205
+ engine='h5netcdf',
206
+ chunks=rechunk,
207
+ parallel=parallel,
208
+ concat_dim=append_dim,
209
+ combine='nested',
210
+ data_vars='minimal',
211
+ coords='minimal',
212
+ compat='override'
213
+ )
214
+ else:
215
+ # Open single file dataset:
216
+ ds_filepath = xr.open_dataset(filepaths[0], chunks=rechunk)
217
+
218
+ # === Update coordinates using model grid file === #
219
+ if update_coords is not None:
220
+ if grid_filepath is None:
221
+ raise ValueError(
222
+ "grid_filepath must be specified to update coordinate variables."
223
+ )
224
+ else:
225
+ ds_grid = xr.open_dataset(grid_filepath)
226
+ # Update coordinate vars using model grid file:
227
+ for key in update_coords.keys():
228
+ coord_data = ds_grid[update_coords[key]].squeeze(drop=True)
229
+ # Rechunk dimensions to user specified chunks:
230
+ if rechunk is not None:
231
+ coord_chunks = {dim: rechunk[dim] for dim in coord_data.dims}
232
+ ds_filepath = ds_filepath.assign_coords(
233
+ {key: coord_data.chunk(coord_chunks)}
234
+ )
235
+ else:
236
+ ds_filepath = ds_filepath.assign_coords(
237
+ {key: coord_data}
238
+ )
239
+ logging.info('Completed: Updated coordinate variables.')
240
+
241
+ # === Update Attributes === #
242
+ if attrs is not None:
243
+ ds_filepath = ds_filepath.assign_attrs(attrs)
244
+
245
+ return ds_filepath
246
+
247
+
248
+ # -- Command Line Interface Utility Functions -- #
249
+ def list_objects(
250
+ dest: str,
251
+ store_credentials_json: str,
252
+ ) -> list[str]:
253
+ """
254
+ List the objects contained inside a bucket / object.
255
+
256
+ Parameters
257
+ ----------
258
+ dest: str
259
+ Destination path in the object store.
260
+ store_credentials_json: str
261
+ Path to the JSON file containing the object store credentials.
262
+
263
+ Returns
264
+ -------
265
+ list[str]
266
+ List of objects contained inside the bucket / object.
267
+ """
268
+ # === Initialise synchronous object store === #
269
+ logging.info("Reading object store credentials from %s", store_credentials_json)
270
+ obj_store = ObjectStoreS3(anon=False,
271
+ asynchronous=False,
272
+ store_credentials_json=store_credentials_json
273
+ )
274
+
275
+ logging.info(obj_store.ls(dest))