climdata 0.0.5__tar.gz → 0.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of climdata might be problematic. Click here for more details.

Files changed (55) hide show
  1. {climdata-0.0.5 → climdata-0.0.6}/.gitignore +3 -1
  2. {climdata-0.0.5 → climdata-0.0.6}/PKG-INFO +2 -1
  3. {climdata-0.0.5 → climdata-0.0.6}/climdata/__init__.py +2 -1
  4. {climdata-0.0.5 → climdata-0.0.6}/climdata/conf/config.yaml +1 -1
  5. climdata-0.0.6/climdata/datasets/ERA5.py +322 -0
  6. {climdata-0.0.5 → climdata-0.0.6}/climdata/datasets/MSWX.py +2 -0
  7. {climdata-0.0.5 → climdata-0.0.6}/climdata.egg-info/PKG-INFO +2 -1
  8. {climdata-0.0.5 → climdata-0.0.6}/climdata.egg-info/SOURCES.txt +2 -0
  9. {climdata-0.0.5 → climdata-0.0.6}/climdata.egg-info/requires.txt +1 -0
  10. climdata-0.0.6/examples/extract_dwd_loc.ipynb +310 -0
  11. climdata-0.0.6/examples/zarr_tas_data/metadata.json +1 -0
  12. {climdata-0.0.5 → climdata-0.0.6}/pyproject.toml +2 -2
  13. {climdata-0.0.5 → climdata-0.0.6}/requirements.txt +1 -0
  14. climdata-0.0.5/examples/extract_dwd_loc.ipynb +0 -1097
  15. {climdata-0.0.5 → climdata-0.0.6}/.editorconfig +0 -0
  16. {climdata-0.0.5 → climdata-0.0.6}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  17. {climdata-0.0.5 → climdata-0.0.6}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  18. {climdata-0.0.5 → climdata-0.0.6}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  19. {climdata-0.0.5 → climdata-0.0.6}/.github/workflows/docs-build.yml +0 -0
  20. {climdata-0.0.5 → climdata-0.0.6}/.github/workflows/docs.yml +0 -0
  21. {climdata-0.0.5 → climdata-0.0.6}/.github/workflows/installation.yml +0 -0
  22. {climdata-0.0.5 → climdata-0.0.6}/.github/workflows/macos.yml +0 -0
  23. {climdata-0.0.5 → climdata-0.0.6}/.github/workflows/pypi.yml +0 -0
  24. {climdata-0.0.5 → climdata-0.0.6}/.github/workflows/ubuntu.yml +0 -0
  25. {climdata-0.0.5 → climdata-0.0.6}/.github/workflows/windows.yml +0 -0
  26. {climdata-0.0.5 → climdata-0.0.6}/LICENSE +0 -0
  27. {climdata-0.0.5 → climdata-0.0.6}/MANIFEST.in +0 -0
  28. {climdata-0.0.5 → climdata-0.0.6}/README.md +0 -0
  29. {climdata-0.0.5 → climdata-0.0.6}/climdata/__main__.py +0 -0
  30. {climdata-0.0.5 → climdata-0.0.6}/climdata/conf/mappings/parameters.yaml +0 -0
  31. {climdata-0.0.5 → climdata-0.0.6}/climdata/conf/mappings/variables.yaml +0 -0
  32. {climdata-0.0.5 → climdata-0.0.6}/climdata/datasets/CMIP.py +0 -0
  33. {climdata-0.0.5 → climdata-0.0.6}/climdata/datasets/DWD.py +0 -0
  34. {climdata-0.0.5 → climdata-0.0.6}/climdata/main.py +0 -0
  35. {climdata-0.0.5 → climdata-0.0.6}/climdata/utils/__init__.py +0 -0
  36. {climdata-0.0.5 → climdata-0.0.6}/climdata/utils/config.py +0 -0
  37. {climdata-0.0.5 → climdata-0.0.6}/climdata/utils/utils_download.py +0 -0
  38. {climdata-0.0.5 → climdata-0.0.6}/climdata.egg-info/dependency_links.txt +0 -0
  39. {climdata-0.0.5 → climdata-0.0.6}/climdata.egg-info/entry_points.txt +0 -0
  40. {climdata-0.0.5 → climdata-0.0.6}/climdata.egg-info/top_level.txt +0 -0
  41. {climdata-0.0.5 → climdata-0.0.6}/docs/changelog.md +0 -0
  42. {climdata-0.0.5 → climdata-0.0.6}/docs/climdata.md +0 -0
  43. {climdata-0.0.5 → climdata-0.0.6}/docs/common.md +0 -0
  44. {climdata-0.0.5 → climdata-0.0.6}/docs/contributing.md +0 -0
  45. {climdata-0.0.5 → climdata-0.0.6}/docs/faq.md +0 -0
  46. {climdata-0.0.5 → climdata-0.0.6}/docs/index.md +0 -0
  47. {climdata-0.0.5 → climdata-0.0.6}/docs/installation.md +0 -0
  48. {climdata-0.0.5 → climdata-0.0.6}/docs/overrides/main.html +0 -0
  49. {climdata-0.0.5 → climdata-0.0.6}/docs/usage.md +0 -0
  50. {climdata-0.0.5 → climdata-0.0.6}/dwd_tas_LAT52.507_LON14.1372_1989-01-01_2020-12-31.csv +0 -0
  51. {climdata-0.0.5 → climdata-0.0.6}/mkdocs.yml +0 -0
  52. {climdata-0.0.5 → climdata-0.0.6}/requirements_dev.txt +0 -0
  53. {climdata-0.0.5 → climdata-0.0.6}/setup.cfg +0 -0
  54. {climdata-0.0.5 → climdata-0.0.6}/tests/__init__.py +0 -0
  55. {climdata-0.0.5 → climdata-0.0.6}/tests/test_climdata.py +0 -0
@@ -107,4 +107,6 @@ ENV/
107
107
  .vscode/
108
108
  climdata/conf/service.json
109
109
  outputs
110
- *.csv
110
+ *.csv
111
+ *.zarr
112
+ *.nc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: climdata
3
- Version: 0.0.5
3
+ Version: 0.0.6
4
4
  Summary: This project automates the fetching and extraction of weather data from multiple sources — such as MSWX, DWD HYRAS, ERA5-Land, NASA-NEX-GDDP, and more — for a given location and time range.
5
5
  Author-email: Kaushik Muduchuru <kaushik.reddy.m@gmail.com>
6
6
  License: MIT License
@@ -37,6 +37,7 @@ Requires-Dist: zarr
37
37
  Requires-Dist: ipyleaflet
38
38
  Requires-Dist: wetterdienst
39
39
  Requires-Dist: pint-pandas
40
+ Requires-Dist: cdsapi
40
41
  Requires-Dist: hydra-core
41
42
  Requires-Dist: intake
42
43
  Requires-Dist: intake-esm
@@ -2,10 +2,11 @@
2
2
 
3
3
  __author__ = """Kaushik Muduchuru"""
4
4
  __email__ = "kaushik.reddy.m@gmail.com"
5
- __version__ = "0.0.5"
5
+ __version__ = "0.0.6"
6
6
 
7
7
  from .utils.utils_download import * # etc.
8
8
  from .utils.config import load_config
9
9
  from .datasets.DWD import DWDmirror as DWD
10
10
  from .datasets.MSWX import MSWXmirror as MSWX
11
+ from .datasets.ERA5 import ERA5Mirror as ERA5
11
12
 
@@ -4,7 +4,7 @@ defaults:
4
4
  - mappings/parameters
5
5
  - mappings/variables
6
6
  dataset: dwd
7
- data_dir: /beegfs/muduchuru/data
7
+ data_dir: ./data
8
8
  weather:
9
9
  parameter: tas # standardized variable name (e.g., tas, pr, rsds)
10
10
 
@@ -0,0 +1,322 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-FileCopyrightText: All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import os
18
+ import tempfile
19
+ import cdsapi
20
+ import xarray as xr
21
+ import datetime
22
+ import json
23
+ import dask
24
+ import calendar
25
+ from dask.diagnostics import ProgressBar
26
+ from typing import List, Tuple, Dict, Union
27
+ import urllib3
28
+ import logging
29
+ import numpy as np
30
+ import fsspec
31
+
32
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
33
+
34
+
35
+ class ERA5Mirror:
36
+ """
37
+ A class to manage downloading ERA5 datasets. The datasets are downloaded from the Copernicus Climate Data Store (CDS) and stored in Zarr format.
38
+
39
+ Attributes
40
+ ----------
41
+ base_path : Path
42
+ The path to the Zarr dataset.
43
+ fs : fsspec.AbstractFileSystem
44
+ The filesystem to use for the Zarr dataset. If None, the local filesystem will be used.
45
+ """
46
+
47
+ def __init__(self, base_path: str, fs: fsspec.AbstractFileSystem = None):
48
+ # Get parameters
49
+ self.base_path = base_path
50
+ if fs is None:
51
+ fs = fsspec.filesystem("file")
52
+ self.fs = fs
53
+
54
+ # Create the base path if it doesn't exist
55
+ if not self.fs.exists(self.base_path):
56
+ self.fs.makedirs(self.base_path)
57
+
58
+ # Create metadata that will be used to track which chunks have been downloaded
59
+ self.metadata_file = os.path.join(self.base_path, "metadata.json")
60
+ self.metadata = self.get_metadata()
61
+
62
+ def get_metadata(self):
63
+ """Get metadata"""
64
+ if self.fs.exists(self.metadata_file):
65
+ with self.fs.open(self.metadata_file, "r") as f:
66
+ try:
67
+ metadata = json.load(f)
68
+ except json.decoder.JSONDecodeError:
69
+ metadata = {"chunks": []}
70
+ else:
71
+ metadata = {"chunks": []}
72
+ return metadata
73
+
74
+ def save_metadata(self):
75
+ """Save metadata"""
76
+ with self.fs.open(self.metadata_file, "w") as f:
77
+ json.dump(self.metadata, f)
78
+
79
+ def chunk_exists(self, variable, year, month, pressure_level):
80
+ """Check if chunk exists"""
81
+ for chunk in self.metadata["chunks"]:
82
+ if (
83
+ chunk["variable"] == variable
84
+ and chunk["year"] == year
85
+ and chunk["month"] == month
86
+ and chunk["pressure_level"] == pressure_level
87
+ ):
88
+ return True
89
+ return False
90
+
91
+ def download_chunk(
92
+ self,
93
+ variable: str,
94
+ year: int,
95
+ month: int,
96
+ pressure_level: int = None,
97
+ ):
98
+ """
99
+ Download ERA5 data for the specified variable, date range, and pressure levels.
100
+
101
+ Parameters
102
+ ----------
103
+ variable : str
104
+ The ERA5 variable to download, e.g. 'tisr' for solar radiation or 'z' for geopotential.
105
+ year : int
106
+ The year to download.
107
+ month : int
108
+ The month to download.
109
+ pressure_level : int, optional
110
+ A pressure level to include in the download, by default None. If None, the single-level data will be downloaded.
111
+
112
+ Returns
113
+ -------
114
+ xr.Dataset
115
+ An xarray Dataset containing the downloaded data.
116
+ """
117
+
118
+ with tempfile.TemporaryDirectory() as tmpdir:
119
+ # Get all days in the month
120
+ days_in_month = calendar.monthrange(year, month)[1]
121
+
122
+ # Make tmpfile to store the data
123
+ output_file = os.path.join(
124
+ tmpdir,
125
+ f"{variable}_{year}_{month:02d}_{str(pressure_level)}.nc",
126
+ )
127
+
128
+ # start the CDS API client (maybe need to move this outside the loop?)
129
+ c = cdsapi.Client(quiet=True)
130
+
131
+ # Setup the request parameters
132
+ request_params = {
133
+ "product_type": "reanalysis",
134
+ "variable": [variable],
135
+ "year": str(year),
136
+ "month": str(month),
137
+ "day": [f"{day:02d}" for day in range(1, days_in_month + 1)],
138
+ "time_zone": "utc+00:00",
139
+ "frequency": "6_hourly",
140
+ "daily_statistic": "daily_mean",
141
+ "data_format": "netcdf"
142
+ }
143
+ if pressure_level:
144
+ request_params["pressure_level"] = [str(pressure_level)]
145
+ dataset_name = "derived-era5-pressure-levels-daily-statistics"
146
+ else:
147
+ dataset_name = "derived-era5-single-levels-daily-statistics"
148
+
149
+ # Download the data
150
+ c.retrieve(
151
+ dataset_name,
152
+ request_params,
153
+ output_file,
154
+ )
155
+
156
+ # Open the downloaded data
157
+ ds = xr.open_dataset(output_file)
158
+ return ds
159
+
160
+ def variable_to_zarr_name(self, variable: str, pressure_level: int = None):
161
+ """convert variable to zarr name"""
162
+ # create zarr path for variable
163
+ zarr_path = f"{self.base_path}/{variable}"
164
+ if pressure_level:
165
+ zarr_path += f"_pressure_level_{pressure_level}"
166
+ zarr_path += ".zarr"
167
+ return zarr_path
168
+
169
+ def download_and_upload_chunk(
170
+ self,
171
+ variable: str,
172
+ year: int,
173
+ month: int,
174
+ pressure_level: int = None,
175
+ ):
176
+ """
177
+ Downloads a chunk of ERA5 data for a specific variable and date range, and uploads it to a Zarr array.
178
+ This downloads a 1-month chunk of data.
179
+
180
+ Parameters
181
+ ----------
182
+ variable : str
183
+ The variable to download.
184
+ year : int
185
+ The year to download.
186
+ month : int
187
+ The month to download.
188
+ pressure_level : int, optional
189
+ Pressure levels to download, if applicable.
190
+ """
191
+
192
+ # Download the data
193
+ ds = self.download_chunk(variable, year, month, pressure_level)
194
+ if "valid_time" in ds.dims:
195
+ ds = ds.rename({"valid_time": "time"})
196
+
197
+ # Create the Zarr path
198
+ zarr_path = self.variable_to_zarr_name(variable, pressure_level)
199
+
200
+ # Specify the chunking options
201
+ chunking = {"time": 1, "latitude": 721, "longitude": 1440}
202
+ if "level" in ds.dims:
203
+ chunking["level"] = 1
204
+
205
+ # Re-chunk the dataset
206
+ ds = ds.chunk(chunking)
207
+
208
+ # Check if the Zarr dataset exists
209
+ if self.fs.exists(zarr_path):
210
+ mode = "a"
211
+ append_dim = "time"
212
+ create = False
213
+ else:
214
+ mode = "w"
215
+ append_dim = None
216
+ create = True
217
+
218
+ # Upload the data to the Zarr dataset
219
+ mapper = self.fs.get_mapper(zarr_path, create=create)
220
+ ds.to_zarr(mapper, mode=mode, consolidated=True, append_dim=append_dim)
221
+
222
+ # Update the metadata
223
+ self.metadata["chunks"].append(
224
+ {
225
+ "variable": variable,
226
+ "year": year,
227
+ "month": month,
228
+ "pressure_level": pressure_level,
229
+ }
230
+ )
231
+ self.save_metadata()
232
+
233
+ def download(
234
+ self,
235
+ variables: List[Union[str, Tuple[str, int]]],
236
+ date_range: Tuple[datetime.date, datetime.date],
237
+ ):
238
+ """
239
+ Start the process of mirroring the specified ERA5 variables for the given date range.
240
+
241
+ Parameters
242
+ ----------
243
+ variables : List[Union[str, Tuple[str, List[int]]]]
244
+ A list of variables to mirror, where each element can either be a string (single-level variable)
245
+ or a tuple (variable with pressure level).
246
+ date_range : Tuple[datetime.date, datetime.date]
247
+ A tuple containing the start and end dates for the data to be mirrored. This will download and store every month in the range.
248
+
249
+ Returns
250
+ -------
251
+ zarr_paths : List[str]
252
+ A list of Zarr paths for each of the variables.
253
+ """
254
+
255
+ start_date, end_date = date_range
256
+
257
+ # Reformat the variables list so all elements are tuples
258
+ reformated_variables = []
259
+ for variable in variables:
260
+ if isinstance(variable, str):
261
+ reformated_variables.append(tuple([variable, None]))
262
+ else:
263
+ reformated_variables.append(variable)
264
+
265
+ # Start Downloading
266
+ with ProgressBar():
267
+ # Round dates to months
268
+ current_date = start_date.replace(day=1)
269
+ end_date = end_date.replace(day=1)
270
+
271
+ while current_date <= end_date:
272
+ # Create a list of tasks to download the data
273
+ tasks = []
274
+ for variable, pressure_level in reformated_variables:
275
+ if not self.chunk_exists(
276
+ variable,
277
+ current_date.year,
278
+ current_date.month,
279
+ pressure_level,
280
+ ):
281
+ task = dask.delayed(self.download_and_upload_chunk)(
282
+ variable,
283
+ current_date.year,
284
+ current_date.month,
285
+ pressure_level,
286
+ )
287
+ tasks.append(task)
288
+ else:
289
+ print(
290
+ f"Chunk for {variable} {pressure_level} {current_date.year}-{current_date.month} already exists. Skipping."
291
+ )
292
+
293
+ # Execute the tasks with Dask
294
+ print(f"Downloading data for {current_date.year}-{current_date.month}")
295
+ if tasks:
296
+ dask.compute(*tasks)
297
+
298
+ # Update the metadata
299
+ self.save_metadata()
300
+
301
+ # Update the current date
302
+ days_in_month = calendar.monthrange(
303
+ year=current_date.year, month=current_date.month
304
+ )[1]
305
+ current_date += datetime.timedelta(days=days_in_month)
306
+
307
+ # Return the Zarr paths
308
+ zarr_paths = []
309
+ for variable, pressure_level in reformated_variables:
310
+ zarr_path = self.variable_to_zarr_name(variable, pressure_level)
311
+ zarr_paths.append(zarr_path)
312
+
313
+ # Check that Zarr arrays have correct dt for time dimension
314
+ for zarr_path in zarr_paths:
315
+ ds = xr.open_zarr(zarr_path)
316
+ time_stamps = ds.time.values
317
+ dt = time_stamps[1:] - time_stamps[:-1]
318
+ assert np.all(dt == dt[0]), (
319
+ f"Zarr array {zarr_path} has incorrect dt for time dimension. An error may have occurred during download. Please delete the Zarr array and try again."
320
+ )
321
+
322
+ return zarr_paths
@@ -23,6 +23,8 @@ from google.oauth2 import service_account
23
23
  from googleapiclient.discovery import build
24
24
  from googleapiclient.http import MediaIoBaseDownload
25
25
 
26
+ from climdata.utils.utils_download import list_drive_files, download_drive_file
27
+
26
28
  import io
27
29
  import requests
28
30
  from scipy.spatial import cKDTree
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: climdata
3
- Version: 0.0.5
3
+ Version: 0.0.6
4
4
  Summary: This project automates the fetching and extraction of weather data from multiple sources — such as MSWX, DWD HYRAS, ERA5-Land, NASA-NEX-GDDP, and more — for a given location and time range.
5
5
  Author-email: Kaushik Muduchuru <kaushik.reddy.m@gmail.com>
6
6
  License: MIT License
@@ -37,6 +37,7 @@ Requires-Dist: zarr
37
37
  Requires-Dist: ipyleaflet
38
38
  Requires-Dist: wetterdienst
39
39
  Requires-Dist: pint-pandas
40
+ Requires-Dist: cdsapi
40
41
  Requires-Dist: hydra-core
41
42
  Requires-Dist: intake
42
43
  Requires-Dist: intake-esm
@@ -32,6 +32,7 @@ climdata/conf/mappings/parameters.yaml
32
32
  climdata/conf/mappings/variables.yaml
33
33
  climdata/datasets/CMIP.py
34
34
  climdata/datasets/DWD.py
35
+ climdata/datasets/ERA5.py
35
36
  climdata/datasets/MSWX.py
36
37
  climdata/utils/__init__.py
37
38
  climdata/utils/config.py
@@ -46,5 +47,6 @@ docs/installation.md
46
47
  docs/usage.md
47
48
  docs/overrides/main.html
48
49
  examples/extract_dwd_loc.ipynb
50
+ examples/zarr_tas_data/metadata.json
49
51
  tests/__init__.py
50
52
  tests/test_climdata.py
@@ -18,6 +18,7 @@ zarr
18
18
  ipyleaflet
19
19
  wetterdienst
20
20
  pint-pandas
21
+ cdsapi
21
22
  hydra-core
22
23
  intake
23
24
  intake-esm