climdata 0.0.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of climdata might be problematic. Click here for more details.

@@ -0,0 +1,976 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from wetterdienst import Settings
4
+ from wetterdienst.provider.dwd.observation import DwdObservationRequest
5
+ import geemap
6
+ import ee
7
+ import ipdb
8
+ import geopandas as gpd
9
+ from omegaconf import DictConfig
10
+ import os
11
+ import yaml
12
+ import time
13
+ from tqdm import tqdm
14
+ import warnings
15
+ from datetime import datetime, timedelta
16
+ import xarray as xr
17
+ import hydra
18
+ from omegaconf import DictConfig
19
+ import pint
20
+ import pint_pandas
21
+
22
+ from google.oauth2 import service_account
23
+ from googleapiclient.discovery import build
24
+ from googleapiclient.http import MediaIoBaseDownload
25
+
26
+ import io
27
+ import requests
28
+ from scipy.spatial import cKDTree
29
+ import argparse
30
+ import re
31
+
32
+ import requests
33
+ from bs4 import BeautifulSoup
34
+ import concurrent.futures
35
+
36
+ warnings.filterwarnings("ignore", category=Warning)
37
+
38
+ def fetch_dwd_loc(cfg: DictConfig):
39
+
40
+ param_mapping = cfg.mappings
41
+ provider = cfg.dataset.lower()
42
+ parameter_key = cfg.weather.parameter
43
+ # Validate provider and parameter
44
+
45
+ if provider not in param_mapping:
46
+ raise ValueError(f"Provider '{provider}' not found in parameter map.")
47
+ if parameter_key not in param_mapping[provider]['variables']:
48
+ raise ValueError(f"Parameter '{parameter_key}' not defined for provider '{provider}'.")
49
+
50
+ param_info = param_mapping[provider]['variables'][parameter_key]
51
+ resolution = param_info["resolution"]
52
+ dataset = param_info["dataset"]
53
+ variable_name = param_info["name"]
54
+ units = param_info.get("unit", None)
55
+
56
+ lat = cfg.location.lat
57
+ lon = cfg.location.lon
58
+ distance = cfg.location.buffer_km
59
+ start_date = cfg.time_range.start_date
60
+ end_date = cfg.time_range.end_date
61
+ output_file = cfg.output.filename
62
+
63
+ settings = Settings(
64
+ ts_shape="long",
65
+ ts_humanize=True,
66
+ # ts_si_units=False
67
+ )
68
+
69
+ request = DwdObservationRequest(
70
+ parameters=(resolution, dataset, variable_name),
71
+ start_date=start_date,
72
+ end_date=end_date,
73
+ settings=settings
74
+ ).filter_by_distance(
75
+ latlon=(lat, lon),
76
+ distance=distance,
77
+ unit="km"
78
+ )
79
+
80
+ df = request.values.all().df.to_pandas()
81
+
82
+ df['date'] = pd.to_datetime(df['date'])
83
+ df = df.groupby(['date']).agg({
84
+ 'value': 'mean',
85
+ 'station_id': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
86
+ 'resolution': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
87
+ 'dataset': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
88
+ 'parameter': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
89
+ 'quality': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
90
+ }).reset_index()
91
+
92
+ df.set_index("date", inplace=True)
93
+ df.reset_index(inplace=True)
94
+
95
+ # Standardize column names
96
+ df = df.rename(columns={
97
+ "date": "time",
98
+ "value": "value",
99
+ "station_id": "frequent_station",
100
+
101
+ })
102
+ df["variable"] = parameter_key
103
+ df["latitude"] = lat
104
+ df["longitude"] = lon
105
+ df['source'] = 'DWD'
106
+ df['units'] = units
107
+ df = df[["latitude", "longitude", "time", "source", "variable", "value","units"]]
108
+
109
+ out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
110
+ os.makedirs(out_dir, exist_ok=True)
111
+ out_path = os.path.join(out_dir, cfg.output.filename)
112
+
113
+ df.to_csv(out_path, index=False)
114
+ print(f"✅ Saved time series to: {out_path}")
115
+ return df
116
+ def fetch_ee_loc(cfg: DictConfig):
117
+ ee.Initialize(project='earthengine-462007')
118
+
119
+ provider = cfg.dataset.lower()
120
+ variable_name = cfg.weather.parameter
121
+ ee_image_collection = cfg.mappings[provider].params.collection
122
+
123
+ # Prepare the image collection
124
+ sd = cfg.time_range.start_date
125
+ ed = cfg.time_range.end_date
126
+ var_name = cfg.mappings[provider].variables[variable_name].name
127
+ units = cfg.mappings[provider].variables[variable_name].unit
128
+ if provider=='gddp':
129
+ model = cfg.mappings[provider].params.model
130
+ scenario = cfg.mappings[provider].params.scenario
131
+ dataset = ee.ImageCollection(ee_image_collection)\
132
+ .filter(ee.Filter.date(sd, ed))\
133
+ .filter(ee.Filter.eq('model', model))\
134
+ .filter(ee.Filter.eq('scenario', scenario))
135
+ elif provider=='era5-land':
136
+ dataset = ee.ImageCollection(ee_image_collection)\
137
+ .filter(ee.Filter.date(sd, ed))
138
+ else:
139
+ raise ValueError(f"Provider '{provider}' is not supported for Earth Engine data fetching.")
140
+ image_var = dataset.select(var_name)
141
+
142
+
143
+ lat = cfg.location.lat
144
+ lon = cfg.location.lon
145
+ # identifier = cfg.location.id
146
+ out_dir = cfg.output.out_dir
147
+ # buffer = cfg.location.buffer_km
148
+ buffer = None
149
+ scale = cfg.mappings[provider].params.scale
150
+ # retry_delay = cfg.download.retry_delay
151
+
152
+ os.makedirs(out_dir, exist_ok=True)
153
+
154
+ df = pd.DataFrame([{ "lat": lat, "lon": lon, "id": 0}])
155
+ gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df["lon"], df["lat"]), crs="EPSG:4326")
156
+
157
+ try:
158
+ gdf_ee = geemap.gdf_to_ee(gdf)
159
+
160
+ # if buffer:
161
+ # pixel_values = gdf_ee.map(
162
+ # lambda f: f.set('ts', image_var.getRegion(
163
+ # f.buffer(buffer*1e3).bounds().geometry(), scale))
164
+ # )
165
+ # else:
166
+ pixel_values = gdf_ee.map(
167
+ lambda f: f.set('ts', image_var.getRegion(
168
+ f.geometry(), scale))
169
+ )
170
+
171
+ pixel_values_info = pixel_values.getInfo()
172
+
173
+ for feature in pixel_values_info['features']:
174
+ data = feature['properties']['ts']
175
+ data_id = feature['properties']['id']
176
+
177
+ if data:
178
+ columns = data[0]
179
+ rows = data[1:]
180
+ df_out = pd.DataFrame(rows, columns=columns)
181
+
182
+ out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
183
+ os.makedirs(out_dir, exist_ok=True)
184
+ out_path = os.path.join(out_dir, cfg.output.filename)
185
+
186
+ df_out["variable"] = variable_name
187
+ df_out["latitude"] = lat
188
+ df_out["longitude"] = lon
189
+ df_out['source'] = provider.upper()
190
+ df_out['units'] = units
191
+ df_out['time'] = pd.to_datetime(df_out['time'], unit='ms')
192
+ df_out.rename(columns={variable_name: 'value'}, inplace=True)
193
+ df_out = df_out[["latitude", "longitude", "time", "source", "variable", "value","units"]]
194
+
195
+ df_out.to_csv(out_path, index=False)
196
+ print(f"[\u2713] Saved: {out_path}")
197
+
198
+ return df_out
199
+ else:
200
+ print(f"[!] No data for ID {data_id}")
201
+
202
+ except Exception as e:
203
+ print(f"[\u2717] Error: {e}")
204
+ # time.sleep(retry_delay)
205
+ raise RuntimeError("Failed to download data.")
206
+
207
+ def fetch_ee_loc_mod(cfg: DictConfig):
208
+ # Initialize Earth Engine
209
+ ee.Initialize(project='earthengine-462007')
210
+
211
+ provider = cfg.dataset.lower()
212
+ variable_name = cfg.weather.parameter
213
+ ee_image_collection = cfg.mappings[provider].params.collection
214
+
215
+ sd = cfg.time_range.start_date
216
+ ed = cfg.time_range.end_date
217
+ var_name = cfg.mappings[provider].variables[variable_name].name
218
+ units = cfg.mappings[provider].variables[variable_name].unit
219
+ scale = cfg.mappings[provider].params.scale
220
+ out_dir = cfg.output.out_dir
221
+
222
+ lat = cfg.location.lat
223
+ lon = cfg.location.lon
224
+
225
+ # Handle model/scenario if needed
226
+ if provider == 'gddp':
227
+ model = cfg.mappings[provider].params.model
228
+ scenario = cfg.mappings[provider].params.scenario
229
+ dataset = ee.ImageCollection(ee_image_collection) \
230
+ .filter(ee.Filter.date(sd, ed)) \
231
+ .filter(ee.Filter.eq('model', model)) \
232
+ .filter(ee.Filter.eq('scenario', scenario))
233
+ elif provider == 'era5-land':
234
+ dataset = ee.ImageCollection(ee_image_collection) \
235
+ .filter(ee.Filter.date(sd, ed))
236
+ else:
237
+ raise ValueError(f"Provider '{provider}' is not supported.")
238
+
239
+ image_var = dataset.select(var_name)
240
+ point = ee.Geometry.Point(lon, lat)
241
+
242
+ os.makedirs(out_dir, exist_ok=True)
243
+ results = []
244
+
245
+ print(f"[i] Fetching time series for point: ({lat}, {lon})")
246
+
247
+ # Use a client-side list of images
248
+ image_list = image_var.toList(image_var.size())
249
+ n_images = image_var.size().getInfo()
250
+
251
+ for i in tqdm(range(n_images), desc="Processing images"):
252
+ try:
253
+ img = ee.Image(image_list.get(i))
254
+ date = img.date().format('YYYY-MM-dd').getInfo()
255
+
256
+ value = img.reduceRegion(
257
+ reducer=ee.Reducer.first(),
258
+ geometry=point,
259
+ scale=scale,
260
+ bestEffort=True
261
+ ).get(var_name)
262
+
263
+ value = value.getInfo() if value else None
264
+ results.append({"date": date, var_name: value})
265
+ except Exception as e:
266
+ print(f"[!] Skipping image {i} due to error: {e}")
267
+ continue
268
+
269
+ df_out = pd.DataFrame(results)
270
+ out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
271
+ os.makedirs(out_dir, exist_ok=True)
272
+ out_path = os.path.join(out_dir, cfg.output.filename)
273
+
274
+ df_out["variable"] = variable_name
275
+ df_out["latitude"] = lat
276
+ df_out["longitude"] = lon
277
+ df_out['units'] = units
278
+ df_out['source'] = provider.upper()
279
+ df_out.rename(columns={var_name: 'value', "date": 'time'}, inplace=True)
280
+ df_out = df_out[["latitude", "longitude", "time", "source", "variable", "value",'units']]
281
+
282
+ # ureg = pint.UnitRegistry()
283
+ # pint_pandas.PintType.ureg = ureg
284
+ # df_out['temperature'] = df['temperature'].astype('pint[C]')
285
+
286
+ df_out.to_csv(out_path, index=False)
287
+ print(f"[✓] Saved timeseries to: {out_path}")
288
+ return df_out
289
+ def list_drive_files(folder_id, service):
290
+ """
291
+ List all files in a Google Drive folder, handling pagination.
292
+ """
293
+ files = []
294
+ page_token = None
295
+
296
+ while True:
297
+ results = service.files().list(
298
+ q=f"'{folder_id}' in parents and trashed = false",
299
+ fields="files(id, name), nextPageToken",
300
+ pageToken=page_token
301
+ ).execute()
302
+
303
+ files.extend(results.get("files", []))
304
+ page_token = results.get("nextPageToken", None)
305
+
306
+ if not page_token:
307
+ break
308
+
309
+ return files
310
+ def download_drive_file(file_id, local_path, service):
311
+ """
312
+ Download a single file from Drive to a local path.
313
+ """
314
+ request = service.files().get_media(fileId=file_id)
315
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
316
+
317
+ with io.FileIO(local_path, 'wb') as fh:
318
+ downloader = MediaIoBaseDownload(fh, request)
319
+
320
+ done = False
321
+ while not done:
322
+ status, done = downloader.next_chunk()
323
+ print(f" → Download {int(status.progress() * 100)}% complete")
324
+ def fetch_MSWX(var_cfg):
325
+ param_mapping = var_cfg.mappings
326
+ provider = var_cfg.dataset.lower()
327
+ parameter_key = var_cfg.weather.parameter
328
+
329
+ param_info = param_mapping[provider]['variables'][parameter_key]
330
+ folder_id = param_info["folder_id"]
331
+
332
+ start_date = var_cfg.time_range.start_date
333
+ end_date = var_cfg.time_range.end_date
334
+
335
+ # === 1) Generate expected filenames ===
336
+ start = datetime.fromisoformat(start_date)
337
+ end = datetime.fromisoformat(end_date)
338
+
339
+ expected_files = []
340
+ current = start
341
+ while current <= end:
342
+ doy = current.timetuple().tm_yday
343
+ basename = f"{current.year}{doy:03d}.nc"
344
+ expected_files.append(basename)
345
+ current += timedelta(days=1)
346
+
347
+ output_dir = var_cfg.data_dir
348
+ local_files = []
349
+ missing_files = []
350
+
351
+ for basename in expected_files:
352
+ local_path = os.path.join(output_dir, provider, parameter_key, basename)
353
+ if os.path.exists(local_path):
354
+ local_files.append(basename)
355
+ else:
356
+ missing_files.append(basename)
357
+
358
+ if not missing_files:
359
+ print(f"✅ All {len(expected_files)} files already exist locally. No download needed.")
360
+ return local_files
361
+
362
+ print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching from Drive...")
363
+
364
+ # === 2) Connect to Drive ===
365
+ SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
366
+ creds = service_account.Credentials.from_service_account_file(
367
+ param_mapping[provider].params.google_service_account, scopes=SCOPES
368
+ )
369
+ service = build('drive', 'v3', credentials=creds)
370
+
371
+ # === 3) List all Drive files ===
372
+ drive_files = list_drive_files(folder_id, service)
373
+ valid_filenames = set(missing_files)
374
+
375
+ files_to_download = [f for f in drive_files if f['name'] in valid_filenames]
376
+
377
+ if not files_to_download:
378
+ print(f"⚠️ None of the missing files found in Drive. Check folder & date range.")
379
+ return local_files
380
+
381
+ # === 4) Download missing ===
382
+ for file in files_to_download:
383
+ filename = file['name']
384
+ local_path = os.path.join(output_dir, provider, parameter_key, filename)
385
+ print(f"⬇️ Downloading {filename} ...")
386
+ download_drive_file(file['id'], local_path, service)
387
+ local_files.append(filename)
388
+
389
+ return local_files
390
+
391
+
392
+ def fetch_dwd(var_cfg):
393
+ """Download HYRAS data for one variable and a list of years."""
394
+ param_mapping = var_cfg.mappings
395
+ provider = var_cfg.dataset.lower()
396
+ parameter_key = var_cfg.weather.parameter
397
+ # Validate provider and parameter
398
+
399
+ param_info = param_mapping[provider]['variables'][parameter_key]
400
+ base_url = param_info["base_url"]
401
+ prefix = param_info["prefix"]
402
+ version = param_info["version"]
403
+
404
+ start_date = var_cfg.time_range.start_date
405
+ end_date = var_cfg.time_range.end_date
406
+
407
+ # Parse dates & extract unique years
408
+ start_year = datetime.fromisoformat(start_date).year
409
+ end_year = datetime.fromisoformat(end_date).year
410
+ years = list(range(start_year, end_year + 1))
411
+
412
+ # output_file = cfg.output.filename
413
+ os.makedirs(parameter_key, exist_ok=True)
414
+
415
+ for year in years:
416
+ file_name = f"{prefix}_{year}_{version}_de.nc"
417
+ file_url = f"{base_url}{file_name}"
418
+ local_path = os.path.join(var_cfg.data_dir,provider,parameter_key.upper(), file_name)
419
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
420
+ print(f"⬇️ Checking: {file_url}")
421
+
422
+ # Check if file exists on server first (HEAD request)
423
+ head = requests.head(file_url)
424
+ if head.status_code != 200:
425
+ raise FileNotFoundError(f"❌ Not found on server: {file_url} (HTTP {head.status_code})")
426
+
427
+ if os.path.exists(local_path):
428
+ print(f"✔️ Exists locally: {local_path}")
429
+ continue
430
+
431
+ print(f"⬇️ Downloading: {file_url}")
432
+ try:
433
+ response = requests.get(file_url, stream=True)
434
+ response.raise_for_status()
435
+ with open(local_path, "wb") as f:
436
+ for chunk in response.iter_content(chunk_size=8192):
437
+ f.write(chunk)
438
+ print(f"✅ Saved: {local_path}")
439
+ except requests.HTTPError as e:
440
+ raise RuntimeError(f"❌ Failed download: {file_url} — {e}")
441
+
442
+ def find_nearest_xy(ds, target_lat, target_lon):
443
+ """
444
+ Given a dataset with curvilinear grid, find the nearest x,y index.
445
+ """
446
+ lat = ds['lat'].values # shape (y,x) or (x,y)
447
+ lon = ds['lon'].values
448
+
449
+ # Flatten to 1D for k-d tree
450
+ lat_flat = lat.flatten()
451
+ lon_flat = lon.flatten()
452
+
453
+ tree = cKDTree(np.column_stack((lat_flat, lon_flat)))
454
+ _, idx = tree.query([target_lat, target_lon])
455
+ iy, ix = np.unravel_index(idx, lat.shape)
456
+
457
+ return iy, ix
458
+
459
+ def extract_ts_dwd(cfg: DictConfig):
460
+ param_mapping = cfg.mappings
461
+ provider = cfg.dataset.lower()
462
+ parameter_key = cfg.weather.parameter
463
+ # Validate provider and parameter
464
+
465
+ param_info = param_mapping[provider]['variables'][parameter_key]
466
+ prefix = param_info["prefix"]
467
+ version = param_info["version"]
468
+
469
+ start_date = cfg.time_range.start_date
470
+ end_date = cfg.time_range.end_date
471
+
472
+ # Parse dates & extract unique years
473
+ start_year = datetime.fromisoformat(start_date).year
474
+ end_year = datetime.fromisoformat(end_date).year
475
+ years = list(range(start_year, end_year + 1))
476
+ files=[]
477
+ for year in years:
478
+ file_name = f"{prefix}_{year}_{version}_de.nc"
479
+ files.append(os.path.join(cfg.data_dir,provider,parameter_key.upper(), file_name))
480
+
481
+ if not files:
482
+ raise FileNotFoundError(f"No NetCDF files found for {parameter_key}")
483
+
484
+ target_lat = cfg.location.lat
485
+ target_lon = cfg.location.lon
486
+
487
+ ts_list = []
488
+
489
+ for f in files:
490
+ print(f"📂 Opening: {f}")
491
+ ds = xr.open_dataset(f)
492
+
493
+ # Dimensions: (time, y, x) or (time, x, y)
494
+ # lat/lon: 2D
495
+ time_name = [x for x in ds.coords if "time" in x.lower()][0]
496
+
497
+ iy, ix = find_nearest_xy(ds, target_lat, target_lon)
498
+
499
+ print(f"📌 Nearest grid point at (y,x)=({iy},{ix})")
500
+
501
+ ts = ds[parameter_key].isel(x=ix, y=iy) # watch order: dims must match
502
+
503
+ df = ts.to_dataframe().reset_index()[[time_name, parameter_key]]
504
+ ts_list.append(df)
505
+
506
+ # Combine all time series
507
+ ts_all = pd.concat(ts_list).sort_values(by=time_name).reset_index(drop=True)
508
+
509
+ # Slice on combined DataFrame
510
+ ts_all[time_name] = pd.to_datetime(ts_all[time_name])
511
+ mask = (ts_all[time_name] >= start_date) & (ts_all[time_name] <= end_date)
512
+ ts_all = ts_all.loc[mask].reset_index(drop=True)
513
+
514
+ out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
515
+ os.makedirs(out_dir, exist_ok=True)
516
+ out_path = os.path.join(out_dir, cfg.output.filename)
517
+
518
+ ts_all["variable"] = param_info['name']
519
+ ts_all["latitude"] = target_lat
520
+ ts_all["longitude"] = target_lon
521
+ ts_all['source'] = provider.upper()
522
+ ts_all['units'] = ts.attrs['units']
523
+ ts_all.rename(columns={param_info['name']: 'value'}, inplace=True)
524
+ ts_all = ts_all[["latitude", "longitude", "time", "source", "variable", "value",'units']]
525
+ ts_all.to_csv(out_path, index=False)
526
+ print(f"✅ Saved time series to: {out_path}")
527
+
528
+ return ts_all
529
+ def extract_ts_MSWX(cfg: DictConfig):
530
+ parameter = cfg.weather.parameter
531
+ param_mapping = cfg.mappings
532
+ provider = cfg.dataset.lower()
533
+ parameter_key = cfg.weather.parameter
534
+ # Validate provider and parameter
535
+
536
+ param_info = param_mapping[provider]['variables'][parameter_key]
537
+
538
+ base_dir = cfg.data_dir
539
+
540
+ target_lat = cfg.location.lat
541
+ target_lon = cfg.location.lon
542
+
543
+ start_date = pd.to_datetime(cfg.time_range.start_date)
544
+ end_date = pd.to_datetime(cfg.time_range.end_date)
545
+
546
+ # === 1) Rebuild exact basenames ===
547
+ current = start_date
548
+ basenames = []
549
+ while current <= end_date:
550
+ doy = current.timetuple().tm_yday
551
+ basename = f"{current.year}{doy:03d}.nc"
552
+ basenames.append(basename)
553
+ current += timedelta(days=1)
554
+
555
+ # === 2) Process only those files ===
556
+ ts_list = []
557
+ missing = []
558
+
559
+ for basename in basenames:
560
+ file_path = os.path.join(base_dir, provider, parameter, basename)
561
+
562
+ if not os.path.exists(file_path):
563
+ missing.append(basename)
564
+ continue
565
+
566
+ print(f"📂 Opening: {file_path}")
567
+ ds = xr.open_dataset(file_path)
568
+
569
+ time_name = [x for x in ds.coords if "time" in x.lower()][0]
570
+ data_var = [v for v in ds.data_vars][0]
571
+
572
+ ts = ds[data_var].sel(
573
+ lat=target_lat,
574
+ lon=target_lon,
575
+ method='nearest'
576
+ )
577
+
578
+ df = ts.to_dataframe().reset_index()[[time_name, data_var]]
579
+ ts_list.append(df)
580
+
581
+ if missing:
582
+ print(f"⚠️ Warning: {len(missing)} files were missing and skipped:")
583
+ for m in missing:
584
+ print(f" - {m}")
585
+
586
+ if not ts_list:
587
+ raise RuntimeError("❌ No valid files were found. Cannot extract time series.")
588
+
589
+ # === 3) Combine and slice (for safety) ===
590
+ ts_all = pd.concat(ts_list).sort_values(by=time_name).reset_index(drop=True)
591
+
592
+ ts_all[time_name] = pd.to_datetime(ts_all[time_name])
593
+ ts_all = ts_all[
594
+ (ts_all[time_name] >= start_date) &
595
+ (ts_all[time_name] <= end_date)
596
+ ].reset_index(drop=True)
597
+
598
+ out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
599
+ os.makedirs(out_dir, exist_ok=True)
600
+ out_path = os.path.join(out_dir, cfg.output.filename)
601
+
602
+ ts_all["variable"] = param_info['name']
603
+ ts_all["latitude"] = target_lat
604
+ ts_all["longitude"] = target_lon
605
+ ts_all['source'] = provider.upper()
606
+ ts_all['units'] = ts.attrs['units']
607
+ ts_all.rename(columns={param_info['name']: 'value'}, inplace=True)
608
+ ts_all = ts_all[["latitude", "longitude", "time", "source", "variable", "value",'units']]
609
+
610
+ ts_all.to_csv(out_path, index=False)
611
+ print(f"✅ Saved MSWX time series to: {out_path}")
612
+
613
+ return ts_all
614
+
615
+ import os
616
+ from omegaconf import DictConfig
617
+
618
+ def build_output_filename(cfg: DictConfig) -> str:
619
+ """Generate full output file path from pattern and config."""
620
+ provider = cfg.dataset.lower()
621
+ parameter = cfg.weather.parameter
622
+ lat = cfg.location.lat
623
+ lon = cfg.location.lon
624
+ start = cfg.time_range.start_date
625
+ end = cfg.time_range.end_date
626
+
627
+ pattern = cfg.output.get("filename", "{provider}_{parameter}_{start}_{end}.csv")
628
+ filename = pattern.format(
629
+ provider=provider,
630
+ parameter=parameter,
631
+ lat=lat,
632
+ lon=lon,
633
+ start=start,
634
+ end=end
635
+ )
636
+
637
+ out_dir = cfg.output.out_dir
638
+ fmt = cfg.output.fmt # format is a reserved word in Python, so use 'fmt'
639
+
640
+ # return os.path.join(out_dir, fmt, filename)
641
+ return filename
642
+
643
+ # SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES.
644
+ # SPDX-FileCopyrightText: All rights reserved.
645
+ # SPDX-License-Identifier: Apache-2.0
646
+ #
647
+ # Licensed under the Apache License, Version 2.0 (the "License");
648
+ # you may not use this file except in compliance with the License.
649
+ # You may obtain a copy of the License at
650
+ #
651
+ # http://www.apache.org/licenses/LICENSE-2.0
652
+ #
653
+ # Unless required by applicable law or agreed to in writing, software
654
+ # distributed under the License is distributed on an "AS IS" BASIS,
655
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
656
+ # See the License for the specific language governing permissions and
657
+ # limitations under the License.
658
+ '''
659
+ import os
660
+ import tempfile
661
+ import cdsapi
662
+ import xarray as xr
663
+ import datetime
664
+ import json
665
+ import dask
666
+ import calendar
667
+ from dask.diagnostics import ProgressBar
668
+ from typing import List, Tuple, Dict, Union
669
+ import urllib3
670
+ import logging
671
+ import numpy as np
672
+ import fsspec
673
+
674
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
675
+
676
+
677
+ class ERA5Mirror:
678
+ """
679
+ A class to manage downloading ERA5 datasets. The datasets are downloaded from the Copernicus Climate Data Store (CDS) and stored in Zarr format.
680
+
681
+ Attributes
682
+ ----------
683
+ base_path : Path
684
+ The path to the Zarr dataset.
685
+ fs : fsspec.AbstractFileSystem
686
+ The filesystem to use for the Zarr dataset. If None, the local filesystem will be used.
687
+ """
688
+
689
+ def __init__(self, base_path: str, fs: fsspec.AbstractFileSystem = None):
690
+ # Get parameters
691
+ self.base_path = base_path
692
+ if fs is None:
693
+ fs = fsspec.filesystem("file")
694
+ self.fs = fs
695
+
696
+ # Create the base path if it doesn't exist
697
+ if not self.fs.exists(self.base_path):
698
+ self.fs.makedirs(self.base_path)
699
+
700
+ # Create metadata that will be used to track which chunks have been downloaded
701
+ self.metadata_file = os.path.join(self.base_path, "metadata.json")
702
+ self.metadata = self.get_metadata()
703
+
704
+ def get_metadata(self):
705
+ """Get metadata"""
706
+ if self.fs.exists(self.metadata_file):
707
+ with self.fs.open(self.metadata_file, "r") as f:
708
+ try:
709
+ metadata = json.load(f)
710
+ except json.decoder.JSONDecodeError:
711
+ metadata = {"chunks": []}
712
+ else:
713
+ metadata = {"chunks": []}
714
+ return metadata
715
+
716
+ def save_metadata(self):
717
+ """Save metadata"""
718
+ with self.fs.open(self.metadata_file, "w") as f:
719
+ json.dump(self.metadata, f)
720
+
721
+ def chunk_exists(self, variable, year, month, hours, pressure_level):
722
+ """Check if chunk exists"""
723
+ for chunk in self.metadata["chunks"]:
724
+ if (
725
+ chunk["variable"] == variable
726
+ and chunk["year"] == year
727
+ and chunk["month"] == month
728
+ and chunk["hours"] == hours
729
+ and chunk["pressure_level"] == pressure_level
730
+ ):
731
+ return True
732
+ return False
733
+
734
+ def download_chunk(
735
+ self,
736
+ variable: str,
737
+ year: int,
738
+ month: int,
739
+ hours: List[int],
740
+ pressure_level: int = None,
741
+ ):
742
+ """
743
+ Download ERA5 data for the specified variable, date range, hours, and pressure levels.
744
+
745
+ Parameters
746
+ ----------
747
+ variable : str
748
+ The ERA5 variable to download, e.g. 'tisr' for solar radiation or 'z' for geopotential.
749
+ year : int
750
+ The year to download.
751
+ month : int
752
+ The month to download.
753
+ hours : List[int]
754
+ A list of hours (0-23) for which data should be downloaded.
755
+ pressure_level : int, optional
756
+ A pressure level to include in the download, by default None. If None, the single-level data will be downloaded.
757
+
758
+ Returns
759
+ -------
760
+ xr.Dataset
761
+ An xarray Dataset containing the downloaded data.
762
+ """
763
+
764
+ with tempfile.TemporaryDirectory() as tmpdir:
765
+ # Get all days in the month
766
+ days_in_month = calendar.monthrange(year, month)[1]
767
+
768
+ # Make tmpfile to store the data
769
+ output_file = os.path.join(
770
+ tmpdir,
771
+ f"{variable}_{year}_{month:02d}_{str(hours)}_{str(pressure_level)}.nc",
772
+ )
773
+
774
+ # start the CDS API client (maybe need to move this outside the loop?)
775
+ c = cdsapi.Client(quiet=True)
776
+
777
+ # Setup the request parameters
778
+ request_params = {
779
+ "product_type": "reanalysis",
780
+ "variable": variable,
781
+ "year": str(year),
782
+ "month": str(month),
783
+ "day": [f"{day:02d}" for day in range(1, days_in_month + 1)],
784
+ "time": [f"{hour:02d}:00" for hour in hours],
785
+ "format": "netcdf",
786
+ }
787
+ if pressure_level:
788
+ request_params["pressure_level"] = [str(pressure_level)]
789
+ dataset_name = "reanalysis-era5-pressure-levels"
790
+ else:
791
+ dataset_name = "reanalysis-era5-single-levels"
792
+
793
+ # Download the data
794
+ c.retrieve(
795
+ dataset_name,
796
+ request_params,
797
+ output_file,
798
+ )
799
+
800
+ # Open the downloaded data
801
+ ds = xr.open_dataset(output_file)
802
+ return ds
803
+
804
+ def variable_to_zarr_name(self, variable: str, pressure_level: int = None):
805
+ """convert variable to zarr name"""
806
+ # create zarr path for variable
807
+ zarr_path = f"{self.base_path}/{variable}"
808
+ if pressure_level:
809
+ zarr_path += f"_pressure_level_{pressure_level}"
810
+ zarr_path += ".zarr"
811
+ return zarr_path
812
+
813
+ def download_and_upload_chunk(
814
+ self,
815
+ variable: str,
816
+ year: int,
817
+ month: int,
818
+ hours: List[int],
819
+ pressure_level: int = None,
820
+ ):
821
+ """
822
+ Downloads a chunk of ERA5 data for a specific variable and date range, and uploads it to a Zarr array.
823
+ This downloads a 1-month chunk of data.
824
+
825
+ Parameters
826
+ ----------
827
+ variable : str
828
+ The variable to download.
829
+ year : int
830
+ The year to download.
831
+ month : int
832
+ The month to download.
833
+ hours : List[int]
834
+ A list of hours to download.
835
+ pressure_level : int, optional
836
+ Pressure levels to download, if applicable.
837
+ """
838
+
839
+ # Download the data
840
+ ds = self.download_chunk(variable, year, month, hours, pressure_level)
841
+ if "valid_time" in ds.dims:
842
+ ds = ds.rename({"valid_time": "time"})
843
+
844
+ # Create the Zarr path
845
+ zarr_path = self.variable_to_zarr_name(variable, pressure_level)
846
+
847
+ # Specify the chunking options
848
+ chunking = {"time": 1, "latitude": 721, "longitude": 1440}
849
+ if "level" in ds.dims:
850
+ chunking["level"] = 1
851
+
852
+ # Re-chunk the dataset
853
+ ds = ds.chunk(chunking)
854
+
855
+ # Check if the Zarr dataset exists
856
+ if self.fs.exists(zarr_path):
857
+ mode = "a"
858
+ append_dim = "time"
859
+ create = False
860
+ else:
861
+ mode = "w"
862
+ append_dim = None
863
+ create = True
864
+
865
+ # Upload the data to the Zarr dataset
866
+ mapper = self.fs.get_mapper(zarr_path, create=create)
867
+ ds.to_zarr(mapper, mode=mode, consolidated=True, append_dim=append_dim)
868
+
869
+ # Update the metadata
870
+ self.metadata["chunks"].append(
871
+ {
872
+ "variable": variable,
873
+ "year": year,
874
+ "month": month,
875
+ "hours": hours,
876
+ "pressure_level": pressure_level,
877
+ }
878
+ )
879
+ self.save_metadata()
880
+
881
+ def download(
882
+ self,
883
+ variables: List[Union[str, Tuple[str, int]]],
884
+ date_range: Tuple[datetime.date, datetime.date],
885
+ hours: List[int],
886
+ ):
887
+ """
888
+ Start the process of mirroring the specified ERA5 variables for the given date range and hours.
889
+
890
+ Parameters
891
+ ----------
892
+ variables : List[Union[str, Tuple[str, List[int]]]]
893
+ A list of variables to mirror, where each element can either be a string (single-level variable)
894
+ or a tuple (variable with pressure level).
895
+ date_range : Tuple[datetime.date, datetime.date]
896
+ A tuple containing the start and end dates for the data to be mirrored. This will download and store every month in the range.
897
+ hours : List[int]
898
+ A list of hours for which to download the data.
899
+
900
+ Returns
901
+ -------
902
+ zarr_paths : List[str]
903
+ A list of Zarr paths for each of the variables.
904
+ """
905
+
906
+ start_date, end_date = date_range
907
+
908
+ # Reformat the variables list so all elements are tuples
909
+ reformated_variables = []
910
+ for variable in variables:
911
+ if isinstance(variable, str):
912
+ reformated_variables.append(tuple([variable, None]))
913
+ else:
914
+ reformated_variables.append(variable)
915
+
916
+ # Start Downloading
917
+ with ProgressBar():
918
+ # Round dates to months
919
+ current_date = start_date.replace(day=1)
920
+ end_date = end_date.replace(day=1)
921
+
922
+ while current_date <= end_date:
923
+ # Create a list of tasks to download the data
924
+ tasks = []
925
+ for variable, pressure_level in reformated_variables:
926
+ if not self.chunk_exists(
927
+ variable,
928
+ current_date.year,
929
+ current_date.month,
930
+ hours,
931
+ pressure_level,
932
+ ):
933
+ task = dask.delayed(self.download_and_upload_chunk)(
934
+ variable,
935
+ current_date.year,
936
+ current_date.month,
937
+ hours,
938
+ pressure_level,
939
+ )
940
+ tasks.append(task)
941
+ else:
942
+ print(
943
+ f"Chunk for {variable} {pressure_level} {current_date.year}-{current_date.month} already exists. Skipping."
944
+ )
945
+
946
+ # Execute the tasks with Dask
947
+ print(f"Downloading data for {current_date.year}-{current_date.month}")
948
+ if tasks:
949
+ dask.compute(*tasks)
950
+
951
+ # Update the metadata
952
+ self.save_metadata()
953
+
954
+ # Update the current date
955
+ days_in_month = calendar.monthrange(
956
+ year=current_date.year, month=current_date.month
957
+ )[1]
958
+ current_date += datetime.timedelta(days=days_in_month)
959
+
960
+ # Return the Zarr paths
961
+ zarr_paths = []
962
+ for variable, pressure_level in reformated_variables:
963
+ zarr_path = self.variable_to_zarr_name(variable, pressure_level)
964
+ zarr_paths.append(zarr_path)
965
+
966
+ # Check that Zarr arrays have correct dt for time dimension
967
+ for zarr_path in zarr_paths:
968
+ ds = xr.open_zarr(zarr_path)
969
+ time_stamps = ds.time.values
970
+ dt = time_stamps[1:] - time_stamps[:-1]
971
+ assert np.all(
972
+ dt == dt[0]
973
+ ), f"Zarr array {zarr_path} has incorrect dt for time dimension. An error may have occurred during download. Please delete the Zarr array and try again."
974
+
975
+ return zarr_paths
976
+ '''