climdata 0.1.2__py2.py3-none-any.whl → 0.1.4__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of climdata might be problematic. Click here for more details.

@@ -1,9 +1,5 @@
1
1
  import pandas as pd
2
2
  import numpy as np
3
- from wetterdienst import Settings
4
- from wetterdienst.provider.dwd.observation import DwdObservationRequest
5
- import geemap
6
- import ee
7
3
  import geopandas as gpd
8
4
  from omegaconf import DictConfig
9
5
  import os
@@ -15,8 +11,6 @@ from datetime import datetime, timedelta
15
11
  import xarray as xr
16
12
  import hydra
17
13
 
18
- import pint
19
- import pint_pandas
20
14
 
21
15
  from google.oauth2 import service_account
22
16
  from googleapiclient.discovery import build
@@ -34,257 +28,6 @@ import concurrent.futures
34
28
 
35
29
  warnings.filterwarnings("ignore", category=Warning)
36
30
 
37
- def fetch_dwd_loc(cfg: DictConfig):
38
-
39
- param_mapping = cfg.mappings
40
- provider = cfg.dataset.lower()
41
- parameter_key = cfg.weather.parameter
42
- # Validate provider and parameter
43
-
44
- if provider not in param_mapping:
45
- raise ValueError(f"Provider '{provider}' not found in parameter map.")
46
- if parameter_key not in param_mapping[provider]['variables']:
47
- raise ValueError(f"Parameter '{parameter_key}' not defined for provider '{provider}'.")
48
-
49
- param_info = param_mapping[provider]['variables'][parameter_key]
50
- resolution = param_info["resolution"]
51
- dataset = param_info["dataset"]
52
- variable_name = param_info["name"]
53
- units = param_info.get("unit", None)
54
-
55
- lat = cfg.location.lat
56
- lon = cfg.location.lon
57
- distance = cfg.location.buffer_km
58
- start_date = cfg.time_range.start_date
59
- end_date = cfg.time_range.end_date
60
- output_file = cfg.output.filename
61
-
62
- settings = Settings(
63
- ts_shape="long",
64
- ts_humanize=True,
65
- # ts_si_units=False
66
- )
67
-
68
- request = DwdObservationRequest(
69
- parameters=(resolution, dataset, variable_name),
70
- start_date=start_date,
71
- end_date=end_date,
72
- settings=settings
73
- ).filter_by_distance(
74
- latlon=(lat, lon),
75
- distance=distance,
76
- unit="km"
77
- )
78
-
79
- df = request.values.all().df.to_pandas()
80
-
81
- df['date'] = pd.to_datetime(df['date'])
82
- df = df.groupby(['date']).agg({
83
- 'value': 'mean',
84
- 'station_id': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
85
- 'resolution': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
86
- 'dataset': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
87
- 'parameter': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
88
- 'quality': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
89
- }).reset_index()
90
-
91
- df.set_index("date", inplace=True)
92
- df.reset_index(inplace=True)
93
-
94
- # Standardize column names
95
- df = df.rename(columns={
96
- "date": "time",
97
- "value": "value",
98
- "station_id": "frequent_station",
99
-
100
- })
101
- df["variable"] = parameter_key
102
- df["latitude"] = lat
103
- df["longitude"] = lon
104
- df['source'] = 'DWD'
105
- df['units'] = units
106
- df = df[["latitude", "longitude", "time", "source", "variable", "value","units"]]
107
-
108
- out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
109
- os.makedirs(out_dir, exist_ok=True)
110
- out_path = os.path.join(out_dir, cfg.output.filename)
111
-
112
- df.to_csv(out_path, index=False)
113
- print(f"✅ Saved time series to: {out_path}")
114
- return df
115
- def fetch_ee_loc(cfg: DictConfig):
116
- ee.Initialize(project='earthengine-462007')
117
-
118
- provider = cfg.dataset.lower()
119
- variable_name = cfg.weather.parameter
120
- ee_image_collection = cfg.mappings[provider].params.collection
121
-
122
- # Prepare the image collection
123
- sd = cfg.time_range.start_date
124
- ed = cfg.time_range.end_date
125
- var_name = cfg.mappings[provider].variables[variable_name].name
126
- units = cfg.mappings[provider].variables[variable_name].unit
127
- if provider=='gddp':
128
- model = cfg.mappings[provider].params.model
129
- scenario = cfg.mappings[provider].params.scenario
130
- dataset = ee.ImageCollection(ee_image_collection)\
131
- .filter(ee.Filter.date(sd, ed))\
132
- .filter(ee.Filter.eq('model', model))\
133
- .filter(ee.Filter.eq('scenario', scenario))
134
- elif provider=='era5-land':
135
- dataset = ee.ImageCollection(ee_image_collection)\
136
- .filter(ee.Filter.date(sd, ed))
137
- else:
138
- raise ValueError(f"Provider '{provider}' is not supported for Earth Engine data fetching.")
139
- image_var = dataset.select(var_name)
140
-
141
-
142
- lat = cfg.location.lat
143
- lon = cfg.location.lon
144
- # identifier = cfg.location.id
145
- out_dir = cfg.output.out_dir
146
- # buffer = cfg.location.buffer_km
147
- buffer = None
148
- scale = cfg.mappings[provider].params.scale
149
- # retry_delay = cfg.download.retry_delay
150
-
151
- os.makedirs(out_dir, exist_ok=True)
152
-
153
- df = pd.DataFrame([{ "lat": lat, "lon": lon, "id": 0}])
154
- gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df["lon"], df["lat"]), crs="EPSG:4326")
155
-
156
- try:
157
- gdf_ee = geemap.gdf_to_ee(gdf)
158
-
159
- # if buffer:
160
- # pixel_values = gdf_ee.map(
161
- # lambda f: f.set('ts', image_var.getRegion(
162
- # f.buffer(buffer*1e3).bounds().geometry(), scale))
163
- # )
164
- # else:
165
- pixel_values = gdf_ee.map(
166
- lambda f: f.set('ts', image_var.getRegion(
167
- f.geometry(), scale))
168
- )
169
-
170
- pixel_values_info = pixel_values.getInfo()
171
-
172
- for feature in pixel_values_info['features']:
173
- data = feature['properties']['ts']
174
- data_id = feature['properties']['id']
175
-
176
- if data:
177
- columns = data[0]
178
- rows = data[1:]
179
- df_out = pd.DataFrame(rows, columns=columns)
180
-
181
- out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
182
- os.makedirs(out_dir, exist_ok=True)
183
- out_path = os.path.join(out_dir, cfg.output.filename)
184
-
185
- df_out["variable"] = variable_name
186
- df_out["latitude"] = lat
187
- df_out["longitude"] = lon
188
- df_out['source'] = provider.upper()
189
- df_out['units'] = units
190
- df_out['time'] = pd.to_datetime(df_out['time'], unit='ms')
191
- df_out.rename(columns={variable_name: 'value'}, inplace=True)
192
- df_out = df_out[["latitude", "longitude", "time", "source", "variable", "value","units"]]
193
-
194
- df_out.to_csv(out_path, index=False)
195
- print(f"[\u2713] Saved: {out_path}")
196
-
197
- return df_out
198
- else:
199
- print(f"[!] No data for ID {data_id}")
200
-
201
- except Exception as e:
202
- print(f"[\u2717] Error: {e}")
203
- # time.sleep(retry_delay)
204
- raise RuntimeError("Failed to download data.")
205
-
206
- def fetch_ee_loc_mod(cfg: DictConfig):
207
- # Initialize Earth Engine
208
- ee.Initialize(project='earthengine-462007')
209
-
210
- provider = cfg.dataset.lower()
211
- variable_name = cfg.weather.parameter
212
- ee_image_collection = cfg.mappings[provider].params.collection
213
-
214
- sd = cfg.time_range.start_date
215
- ed = cfg.time_range.end_date
216
- var_name = cfg.mappings[provider].variables[variable_name].name
217
- units = cfg.mappings[provider].variables[variable_name].unit
218
- scale = cfg.mappings[provider].params.scale
219
- out_dir = cfg.output.out_dir
220
-
221
- lat = cfg.location.lat
222
- lon = cfg.location.lon
223
-
224
- # Handle model/scenario if needed
225
- if provider == 'gddp':
226
- model = cfg.mappings[provider].params.model
227
- scenario = cfg.mappings[provider].params.scenario
228
- dataset = ee.ImageCollection(ee_image_collection) \
229
- .filter(ee.Filter.date(sd, ed)) \
230
- .filter(ee.Filter.eq('model', model)) \
231
- .filter(ee.Filter.eq('scenario', scenario))
232
- elif provider == 'era5-land':
233
- dataset = ee.ImageCollection(ee_image_collection) \
234
- .filter(ee.Filter.date(sd, ed))
235
- else:
236
- raise ValueError(f"Provider '{provider}' is not supported.")
237
-
238
- image_var = dataset.select(var_name)
239
- point = ee.Geometry.Point(lon, lat)
240
-
241
- os.makedirs(out_dir, exist_ok=True)
242
- results = []
243
-
244
- print(f"[i] Fetching time series for point: ({lat}, {lon})")
245
-
246
- # Use a client-side list of images
247
- image_list = image_var.toList(image_var.size())
248
- n_images = image_var.size().getInfo()
249
-
250
- for i in tqdm(range(n_images), desc="Processing images"):
251
- try:
252
- img = ee.Image(image_list.get(i))
253
- date = img.date().format('YYYY-MM-dd').getInfo()
254
-
255
- value = img.reduceRegion(
256
- reducer=ee.Reducer.first(),
257
- geometry=point,
258
- scale=scale,
259
- bestEffort=True
260
- ).get(var_name)
261
-
262
- value = value.getInfo() if value else None
263
- results.append({"date": date, var_name: value})
264
- except Exception as e:
265
- print(f"[!] Skipping image {i} due to error: {e}")
266
- continue
267
-
268
- df_out = pd.DataFrame(results)
269
- out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
270
- os.makedirs(out_dir, exist_ok=True)
271
- out_path = os.path.join(out_dir, cfg.output.filename)
272
-
273
- df_out["variable"] = variable_name
274
- df_out["latitude"] = lat
275
- df_out["longitude"] = lon
276
- df_out['units'] = units
277
- df_out['source'] = provider.upper()
278
- df_out.rename(columns={var_name: 'value', "date": 'time'}, inplace=True)
279
- df_out = df_out[["latitude", "longitude", "time", "source", "variable", "value",'units']]
280
-
281
- # ureg = pint.UnitRegistry()
282
- # pint_pandas.PintType.ureg = ureg
283
- # df_out['temperature'] = df['temperature'].astype('pint[C]')
284
-
285
- df_out.to_csv(out_path, index=False)
286
- print(f"[✓] Saved timeseries to: {out_path}")
287
- return df_out
288
31
  def list_drive_files(folder_id, service):
289
32
  """
290
33
  List all files in a Google Drive folder, handling pagination.
@@ -320,79 +63,12 @@ def download_drive_file(file_id, local_path, service):
320
63
  while not done:
321
64
  status, done = downloader.next_chunk()
322
65
  print(f" → Download {int(status.progress() * 100)}% complete")
323
- def fetch_MSWX(var_cfg):
324
- param_mapping = var_cfg.mappings
325
- provider = var_cfg.dataset.lower()
326
- parameter_key = var_cfg.weather.parameter
327
-
328
- param_info = param_mapping[provider]['variables'][parameter_key]
329
- folder_id = param_info["folder_id"]
330
-
331
- start_date = var_cfg.time_range.start_date
332
- end_date = var_cfg.time_range.end_date
333
-
334
- # === 1) Generate expected filenames ===
335
- start = datetime.fromisoformat(start_date)
336
- end = datetime.fromisoformat(end_date)
337
-
338
- expected_files = []
339
- current = start
340
- while current <= end:
341
- doy = current.timetuple().tm_yday
342
- basename = f"{current.year}{doy:03d}.nc"
343
- expected_files.append(basename)
344
- current += timedelta(days=1)
345
-
346
- output_dir = var_cfg.data_dir
347
- local_files = []
348
- missing_files = []
349
-
350
- for basename in expected_files:
351
- local_path = os.path.join(output_dir, provider, parameter_key, basename)
352
- if os.path.exists(local_path):
353
- local_files.append(basename)
354
- else:
355
- missing_files.append(basename)
356
-
357
- if not missing_files:
358
- print(f"✅ All {len(expected_files)} files already exist locally. No download needed.")
359
- return local_files
360
-
361
- print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching from Drive...")
362
-
363
- # === 2) Connect to Drive ===
364
- SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
365
- creds = service_account.Credentials.from_service_account_file(
366
- param_mapping[provider].params.google_service_account, scopes=SCOPES
367
- )
368
- service = build('drive', 'v3', credentials=creds)
369
66
 
370
- # === 3) List all Drive files ===
371
- drive_files = list_drive_files(folder_id, service)
372
- valid_filenames = set(missing_files)
373
-
374
- files_to_download = [f for f in drive_files if f['name'] in valid_filenames]
375
-
376
- if not files_to_download:
377
- print(f"⚠️ None of the missing files found in Drive. Check folder & date range.")
378
- return local_files
379
-
380
- # === 4) Download missing ===
381
- for file in files_to_download:
382
- filename = file['name']
383
- local_path = os.path.join(output_dir, provider, parameter_key, filename)
384
- print(f"⬇️ Downloading {filename} ...")
385
- download_drive_file(file['id'], local_path, service)
386
- local_files.append(filename)
387
-
388
- return local_files
389
-
390
-
391
- def fetch_dwd(var_cfg):
67
+ def fetch_dwd(var_cfg,var):
392
68
  """Download HYRAS data for one variable and a list of years."""
393
69
  param_mapping = var_cfg.mappings
394
70
  provider = var_cfg.dataset.lower()
395
- parameter_key = var_cfg.weather.parameter
71
+ parameter_key = var
396
72
  # Validate provider and parameter
397
73
 
398
74
  param_info = param_mapping[provider]['variables'][parameter_key]
@@ -524,452 +200,42 @@ def extract_ts_dwd(cfg: DictConfig):
524
200
  ts_all.to_csv(out_path, index=False)
525
201
  print(f"✅ Saved time series to: {out_path}")
526
202
 
527
- return ts_all
528
- def extract_ts_MSWX(cfg: DictConfig):
529
- parameter = cfg.weather.parameter
530
- param_mapping = cfg.mappings
531
- provider = cfg.dataset.lower()
532
- parameter_key = cfg.weather.parameter
533
- # Validate provider and parameter
534
-
535
- param_info = param_mapping[provider]['variables'][parameter_key]
536
-
537
- base_dir = cfg.data_dir
538
-
539
- target_lat = cfg.location.lat
540
- target_lon = cfg.location.lon
541
-
542
- start_date = pd.to_datetime(cfg.time_range.start_date)
543
- end_date = pd.to_datetime(cfg.time_range.end_date)
544
-
545
- # === 1) Rebuild exact basenames ===
546
- current = start_date
547
- basenames = []
548
- while current <= end_date:
549
- doy = current.timetuple().tm_yday
550
- basename = f"{current.year}{doy:03d}.nc"
551
- basenames.append(basename)
552
- current += timedelta(days=1)
553
-
554
- # === 2) Process only those files ===
555
- ts_list = []
556
- missing = []
557
-
558
- for basename in basenames:
559
- file_path = os.path.join(base_dir, provider, parameter, basename)
560
-
561
- if not os.path.exists(file_path):
562
- missing.append(basename)
563
- continue
564
-
565
- print(f"📂 Opening: {file_path}")
566
- ds = xr.open_dataset(file_path)
567
-
568
- time_name = [x for x in ds.coords if "time" in x.lower()][0]
569
- data_var = [v for v in ds.data_vars][0]
570
-
571
- ts = ds[data_var].sel(
572
- lat=target_lat,
573
- lon=target_lon,
574
- method='nearest'
575
- )
576
-
577
- df = ts.to_dataframe().reset_index()[[time_name, data_var]]
578
- ts_list.append(df)
579
-
580
- if missing:
581
- print(f"⚠️ Warning: {len(missing)} files were missing and skipped:")
582
- for m in missing:
583
- print(f" - {m}")
584
-
585
- if not ts_list:
586
- raise RuntimeError("❌ No valid files were found. Cannot extract time series.")
587
-
588
- # === 3) Combine and slice (for safety) ===
589
- ts_all = pd.concat(ts_list).sort_values(by=time_name).reset_index(drop=True)
590
-
591
- ts_all[time_name] = pd.to_datetime(ts_all[time_name])
592
- ts_all = ts_all[
593
- (ts_all[time_name] >= start_date) &
594
- (ts_all[time_name] <= end_date)
595
- ].reset_index(drop=True)
596
-
597
- out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
598
- os.makedirs(out_dir, exist_ok=True)
599
- out_path = os.path.join(out_dir, cfg.output.filename)
600
-
601
- ts_all["variable"] = param_info['name']
602
- ts_all["latitude"] = target_lat
603
- ts_all["longitude"] = target_lon
604
- ts_all['source'] = provider.upper()
605
- ts_all['units'] = ts.attrs['units']
606
- ts_all.rename(columns={param_info['name']: 'value'}, inplace=True)
607
- ts_all = ts_all[["latitude", "longitude", "time", "source", "variable", "value",'units']]
608
-
609
- ts_all.to_csv(out_path, index=False)
610
- print(f"✅ Saved MSWX time series to: {out_path}")
611
-
612
203
  return ts_all
613
204
 
614
205
  import os
615
206
  from omegaconf import DictConfig
616
207
 
617
- def build_output_filename(cfg: DictConfig) -> str:
618
- """Generate full output file path from pattern and config."""
619
- provider = cfg.dataset.lower()
620
- parameter = cfg.weather.parameter
621
- lat = cfg.location.lat
622
- lon = cfg.location.lon
623
- start = cfg.time_range.start_date
624
- end = cfg.time_range.end_date
625
-
626
- pattern = cfg.output.get("filename", "{provider}_{parameter}_{start}_{end}.csv")
627
- filename = pattern.format(
628
- provider=provider,
629
- parameter=parameter,
630
- lat=lat,
631
- lon=lon,
632
- start=start,
633
- end=end
634
- )
635
-
636
- out_dir = cfg.output.out_dir
637
- fmt = cfg.output.fmt # format is a reserved word in Python, so use 'fmt'
638
-
639
- # return os.path.join(out_dir, fmt, filename)
640
- return filename
641
-
642
- # SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES.
643
- # SPDX-FileCopyrightText: All rights reserved.
644
- # SPDX-License-Identifier: Apache-2.0
645
- #
646
- # Licensed under the Apache License, Version 2.0 (the "License");
647
- # you may not use this file except in compliance with the License.
648
- # You may obtain a copy of the License at
649
- #
650
- # http://www.apache.org/licenses/LICENSE-2.0
651
- #
652
- # Unless required by applicable law or agreed to in writing, software
653
- # distributed under the License is distributed on an "AS IS" BASIS,
654
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
655
- # See the License for the specific language governing permissions and
656
- # limitations under the License.
657
- '''
658
- import os
659
- import tempfile
660
- import cdsapi
661
- import xarray as xr
662
- import datetime
663
- import json
664
- import dask
665
- import calendar
666
- from dask.diagnostics import ProgressBar
667
- from typing import List, Tuple, Dict, Union
668
- import urllib3
669
- import logging
670
- import numpy as np
671
- import fsspec
672
-
673
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
674
-
675
-
676
- class ERA5Mirror:
208
+ def get_output_filename(cfg, output_type="nc", lat=None, lon=None):
677
209
  """
678
- A class to manage downloading ERA5 datasets. The datasets are downloaded from the Copernicus Climate Data Store (CDS) and stored in Zarr format.
679
-
680
- Attributes
681
- ----------
682
- base_path : Path
683
- The path to the Zarr dataset.
684
- fs : fsspec.AbstractFileSystem
685
- The filesystem to use for the Zarr dataset. If None, the local filesystem will be used.
210
+ Generate output filename based on config, output type, and extraction mode.
211
+ output_type: "nc", "csv", or "zarr"
686
212
  """
687
-
688
- def __init__(self, base_path: str, fs: fsspec.AbstractFileSystem = None):
689
- # Get parameters
690
- self.base_path = base_path
691
- if fs is None:
692
- fs = fsspec.filesystem("file")
693
- self.fs = fs
694
-
695
- # Create the base path if it doesn't exist
696
- if not self.fs.exists(self.base_path):
697
- self.fs.makedirs(self.base_path)
698
-
699
- # Create metadata that will be used to track which chunks have been downloaded
700
- self.metadata_file = os.path.join(self.base_path, "metadata.json")
701
- self.metadata = self.get_metadata()
702
-
703
- def get_metadata(self):
704
- """Get metadata"""
705
- if self.fs.exists(self.metadata_file):
706
- with self.fs.open(self.metadata_file, "r") as f:
707
- try:
708
- metadata = json.load(f)
709
- except json.decoder.JSONDecodeError:
710
- metadata = {"chunks": []}
711
- else:
712
- metadata = {"chunks": []}
713
- return metadata
714
-
715
- def save_metadata(self):
716
- """Save metadata"""
717
- with self.fs.open(self.metadata_file, "w") as f:
718
- json.dump(self.metadata, f)
719
-
720
- def chunk_exists(self, variable, year, month, hours, pressure_level):
721
- """Check if chunk exists"""
722
- for chunk in self.metadata["chunks"]:
723
- if (
724
- chunk["variable"] == variable
725
- and chunk["year"] == year
726
- and chunk["month"] == month
727
- and chunk["hours"] == hours
728
- and chunk["pressure_level"] == pressure_level
729
- ):
730
- return True
731
- return False
732
-
733
- def download_chunk(
734
- self,
735
- variable: str,
736
- year: int,
737
- month: int,
738
- hours: List[int],
739
- pressure_level: int = None,
740
- ):
741
- """
742
- Download ERA5 data for the specified variable, date range, hours, and pressure levels.
743
-
744
- Parameters
745
- ----------
746
- variable : str
747
- The ERA5 variable to download, e.g. 'tisr' for solar radiation or 'z' for geopotential.
748
- year : int
749
- The year to download.
750
- month : int
751
- The month to download.
752
- hours : List[int]
753
- A list of hours (0-23) for which data should be downloaded.
754
- pressure_level : int, optional
755
- A pressure level to include in the download, by default None. If None, the single-level data will be downloaded.
756
-
757
- Returns
758
- -------
759
- xr.Dataset
760
- An xarray Dataset containing the downloaded data.
761
- """
762
-
763
- with tempfile.TemporaryDirectory() as tmpdir:
764
- # Get all days in the month
765
- days_in_month = calendar.monthrange(year, month)[1]
766
-
767
- # Make tmpfile to store the data
768
- output_file = os.path.join(
769
- tmpdir,
770
- f"{variable}_{year}_{month:02d}_{str(hours)}_{str(pressure_level)}.nc",
771
- )
772
-
773
- # start the CDS API client (maybe need to move this outside the loop?)
774
- c = cdsapi.Client(quiet=True)
775
-
776
- # Setup the request parameters
777
- request_params = {
778
- "product_type": "reanalysis",
779
- "variable": variable,
780
- "year": str(year),
781
- "month": str(month),
782
- "day": [f"{day:02d}" for day in range(1, days_in_month + 1)],
783
- "time": [f"{hour:02d}:00" for hour in hours],
784
- "format": "netcdf",
785
- }
786
- if pressure_level:
787
- request_params["pressure_level"] = [str(pressure_level)]
788
- dataset_name = "reanalysis-era5-pressure-levels"
789
- else:
790
- dataset_name = "reanalysis-era5-single-levels"
791
-
792
- # Download the data
793
- c.retrieve(
794
- dataset_name,
795
- request_params,
796
- output_file,
797
- )
798
-
799
- # Open the downloaded data
800
- ds = xr.open_dataset(output_file)
801
- return ds
802
-
803
- def variable_to_zarr_name(self, variable: str, pressure_level: int = None):
804
- """convert variable to zarr name"""
805
- # create zarr path for variable
806
- zarr_path = f"{self.base_path}/{variable}"
807
- if pressure_level:
808
- zarr_path += f"_pressure_level_{pressure_level}"
809
- zarr_path += ".zarr"
810
- return zarr_path
811
-
812
- def download_and_upload_chunk(
813
- self,
814
- variable: str,
815
- year: int,
816
- month: int,
817
- hours: List[int],
818
- pressure_level: int = None,
819
- ):
820
- """
821
- Downloads a chunk of ERA5 data for a specific variable and date range, and uploads it to a Zarr array.
822
- This downloads a 1-month chunk of data.
823
-
824
- Parameters
825
- ----------
826
- variable : str
827
- The variable to download.
828
- year : int
829
- The year to download.
830
- month : int
831
- The month to download.
832
- hours : List[int]
833
- A list of hours to download.
834
- pressure_level : int, optional
835
- Pressure levels to download, if applicable.
836
- """
837
-
838
- # Download the data
839
- ds = self.download_chunk(variable, year, month, hours, pressure_level)
840
- if "valid_time" in ds.dims:
841
- ds = ds.rename({"valid_time": "time"})
842
-
843
- # Create the Zarr path
844
- zarr_path = self.variable_to_zarr_name(variable, pressure_level)
845
-
846
- # Specify the chunking options
847
- chunking = {"time": 1, "latitude": 721, "longitude": 1440}
848
- if "level" in ds.dims:
849
- chunking["level"] = 1
850
-
851
- # Re-chunk the dataset
852
- ds = ds.chunk(chunking)
853
-
854
- # Check if the Zarr dataset exists
855
- if self.fs.exists(zarr_path):
856
- mode = "a"
857
- append_dim = "time"
858
- create = False
859
- else:
860
- mode = "w"
861
- append_dim = None
862
- create = True
863
-
864
- # Upload the data to the Zarr dataset
865
- mapper = self.fs.get_mapper(zarr_path, create=create)
866
- ds.to_zarr(mapper, mode=mode, consolidated=True, append_dim=append_dim)
867
-
868
- # Update the metadata
869
- self.metadata["chunks"].append(
870
- {
871
- "variable": variable,
872
- "year": year,
873
- "month": month,
874
- "hours": hours,
875
- "pressure_level": pressure_level,
876
- }
213
+ if output_type == "csv":
214
+ template = cfg.output.filename_csv
215
+ elif output_type == "zarr":
216
+ template = cfg.output.filename_zarr
217
+ else:
218
+ template = cfg.output.filename_nc
219
+
220
+ # If lat/lon are provided, use point template
221
+ if lat is not None and lon is not None:
222
+ filename = template.format(
223
+ provider=cfg.dataset,
224
+ parameter="surface",
225
+ lat=f"{lat}",
226
+ lon=f"{lon}",
227
+ start=cfg.time_range.start_date.replace("-", ""),
228
+ end=cfg.time_range.end_date.replace("-", ""),
229
+ )
230
+ else:
231
+ # Use region bounds
232
+ region_bounds = cfg.bounds[cfg.region]
233
+ filename = template.format(
234
+ provider=cfg.dataset,
235
+ parameter="surface",
236
+ lat_range=f"{region_bounds['lat_min']}-{region_bounds['lat_max']}",
237
+ lon_range=f"{region_bounds['lon_min']}-{region_bounds['lon_max']}",
238
+ start=cfg.time_range.start_date.replace("-", ""),
239
+ end=cfg.time_range.end_date.replace("-", ""),
877
240
  )
878
- self.save_metadata()
879
-
880
- def download(
881
- self,
882
- variables: List[Union[str, Tuple[str, int]]],
883
- date_range: Tuple[datetime.date, datetime.date],
884
- hours: List[int],
885
- ):
886
- """
887
- Start the process of mirroring the specified ERA5 variables for the given date range and hours.
888
-
889
- Parameters
890
- ----------
891
- variables : List[Union[str, Tuple[str, List[int]]]]
892
- A list of variables to mirror, where each element can either be a string (single-level variable)
893
- or a tuple (variable with pressure level).
894
- date_range : Tuple[datetime.date, datetime.date]
895
- A tuple containing the start and end dates for the data to be mirrored. This will download and store every month in the range.
896
- hours : List[int]
897
- A list of hours for which to download the data.
898
-
899
- Returns
900
- -------
901
- zarr_paths : List[str]
902
- A list of Zarr paths for each of the variables.
903
- """
904
-
905
- start_date, end_date = date_range
906
-
907
- # Reformat the variables list so all elements are tuples
908
- reformated_variables = []
909
- for variable in variables:
910
- if isinstance(variable, str):
911
- reformated_variables.append(tuple([variable, None]))
912
- else:
913
- reformated_variables.append(variable)
914
-
915
- # Start Downloading
916
- with ProgressBar():
917
- # Round dates to months
918
- current_date = start_date.replace(day=1)
919
- end_date = end_date.replace(day=1)
920
-
921
- while current_date <= end_date:
922
- # Create a list of tasks to download the data
923
- tasks = []
924
- for variable, pressure_level in reformated_variables:
925
- if not self.chunk_exists(
926
- variable,
927
- current_date.year,
928
- current_date.month,
929
- hours,
930
- pressure_level,
931
- ):
932
- task = dask.delayed(self.download_and_upload_chunk)(
933
- variable,
934
- current_date.year,
935
- current_date.month,
936
- hours,
937
- pressure_level,
938
- )
939
- tasks.append(task)
940
- else:
941
- print(
942
- f"Chunk for {variable} {pressure_level} {current_date.year}-{current_date.month} already exists. Skipping."
943
- )
944
-
945
- # Execute the tasks with Dask
946
- print(f"Downloading data for {current_date.year}-{current_date.month}")
947
- if tasks:
948
- dask.compute(*tasks)
949
-
950
- # Update the metadata
951
- self.save_metadata()
952
-
953
- # Update the current date
954
- days_in_month = calendar.monthrange(
955
- year=current_date.year, month=current_date.month
956
- )[1]
957
- current_date += datetime.timedelta(days=days_in_month)
958
-
959
- # Return the Zarr paths
960
- zarr_paths = []
961
- for variable, pressure_level in reformated_variables:
962
- zarr_path = self.variable_to_zarr_name(variable, pressure_level)
963
- zarr_paths.append(zarr_path)
964
-
965
- # Check that Zarr arrays have correct dt for time dimension
966
- for zarr_path in zarr_paths:
967
- ds = xr.open_zarr(zarr_path)
968
- time_stamps = ds.time.values
969
- dt = time_stamps[1:] - time_stamps[:-1]
970
- assert np.all(
971
- dt == dt[0]
972
- ), f"Zarr array {zarr_path} has incorrect dt for time dimension. An error may have occurred during download. Please delete the Zarr array and try again."
973
-
974
- return zarr_paths
975
- '''
241
+ return filename