climdata 0.1.2__py2.py3-none-any.whl → 0.1.3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of climdata might be problematic. Click here for more details.
- climdata/__init__.py +2 -1
- climdata/conf/config.yaml +17 -20
- climdata/conf/mappings/parameters.yaml +2 -2
- climdata/datasets/CMIPCloud.py +54 -32
- climdata/datasets/DWD.py +12 -20
- climdata/datasets/HYRAS.py +133 -0
- climdata/datasets/MSWX.py +107 -190
- climdata/utils/utils_download.py +33 -767
- {climdata-0.1.2.dist-info → climdata-0.1.3.dist-info}/METADATA +1 -1
- climdata-0.1.3.dist-info/RECORD +19 -0
- climdata/__main__.py +0 -5
- climdata/main.py +0 -56
- climdata-0.1.2.dist-info/RECORD +0 -20
- {climdata-0.1.2.dist-info → climdata-0.1.3.dist-info}/WHEEL +0 -0
- {climdata-0.1.2.dist-info → climdata-0.1.3.dist-info}/entry_points.txt +0 -0
- {climdata-0.1.2.dist-info → climdata-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {climdata-0.1.2.dist-info → climdata-0.1.3.dist-info}/top_level.txt +0 -0
climdata/utils/utils_download.py
CHANGED
|
@@ -1,9 +1,5 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
import numpy as np
|
|
3
|
-
from wetterdienst import Settings
|
|
4
|
-
from wetterdienst.provider.dwd.observation import DwdObservationRequest
|
|
5
|
-
import geemap
|
|
6
|
-
import ee
|
|
7
3
|
import geopandas as gpd
|
|
8
4
|
from omegaconf import DictConfig
|
|
9
5
|
import os
|
|
@@ -15,8 +11,6 @@ from datetime import datetime, timedelta
|
|
|
15
11
|
import xarray as xr
|
|
16
12
|
import hydra
|
|
17
13
|
|
|
18
|
-
import pint
|
|
19
|
-
import pint_pandas
|
|
20
14
|
|
|
21
15
|
from google.oauth2 import service_account
|
|
22
16
|
from googleapiclient.discovery import build
|
|
@@ -34,257 +28,6 @@ import concurrent.futures
|
|
|
34
28
|
|
|
35
29
|
warnings.filterwarnings("ignore", category=Warning)
|
|
36
30
|
|
|
37
|
-
def fetch_dwd_loc(cfg: DictConfig):
|
|
38
|
-
|
|
39
|
-
param_mapping = cfg.mappings
|
|
40
|
-
provider = cfg.dataset.lower()
|
|
41
|
-
parameter_key = cfg.weather.parameter
|
|
42
|
-
# Validate provider and parameter
|
|
43
|
-
|
|
44
|
-
if provider not in param_mapping:
|
|
45
|
-
raise ValueError(f"Provider '{provider}' not found in parameter map.")
|
|
46
|
-
if parameter_key not in param_mapping[provider]['variables']:
|
|
47
|
-
raise ValueError(f"Parameter '{parameter_key}' not defined for provider '{provider}'.")
|
|
48
|
-
|
|
49
|
-
param_info = param_mapping[provider]['variables'][parameter_key]
|
|
50
|
-
resolution = param_info["resolution"]
|
|
51
|
-
dataset = param_info["dataset"]
|
|
52
|
-
variable_name = param_info["name"]
|
|
53
|
-
units = param_info.get("unit", None)
|
|
54
|
-
|
|
55
|
-
lat = cfg.location.lat
|
|
56
|
-
lon = cfg.location.lon
|
|
57
|
-
distance = cfg.location.buffer_km
|
|
58
|
-
start_date = cfg.time_range.start_date
|
|
59
|
-
end_date = cfg.time_range.end_date
|
|
60
|
-
output_file = cfg.output.filename
|
|
61
|
-
|
|
62
|
-
settings = Settings(
|
|
63
|
-
ts_shape="long",
|
|
64
|
-
ts_humanize=True,
|
|
65
|
-
# ts_si_units=False
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
request = DwdObservationRequest(
|
|
69
|
-
parameters=(resolution, dataset, variable_name),
|
|
70
|
-
start_date=start_date,
|
|
71
|
-
end_date=end_date,
|
|
72
|
-
settings=settings
|
|
73
|
-
).filter_by_distance(
|
|
74
|
-
latlon=(lat, lon),
|
|
75
|
-
distance=distance,
|
|
76
|
-
unit="km"
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
df = request.values.all().df.to_pandas()
|
|
80
|
-
|
|
81
|
-
df['date'] = pd.to_datetime(df['date'])
|
|
82
|
-
df = df.groupby(['date']).agg({
|
|
83
|
-
'value': 'mean',
|
|
84
|
-
'station_id': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
|
|
85
|
-
'resolution': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
|
|
86
|
-
'dataset': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
|
|
87
|
-
'parameter': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
|
|
88
|
-
'quality': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
|
|
89
|
-
}).reset_index()
|
|
90
|
-
|
|
91
|
-
df.set_index("date", inplace=True)
|
|
92
|
-
df.reset_index(inplace=True)
|
|
93
|
-
|
|
94
|
-
# Standardize column names
|
|
95
|
-
df = df.rename(columns={
|
|
96
|
-
"date": "time",
|
|
97
|
-
"value": "value",
|
|
98
|
-
"station_id": "frequent_station",
|
|
99
|
-
|
|
100
|
-
})
|
|
101
|
-
df["variable"] = parameter_key
|
|
102
|
-
df["latitude"] = lat
|
|
103
|
-
df["longitude"] = lon
|
|
104
|
-
df['source'] = 'DWD'
|
|
105
|
-
df['units'] = units
|
|
106
|
-
df = df[["latitude", "longitude", "time", "source", "variable", "value","units"]]
|
|
107
|
-
|
|
108
|
-
out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
|
|
109
|
-
os.makedirs(out_dir, exist_ok=True)
|
|
110
|
-
out_path = os.path.join(out_dir, cfg.output.filename)
|
|
111
|
-
|
|
112
|
-
df.to_csv(out_path, index=False)
|
|
113
|
-
print(f"✅ Saved time series to: {out_path}")
|
|
114
|
-
return df
|
|
115
|
-
def fetch_ee_loc(cfg: DictConfig):
|
|
116
|
-
ee.Initialize(project='earthengine-462007')
|
|
117
|
-
|
|
118
|
-
provider = cfg.dataset.lower()
|
|
119
|
-
variable_name = cfg.weather.parameter
|
|
120
|
-
ee_image_collection = cfg.mappings[provider].params.collection
|
|
121
|
-
|
|
122
|
-
# Prepare the image collection
|
|
123
|
-
sd = cfg.time_range.start_date
|
|
124
|
-
ed = cfg.time_range.end_date
|
|
125
|
-
var_name = cfg.mappings[provider].variables[variable_name].name
|
|
126
|
-
units = cfg.mappings[provider].variables[variable_name].unit
|
|
127
|
-
if provider=='gddp':
|
|
128
|
-
model = cfg.mappings[provider].params.model
|
|
129
|
-
scenario = cfg.mappings[provider].params.scenario
|
|
130
|
-
dataset = ee.ImageCollection(ee_image_collection)\
|
|
131
|
-
.filter(ee.Filter.date(sd, ed))\
|
|
132
|
-
.filter(ee.Filter.eq('model', model))\
|
|
133
|
-
.filter(ee.Filter.eq('scenario', scenario))
|
|
134
|
-
elif provider=='era5-land':
|
|
135
|
-
dataset = ee.ImageCollection(ee_image_collection)\
|
|
136
|
-
.filter(ee.Filter.date(sd, ed))
|
|
137
|
-
else:
|
|
138
|
-
raise ValueError(f"Provider '{provider}' is not supported for Earth Engine data fetching.")
|
|
139
|
-
image_var = dataset.select(var_name)
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
lat = cfg.location.lat
|
|
143
|
-
lon = cfg.location.lon
|
|
144
|
-
# identifier = cfg.location.id
|
|
145
|
-
out_dir = cfg.output.out_dir
|
|
146
|
-
# buffer = cfg.location.buffer_km
|
|
147
|
-
buffer = None
|
|
148
|
-
scale = cfg.mappings[provider].params.scale
|
|
149
|
-
# retry_delay = cfg.download.retry_delay
|
|
150
|
-
|
|
151
|
-
os.makedirs(out_dir, exist_ok=True)
|
|
152
|
-
|
|
153
|
-
df = pd.DataFrame([{ "lat": lat, "lon": lon, "id": 0}])
|
|
154
|
-
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df["lon"], df["lat"]), crs="EPSG:4326")
|
|
155
|
-
|
|
156
|
-
try:
|
|
157
|
-
gdf_ee = geemap.gdf_to_ee(gdf)
|
|
158
|
-
|
|
159
|
-
# if buffer:
|
|
160
|
-
# pixel_values = gdf_ee.map(
|
|
161
|
-
# lambda f: f.set('ts', image_var.getRegion(
|
|
162
|
-
# f.buffer(buffer*1e3).bounds().geometry(), scale))
|
|
163
|
-
# )
|
|
164
|
-
# else:
|
|
165
|
-
pixel_values = gdf_ee.map(
|
|
166
|
-
lambda f: f.set('ts', image_var.getRegion(
|
|
167
|
-
f.geometry(), scale))
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
pixel_values_info = pixel_values.getInfo()
|
|
171
|
-
|
|
172
|
-
for feature in pixel_values_info['features']:
|
|
173
|
-
data = feature['properties']['ts']
|
|
174
|
-
data_id = feature['properties']['id']
|
|
175
|
-
|
|
176
|
-
if data:
|
|
177
|
-
columns = data[0]
|
|
178
|
-
rows = data[1:]
|
|
179
|
-
df_out = pd.DataFrame(rows, columns=columns)
|
|
180
|
-
|
|
181
|
-
out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
|
|
182
|
-
os.makedirs(out_dir, exist_ok=True)
|
|
183
|
-
out_path = os.path.join(out_dir, cfg.output.filename)
|
|
184
|
-
|
|
185
|
-
df_out["variable"] = variable_name
|
|
186
|
-
df_out["latitude"] = lat
|
|
187
|
-
df_out["longitude"] = lon
|
|
188
|
-
df_out['source'] = provider.upper()
|
|
189
|
-
df_out['units'] = units
|
|
190
|
-
df_out['time'] = pd.to_datetime(df_out['time'], unit='ms')
|
|
191
|
-
df_out.rename(columns={variable_name: 'value'}, inplace=True)
|
|
192
|
-
df_out = df_out[["latitude", "longitude", "time", "source", "variable", "value","units"]]
|
|
193
|
-
|
|
194
|
-
df_out.to_csv(out_path, index=False)
|
|
195
|
-
print(f"[\u2713] Saved: {out_path}")
|
|
196
|
-
|
|
197
|
-
return df_out
|
|
198
|
-
else:
|
|
199
|
-
print(f"[!] No data for ID {data_id}")
|
|
200
|
-
|
|
201
|
-
except Exception as e:
|
|
202
|
-
print(f"[\u2717] Error: {e}")
|
|
203
|
-
# time.sleep(retry_delay)
|
|
204
|
-
raise RuntimeError("Failed to download data.")
|
|
205
|
-
|
|
206
|
-
def fetch_ee_loc_mod(cfg: DictConfig):
|
|
207
|
-
# Initialize Earth Engine
|
|
208
|
-
ee.Initialize(project='earthengine-462007')
|
|
209
|
-
|
|
210
|
-
provider = cfg.dataset.lower()
|
|
211
|
-
variable_name = cfg.weather.parameter
|
|
212
|
-
ee_image_collection = cfg.mappings[provider].params.collection
|
|
213
|
-
|
|
214
|
-
sd = cfg.time_range.start_date
|
|
215
|
-
ed = cfg.time_range.end_date
|
|
216
|
-
var_name = cfg.mappings[provider].variables[variable_name].name
|
|
217
|
-
units = cfg.mappings[provider].variables[variable_name].unit
|
|
218
|
-
scale = cfg.mappings[provider].params.scale
|
|
219
|
-
out_dir = cfg.output.out_dir
|
|
220
|
-
|
|
221
|
-
lat = cfg.location.lat
|
|
222
|
-
lon = cfg.location.lon
|
|
223
|
-
|
|
224
|
-
# Handle model/scenario if needed
|
|
225
|
-
if provider == 'gddp':
|
|
226
|
-
model = cfg.mappings[provider].params.model
|
|
227
|
-
scenario = cfg.mappings[provider].params.scenario
|
|
228
|
-
dataset = ee.ImageCollection(ee_image_collection) \
|
|
229
|
-
.filter(ee.Filter.date(sd, ed)) \
|
|
230
|
-
.filter(ee.Filter.eq('model', model)) \
|
|
231
|
-
.filter(ee.Filter.eq('scenario', scenario))
|
|
232
|
-
elif provider == 'era5-land':
|
|
233
|
-
dataset = ee.ImageCollection(ee_image_collection) \
|
|
234
|
-
.filter(ee.Filter.date(sd, ed))
|
|
235
|
-
else:
|
|
236
|
-
raise ValueError(f"Provider '{provider}' is not supported.")
|
|
237
|
-
|
|
238
|
-
image_var = dataset.select(var_name)
|
|
239
|
-
point = ee.Geometry.Point(lon, lat)
|
|
240
|
-
|
|
241
|
-
os.makedirs(out_dir, exist_ok=True)
|
|
242
|
-
results = []
|
|
243
|
-
|
|
244
|
-
print(f"[i] Fetching time series for point: ({lat}, {lon})")
|
|
245
|
-
|
|
246
|
-
# Use a client-side list of images
|
|
247
|
-
image_list = image_var.toList(image_var.size())
|
|
248
|
-
n_images = image_var.size().getInfo()
|
|
249
|
-
|
|
250
|
-
for i in tqdm(range(n_images), desc="Processing images"):
|
|
251
|
-
try:
|
|
252
|
-
img = ee.Image(image_list.get(i))
|
|
253
|
-
date = img.date().format('YYYY-MM-dd').getInfo()
|
|
254
|
-
|
|
255
|
-
value = img.reduceRegion(
|
|
256
|
-
reducer=ee.Reducer.first(),
|
|
257
|
-
geometry=point,
|
|
258
|
-
scale=scale,
|
|
259
|
-
bestEffort=True
|
|
260
|
-
).get(var_name)
|
|
261
|
-
|
|
262
|
-
value = value.getInfo() if value else None
|
|
263
|
-
results.append({"date": date, var_name: value})
|
|
264
|
-
except Exception as e:
|
|
265
|
-
print(f"[!] Skipping image {i} due to error: {e}")
|
|
266
|
-
continue
|
|
267
|
-
|
|
268
|
-
df_out = pd.DataFrame(results)
|
|
269
|
-
out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
|
|
270
|
-
os.makedirs(out_dir, exist_ok=True)
|
|
271
|
-
out_path = os.path.join(out_dir, cfg.output.filename)
|
|
272
|
-
|
|
273
|
-
df_out["variable"] = variable_name
|
|
274
|
-
df_out["latitude"] = lat
|
|
275
|
-
df_out["longitude"] = lon
|
|
276
|
-
df_out['units'] = units
|
|
277
|
-
df_out['source'] = provider.upper()
|
|
278
|
-
df_out.rename(columns={var_name: 'value', "date": 'time'}, inplace=True)
|
|
279
|
-
df_out = df_out[["latitude", "longitude", "time", "source", "variable", "value",'units']]
|
|
280
|
-
|
|
281
|
-
# ureg = pint.UnitRegistry()
|
|
282
|
-
# pint_pandas.PintType.ureg = ureg
|
|
283
|
-
# df_out['temperature'] = df['temperature'].astype('pint[C]')
|
|
284
|
-
|
|
285
|
-
df_out.to_csv(out_path, index=False)
|
|
286
|
-
print(f"[✓] Saved timeseries to: {out_path}")
|
|
287
|
-
return df_out
|
|
288
31
|
def list_drive_files(folder_id, service):
|
|
289
32
|
"""
|
|
290
33
|
List all files in a Google Drive folder, handling pagination.
|
|
@@ -320,79 +63,12 @@ def download_drive_file(file_id, local_path, service):
|
|
|
320
63
|
while not done:
|
|
321
64
|
status, done = downloader.next_chunk()
|
|
322
65
|
print(f" → Download {int(status.progress() * 100)}% complete")
|
|
323
|
-
def fetch_MSWX(var_cfg):
|
|
324
|
-
param_mapping = var_cfg.mappings
|
|
325
|
-
provider = var_cfg.dataset.lower()
|
|
326
|
-
parameter_key = var_cfg.weather.parameter
|
|
327
|
-
|
|
328
|
-
param_info = param_mapping[provider]['variables'][parameter_key]
|
|
329
|
-
folder_id = param_info["folder_id"]
|
|
330
|
-
|
|
331
|
-
start_date = var_cfg.time_range.start_date
|
|
332
|
-
end_date = var_cfg.time_range.end_date
|
|
333
|
-
|
|
334
|
-
# === 1) Generate expected filenames ===
|
|
335
|
-
start = datetime.fromisoformat(start_date)
|
|
336
|
-
end = datetime.fromisoformat(end_date)
|
|
337
|
-
|
|
338
|
-
expected_files = []
|
|
339
|
-
current = start
|
|
340
|
-
while current <= end:
|
|
341
|
-
doy = current.timetuple().tm_yday
|
|
342
|
-
basename = f"{current.year}{doy:03d}.nc"
|
|
343
|
-
expected_files.append(basename)
|
|
344
|
-
current += timedelta(days=1)
|
|
345
|
-
|
|
346
|
-
output_dir = var_cfg.data_dir
|
|
347
|
-
local_files = []
|
|
348
|
-
missing_files = []
|
|
349
|
-
|
|
350
|
-
for basename in expected_files:
|
|
351
|
-
local_path = os.path.join(output_dir, provider, parameter_key, basename)
|
|
352
|
-
if os.path.exists(local_path):
|
|
353
|
-
local_files.append(basename)
|
|
354
|
-
else:
|
|
355
|
-
missing_files.append(basename)
|
|
356
|
-
|
|
357
|
-
if not missing_files:
|
|
358
|
-
print(f"✅ All {len(expected_files)} files already exist locally. No download needed.")
|
|
359
|
-
return local_files
|
|
360
|
-
|
|
361
|
-
print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching from Drive...")
|
|
362
|
-
|
|
363
|
-
# === 2) Connect to Drive ===
|
|
364
|
-
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
|
|
365
|
-
creds = service_account.Credentials.from_service_account_file(
|
|
366
|
-
param_mapping[provider].params.google_service_account, scopes=SCOPES
|
|
367
|
-
)
|
|
368
|
-
service = build('drive', 'v3', credentials=creds)
|
|
369
66
|
|
|
370
|
-
|
|
371
|
-
drive_files = list_drive_files(folder_id, service)
|
|
372
|
-
valid_filenames = set(missing_files)
|
|
373
|
-
|
|
374
|
-
files_to_download = [f for f in drive_files if f['name'] in valid_filenames]
|
|
375
|
-
|
|
376
|
-
if not files_to_download:
|
|
377
|
-
print(f"⚠️ None of the missing files found in Drive. Check folder & date range.")
|
|
378
|
-
return local_files
|
|
379
|
-
|
|
380
|
-
# === 4) Download missing ===
|
|
381
|
-
for file in files_to_download:
|
|
382
|
-
filename = file['name']
|
|
383
|
-
local_path = os.path.join(output_dir, provider, parameter_key, filename)
|
|
384
|
-
print(f"⬇️ Downloading {filename} ...")
|
|
385
|
-
download_drive_file(file['id'], local_path, service)
|
|
386
|
-
local_files.append(filename)
|
|
387
|
-
|
|
388
|
-
return local_files
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
def fetch_dwd(var_cfg):
|
|
67
|
+
def fetch_dwd(var_cfg,var):
|
|
392
68
|
"""Download HYRAS data for one variable and a list of years."""
|
|
393
69
|
param_mapping = var_cfg.mappings
|
|
394
70
|
provider = var_cfg.dataset.lower()
|
|
395
|
-
parameter_key =
|
|
71
|
+
parameter_key = var
|
|
396
72
|
# Validate provider and parameter
|
|
397
73
|
|
|
398
74
|
param_info = param_mapping[provider]['variables'][parameter_key]
|
|
@@ -524,452 +200,42 @@ def extract_ts_dwd(cfg: DictConfig):
|
|
|
524
200
|
ts_all.to_csv(out_path, index=False)
|
|
525
201
|
print(f"✅ Saved time series to: {out_path}")
|
|
526
202
|
|
|
527
|
-
return ts_all
|
|
528
|
-
def extract_ts_MSWX(cfg: DictConfig):
|
|
529
|
-
parameter = cfg.weather.parameter
|
|
530
|
-
param_mapping = cfg.mappings
|
|
531
|
-
provider = cfg.dataset.lower()
|
|
532
|
-
parameter_key = cfg.weather.parameter
|
|
533
|
-
# Validate provider and parameter
|
|
534
|
-
|
|
535
|
-
param_info = param_mapping[provider]['variables'][parameter_key]
|
|
536
|
-
|
|
537
|
-
base_dir = cfg.data_dir
|
|
538
|
-
|
|
539
|
-
target_lat = cfg.location.lat
|
|
540
|
-
target_lon = cfg.location.lon
|
|
541
|
-
|
|
542
|
-
start_date = pd.to_datetime(cfg.time_range.start_date)
|
|
543
|
-
end_date = pd.to_datetime(cfg.time_range.end_date)
|
|
544
|
-
|
|
545
|
-
# === 1) Rebuild exact basenames ===
|
|
546
|
-
current = start_date
|
|
547
|
-
basenames = []
|
|
548
|
-
while current <= end_date:
|
|
549
|
-
doy = current.timetuple().tm_yday
|
|
550
|
-
basename = f"{current.year}{doy:03d}.nc"
|
|
551
|
-
basenames.append(basename)
|
|
552
|
-
current += timedelta(days=1)
|
|
553
|
-
|
|
554
|
-
# === 2) Process only those files ===
|
|
555
|
-
ts_list = []
|
|
556
|
-
missing = []
|
|
557
|
-
|
|
558
|
-
for basename in basenames:
|
|
559
|
-
file_path = os.path.join(base_dir, provider, parameter, basename)
|
|
560
|
-
|
|
561
|
-
if not os.path.exists(file_path):
|
|
562
|
-
missing.append(basename)
|
|
563
|
-
continue
|
|
564
|
-
|
|
565
|
-
print(f"📂 Opening: {file_path}")
|
|
566
|
-
ds = xr.open_dataset(file_path)
|
|
567
|
-
|
|
568
|
-
time_name = [x for x in ds.coords if "time" in x.lower()][0]
|
|
569
|
-
data_var = [v for v in ds.data_vars][0]
|
|
570
|
-
|
|
571
|
-
ts = ds[data_var].sel(
|
|
572
|
-
lat=target_lat,
|
|
573
|
-
lon=target_lon,
|
|
574
|
-
method='nearest'
|
|
575
|
-
)
|
|
576
|
-
|
|
577
|
-
df = ts.to_dataframe().reset_index()[[time_name, data_var]]
|
|
578
|
-
ts_list.append(df)
|
|
579
|
-
|
|
580
|
-
if missing:
|
|
581
|
-
print(f"⚠️ Warning: {len(missing)} files were missing and skipped:")
|
|
582
|
-
for m in missing:
|
|
583
|
-
print(f" - {m}")
|
|
584
|
-
|
|
585
|
-
if not ts_list:
|
|
586
|
-
raise RuntimeError("❌ No valid files were found. Cannot extract time series.")
|
|
587
|
-
|
|
588
|
-
# === 3) Combine and slice (for safety) ===
|
|
589
|
-
ts_all = pd.concat(ts_list).sort_values(by=time_name).reset_index(drop=True)
|
|
590
|
-
|
|
591
|
-
ts_all[time_name] = pd.to_datetime(ts_all[time_name])
|
|
592
|
-
ts_all = ts_all[
|
|
593
|
-
(ts_all[time_name] >= start_date) &
|
|
594
|
-
(ts_all[time_name] <= end_date)
|
|
595
|
-
].reset_index(drop=True)
|
|
596
|
-
|
|
597
|
-
out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
|
|
598
|
-
os.makedirs(out_dir, exist_ok=True)
|
|
599
|
-
out_path = os.path.join(out_dir, cfg.output.filename)
|
|
600
|
-
|
|
601
|
-
ts_all["variable"] = param_info['name']
|
|
602
|
-
ts_all["latitude"] = target_lat
|
|
603
|
-
ts_all["longitude"] = target_lon
|
|
604
|
-
ts_all['source'] = provider.upper()
|
|
605
|
-
ts_all['units'] = ts.attrs['units']
|
|
606
|
-
ts_all.rename(columns={param_info['name']: 'value'}, inplace=True)
|
|
607
|
-
ts_all = ts_all[["latitude", "longitude", "time", "source", "variable", "value",'units']]
|
|
608
|
-
|
|
609
|
-
ts_all.to_csv(out_path, index=False)
|
|
610
|
-
print(f"✅ Saved MSWX time series to: {out_path}")
|
|
611
|
-
|
|
612
203
|
return ts_all
|
|
613
204
|
|
|
614
205
|
import os
|
|
615
206
|
from omegaconf import DictConfig
|
|
616
207
|
|
|
617
|
-
def
|
|
618
|
-
"""Generate full output file path from pattern and config."""
|
|
619
|
-
provider = cfg.dataset.lower()
|
|
620
|
-
parameter = cfg.weather.parameter
|
|
621
|
-
lat = cfg.location.lat
|
|
622
|
-
lon = cfg.location.lon
|
|
623
|
-
start = cfg.time_range.start_date
|
|
624
|
-
end = cfg.time_range.end_date
|
|
625
|
-
|
|
626
|
-
pattern = cfg.output.get("filename", "{provider}_{parameter}_{start}_{end}.csv")
|
|
627
|
-
filename = pattern.format(
|
|
628
|
-
provider=provider,
|
|
629
|
-
parameter=parameter,
|
|
630
|
-
lat=lat,
|
|
631
|
-
lon=lon,
|
|
632
|
-
start=start,
|
|
633
|
-
end=end
|
|
634
|
-
)
|
|
635
|
-
|
|
636
|
-
out_dir = cfg.output.out_dir
|
|
637
|
-
fmt = cfg.output.fmt # format is a reserved word in Python, so use 'fmt'
|
|
638
|
-
|
|
639
|
-
# return os.path.join(out_dir, fmt, filename)
|
|
640
|
-
return filename
|
|
641
|
-
|
|
642
|
-
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
643
|
-
# SPDX-FileCopyrightText: All rights reserved.
|
|
644
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
645
|
-
#
|
|
646
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
647
|
-
# you may not use this file except in compliance with the License.
|
|
648
|
-
# You may obtain a copy of the License at
|
|
649
|
-
#
|
|
650
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
651
|
-
#
|
|
652
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
653
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
654
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
655
|
-
# See the License for the specific language governing permissions and
|
|
656
|
-
# limitations under the License.
|
|
657
|
-
'''
|
|
658
|
-
import os
|
|
659
|
-
import tempfile
|
|
660
|
-
import cdsapi
|
|
661
|
-
import xarray as xr
|
|
662
|
-
import datetime
|
|
663
|
-
import json
|
|
664
|
-
import dask
|
|
665
|
-
import calendar
|
|
666
|
-
from dask.diagnostics import ProgressBar
|
|
667
|
-
from typing import List, Tuple, Dict, Union
|
|
668
|
-
import urllib3
|
|
669
|
-
import logging
|
|
670
|
-
import numpy as np
|
|
671
|
-
import fsspec
|
|
672
|
-
|
|
673
|
-
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
class ERA5Mirror:
|
|
208
|
+
def get_output_filename(cfg, output_type="nc", lat=None, lon=None):
|
|
677
209
|
"""
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
Attributes
|
|
681
|
-
----------
|
|
682
|
-
base_path : Path
|
|
683
|
-
The path to the Zarr dataset.
|
|
684
|
-
fs : fsspec.AbstractFileSystem
|
|
685
|
-
The filesystem to use for the Zarr dataset. If None, the local filesystem will be used.
|
|
210
|
+
Generate output filename based on config, output type, and extraction mode.
|
|
211
|
+
output_type: "nc", "csv", or "zarr"
|
|
686
212
|
"""
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
def save_metadata(self):
|
|
716
|
-
"""Save metadata"""
|
|
717
|
-
with self.fs.open(self.metadata_file, "w") as f:
|
|
718
|
-
json.dump(self.metadata, f)
|
|
719
|
-
|
|
720
|
-
def chunk_exists(self, variable, year, month, hours, pressure_level):
|
|
721
|
-
"""Check if chunk exists"""
|
|
722
|
-
for chunk in self.metadata["chunks"]:
|
|
723
|
-
if (
|
|
724
|
-
chunk["variable"] == variable
|
|
725
|
-
and chunk["year"] == year
|
|
726
|
-
and chunk["month"] == month
|
|
727
|
-
and chunk["hours"] == hours
|
|
728
|
-
and chunk["pressure_level"] == pressure_level
|
|
729
|
-
):
|
|
730
|
-
return True
|
|
731
|
-
return False
|
|
732
|
-
|
|
733
|
-
def download_chunk(
|
|
734
|
-
self,
|
|
735
|
-
variable: str,
|
|
736
|
-
year: int,
|
|
737
|
-
month: int,
|
|
738
|
-
hours: List[int],
|
|
739
|
-
pressure_level: int = None,
|
|
740
|
-
):
|
|
741
|
-
"""
|
|
742
|
-
Download ERA5 data for the specified variable, date range, hours, and pressure levels.
|
|
743
|
-
|
|
744
|
-
Parameters
|
|
745
|
-
----------
|
|
746
|
-
variable : str
|
|
747
|
-
The ERA5 variable to download, e.g. 'tisr' for solar radiation or 'z' for geopotential.
|
|
748
|
-
year : int
|
|
749
|
-
The year to download.
|
|
750
|
-
month : int
|
|
751
|
-
The month to download.
|
|
752
|
-
hours : List[int]
|
|
753
|
-
A list of hours (0-23) for which data should be downloaded.
|
|
754
|
-
pressure_level : int, optional
|
|
755
|
-
A pressure level to include in the download, by default None. If None, the single-level data will be downloaded.
|
|
756
|
-
|
|
757
|
-
Returns
|
|
758
|
-
-------
|
|
759
|
-
xr.Dataset
|
|
760
|
-
An xarray Dataset containing the downloaded data.
|
|
761
|
-
"""
|
|
762
|
-
|
|
763
|
-
with tempfile.TemporaryDirectory() as tmpdir:
|
|
764
|
-
# Get all days in the month
|
|
765
|
-
days_in_month = calendar.monthrange(year, month)[1]
|
|
766
|
-
|
|
767
|
-
# Make tmpfile to store the data
|
|
768
|
-
output_file = os.path.join(
|
|
769
|
-
tmpdir,
|
|
770
|
-
f"{variable}_{year}_{month:02d}_{str(hours)}_{str(pressure_level)}.nc",
|
|
771
|
-
)
|
|
772
|
-
|
|
773
|
-
# start the CDS API client (maybe need to move this outside the loop?)
|
|
774
|
-
c = cdsapi.Client(quiet=True)
|
|
775
|
-
|
|
776
|
-
# Setup the request parameters
|
|
777
|
-
request_params = {
|
|
778
|
-
"product_type": "reanalysis",
|
|
779
|
-
"variable": variable,
|
|
780
|
-
"year": str(year),
|
|
781
|
-
"month": str(month),
|
|
782
|
-
"day": [f"{day:02d}" for day in range(1, days_in_month + 1)],
|
|
783
|
-
"time": [f"{hour:02d}:00" for hour in hours],
|
|
784
|
-
"format": "netcdf",
|
|
785
|
-
}
|
|
786
|
-
if pressure_level:
|
|
787
|
-
request_params["pressure_level"] = [str(pressure_level)]
|
|
788
|
-
dataset_name = "reanalysis-era5-pressure-levels"
|
|
789
|
-
else:
|
|
790
|
-
dataset_name = "reanalysis-era5-single-levels"
|
|
791
|
-
|
|
792
|
-
# Download the data
|
|
793
|
-
c.retrieve(
|
|
794
|
-
dataset_name,
|
|
795
|
-
request_params,
|
|
796
|
-
output_file,
|
|
797
|
-
)
|
|
798
|
-
|
|
799
|
-
# Open the downloaded data
|
|
800
|
-
ds = xr.open_dataset(output_file)
|
|
801
|
-
return ds
|
|
802
|
-
|
|
803
|
-
def variable_to_zarr_name(self, variable: str, pressure_level: int = None):
|
|
804
|
-
"""convert variable to zarr name"""
|
|
805
|
-
# create zarr path for variable
|
|
806
|
-
zarr_path = f"{self.base_path}/{variable}"
|
|
807
|
-
if pressure_level:
|
|
808
|
-
zarr_path += f"_pressure_level_{pressure_level}"
|
|
809
|
-
zarr_path += ".zarr"
|
|
810
|
-
return zarr_path
|
|
811
|
-
|
|
812
|
-
def download_and_upload_chunk(
|
|
813
|
-
self,
|
|
814
|
-
variable: str,
|
|
815
|
-
year: int,
|
|
816
|
-
month: int,
|
|
817
|
-
hours: List[int],
|
|
818
|
-
pressure_level: int = None,
|
|
819
|
-
):
|
|
820
|
-
"""
|
|
821
|
-
Downloads a chunk of ERA5 data for a specific variable and date range, and uploads it to a Zarr array.
|
|
822
|
-
This downloads a 1-month chunk of data.
|
|
823
|
-
|
|
824
|
-
Parameters
|
|
825
|
-
----------
|
|
826
|
-
variable : str
|
|
827
|
-
The variable to download.
|
|
828
|
-
year : int
|
|
829
|
-
The year to download.
|
|
830
|
-
month : int
|
|
831
|
-
The month to download.
|
|
832
|
-
hours : List[int]
|
|
833
|
-
A list of hours to download.
|
|
834
|
-
pressure_level : int, optional
|
|
835
|
-
Pressure levels to download, if applicable.
|
|
836
|
-
"""
|
|
837
|
-
|
|
838
|
-
# Download the data
|
|
839
|
-
ds = self.download_chunk(variable, year, month, hours, pressure_level)
|
|
840
|
-
if "valid_time" in ds.dims:
|
|
841
|
-
ds = ds.rename({"valid_time": "time"})
|
|
842
|
-
|
|
843
|
-
# Create the Zarr path
|
|
844
|
-
zarr_path = self.variable_to_zarr_name(variable, pressure_level)
|
|
845
|
-
|
|
846
|
-
# Specify the chunking options
|
|
847
|
-
chunking = {"time": 1, "latitude": 721, "longitude": 1440}
|
|
848
|
-
if "level" in ds.dims:
|
|
849
|
-
chunking["level"] = 1
|
|
850
|
-
|
|
851
|
-
# Re-chunk the dataset
|
|
852
|
-
ds = ds.chunk(chunking)
|
|
853
|
-
|
|
854
|
-
# Check if the Zarr dataset exists
|
|
855
|
-
if self.fs.exists(zarr_path):
|
|
856
|
-
mode = "a"
|
|
857
|
-
append_dim = "time"
|
|
858
|
-
create = False
|
|
859
|
-
else:
|
|
860
|
-
mode = "w"
|
|
861
|
-
append_dim = None
|
|
862
|
-
create = True
|
|
863
|
-
|
|
864
|
-
# Upload the data to the Zarr dataset
|
|
865
|
-
mapper = self.fs.get_mapper(zarr_path, create=create)
|
|
866
|
-
ds.to_zarr(mapper, mode=mode, consolidated=True, append_dim=append_dim)
|
|
867
|
-
|
|
868
|
-
# Update the metadata
|
|
869
|
-
self.metadata["chunks"].append(
|
|
870
|
-
{
|
|
871
|
-
"variable": variable,
|
|
872
|
-
"year": year,
|
|
873
|
-
"month": month,
|
|
874
|
-
"hours": hours,
|
|
875
|
-
"pressure_level": pressure_level,
|
|
876
|
-
}
|
|
213
|
+
if output_type == "csv":
|
|
214
|
+
template = cfg.output.filename_csv
|
|
215
|
+
elif output_type == "zarr":
|
|
216
|
+
template = cfg.output.filename_zarr
|
|
217
|
+
else:
|
|
218
|
+
template = cfg.output.filename_nc
|
|
219
|
+
|
|
220
|
+
# If lat/lon are provided, use point template
|
|
221
|
+
if lat is not None and lon is not None:
|
|
222
|
+
filename = template.format(
|
|
223
|
+
provider=cfg.dataset,
|
|
224
|
+
parameter="surface",
|
|
225
|
+
lat=f"{lat}",
|
|
226
|
+
lon=f"{lon}",
|
|
227
|
+
start=cfg.time_range.start_date.replace("-", ""),
|
|
228
|
+
end=cfg.time_range.end_date.replace("-", ""),
|
|
229
|
+
)
|
|
230
|
+
else:
|
|
231
|
+
# Use region bounds
|
|
232
|
+
region_bounds = cfg.bounds[cfg.region]
|
|
233
|
+
filename = template.format(
|
|
234
|
+
provider=cfg.dataset,
|
|
235
|
+
parameter="surface",
|
|
236
|
+
lat_range=f"{region_bounds['lat_min']}-{region_bounds['lat_max']}",
|
|
237
|
+
lon_range=f"{region_bounds['lon_min']}-{region_bounds['lon_max']}",
|
|
238
|
+
start=cfg.time_range.start_date.replace("-", ""),
|
|
239
|
+
end=cfg.time_range.end_date.replace("-", ""),
|
|
877
240
|
)
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
def download(
|
|
881
|
-
self,
|
|
882
|
-
variables: List[Union[str, Tuple[str, int]]],
|
|
883
|
-
date_range: Tuple[datetime.date, datetime.date],
|
|
884
|
-
hours: List[int],
|
|
885
|
-
):
|
|
886
|
-
"""
|
|
887
|
-
Start the process of mirroring the specified ERA5 variables for the given date range and hours.
|
|
888
|
-
|
|
889
|
-
Parameters
|
|
890
|
-
----------
|
|
891
|
-
variables : List[Union[str, Tuple[str, List[int]]]]
|
|
892
|
-
A list of variables to mirror, where each element can either be a string (single-level variable)
|
|
893
|
-
or a tuple (variable with pressure level).
|
|
894
|
-
date_range : Tuple[datetime.date, datetime.date]
|
|
895
|
-
A tuple containing the start and end dates for the data to be mirrored. This will download and store every month in the range.
|
|
896
|
-
hours : List[int]
|
|
897
|
-
A list of hours for which to download the data.
|
|
898
|
-
|
|
899
|
-
Returns
|
|
900
|
-
-------
|
|
901
|
-
zarr_paths : List[str]
|
|
902
|
-
A list of Zarr paths for each of the variables.
|
|
903
|
-
"""
|
|
904
|
-
|
|
905
|
-
start_date, end_date = date_range
|
|
906
|
-
|
|
907
|
-
# Reformat the variables list so all elements are tuples
|
|
908
|
-
reformated_variables = []
|
|
909
|
-
for variable in variables:
|
|
910
|
-
if isinstance(variable, str):
|
|
911
|
-
reformated_variables.append(tuple([variable, None]))
|
|
912
|
-
else:
|
|
913
|
-
reformated_variables.append(variable)
|
|
914
|
-
|
|
915
|
-
# Start Downloading
|
|
916
|
-
with ProgressBar():
|
|
917
|
-
# Round dates to months
|
|
918
|
-
current_date = start_date.replace(day=1)
|
|
919
|
-
end_date = end_date.replace(day=1)
|
|
920
|
-
|
|
921
|
-
while current_date <= end_date:
|
|
922
|
-
# Create a list of tasks to download the data
|
|
923
|
-
tasks = []
|
|
924
|
-
for variable, pressure_level in reformated_variables:
|
|
925
|
-
if not self.chunk_exists(
|
|
926
|
-
variable,
|
|
927
|
-
current_date.year,
|
|
928
|
-
current_date.month,
|
|
929
|
-
hours,
|
|
930
|
-
pressure_level,
|
|
931
|
-
):
|
|
932
|
-
task = dask.delayed(self.download_and_upload_chunk)(
|
|
933
|
-
variable,
|
|
934
|
-
current_date.year,
|
|
935
|
-
current_date.month,
|
|
936
|
-
hours,
|
|
937
|
-
pressure_level,
|
|
938
|
-
)
|
|
939
|
-
tasks.append(task)
|
|
940
|
-
else:
|
|
941
|
-
print(
|
|
942
|
-
f"Chunk for {variable} {pressure_level} {current_date.year}-{current_date.month} already exists. Skipping."
|
|
943
|
-
)
|
|
944
|
-
|
|
945
|
-
# Execute the tasks with Dask
|
|
946
|
-
print(f"Downloading data for {current_date.year}-{current_date.month}")
|
|
947
|
-
if tasks:
|
|
948
|
-
dask.compute(*tasks)
|
|
949
|
-
|
|
950
|
-
# Update the metadata
|
|
951
|
-
self.save_metadata()
|
|
952
|
-
|
|
953
|
-
# Update the current date
|
|
954
|
-
days_in_month = calendar.monthrange(
|
|
955
|
-
year=current_date.year, month=current_date.month
|
|
956
|
-
)[1]
|
|
957
|
-
current_date += datetime.timedelta(days=days_in_month)
|
|
958
|
-
|
|
959
|
-
# Return the Zarr paths
|
|
960
|
-
zarr_paths = []
|
|
961
|
-
for variable, pressure_level in reformated_variables:
|
|
962
|
-
zarr_path = self.variable_to_zarr_name(variable, pressure_level)
|
|
963
|
-
zarr_paths.append(zarr_path)
|
|
964
|
-
|
|
965
|
-
# Check that Zarr arrays have correct dt for time dimension
|
|
966
|
-
for zarr_path in zarr_paths:
|
|
967
|
-
ds = xr.open_zarr(zarr_path)
|
|
968
|
-
time_stamps = ds.time.values
|
|
969
|
-
dt = time_stamps[1:] - time_stamps[:-1]
|
|
970
|
-
assert np.all(
|
|
971
|
-
dt == dt[0]
|
|
972
|
-
), f"Zarr array {zarr_path} has incorrect dt for time dimension. An error may have occurred during download. Please delete the Zarr array and try again."
|
|
973
|
-
|
|
974
|
-
return zarr_paths
|
|
975
|
-
'''
|
|
241
|
+
return filename
|