climdata 0.1.2__py2.py3-none-any.whl → 0.1.3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of climdata might be problematic. Click here for more details.
- climdata/__init__.py +2 -1
- climdata/conf/config.yaml +17 -20
- climdata/conf/mappings/parameters.yaml +2 -2
- climdata/datasets/CMIPCloud.py +54 -32
- climdata/datasets/DWD.py +12 -20
- climdata/datasets/HYRAS.py +133 -0
- climdata/datasets/MSWX.py +107 -190
- climdata/utils/utils_download.py +33 -767
- {climdata-0.1.2.dist-info → climdata-0.1.3.dist-info}/METADATA +1 -1
- climdata-0.1.3.dist-info/RECORD +19 -0
- climdata/__main__.py +0 -5
- climdata/main.py +0 -56
- climdata-0.1.2.dist-info/RECORD +0 -20
- {climdata-0.1.2.dist-info → climdata-0.1.3.dist-info}/WHEEL +0 -0
- {climdata-0.1.2.dist-info → climdata-0.1.3.dist-info}/entry_points.txt +0 -0
- {climdata-0.1.2.dist-info → climdata-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {climdata-0.1.2.dist-info → climdata-0.1.3.dist-info}/top_level.txt +0 -0
climdata/datasets/MSWX.py
CHANGED
|
@@ -1,96 +1,42 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
-
import numpy as np
|
|
3
|
-
from wetterdienst import Settings
|
|
4
|
-
from wetterdienst.provider.dwd.observation import DwdObservationRequest
|
|
5
|
-
import geemap
|
|
6
|
-
import ee
|
|
7
|
-
import ipdb
|
|
8
2
|
import geopandas as gpd
|
|
9
|
-
from omegaconf import DictConfig
|
|
10
3
|
import os
|
|
11
|
-
import yaml
|
|
12
|
-
import time
|
|
13
4
|
from tqdm import tqdm
|
|
14
5
|
import warnings
|
|
15
6
|
from datetime import datetime, timedelta
|
|
16
7
|
import xarray as xr
|
|
17
|
-
import hydra
|
|
18
8
|
from omegaconf import DictConfig
|
|
19
|
-
import pint
|
|
20
|
-
import pint_pandas
|
|
21
9
|
|
|
22
10
|
from google.oauth2 import service_account
|
|
23
11
|
from googleapiclient.discovery import build
|
|
24
|
-
from googleapiclient.http import MediaIoBaseDownload
|
|
25
12
|
|
|
26
13
|
from climdata.utils.utils_download import list_drive_files, download_drive_file
|
|
27
|
-
|
|
28
|
-
import io
|
|
29
|
-
import requests
|
|
30
|
-
from scipy.spatial import cKDTree
|
|
31
|
-
import argparse
|
|
32
|
-
import re
|
|
33
|
-
|
|
34
|
-
import requests
|
|
35
|
-
from bs4 import BeautifulSoup
|
|
36
|
-
import concurrent.futures
|
|
37
|
-
|
|
38
|
-
import gzip
|
|
39
|
-
# from utils.utils import *
|
|
40
|
-
# from datasets.datasets import *
|
|
41
|
-
import rioxarray
|
|
42
14
|
from shapely.geometry import mapping
|
|
15
|
+
import cf_xarray
|
|
43
16
|
|
|
44
17
|
warnings.filterwarnings("ignore", category=Warning)
|
|
45
18
|
|
|
46
|
-
import cf_xarray
|
|
47
19
|
|
|
48
20
|
class MSWXmirror:
|
|
49
|
-
def __init__(self,
|
|
50
|
-
self.
|
|
51
|
-
self.files = []
|
|
21
|
+
def __init__(self, cfg: DictConfig):
|
|
22
|
+
self.cfg = cfg
|
|
52
23
|
self.dataset = None
|
|
24
|
+
self.variables = cfg.variables
|
|
25
|
+
self.files = []
|
|
53
26
|
|
|
54
|
-
def _fix_coords(self, ds: xr.Dataset | xr.DataArray)
|
|
55
|
-
"""
|
|
56
|
-
Ensure latitude is ascending and longitude is in the range [0, 360].
|
|
57
|
-
|
|
58
|
-
Parameters
|
|
59
|
-
----------
|
|
60
|
-
ds : xr.Dataset or xr.DataArray
|
|
61
|
-
Input dataset or dataarray with latitude and longitude coordinates.
|
|
62
|
-
|
|
63
|
-
Returns
|
|
64
|
-
-------
|
|
65
|
-
xr.Dataset or xr.DataArray
|
|
66
|
-
Dataset with latitude ascending and longitude wrapped to [0, 360].
|
|
67
|
-
"""
|
|
68
|
-
# Flip latitude to ascending
|
|
27
|
+
def _fix_coords(self, ds: xr.Dataset | xr.DataArray):
|
|
28
|
+
"""Ensure latitude is ascending and longitude is in the range [0, 360]."""
|
|
69
29
|
ds = ds.cf.sortby("latitude")
|
|
70
|
-
|
|
71
|
-
# Wrap longitude into [0, 360]
|
|
72
30
|
lon_name = ds.cf["longitude"].name
|
|
73
31
|
ds = ds.assign_coords({lon_name: ds.cf["longitude"] % 360})
|
|
32
|
+
return ds.sortby(lon_name)
|
|
74
33
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def fetch(self):
|
|
82
|
-
param_mapping = self.var_cfg.mappings
|
|
83
|
-
provider = self.var_cfg.dataset.lower()
|
|
84
|
-
parameter_key = self.var_cfg.weather.parameter
|
|
85
|
-
|
|
86
|
-
param_info = param_mapping[provider]['variables'][parameter_key]
|
|
87
|
-
folder_id = param_info["folder_id"]
|
|
88
|
-
|
|
89
|
-
start_date = self.var_cfg.time_range.start_date
|
|
90
|
-
end_date = self.var_cfg.time_range.end_date
|
|
91
|
-
|
|
92
|
-
start = datetime.fromisoformat(start_date)
|
|
93
|
-
end = datetime.fromisoformat(end_date)
|
|
34
|
+
def fetch(self, folder_id: str, variable: str):
|
|
35
|
+
"""
|
|
36
|
+
Fetch MSWX files from Google Drive for a given variable.
|
|
37
|
+
"""
|
|
38
|
+
start = datetime.fromisoformat(self.cfg.time_range.start_date)
|
|
39
|
+
end = datetime.fromisoformat(self.cfg.time_range.end_date)
|
|
94
40
|
|
|
95
41
|
expected_files = []
|
|
96
42
|
current = start
|
|
@@ -100,29 +46,25 @@ class MSWXmirror:
|
|
|
100
46
|
expected_files.append(basename)
|
|
101
47
|
current += timedelta(days=1)
|
|
102
48
|
|
|
103
|
-
output_dir = self.
|
|
104
|
-
|
|
105
|
-
parameter_key = self.var_cfg.weather.parameter
|
|
106
|
-
local_files = []
|
|
107
|
-
missing_files = []
|
|
49
|
+
output_dir = self.cfg.data_dir
|
|
50
|
+
local_files, missing_files = [], []
|
|
108
51
|
|
|
109
52
|
for basename in expected_files:
|
|
110
|
-
local_path = os.path.join(output_dir,
|
|
53
|
+
local_path = os.path.join(output_dir,self.cfg.dataset, variable, basename)
|
|
111
54
|
if os.path.exists(local_path):
|
|
112
55
|
local_files.append(basename)
|
|
113
56
|
else:
|
|
114
57
|
missing_files.append(basename)
|
|
115
58
|
|
|
116
59
|
if not missing_files:
|
|
117
|
-
print(f"✅ All {len(expected_files)} files already exist locally.
|
|
118
|
-
self.files = local_files
|
|
60
|
+
print(f"✅ All {len(expected_files)} {variable} files already exist locally.")
|
|
119
61
|
return local_files
|
|
120
62
|
|
|
121
|
-
print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching from Drive...")
|
|
63
|
+
print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching {variable} from Drive...")
|
|
122
64
|
|
|
123
65
|
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
|
|
124
66
|
creds = service_account.Credentials.from_service_account_file(
|
|
125
|
-
|
|
67
|
+
self.cfg.mappings.mswx.params.google_service_account, scopes=SCOPES
|
|
126
68
|
)
|
|
127
69
|
service = build('drive', 'v3', credentials=creds)
|
|
128
70
|
|
|
@@ -131,86 +73,63 @@ class MSWXmirror:
|
|
|
131
73
|
files_to_download = [f for f in drive_files if f['name'] in valid_filenames]
|
|
132
74
|
|
|
133
75
|
if not files_to_download:
|
|
134
|
-
print(f"⚠️
|
|
135
|
-
self.files = local_files
|
|
76
|
+
print(f"⚠️ No {variable} files found in Drive for requested dates.")
|
|
136
77
|
return local_files
|
|
137
78
|
|
|
138
79
|
for file in files_to_download:
|
|
139
80
|
filename = file['name']
|
|
140
|
-
local_path = os.path.join(output_dir,
|
|
81
|
+
local_path = os.path.join(output_dir, self.cfg.dataset, variable, filename)
|
|
82
|
+
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
141
83
|
print(f"⬇️ Downloading {filename} ...")
|
|
142
84
|
download_drive_file(file['id'], local_path, service)
|
|
143
85
|
local_files.append(filename)
|
|
144
86
|
|
|
145
|
-
self.files = local_files
|
|
146
87
|
return local_files
|
|
147
88
|
|
|
148
|
-
def load(self):
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
param_info = param_mapping[provider]['variables'][parameter_key]
|
|
156
|
-
output_dir = self.var_cfg.data_dir
|
|
157
|
-
valid_dsets = []
|
|
89
|
+
def load(self, variable: str):
|
|
90
|
+
"""
|
|
91
|
+
Load MSWX NetCDFs for a given variable into a single xarray Dataset.
|
|
92
|
+
"""
|
|
93
|
+
folder_id = self.cfg.mappings["mswx"]["variables"][variable]["folder_id"]
|
|
94
|
+
files = self.fetch(folder_id, variable)
|
|
95
|
+
datasets = []
|
|
158
96
|
|
|
159
|
-
for f in
|
|
160
|
-
local_path = os.path.join(
|
|
97
|
+
for f in files:
|
|
98
|
+
local_path = os.path.join(self.cfg.data_dir, self.cfg.dataset.lower(), variable, f)
|
|
161
99
|
try:
|
|
162
|
-
ds = xr.open_dataset(local_path, chunks=
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
valid_dsets.append(ds)
|
|
100
|
+
ds = xr.open_dataset(local_path, chunks="auto", engine="netcdf4")[self.cfg.mappings[self.cfg.dataset].variables[variable].name]
|
|
101
|
+
ds = ds.rename(variable)
|
|
102
|
+
datasets.append(ds)
|
|
166
103
|
except Exception as e:
|
|
167
|
-
print(f"Skipping file due to error: {
|
|
104
|
+
print(f"Skipping file {f} due to error: {e}")
|
|
105
|
+
|
|
106
|
+
if not datasets:
|
|
107
|
+
raise RuntimeError(f"No datasets could be loaded for {variable}.")
|
|
108
|
+
|
|
109
|
+
dset = xr.concat(datasets, dim="time")
|
|
110
|
+
dset = dset.transpose("time", "lat", "lon")
|
|
111
|
+
dset = self._fix_coords(dset)
|
|
168
112
|
|
|
169
|
-
|
|
170
|
-
dset = dset.transpose('time', 'lat', 'lon')
|
|
171
|
-
self.dataset = self._fix_coords(dset)
|
|
113
|
+
self.dataset = dset
|
|
172
114
|
return self.dataset
|
|
173
115
|
|
|
174
|
-
def to_zarr(self, zarr_filename):
|
|
116
|
+
def to_zarr(self, zarr_filename: str):
|
|
175
117
|
if self.dataset is None:
|
|
176
|
-
raise ValueError("No dataset loaded. Call `load()`
|
|
177
|
-
|
|
178
|
-
var_name = self.var_cfg.weather.parameter
|
|
179
|
-
dataset_name = self.var_cfg.dataset
|
|
180
|
-
region = self.var_cfg.region
|
|
118
|
+
raise ValueError("No dataset loaded. Call `load()` first.")
|
|
181
119
|
|
|
182
|
-
|
|
120
|
+
var_name = self.dataset.name
|
|
183
121
|
if var_name == 'pr':
|
|
184
122
|
self.dataset.attrs['units'] = 'mm/day'
|
|
185
123
|
elif var_name in ['tas', 'tasmax', 'tasmin']:
|
|
186
124
|
self.dataset.attrs['units'] = 'degC'
|
|
187
125
|
|
|
188
|
-
zarr_path = os.path.join("data/MSWX
|
|
126
|
+
zarr_path = os.path.join("data/MSWX", zarr_filename)
|
|
189
127
|
os.makedirs(os.path.dirname(zarr_path), exist_ok=True)
|
|
190
128
|
|
|
191
129
|
print(f"💾 Saving {var_name} to Zarr: {zarr_path}")
|
|
192
130
|
self.dataset.to_zarr(zarr_path, mode="w")
|
|
193
131
|
|
|
194
132
|
def extract(self, *, point=None, box=None, shapefile=None, buffer_km=0.0):
|
|
195
|
-
"""
|
|
196
|
-
Extract a subset of the dataset by point, bounding box, or shapefile.
|
|
197
|
-
|
|
198
|
-
Parameters
|
|
199
|
-
----------
|
|
200
|
-
point : tuple(float, float), optional
|
|
201
|
-
(lon, lat) coordinates for a single point.
|
|
202
|
-
box : tuple(float, float, float, float), optional
|
|
203
|
-
(min_lon, min_lat, max_lon, max_lat) bounding box.
|
|
204
|
-
shapefile : str or geopandas.GeoDataFrame, optional
|
|
205
|
-
Path to shapefile or a GeoDataFrame.
|
|
206
|
-
buffer_km : float, optional
|
|
207
|
-
Buffer distance in kilometers (for point or shapefile).
|
|
208
|
-
|
|
209
|
-
Returns
|
|
210
|
-
-------
|
|
211
|
-
xarray.Dataset or xarray.DataArray
|
|
212
|
-
Subset of the dataset.
|
|
213
|
-
"""
|
|
214
133
|
if self.dataset is None:
|
|
215
134
|
raise ValueError("No dataset loaded. Call `load()` first.")
|
|
216
135
|
|
|
@@ -219,20 +138,18 @@ class MSWXmirror:
|
|
|
219
138
|
if point is not None:
|
|
220
139
|
lon, lat = point
|
|
221
140
|
if buffer_km > 0:
|
|
222
|
-
|
|
223
|
-
buffer_deg = buffer_km / 111 # rough conversion km→degrees
|
|
141
|
+
buffer_deg = buffer_km / 111
|
|
224
142
|
ds_subset = ds.sel(
|
|
225
143
|
lon=slice(lon-buffer_deg, lon+buffer_deg),
|
|
226
|
-
lat=slice(lat-buffer_deg, lat+buffer_deg)
|
|
144
|
+
lat=slice(lat-buffer_deg, lat+buffer_deg),
|
|
227
145
|
)
|
|
228
146
|
else:
|
|
229
147
|
ds_subset = ds.sel(lon=lon, lat=lat, method="nearest")
|
|
230
148
|
|
|
231
149
|
elif box is not None:
|
|
232
|
-
# Accept dict: {'lat_min': ..., 'lat_max': ..., 'lon_min': ..., 'lon_max': ...}
|
|
233
150
|
ds_subset = ds.sel(
|
|
234
|
-
lon=slice(box[
|
|
235
|
-
lat=slice(box[
|
|
151
|
+
lon=slice(box["lon_min"], box["lon_max"]),
|
|
152
|
+
lat=slice(box["lat_min"], box["lat_max"]),
|
|
236
153
|
)
|
|
237
154
|
|
|
238
155
|
elif shapefile is not None:
|
|
@@ -241,71 +158,71 @@ class MSWXmirror:
|
|
|
241
158
|
else:
|
|
242
159
|
gdf = shapefile
|
|
243
160
|
if buffer_km > 0:
|
|
244
|
-
gdf = gdf.to_crs(epsg=3857)
|
|
161
|
+
gdf = gdf.to_crs(epsg=3857)
|
|
245
162
|
gdf["geometry"] = gdf.buffer(buffer_km * 1000)
|
|
246
163
|
gdf = gdf.to_crs(epsg=4326)
|
|
247
|
-
|
|
248
164
|
geom = [mapping(g) for g in gdf.geometry]
|
|
249
165
|
ds_subset = ds.rio.clip(geom, gdf.crs, drop=True)
|
|
250
166
|
|
|
251
167
|
else:
|
|
252
168
|
raise ValueError("Must provide either point, box, or shapefile.")
|
|
253
|
-
|
|
254
|
-
self.dataset =
|
|
169
|
+
|
|
170
|
+
self.dataset = ds_subset.to_dataset()
|
|
255
171
|
return ds_subset
|
|
256
|
-
|
|
257
|
-
def to_dataframe(self, ds=None):
|
|
258
|
-
"""
|
|
259
|
-
Convert extracted xarray dataset to a tidy dataframe.
|
|
260
172
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
173
|
+
# def to_dataframe(self, ds=None):
|
|
174
|
+
# if ds is None:
|
|
175
|
+
# if self.dataset is None:
|
|
176
|
+
# raise ValueError("No dataset loaded. Call `load()` first or pass `ds`.")
|
|
177
|
+
# ds = self.dataset
|
|
178
|
+
|
|
179
|
+
# if isinstance(ds, xr.Dataset):
|
|
180
|
+
# if len(ds.data_vars) != 1:
|
|
181
|
+
# raise ValueError("Dataset has multiple variables. Please select one.")
|
|
182
|
+
# ds = ds[list(ds.data_vars)[0]]
|
|
183
|
+
|
|
184
|
+
# df = ds.to_dataframe().reset_index()
|
|
185
|
+
# df = df[["time", "lat", "lon", ds.name]]
|
|
186
|
+
# df = df.rename(columns={"lat": "latitude", "lon": "longitude", ds.name: "value"})
|
|
187
|
+
# return df
|
|
188
|
+
|
|
189
|
+
def _format(self, df):
|
|
190
|
+
"""Format dataframe for standardized output."""
|
|
191
|
+
value_vars = [v for v in self.variables if v in df.columns]
|
|
192
|
+
id_vars = [c for c in df.columns if c not in value_vars]
|
|
193
|
+
|
|
194
|
+
df_long = df.melt(
|
|
195
|
+
id_vars=id_vars,
|
|
196
|
+
value_vars=value_vars,
|
|
197
|
+
var_name="variable",
|
|
198
|
+
value_name="value",
|
|
199
|
+
)
|
|
265
200
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
if self.dataset is None:
|
|
272
|
-
raise ValueError("No dataset loaded. Call `load()` first or pass `ds`.")
|
|
273
|
-
ds = self.dataset
|
|
274
|
-
|
|
275
|
-
# If Dataset, pick first variable
|
|
276
|
-
if isinstance(ds, xr.Dataset):
|
|
277
|
-
if len(ds.data_vars) != 1:
|
|
278
|
-
raise ValueError("Dataset has multiple variables. Please select one.")
|
|
279
|
-
ds = ds[list(ds.data_vars)[0]]
|
|
280
|
-
|
|
281
|
-
df = ds.to_dataframe().reset_index()
|
|
282
|
-
|
|
283
|
-
# Keep only relevant cols
|
|
284
|
-
df = df[["time", "lat", "lon", ds.name]]
|
|
285
|
-
|
|
286
|
-
# Rename
|
|
287
|
-
df = df.rename(columns={
|
|
288
|
-
"lat": "latitude",
|
|
289
|
-
"lon": "longitude",
|
|
290
|
-
ds.name: "value"
|
|
291
|
-
})
|
|
292
|
-
return df
|
|
293
|
-
def save_netcdf(self, filename):
|
|
294
|
-
if self.dataset is not None:
|
|
295
|
-
if "time" in self.dataset.variables:
|
|
296
|
-
self.dataset["time"].encoding.clear()
|
|
297
|
-
self.dataset.to_netcdf(filename)
|
|
298
|
-
# print(f"Saved NetCDF to {filename}")
|
|
201
|
+
df_long["units"] = df_long["variable"].map(
|
|
202
|
+
lambda v: self.dataset[v].attrs.get("units", "unknown")
|
|
203
|
+
if v in self.dataset.data_vars
|
|
204
|
+
else "unknown"
|
|
205
|
+
)
|
|
299
206
|
|
|
300
|
-
|
|
301
|
-
"""
|
|
302
|
-
Format dataframe into standard schema.
|
|
303
|
-
"""
|
|
304
|
-
df = df.copy()
|
|
305
|
-
df["variable"] = self.var_cfg.weather.parameter
|
|
306
|
-
df["source"] = self.var_cfg.dataset.upper()
|
|
307
|
-
df["units"] = self.dataset.attrs.get("units", "unknown")
|
|
207
|
+
df_long["source"] = self.cfg.dataset
|
|
308
208
|
|
|
309
|
-
|
|
310
|
-
|
|
209
|
+
cols = [
|
|
210
|
+
"source",
|
|
211
|
+
"table",
|
|
212
|
+
"time",
|
|
213
|
+
"lat",
|
|
214
|
+
"lon",
|
|
215
|
+
"variable",
|
|
216
|
+
"value",
|
|
217
|
+
"units",
|
|
218
|
+
]
|
|
219
|
+
df_long = df_long[[c for c in cols if c in df_long.columns]]
|
|
311
220
|
|
|
221
|
+
return df_long
|
|
222
|
+
|
|
223
|
+
def save_csv(self, filename):
|
|
224
|
+
if self.dataset is not None:
|
|
225
|
+
df = self.dataset.to_dataframe().reset_index()
|
|
226
|
+
df = self._format(df)
|
|
227
|
+
df.to_csv(filename, index=False)
|
|
228
|
+
print(f"Saved CSV to {filename}")
|