climdata 0.1.1__py2.py3-none-any.whl → 0.1.3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of climdata might be problematic. Click here for more details.
- climdata/__init__.py +2 -1
- climdata/conf/config.yaml +17 -20
- climdata/conf/mappings/parameters.yaml +2 -2
- climdata/datasets/CMIPCloud.py +55 -32
- climdata/datasets/DWD.py +12 -20
- climdata/datasets/HYRAS.py +133 -0
- climdata/datasets/MSWX.py +110 -184
- climdata/utils/utils_download.py +33 -767
- {climdata-0.1.1.dist-info → climdata-0.1.3.dist-info}/METADATA +1 -1
- climdata-0.1.3.dist-info/RECORD +19 -0
- climdata/__main__.py +0 -5
- climdata/main.py +0 -56
- climdata-0.1.1.dist-info/RECORD +0 -20
- {climdata-0.1.1.dist-info → climdata-0.1.3.dist-info}/WHEEL +0 -0
- {climdata-0.1.1.dist-info → climdata-0.1.3.dist-info}/entry_points.txt +0 -0
- {climdata-0.1.1.dist-info → climdata-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {climdata-0.1.1.dist-info → climdata-0.1.3.dist-info}/top_level.txt +0 -0
climdata/datasets/MSWX.py
CHANGED
|
@@ -1,96 +1,42 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
-
import numpy as np
|
|
3
|
-
from wetterdienst import Settings
|
|
4
|
-
from wetterdienst.provider.dwd.observation import DwdObservationRequest
|
|
5
|
-
import geemap
|
|
6
|
-
import ee
|
|
7
|
-
import ipdb
|
|
8
2
|
import geopandas as gpd
|
|
9
|
-
from omegaconf import DictConfig
|
|
10
3
|
import os
|
|
11
|
-
import yaml
|
|
12
|
-
import time
|
|
13
4
|
from tqdm import tqdm
|
|
14
5
|
import warnings
|
|
15
6
|
from datetime import datetime, timedelta
|
|
16
7
|
import xarray as xr
|
|
17
|
-
import hydra
|
|
18
8
|
from omegaconf import DictConfig
|
|
19
|
-
import pint
|
|
20
|
-
import pint_pandas
|
|
21
9
|
|
|
22
10
|
from google.oauth2 import service_account
|
|
23
11
|
from googleapiclient.discovery import build
|
|
24
|
-
from googleapiclient.http import MediaIoBaseDownload
|
|
25
12
|
|
|
26
13
|
from climdata.utils.utils_download import list_drive_files, download_drive_file
|
|
27
|
-
|
|
28
|
-
import io
|
|
29
|
-
import requests
|
|
30
|
-
from scipy.spatial import cKDTree
|
|
31
|
-
import argparse
|
|
32
|
-
import re
|
|
33
|
-
|
|
34
|
-
import requests
|
|
35
|
-
from bs4 import BeautifulSoup
|
|
36
|
-
import concurrent.futures
|
|
37
|
-
|
|
38
|
-
import gzip
|
|
39
|
-
# from utils.utils import *
|
|
40
|
-
# from datasets.datasets import *
|
|
41
|
-
import rioxarray
|
|
42
14
|
from shapely.geometry import mapping
|
|
15
|
+
import cf_xarray
|
|
43
16
|
|
|
44
17
|
warnings.filterwarnings("ignore", category=Warning)
|
|
45
18
|
|
|
46
|
-
import cf_xarray
|
|
47
19
|
|
|
48
20
|
class MSWXmirror:
|
|
49
|
-
def __init__(self,
|
|
50
|
-
self.
|
|
51
|
-
self.files = []
|
|
21
|
+
def __init__(self, cfg: DictConfig):
|
|
22
|
+
self.cfg = cfg
|
|
52
23
|
self.dataset = None
|
|
24
|
+
self.variables = cfg.variables
|
|
25
|
+
self.files = []
|
|
53
26
|
|
|
54
|
-
def _fix_coords(self, ds: xr.Dataset | xr.DataArray)
|
|
55
|
-
"""
|
|
56
|
-
Ensure latitude is ascending and longitude is in the range [0, 360].
|
|
57
|
-
|
|
58
|
-
Parameters
|
|
59
|
-
----------
|
|
60
|
-
ds : xr.Dataset or xr.DataArray
|
|
61
|
-
Input dataset or dataarray with latitude and longitude coordinates.
|
|
62
|
-
|
|
63
|
-
Returns
|
|
64
|
-
-------
|
|
65
|
-
xr.Dataset or xr.DataArray
|
|
66
|
-
Dataset with latitude ascending and longitude wrapped to [0, 360].
|
|
67
|
-
"""
|
|
68
|
-
# Flip latitude to ascending
|
|
27
|
+
def _fix_coords(self, ds: xr.Dataset | xr.DataArray):
|
|
28
|
+
"""Ensure latitude is ascending and longitude is in the range [0, 360]."""
|
|
69
29
|
ds = ds.cf.sortby("latitude")
|
|
70
|
-
|
|
71
|
-
# Wrap longitude into [0, 360]
|
|
72
30
|
lon_name = ds.cf["longitude"].name
|
|
73
31
|
ds = ds.assign_coords({lon_name: ds.cf["longitude"] % 360})
|
|
32
|
+
return ds.sortby(lon_name)
|
|
74
33
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def fetch(self):
|
|
82
|
-
param_mapping = self.var_cfg.mappings
|
|
83
|
-
provider = self.var_cfg.dataset.lower()
|
|
84
|
-
parameter_key = self.var_cfg.weather.parameter
|
|
85
|
-
|
|
86
|
-
param_info = param_mapping[provider]['variables'][parameter_key]
|
|
87
|
-
folder_id = param_info["folder_id"]
|
|
88
|
-
|
|
89
|
-
start_date = self.var_cfg.time_range.start_date
|
|
90
|
-
end_date = self.var_cfg.time_range.end_date
|
|
91
|
-
|
|
92
|
-
start = datetime.fromisoformat(start_date)
|
|
93
|
-
end = datetime.fromisoformat(end_date)
|
|
34
|
+
def fetch(self, folder_id: str, variable: str):
|
|
35
|
+
"""
|
|
36
|
+
Fetch MSWX files from Google Drive for a given variable.
|
|
37
|
+
"""
|
|
38
|
+
start = datetime.fromisoformat(self.cfg.time_range.start_date)
|
|
39
|
+
end = datetime.fromisoformat(self.cfg.time_range.end_date)
|
|
94
40
|
|
|
95
41
|
expected_files = []
|
|
96
42
|
current = start
|
|
@@ -100,29 +46,25 @@ class MSWXmirror:
|
|
|
100
46
|
expected_files.append(basename)
|
|
101
47
|
current += timedelta(days=1)
|
|
102
48
|
|
|
103
|
-
output_dir = self.
|
|
104
|
-
|
|
105
|
-
parameter_key = self.var_cfg.weather.parameter
|
|
106
|
-
local_files = []
|
|
107
|
-
missing_files = []
|
|
49
|
+
output_dir = self.cfg.data_dir
|
|
50
|
+
local_files, missing_files = [], []
|
|
108
51
|
|
|
109
52
|
for basename in expected_files:
|
|
110
|
-
local_path = os.path.join(output_dir,
|
|
53
|
+
local_path = os.path.join(output_dir,self.cfg.dataset, variable, basename)
|
|
111
54
|
if os.path.exists(local_path):
|
|
112
55
|
local_files.append(basename)
|
|
113
56
|
else:
|
|
114
57
|
missing_files.append(basename)
|
|
115
58
|
|
|
116
59
|
if not missing_files:
|
|
117
|
-
print(f"✅ All {len(expected_files)} files already exist locally.
|
|
118
|
-
self.files = local_files
|
|
60
|
+
print(f"✅ All {len(expected_files)} {variable} files already exist locally.")
|
|
119
61
|
return local_files
|
|
120
62
|
|
|
121
|
-
print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching from Drive...")
|
|
63
|
+
print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching {variable} from Drive...")
|
|
122
64
|
|
|
123
65
|
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
|
|
124
66
|
creds = service_account.Credentials.from_service_account_file(
|
|
125
|
-
|
|
67
|
+
self.cfg.mappings.mswx.params.google_service_account, scopes=SCOPES
|
|
126
68
|
)
|
|
127
69
|
service = build('drive', 'v3', credentials=creds)
|
|
128
70
|
|
|
@@ -131,84 +73,63 @@ class MSWXmirror:
|
|
|
131
73
|
files_to_download = [f for f in drive_files if f['name'] in valid_filenames]
|
|
132
74
|
|
|
133
75
|
if not files_to_download:
|
|
134
|
-
print(f"⚠️
|
|
135
|
-
self.files = local_files
|
|
76
|
+
print(f"⚠️ No {variable} files found in Drive for requested dates.")
|
|
136
77
|
return local_files
|
|
137
78
|
|
|
138
79
|
for file in files_to_download:
|
|
139
80
|
filename = file['name']
|
|
140
|
-
local_path = os.path.join(output_dir,
|
|
81
|
+
local_path = os.path.join(output_dir, self.cfg.dataset, variable, filename)
|
|
82
|
+
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
141
83
|
print(f"⬇️ Downloading {filename} ...")
|
|
142
84
|
download_drive_file(file['id'], local_path, service)
|
|
143
85
|
local_files.append(filename)
|
|
144
86
|
|
|
145
|
-
self.files = local_files
|
|
146
87
|
return local_files
|
|
147
88
|
|
|
148
|
-
def load(self):
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
param_info = param_mapping[provider]['variables'][parameter_key]
|
|
156
|
-
output_dir = self.var_cfg.data_dir
|
|
157
|
-
valid_dsets = []
|
|
89
|
+
def load(self, variable: str):
|
|
90
|
+
"""
|
|
91
|
+
Load MSWX NetCDFs for a given variable into a single xarray Dataset.
|
|
92
|
+
"""
|
|
93
|
+
folder_id = self.cfg.mappings["mswx"]["variables"][variable]["folder_id"]
|
|
94
|
+
files = self.fetch(folder_id, variable)
|
|
95
|
+
datasets = []
|
|
158
96
|
|
|
159
|
-
for f in
|
|
160
|
-
local_path = os.path.join(
|
|
97
|
+
for f in files:
|
|
98
|
+
local_path = os.path.join(self.cfg.data_dir, self.cfg.dataset.lower(), variable, f)
|
|
161
99
|
try:
|
|
162
|
-
ds = xr.open_dataset(local_path, chunks=
|
|
163
|
-
|
|
100
|
+
ds = xr.open_dataset(local_path, chunks="auto", engine="netcdf4")[self.cfg.mappings[self.cfg.dataset].variables[variable].name]
|
|
101
|
+
ds = ds.rename(variable)
|
|
102
|
+
datasets.append(ds)
|
|
164
103
|
except Exception as e:
|
|
165
|
-
print(f"Skipping file due to error: {
|
|
104
|
+
print(f"Skipping file {f} due to error: {e}")
|
|
166
105
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
self.dataset = self._fix_coords(dset)
|
|
170
|
-
return dset
|
|
106
|
+
if not datasets:
|
|
107
|
+
raise RuntimeError(f"No datasets could be loaded for {variable}.")
|
|
171
108
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
109
|
+
dset = xr.concat(datasets, dim="time")
|
|
110
|
+
dset = dset.transpose("time", "lat", "lon")
|
|
111
|
+
dset = self._fix_coords(dset)
|
|
175
112
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
region = self.var_cfg.region
|
|
113
|
+
self.dataset = dset
|
|
114
|
+
return self.dataset
|
|
179
115
|
|
|
180
|
-
|
|
116
|
+
def to_zarr(self, zarr_filename: str):
|
|
117
|
+
if self.dataset is None:
|
|
118
|
+
raise ValueError("No dataset loaded. Call `load()` first.")
|
|
119
|
+
|
|
120
|
+
var_name = self.dataset.name
|
|
181
121
|
if var_name == 'pr':
|
|
182
122
|
self.dataset.attrs['units'] = 'mm/day'
|
|
183
123
|
elif var_name in ['tas', 'tasmax', 'tasmin']:
|
|
184
124
|
self.dataset.attrs['units'] = 'degC'
|
|
185
125
|
|
|
186
|
-
zarr_path = os.path.join("data/MSWX
|
|
126
|
+
zarr_path = os.path.join("data/MSWX", zarr_filename)
|
|
187
127
|
os.makedirs(os.path.dirname(zarr_path), exist_ok=True)
|
|
188
128
|
|
|
189
129
|
print(f"💾 Saving {var_name} to Zarr: {zarr_path}")
|
|
190
130
|
self.dataset.to_zarr(zarr_path, mode="w")
|
|
191
131
|
|
|
192
132
|
def extract(self, *, point=None, box=None, shapefile=None, buffer_km=0.0):
|
|
193
|
-
"""
|
|
194
|
-
Extract a subset of the dataset by point, bounding box, or shapefile.
|
|
195
|
-
|
|
196
|
-
Parameters
|
|
197
|
-
----------
|
|
198
|
-
point : tuple(float, float), optional
|
|
199
|
-
(lon, lat) coordinates for a single point.
|
|
200
|
-
box : tuple(float, float, float, float), optional
|
|
201
|
-
(min_lon, min_lat, max_lon, max_lat) bounding box.
|
|
202
|
-
shapefile : str or geopandas.GeoDataFrame, optional
|
|
203
|
-
Path to shapefile or a GeoDataFrame.
|
|
204
|
-
buffer_km : float, optional
|
|
205
|
-
Buffer distance in kilometers (for point or shapefile).
|
|
206
|
-
|
|
207
|
-
Returns
|
|
208
|
-
-------
|
|
209
|
-
xarray.Dataset or xarray.DataArray
|
|
210
|
-
Subset of the dataset.
|
|
211
|
-
"""
|
|
212
133
|
if self.dataset is None:
|
|
213
134
|
raise ValueError("No dataset loaded. Call `load()` first.")
|
|
214
135
|
|
|
@@ -217,20 +138,18 @@ class MSWXmirror:
|
|
|
217
138
|
if point is not None:
|
|
218
139
|
lon, lat = point
|
|
219
140
|
if buffer_km > 0:
|
|
220
|
-
|
|
221
|
-
buffer_deg = buffer_km / 111 # rough conversion km→degrees
|
|
141
|
+
buffer_deg = buffer_km / 111
|
|
222
142
|
ds_subset = ds.sel(
|
|
223
143
|
lon=slice(lon-buffer_deg, lon+buffer_deg),
|
|
224
|
-
lat=slice(lat-buffer_deg, lat+buffer_deg)
|
|
144
|
+
lat=slice(lat-buffer_deg, lat+buffer_deg),
|
|
225
145
|
)
|
|
226
146
|
else:
|
|
227
147
|
ds_subset = ds.sel(lon=lon, lat=lat, method="nearest")
|
|
228
148
|
|
|
229
149
|
elif box is not None:
|
|
230
|
-
min_lon, min_lat, max_lon, max_lat = box
|
|
231
150
|
ds_subset = ds.sel(
|
|
232
|
-
lon=slice(
|
|
233
|
-
lat=slice(
|
|
151
|
+
lon=slice(box["lon_min"], box["lon_max"]),
|
|
152
|
+
lat=slice(box["lat_min"], box["lat_max"]),
|
|
234
153
|
)
|
|
235
154
|
|
|
236
155
|
elif shapefile is not None:
|
|
@@ -239,64 +158,71 @@ class MSWXmirror:
|
|
|
239
158
|
else:
|
|
240
159
|
gdf = shapefile
|
|
241
160
|
if buffer_km > 0:
|
|
242
|
-
gdf = gdf.to_crs(epsg=3857)
|
|
161
|
+
gdf = gdf.to_crs(epsg=3857)
|
|
243
162
|
gdf["geometry"] = gdf.buffer(buffer_km * 1000)
|
|
244
163
|
gdf = gdf.to_crs(epsg=4326)
|
|
245
|
-
|
|
246
164
|
geom = [mapping(g) for g in gdf.geometry]
|
|
247
165
|
ds_subset = ds.rio.clip(geom, gdf.crs, drop=True)
|
|
248
166
|
|
|
249
167
|
else:
|
|
250
168
|
raise ValueError("Must provide either point, box, or shapefile.")
|
|
251
169
|
|
|
170
|
+
self.dataset = ds_subset.to_dataset()
|
|
252
171
|
return ds_subset
|
|
253
|
-
|
|
254
|
-
def to_dataframe(self, ds=None):
|
|
255
|
-
"""
|
|
256
|
-
Convert extracted xarray dataset to a tidy dataframe.
|
|
257
172
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
173
|
+
# def to_dataframe(self, ds=None):
|
|
174
|
+
# if ds is None:
|
|
175
|
+
# if self.dataset is None:
|
|
176
|
+
# raise ValueError("No dataset loaded. Call `load()` first or pass `ds`.")
|
|
177
|
+
# ds = self.dataset
|
|
178
|
+
|
|
179
|
+
# if isinstance(ds, xr.Dataset):
|
|
180
|
+
# if len(ds.data_vars) != 1:
|
|
181
|
+
# raise ValueError("Dataset has multiple variables. Please select one.")
|
|
182
|
+
# ds = ds[list(ds.data_vars)[0]]
|
|
183
|
+
|
|
184
|
+
# df = ds.to_dataframe().reset_index()
|
|
185
|
+
# df = df[["time", "lat", "lon", ds.name]]
|
|
186
|
+
# df = df.rename(columns={"lat": "latitude", "lon": "longitude", ds.name: "value"})
|
|
187
|
+
# return df
|
|
188
|
+
|
|
189
|
+
def _format(self, df):
|
|
190
|
+
"""Format dataframe for standardized output."""
|
|
191
|
+
value_vars = [v for v in self.variables if v in df.columns]
|
|
192
|
+
id_vars = [c for c in df.columns if c not in value_vars]
|
|
193
|
+
|
|
194
|
+
df_long = df.melt(
|
|
195
|
+
id_vars=id_vars,
|
|
196
|
+
value_vars=value_vars,
|
|
197
|
+
var_name="variable",
|
|
198
|
+
value_name="value",
|
|
199
|
+
)
|
|
262
200
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
def format(self, df):
|
|
292
|
-
"""
|
|
293
|
-
Format dataframe into standard schema.
|
|
294
|
-
"""
|
|
295
|
-
df = df.copy()
|
|
296
|
-
df["variable"] = self.var_cfg.weather.parameter
|
|
297
|
-
df["source"] = self.var_cfg.dataset.upper()
|
|
298
|
-
df["units"] = self.dataset.attrs.get("units", "unknown")
|
|
299
|
-
|
|
300
|
-
df = df[["latitude", "longitude", "time", "source", "variable", "value", "units"]]
|
|
301
|
-
return df
|
|
302
|
-
|
|
201
|
+
df_long["units"] = df_long["variable"].map(
|
|
202
|
+
lambda v: self.dataset[v].attrs.get("units", "unknown")
|
|
203
|
+
if v in self.dataset.data_vars
|
|
204
|
+
else "unknown"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
df_long["source"] = self.cfg.dataset
|
|
208
|
+
|
|
209
|
+
cols = [
|
|
210
|
+
"source",
|
|
211
|
+
"table",
|
|
212
|
+
"time",
|
|
213
|
+
"lat",
|
|
214
|
+
"lon",
|
|
215
|
+
"variable",
|
|
216
|
+
"value",
|
|
217
|
+
"units",
|
|
218
|
+
]
|
|
219
|
+
df_long = df_long[[c for c in cols if c in df_long.columns]]
|
|
220
|
+
|
|
221
|
+
return df_long
|
|
222
|
+
|
|
223
|
+
def save_csv(self, filename):
|
|
224
|
+
if self.dataset is not None:
|
|
225
|
+
df = self.dataset.to_dataframe().reset_index()
|
|
226
|
+
df = self._format(df)
|
|
227
|
+
df.to_csv(filename, index=False)
|
|
228
|
+
print(f"Saved CSV to {filename}")
|