climdata 0.1.2__py2.py3-none-any.whl → 0.1.4__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of climdata might be problematic. Click here for more details.

climdata/datasets/MSWX.py CHANGED
@@ -1,96 +1,42 @@
1
1
  import pandas as pd
2
- import numpy as np
3
- from wetterdienst import Settings
4
- from wetterdienst.provider.dwd.observation import DwdObservationRequest
5
- import geemap
6
- import ee
7
- import ipdb
8
2
  import geopandas as gpd
9
- from omegaconf import DictConfig
10
3
  import os
11
- import yaml
12
- import time
13
4
  from tqdm import tqdm
14
5
  import warnings
15
6
  from datetime import datetime, timedelta
16
7
  import xarray as xr
17
- import hydra
18
8
  from omegaconf import DictConfig
19
- import pint
20
- import pint_pandas
21
9
 
22
10
  from google.oauth2 import service_account
23
11
  from googleapiclient.discovery import build
24
- from googleapiclient.http import MediaIoBaseDownload
25
12
 
26
13
  from climdata.utils.utils_download import list_drive_files, download_drive_file
27
-
28
- import io
29
- import requests
30
- from scipy.spatial import cKDTree
31
- import argparse
32
- import re
33
-
34
- import requests
35
- from bs4 import BeautifulSoup
36
- import concurrent.futures
37
-
38
- import gzip
39
- # from utils.utils import *
40
- # from datasets.datasets import *
41
- import rioxarray
42
14
  from shapely.geometry import mapping
15
+ import cf_xarray
43
16
 
44
17
  warnings.filterwarnings("ignore", category=Warning)
45
18
 
46
- import cf_xarray
47
19
 
48
20
  class MSWXmirror:
49
- def __init__(self, var_cfg: DictConfig):
50
- self.var_cfg = var_cfg
51
- self.files = []
21
+ def __init__(self, cfg: DictConfig):
22
+ self.cfg = cfg
52
23
  self.dataset = None
24
+ self.variables = cfg.variables
25
+ self.files = []
53
26
 
54
- def _fix_coords(self, ds: xr.Dataset | xr.DataArray) -> xr.Dataset | xr.DataArray:
55
- """
56
- Ensure latitude is ascending and longitude is in the range [0, 360].
57
-
58
- Parameters
59
- ----------
60
- ds : xr.Dataset or xr.DataArray
61
- Input dataset or dataarray with latitude and longitude coordinates.
62
-
63
- Returns
64
- -------
65
- xr.Dataset or xr.DataArray
66
- Dataset with latitude ascending and longitude wrapped to [0, 360].
67
- """
68
- # Flip latitude to ascending
27
+ def _fix_coords(self, ds: xr.Dataset | xr.DataArray):
28
+ """Ensure latitude is ascending and longitude is in the range [0, 360]."""
69
29
  ds = ds.cf.sortby("latitude")
70
-
71
- # Wrap longitude into [0, 360]
72
30
  lon_name = ds.cf["longitude"].name
73
31
  ds = ds.assign_coords({lon_name: ds.cf["longitude"] % 360})
32
+ return ds.sortby(lon_name)
74
33
 
75
- # Sort by longitude
76
- ds = ds.sortby(lon_name)
77
-
78
- return ds
79
-
80
-
81
- def fetch(self):
82
- param_mapping = self.var_cfg.mappings
83
- provider = self.var_cfg.dataset.lower()
84
- parameter_key = self.var_cfg.weather.parameter
85
-
86
- param_info = param_mapping[provider]['variables'][parameter_key]
87
- folder_id = param_info["folder_id"]
88
-
89
- start_date = self.var_cfg.time_range.start_date
90
- end_date = self.var_cfg.time_range.end_date
91
-
92
- start = datetime.fromisoformat(start_date)
93
- end = datetime.fromisoformat(end_date)
34
+ def fetch(self, folder_id: str, variable: str):
35
+ """
36
+ Fetch MSWX files from Google Drive for a given variable.
37
+ """
38
+ start = datetime.fromisoformat(self.cfg.time_range.start_date)
39
+ end = datetime.fromisoformat(self.cfg.time_range.end_date)
94
40
 
95
41
  expected_files = []
96
42
  current = start
@@ -100,29 +46,25 @@ class MSWXmirror:
100
46
  expected_files.append(basename)
101
47
  current += timedelta(days=1)
102
48
 
103
- output_dir = self.var_cfg.data_dir
104
- provider = self.var_cfg.dataset.lower()
105
- parameter_key = self.var_cfg.weather.parameter
106
- local_files = []
107
- missing_files = []
49
+ output_dir = self.cfg.data_dir
50
+ local_files, missing_files = [], []
108
51
 
109
52
  for basename in expected_files:
110
- local_path = os.path.join(output_dir, provider, parameter_key, basename)
53
+ local_path = os.path.join(output_dir,self.cfg.dataset, variable, basename)
111
54
  if os.path.exists(local_path):
112
55
  local_files.append(basename)
113
56
  else:
114
57
  missing_files.append(basename)
115
58
 
116
59
  if not missing_files:
117
- print(f"✅ All {len(expected_files)} files already exist locally. No download needed.")
118
- self.files = local_files
60
+ print(f"✅ All {len(expected_files)} {variable} files already exist locally.")
119
61
  return local_files
120
62
 
121
- print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching from Drive...")
63
+ print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching {variable} from Drive...")
122
64
 
123
65
  SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
124
66
  creds = service_account.Credentials.from_service_account_file(
125
- param_mapping[provider].params.google_service_account, scopes=SCOPES
67
+ self.cfg.mappings.mswx.params.google_service_account, scopes=SCOPES
126
68
  )
127
69
  service = build('drive', 'v3', credentials=creds)
128
70
 
@@ -131,86 +73,63 @@ class MSWXmirror:
131
73
  files_to_download = [f for f in drive_files if f['name'] in valid_filenames]
132
74
 
133
75
  if not files_to_download:
134
- print(f"⚠️ None of the missing files found in Drive. Check folder & date range.")
135
- self.files = local_files
76
+ print(f"⚠️ No {variable} files found in Drive for requested dates.")
136
77
  return local_files
137
78
 
138
79
  for file in files_to_download:
139
80
  filename = file['name']
140
- local_path = os.path.join(output_dir, provider, parameter_key, filename)
81
+ local_path = os.path.join(output_dir, self.cfg.dataset, variable, filename)
82
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
141
83
  print(f"⬇️ Downloading {filename} ...")
142
84
  download_drive_file(file['id'], local_path, service)
143
85
  local_files.append(filename)
144
86
 
145
- self.files = local_files
146
87
  return local_files
147
88
 
148
- def load(self):
149
- param_mapping = self.var_cfg.mappings
150
- provider = self.var_cfg.dataset.lower()
151
- parameter_key = self.var_cfg.weather.parameter
152
- region = self.var_cfg.region
153
- bounds = self.var_cfg.bounds[region]
154
-
155
- param_info = param_mapping[provider]['variables'][parameter_key]
156
- output_dir = self.var_cfg.data_dir
157
- valid_dsets = []
89
+ def load(self, variable: str):
90
+ """
91
+ Load MSWX NetCDFs for a given variable into a single xarray Dataset.
92
+ """
93
+ folder_id = self.cfg.mappings["mswx"]["variables"][variable]["folder_id"]
94
+ files = self.fetch(folder_id, variable)
95
+ datasets = []
158
96
 
159
- for f in self.files:
160
- local_path = os.path.join(output_dir, provider, parameter_key, f)
97
+ for f in files:
98
+ local_path = os.path.join(self.cfg.data_dir, self.cfg.dataset.lower(), variable, f)
161
99
  try:
162
- ds = xr.open_dataset(local_path, chunks='auto', engine='netcdf4')[param_info.name]
163
- # Rename DataArray to parameter_key
164
- ds = ds.rename(parameter_key)
165
- valid_dsets.append(ds)
100
+ ds = xr.open_dataset(local_path, chunks="auto", engine="netcdf4")[self.cfg.mappings[self.cfg.dataset].variables[variable].name]
101
+ ds = ds.rename(variable)
102
+ datasets.append(ds)
166
103
  except Exception as e:
167
- print(f"Skipping file due to error: {f}\n{e}")
104
+ print(f"Skipping file {f} due to error: {e}")
105
+
106
+ if not datasets:
107
+ raise RuntimeError(f"No datasets could be loaded for {variable}.")
108
+
109
+ dset = xr.concat(datasets, dim="time")
110
+ dset = dset.transpose("time", "lat", "lon")
111
+ dset = self._fix_coords(dset)
168
112
 
169
- dset = xr.concat(valid_dsets, dim='time')
170
- dset = dset.transpose('time', 'lat', 'lon')
171
- self.dataset = self._fix_coords(dset)
113
+ self.dataset = dset
172
114
  return self.dataset
173
115
 
174
- def to_zarr(self, zarr_filename):
116
+ def to_zarr(self, zarr_filename: str):
175
117
  if self.dataset is None:
176
- raise ValueError("No dataset loaded. Call `load()` before `to_zarr()`.")
177
-
178
- var_name = self.var_cfg.weather.parameter
179
- dataset_name = self.var_cfg.dataset
180
- region = self.var_cfg.region
118
+ raise ValueError("No dataset loaded. Call `load()` first.")
181
119
 
182
- # Add standard units metadata
120
+ var_name = self.dataset.name
183
121
  if var_name == 'pr':
184
122
  self.dataset.attrs['units'] = 'mm/day'
185
123
  elif var_name in ['tas', 'tasmax', 'tasmin']:
186
124
  self.dataset.attrs['units'] = 'degC'
187
125
 
188
- zarr_path = os.path.join("data/MSWX/", zarr_filename)
126
+ zarr_path = os.path.join("data/MSWX", zarr_filename)
189
127
  os.makedirs(os.path.dirname(zarr_path), exist_ok=True)
190
128
 
191
129
  print(f"💾 Saving {var_name} to Zarr: {zarr_path}")
192
130
  self.dataset.to_zarr(zarr_path, mode="w")
193
131
 
194
132
  def extract(self, *, point=None, box=None, shapefile=None, buffer_km=0.0):
195
- """
196
- Extract a subset of the dataset by point, bounding box, or shapefile.
197
-
198
- Parameters
199
- ----------
200
- point : tuple(float, float), optional
201
- (lon, lat) coordinates for a single point.
202
- box : tuple(float, float, float, float), optional
203
- (min_lon, min_lat, max_lon, max_lat) bounding box.
204
- shapefile : str or geopandas.GeoDataFrame, optional
205
- Path to shapefile or a GeoDataFrame.
206
- buffer_km : float, optional
207
- Buffer distance in kilometers (for point or shapefile).
208
-
209
- Returns
210
- -------
211
- xarray.Dataset or xarray.DataArray
212
- Subset of the dataset.
213
- """
214
133
  if self.dataset is None:
215
134
  raise ValueError("No dataset loaded. Call `load()` first.")
216
135
 
@@ -219,20 +138,18 @@ class MSWXmirror:
219
138
  if point is not None:
220
139
  lon, lat = point
221
140
  if buffer_km > 0:
222
- # buffer around point
223
- buffer_deg = buffer_km / 111 # rough conversion km→degrees
141
+ buffer_deg = buffer_km / 111
224
142
  ds_subset = ds.sel(
225
143
  lon=slice(lon-buffer_deg, lon+buffer_deg),
226
- lat=slice(lat-buffer_deg, lat+buffer_deg)
144
+ lat=slice(lat-buffer_deg, lat+buffer_deg),
227
145
  )
228
146
  else:
229
147
  ds_subset = ds.sel(lon=lon, lat=lat, method="nearest")
230
148
 
231
149
  elif box is not None:
232
- # Accept dict: {'lat_min': ..., 'lat_max': ..., 'lon_min': ..., 'lon_max': ...}
233
150
  ds_subset = ds.sel(
234
- lon=slice(box['lon_min'], box['lon_max']),
235
- lat=slice(box['lat_min'], box['lat_max'])
151
+ lon=slice(box["lon_min"], box["lon_max"]),
152
+ lat=slice(box["lat_min"], box["lat_max"]),
236
153
  )
237
154
 
238
155
  elif shapefile is not None:
@@ -241,71 +158,71 @@ class MSWXmirror:
241
158
  else:
242
159
  gdf = shapefile
243
160
  if buffer_km > 0:
244
- gdf = gdf.to_crs(epsg=3857) # project to meters
161
+ gdf = gdf.to_crs(epsg=3857)
245
162
  gdf["geometry"] = gdf.buffer(buffer_km * 1000)
246
163
  gdf = gdf.to_crs(epsg=4326)
247
-
248
164
  geom = [mapping(g) for g in gdf.geometry]
249
165
  ds_subset = ds.rio.clip(geom, gdf.crs, drop=True)
250
166
 
251
167
  else:
252
168
  raise ValueError("Must provide either point, box, or shapefile.")
253
- self.dataset = ds_subset
254
- self.dataset = self.dataset.to_dataset()
169
+
170
+ self.dataset = ds_subset.to_dataset()
255
171
  return ds_subset
256
-
257
- def to_dataframe(self, ds=None):
258
- """
259
- Convert extracted xarray dataset to a tidy dataframe.
260
172
 
261
- Parameters
262
- ----------
263
- ds : xr.DataArray or xr.Dataset, optional
264
- Dataset to convert. If None, use self.dataset.
173
+ # def to_dataframe(self, ds=None):
174
+ # if ds is None:
175
+ # if self.dataset is None:
176
+ # raise ValueError("No dataset loaded. Call `load()` first or pass `ds`.")
177
+ # ds = self.dataset
178
+
179
+ # if isinstance(ds, xr.Dataset):
180
+ # if len(ds.data_vars) != 1:
181
+ # raise ValueError("Dataset has multiple variables. Please select one.")
182
+ # ds = ds[list(ds.data_vars)[0]]
183
+
184
+ # df = ds.to_dataframe().reset_index()
185
+ # df = df[["time", "lat", "lon", ds.name]]
186
+ # df = df.rename(columns={"lat": "latitude", "lon": "longitude", ds.name: "value"})
187
+ # return df
188
+
189
+ def _format(self, df):
190
+ """Format dataframe for standardized output."""
191
+ value_vars = [v for v in self.variables if v in df.columns]
192
+ id_vars = [c for c in df.columns if c not in value_vars]
193
+
194
+ df_long = df.melt(
195
+ id_vars=id_vars,
196
+ value_vars=value_vars,
197
+ var_name="variable",
198
+ value_name="value",
199
+ )
265
200
 
266
- Returns
267
- -------
268
- pd.DataFrame
269
- """
270
- if ds is None:
271
- if self.dataset is None:
272
- raise ValueError("No dataset loaded. Call `load()` first or pass `ds`.")
273
- ds = self.dataset
274
-
275
- # If Dataset, pick first variable
276
- if isinstance(ds, xr.Dataset):
277
- if len(ds.data_vars) != 1:
278
- raise ValueError("Dataset has multiple variables. Please select one.")
279
- ds = ds[list(ds.data_vars)[0]]
280
-
281
- df = ds.to_dataframe().reset_index()
282
-
283
- # Keep only relevant cols
284
- df = df[["time", "lat", "lon", ds.name]]
285
-
286
- # Rename
287
- df = df.rename(columns={
288
- "lat": "latitude",
289
- "lon": "longitude",
290
- ds.name: "value"
291
- })
292
- return df
293
- def save_netcdf(self, filename):
294
- if self.dataset is not None:
295
- if "time" in self.dataset.variables:
296
- self.dataset["time"].encoding.clear()
297
- self.dataset.to_netcdf(filename)
298
- # print(f"Saved NetCDF to {filename}")
201
+ df_long["units"] = df_long["variable"].map(
202
+ lambda v: self.dataset[v].attrs.get("units", "unknown")
203
+ if v in self.dataset.data_vars
204
+ else "unknown"
205
+ )
299
206
 
300
- def format(self, df):
301
- """
302
- Format dataframe into standard schema.
303
- """
304
- df = df.copy()
305
- df["variable"] = self.var_cfg.weather.parameter
306
- df["source"] = self.var_cfg.dataset.upper()
307
- df["units"] = self.dataset.attrs.get("units", "unknown")
207
+ df_long["source"] = self.cfg.dataset
308
208
 
309
- df = df[["latitude", "longitude", "time", "source", "variable", "value", "units"]]
310
- return df
209
+ cols = [
210
+ "source",
211
+ "table",
212
+ "time",
213
+ "lat",
214
+ "lon",
215
+ "variable",
216
+ "value",
217
+ "units",
218
+ ]
219
+ df_long = df_long[[c for c in cols if c in df_long.columns]]
311
220
 
221
+ return df_long
222
+
223
+ def save_csv(self, filename):
224
+ if self.dataset is not None:
225
+ df = self.dataset.to_dataframe().reset_index()
226
+ df = self._format(df)
227
+ df.to_csv(filename, index=False)
228
+ print(f"Saved CSV to {filename}")