climdata 0.1.1__py2.py3-none-any.whl → 0.1.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of climdata might be problematic. Click here for more details.

climdata/datasets/MSWX.py CHANGED
@@ -1,96 +1,42 @@
1
1
  import pandas as pd
2
- import numpy as np
3
- from wetterdienst import Settings
4
- from wetterdienst.provider.dwd.observation import DwdObservationRequest
5
- import geemap
6
- import ee
7
- import ipdb
8
2
  import geopandas as gpd
9
- from omegaconf import DictConfig
10
3
  import os
11
- import yaml
12
- import time
13
4
  from tqdm import tqdm
14
5
  import warnings
15
6
  from datetime import datetime, timedelta
16
7
  import xarray as xr
17
- import hydra
18
8
  from omegaconf import DictConfig
19
- import pint
20
- import pint_pandas
21
9
 
22
10
  from google.oauth2 import service_account
23
11
  from googleapiclient.discovery import build
24
- from googleapiclient.http import MediaIoBaseDownload
25
12
 
26
13
  from climdata.utils.utils_download import list_drive_files, download_drive_file
27
-
28
- import io
29
- import requests
30
- from scipy.spatial import cKDTree
31
- import argparse
32
- import re
33
-
34
- import requests
35
- from bs4 import BeautifulSoup
36
- import concurrent.futures
37
-
38
- import gzip
39
- # from utils.utils import *
40
- # from datasets.datasets import *
41
- import rioxarray
42
14
  from shapely.geometry import mapping
15
+ import cf_xarray
43
16
 
44
17
  warnings.filterwarnings("ignore", category=Warning)
45
18
 
46
- import cf_xarray
47
19
 
48
20
  class MSWXmirror:
49
- def __init__(self, var_cfg: DictConfig):
50
- self.var_cfg = var_cfg
51
- self.files = []
21
+ def __init__(self, cfg: DictConfig):
22
+ self.cfg = cfg
52
23
  self.dataset = None
24
+ self.variables = cfg.variables
25
+ self.files = []
53
26
 
54
- def _fix_coords(self, ds: xr.Dataset | xr.DataArray) -> xr.Dataset | xr.DataArray:
55
- """
56
- Ensure latitude is ascending and longitude is in the range [0, 360].
57
-
58
- Parameters
59
- ----------
60
- ds : xr.Dataset or xr.DataArray
61
- Input dataset or dataarray with latitude and longitude coordinates.
62
-
63
- Returns
64
- -------
65
- xr.Dataset or xr.DataArray
66
- Dataset with latitude ascending and longitude wrapped to [0, 360].
67
- """
68
- # Flip latitude to ascending
27
+ def _fix_coords(self, ds: xr.Dataset | xr.DataArray):
28
+ """Ensure latitude is ascending and longitude is in the range [0, 360]."""
69
29
  ds = ds.cf.sortby("latitude")
70
-
71
- # Wrap longitude into [0, 360]
72
30
  lon_name = ds.cf["longitude"].name
73
31
  ds = ds.assign_coords({lon_name: ds.cf["longitude"] % 360})
32
+ return ds.sortby(lon_name)
74
33
 
75
- # Sort by longitude
76
- ds = ds.sortby(lon_name)
77
-
78
- return ds
79
-
80
-
81
- def fetch(self):
82
- param_mapping = self.var_cfg.mappings
83
- provider = self.var_cfg.dataset.lower()
84
- parameter_key = self.var_cfg.weather.parameter
85
-
86
- param_info = param_mapping[provider]['variables'][parameter_key]
87
- folder_id = param_info["folder_id"]
88
-
89
- start_date = self.var_cfg.time_range.start_date
90
- end_date = self.var_cfg.time_range.end_date
91
-
92
- start = datetime.fromisoformat(start_date)
93
- end = datetime.fromisoformat(end_date)
34
+ def fetch(self, folder_id: str, variable: str):
35
+ """
36
+ Fetch MSWX files from Google Drive for a given variable.
37
+ """
38
+ start = datetime.fromisoformat(self.cfg.time_range.start_date)
39
+ end = datetime.fromisoformat(self.cfg.time_range.end_date)
94
40
 
95
41
  expected_files = []
96
42
  current = start
@@ -100,29 +46,25 @@ class MSWXmirror:
100
46
  expected_files.append(basename)
101
47
  current += timedelta(days=1)
102
48
 
103
- output_dir = self.var_cfg.data_dir
104
- provider = self.var_cfg.dataset.lower()
105
- parameter_key = self.var_cfg.weather.parameter
106
- local_files = []
107
- missing_files = []
49
+ output_dir = self.cfg.data_dir
50
+ local_files, missing_files = [], []
108
51
 
109
52
  for basename in expected_files:
110
- local_path = os.path.join(output_dir, provider, parameter_key, basename)
53
+ local_path = os.path.join(output_dir,self.cfg.dataset, variable, basename)
111
54
  if os.path.exists(local_path):
112
55
  local_files.append(basename)
113
56
  else:
114
57
  missing_files.append(basename)
115
58
 
116
59
  if not missing_files:
117
- print(f"✅ All {len(expected_files)} files already exist locally. No download needed.")
118
- self.files = local_files
60
+ print(f"✅ All {len(expected_files)} {variable} files already exist locally.")
119
61
  return local_files
120
62
 
121
- print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching from Drive...")
63
+ print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching {variable} from Drive...")
122
64
 
123
65
  SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
124
66
  creds = service_account.Credentials.from_service_account_file(
125
- param_mapping[provider].params.google_service_account, scopes=SCOPES
67
+ self.cfg.mappings.mswx.params.google_service_account, scopes=SCOPES
126
68
  )
127
69
  service = build('drive', 'v3', credentials=creds)
128
70
 
@@ -131,84 +73,63 @@ class MSWXmirror:
131
73
  files_to_download = [f for f in drive_files if f['name'] in valid_filenames]
132
74
 
133
75
  if not files_to_download:
134
- print(f"⚠️ None of the missing files found in Drive. Check folder & date range.")
135
- self.files = local_files
76
+ print(f"⚠️ No {variable} files found in Drive for requested dates.")
136
77
  return local_files
137
78
 
138
79
  for file in files_to_download:
139
80
  filename = file['name']
140
- local_path = os.path.join(output_dir, provider, parameter_key, filename)
81
+ local_path = os.path.join(output_dir, self.cfg.dataset, variable, filename)
82
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
141
83
  print(f"⬇️ Downloading {filename} ...")
142
84
  download_drive_file(file['id'], local_path, service)
143
85
  local_files.append(filename)
144
86
 
145
- self.files = local_files
146
87
  return local_files
147
88
 
148
- def load(self):
149
- param_mapping = self.var_cfg.mappings
150
- provider = self.var_cfg.dataset.lower()
151
- parameter_key = self.var_cfg.weather.parameter
152
- region = self.var_cfg.region
153
- bounds = self.var_cfg.bounds[region]
154
-
155
- param_info = param_mapping[provider]['variables'][parameter_key]
156
- output_dir = self.var_cfg.data_dir
157
- valid_dsets = []
89
+ def load(self, variable: str):
90
+ """
91
+ Load MSWX NetCDFs for a given variable into a single xarray Dataset.
92
+ """
93
+ folder_id = self.cfg.mappings["mswx"]["variables"][variable]["folder_id"]
94
+ files = self.fetch(folder_id, variable)
95
+ datasets = []
158
96
 
159
- for f in self.files:
160
- local_path = os.path.join(output_dir, provider, parameter_key, f)
97
+ for f in files:
98
+ local_path = os.path.join(self.cfg.data_dir, self.cfg.dataset.lower(), variable, f)
161
99
  try:
162
- ds = xr.open_dataset(local_path, chunks='auto', engine='netcdf4')[param_info.name]
163
- valid_dsets.append(ds)
100
+ ds = xr.open_dataset(local_path, chunks="auto", engine="netcdf4")[self.cfg.mappings[self.cfg.dataset].variables[variable].name]
101
+ ds = ds.rename(variable)
102
+ datasets.append(ds)
164
103
  except Exception as e:
165
- print(f"Skipping file due to error: {f}\n{e}")
104
+ print(f"Skipping file {f} due to error: {e}")
166
105
 
167
- dset = xr.concat(valid_dsets, dim='time')
168
- dset = dset.transpose('time', 'lat', 'lon')
169
- self.dataset = self._fix_coords(dset)
170
- return dset
106
+ if not datasets:
107
+ raise RuntimeError(f"No datasets could be loaded for {variable}.")
171
108
 
172
- def to_zarr(self, zarr_filename):
173
- if self.dataset is None:
174
- raise ValueError("No dataset loaded. Call `load()` before `to_zarr()`.")
109
+ dset = xr.concat(datasets, dim="time")
110
+ dset = dset.transpose("time", "lat", "lon")
111
+ dset = self._fix_coords(dset)
175
112
 
176
- var_name = self.var_cfg.weather.parameter
177
- dataset_name = self.var_cfg.dataset
178
- region = self.var_cfg.region
113
+ self.dataset = dset
114
+ return self.dataset
179
115
 
180
- # Add standard units metadata
116
+ def to_zarr(self, zarr_filename: str):
117
+ if self.dataset is None:
118
+ raise ValueError("No dataset loaded. Call `load()` first.")
119
+
120
+ var_name = self.dataset.name
181
121
  if var_name == 'pr':
182
122
  self.dataset.attrs['units'] = 'mm/day'
183
123
  elif var_name in ['tas', 'tasmax', 'tasmin']:
184
124
  self.dataset.attrs['units'] = 'degC'
185
125
 
186
- zarr_path = os.path.join("data/MSWX/", zarr_filename)
126
+ zarr_path = os.path.join("data/MSWX", zarr_filename)
187
127
  os.makedirs(os.path.dirname(zarr_path), exist_ok=True)
188
128
 
189
129
  print(f"💾 Saving {var_name} to Zarr: {zarr_path}")
190
130
  self.dataset.to_zarr(zarr_path, mode="w")
191
131
 
192
132
  def extract(self, *, point=None, box=None, shapefile=None, buffer_km=0.0):
193
- """
194
- Extract a subset of the dataset by point, bounding box, or shapefile.
195
-
196
- Parameters
197
- ----------
198
- point : tuple(float, float), optional
199
- (lon, lat) coordinates for a single point.
200
- box : tuple(float, float, float, float), optional
201
- (min_lon, min_lat, max_lon, max_lat) bounding box.
202
- shapefile : str or geopandas.GeoDataFrame, optional
203
- Path to shapefile or a GeoDataFrame.
204
- buffer_km : float, optional
205
- Buffer distance in kilometers (for point or shapefile).
206
-
207
- Returns
208
- -------
209
- xarray.Dataset or xarray.DataArray
210
- Subset of the dataset.
211
- """
212
133
  if self.dataset is None:
213
134
  raise ValueError("No dataset loaded. Call `load()` first.")
214
135
 
@@ -217,20 +138,18 @@ class MSWXmirror:
217
138
  if point is not None:
218
139
  lon, lat = point
219
140
  if buffer_km > 0:
220
- # buffer around point
221
- buffer_deg = buffer_km / 111 # rough conversion km→degrees
141
+ buffer_deg = buffer_km / 111
222
142
  ds_subset = ds.sel(
223
143
  lon=slice(lon-buffer_deg, lon+buffer_deg),
224
- lat=slice(lat-buffer_deg, lat+buffer_deg)
144
+ lat=slice(lat-buffer_deg, lat+buffer_deg),
225
145
  )
226
146
  else:
227
147
  ds_subset = ds.sel(lon=lon, lat=lat, method="nearest")
228
148
 
229
149
  elif box is not None:
230
- min_lon, min_lat, max_lon, max_lat = box
231
150
  ds_subset = ds.sel(
232
- lon=slice(min_lon, max_lon),
233
- lat=slice(min_lat, max_lat)
151
+ lon=slice(box["lon_min"], box["lon_max"]),
152
+ lat=slice(box["lat_min"], box["lat_max"]),
234
153
  )
235
154
 
236
155
  elif shapefile is not None:
@@ -239,64 +158,71 @@ class MSWXmirror:
239
158
  else:
240
159
  gdf = shapefile
241
160
  if buffer_km > 0:
242
- gdf = gdf.to_crs(epsg=3857) # project to meters
161
+ gdf = gdf.to_crs(epsg=3857)
243
162
  gdf["geometry"] = gdf.buffer(buffer_km * 1000)
244
163
  gdf = gdf.to_crs(epsg=4326)
245
-
246
164
  geom = [mapping(g) for g in gdf.geometry]
247
165
  ds_subset = ds.rio.clip(geom, gdf.crs, drop=True)
248
166
 
249
167
  else:
250
168
  raise ValueError("Must provide either point, box, or shapefile.")
251
169
 
170
+ self.dataset = ds_subset.to_dataset()
252
171
  return ds_subset
253
-
254
- def to_dataframe(self, ds=None):
255
- """
256
- Convert extracted xarray dataset to a tidy dataframe.
257
172
 
258
- Parameters
259
- ----------
260
- ds : xr.DataArray or xr.Dataset, optional
261
- Dataset to convert. If None, use self.dataset.
173
+ # def to_dataframe(self, ds=None):
174
+ # if ds is None:
175
+ # if self.dataset is None:
176
+ # raise ValueError("No dataset loaded. Call `load()` first or pass `ds`.")
177
+ # ds = self.dataset
178
+
179
+ # if isinstance(ds, xr.Dataset):
180
+ # if len(ds.data_vars) != 1:
181
+ # raise ValueError("Dataset has multiple variables. Please select one.")
182
+ # ds = ds[list(ds.data_vars)[0]]
183
+
184
+ # df = ds.to_dataframe().reset_index()
185
+ # df = df[["time", "lat", "lon", ds.name]]
186
+ # df = df.rename(columns={"lat": "latitude", "lon": "longitude", ds.name: "value"})
187
+ # return df
188
+
189
+ def _format(self, df):
190
+ """Format dataframe for standardized output."""
191
+ value_vars = [v for v in self.variables if v in df.columns]
192
+ id_vars = [c for c in df.columns if c not in value_vars]
193
+
194
+ df_long = df.melt(
195
+ id_vars=id_vars,
196
+ value_vars=value_vars,
197
+ var_name="variable",
198
+ value_name="value",
199
+ )
262
200
 
263
- Returns
264
- -------
265
- pd.DataFrame
266
- """
267
- if ds is None:
268
- if self.dataset is None:
269
- raise ValueError("No dataset loaded. Call `load()` first or pass `ds`.")
270
- ds = self.dataset
271
-
272
- # If Dataset, pick first variable
273
- if isinstance(ds, xr.Dataset):
274
- if len(ds.data_vars) != 1:
275
- raise ValueError("Dataset has multiple variables. Please select one.")
276
- ds = ds[list(ds.data_vars)[0]]
277
-
278
- df = ds.to_dataframe().reset_index()
279
-
280
- # Keep only relevant cols
281
- df = df[["time", "lat", "lon", ds.name]]
282
-
283
- # Rename
284
- df = df.rename(columns={
285
- "lat": "latitude",
286
- "lon": "longitude",
287
- ds.name: "value"
288
- })
289
- return df
290
-
291
- def format(self, df):
292
- """
293
- Format dataframe into standard schema.
294
- """
295
- df = df.copy()
296
- df["variable"] = self.var_cfg.weather.parameter
297
- df["source"] = self.var_cfg.dataset.upper()
298
- df["units"] = self.dataset.attrs.get("units", "unknown")
299
-
300
- df = df[["latitude", "longitude", "time", "source", "variable", "value", "units"]]
301
- return df
302
-
201
+ df_long["units"] = df_long["variable"].map(
202
+ lambda v: self.dataset[v].attrs.get("units", "unknown")
203
+ if v in self.dataset.data_vars
204
+ else "unknown"
205
+ )
206
+
207
+ df_long["source"] = self.cfg.dataset
208
+
209
+ cols = [
210
+ "source",
211
+ "table",
212
+ "time",
213
+ "lat",
214
+ "lon",
215
+ "variable",
216
+ "value",
217
+ "units",
218
+ ]
219
+ df_long = df_long[[c for c in cols if c in df_long.columns]]
220
+
221
+ return df_long
222
+
223
+ def save_csv(self, filename):
224
+ if self.dataset is not None:
225
+ df = self.dataset.to_dataframe().reset_index()
226
+ df = self._format(df)
227
+ df.to_csv(filename, index=False)
228
+ print(f"Saved CSV to {filename}")