climdata 0.0.2__py2.py3-none-any.whl → 0.0.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of climdata might be problematic. Click here for more details.

climdata/__init__.py CHANGED
@@ -2,7 +2,10 @@
2
2
 
3
3
  __author__ = """Kaushik Muduchuru"""
4
4
  __email__ = "kaushik.reddy.m@gmail.com"
5
- __version__ = "0.0.2"
5
+ __version__ = "0.0.3"
6
6
 
7
7
  from .utils.utils_download import * # etc.
8
+ from .utils.config import load_config
8
9
  from .datasets.DWD import DWDmirror as DWD
10
+ from .datasets.MSWX import MSWXmirror as MSWX
11
+
climdata/conf/config.yaml CHANGED
@@ -2,12 +2,26 @@
2
2
  defaults:
3
3
  - _self_
4
4
  - mappings/parameters
5
-
5
+ - mappings/variables
6
6
  dataset: dwd
7
7
  data_dir: /beegfs/muduchuru/data
8
8
  weather:
9
9
  parameter: tas # standardized variable name (e.g., tas, pr, rsds)
10
10
 
11
+ region: europe
12
+
13
+ bounds:
14
+ global:
15
+ lat_min: -90.0
16
+ lat_max: 90.0
17
+ lon_min: -180.0
18
+ lon_max: 180.0
19
+ europe:
20
+ lat_min: 34.0 # Southern Europe (e.g., southern Greece)
21
+ lat_max: 71.0 # Northern Europe (e.g., northern Norway)
22
+ lon_min: -25.0 # Western Europe (e.g., Azores)
23
+ lon_max: 45.0 # Eastern Europe (Ural Mountains, excludes most of Russia)
24
+
11
25
  location:
12
26
  lat: 52.5070
13
27
  lon: 14.1372
@@ -107,7 +107,7 @@ mswx:
107
107
  type: image
108
108
  subsetting: None
109
109
  params:
110
- google_service_account: /beegfs/muduchuru/codes/python/download/conf/service.json
110
+ google_service_account: None
111
111
  variables:
112
112
  tasmin:
113
113
  name: air_temperature
@@ -0,0 +1,77 @@
1
+ info:
2
+ tas:
3
+ cf_name: air_temperature
4
+ long_name: Near-surface air temperature
5
+ units: degC
6
+ tasmax:
7
+ cf_name: air_temperature
8
+ long_name: Daily maximum near-surface air temperature
9
+ units: degC
10
+ tasmin:
11
+ cf_name: air_temperature
12
+ long_name: Daily minimum near-surface air temperature
13
+ units: degC
14
+ pr:
15
+ cf_name: precipitation_flux
16
+ long_name: Precipitation
17
+ units: mm/day
18
+ pracc:
19
+ cf_name: precipitation_amount
20
+ long_name: Accumulated precipitation
21
+ units: mm
22
+ ps:
23
+ cf_name: surface_air_pressure
24
+ long_name: Surface air pressure
25
+ units: Pa
26
+ hurs:
27
+ cf_name: relative_humidity
28
+ long_name: Near-surface relative humidity
29
+ units: '%'
30
+ huss:
31
+ cf_name: specific_humidity
32
+ long_name: Near-surface specific humidity
33
+ units: 1 # kg/kg
34
+ uas:
35
+ cf_name: eastward_wind
36
+ long_name: Eastward near-surface wind
37
+ units: m s-1
38
+ vas:
39
+ cf_name: northward_wind
40
+ long_name: Northward near-surface wind
41
+ units: m s-1
42
+ sfcWind:
43
+ cf_name: wind_speed
44
+ long_name: Near-surface wind speed
45
+ units: m s-1
46
+ rsds:
47
+ cf_name: surface_downwelling_shortwave_flux_in_air
48
+ long_name: Surface downwelling shortwave radiation
49
+ units: W m-2
50
+ rlds:
51
+ cf_name: surface_downwelling_longwave_flux_in_air
52
+ long_name: Surface downwelling longwave radiation
53
+ units: W m-2
54
+ rlus:
55
+ cf_name: surface_upwelling_longwave_flux_in_air
56
+ long_name: Surface upwelling longwave radiation
57
+ units: W m-2
58
+ rlut:
59
+ cf_name: toa_outgoing_longwave_flux
60
+ long_name: Top-of-atmosphere outgoing longwave radiation
61
+ units: W m-2
62
+ psml:
63
+ cf_name: mean_sea_level_pressure
64
+ long_name: Mean sea level pressure
65
+ units: Pa
66
+ evspsbl:
67
+ cf_name: water_evapotranspiration_flux
68
+ long_name: Evaporation including sublimation and transpiration
69
+ units: mm/day
70
+ snd:
71
+ cf_name: surface_snow_thickness
72
+ long_name: Snow depth
73
+ units: m
74
+ snw:
75
+ cf_name: surface_snow_amount
76
+ long_name: Snow water equivalent
77
+ units: mm
@@ -0,0 +1,224 @@
1
+ import os
2
+ import glob
3
+ import pandas as pd
4
+ import xarray as xr
5
+ from datetime import datetime
6
+ from typing import Optional, Dict, Union
7
+ from omegaconf import DictConfig
8
+ import warnings
9
+ from pathlib import Path
10
+ from tqdm.notebook import tqdm
11
+ from collections import defaultdict
12
+ from concurrent.futures import ProcessPoolExecutor
13
+ from xclim.core import units
14
+ warnings.filterwarnings("ignore", category=Warning)
15
+
16
+
17
+ class CMIP:
18
+ def __init__(self, var_cfg: DictConfig, experiments):
19
+ self.var_cfg = var_cfg
20
+ self.files = []
21
+ self.dataset = None
22
+ self.experiments = experiments
23
+
24
+ def _subset_by_bounds(self, ds, bounds, lat_name='lat', lon_name='lon'):
25
+ return ds.sel(
26
+ **{
27
+ lat_name: slice(bounds['lat_min'], bounds['lat_max']),
28
+ lon_name: slice(bounds['lon_min'], bounds['lon_max'])
29
+ }
30
+ )
31
+
32
+ def _check_lat_lon(self, ds: xr.Dataset) -> xr.Dataset:
33
+ # Fix latitude ascending order
34
+ if "lat" in ds.coords:
35
+ lat = ds["lat"]
36
+ if lat.values[0] > lat.values[-1]: # descending
37
+ ds = ds.sortby("lat")
38
+
39
+ # Fix longitude range to -180 to 180
40
+ if "lon" in ds.coords:
41
+ lon = ds["lon"]
42
+ lon_vals = lon.values
43
+ if lon_vals.max() > 180:
44
+ lon_fixed = ((lon_vals + 180) % 360) - 180
45
+ ds = ds.assign_coords(lon=lon_fixed)
46
+ ds = ds.sortby("lon")
47
+ return ds
48
+
49
+ def fetch(self, base_dir,tbl_id):
50
+ nc_files = [
51
+ f
52
+ for exp in self.experiments
53
+ for f in glob.glob(
54
+ os.path.join(base_dir, "*/*/*", exp, f"*/{tbl_id}/*/*/*/*.nc"),
55
+ recursive=True
56
+ )
57
+ ]
58
+ rows = []
59
+ for file_path in tqdm(nc_files, desc="Indexing CMIP6 files"):
60
+ parts = file_path.split(os.sep)
61
+ try:
62
+ activity_id = parts[6]
63
+ institution_id = parts[7]
64
+ source_id = parts[8]
65
+ experiment_id = parts[9]
66
+ member_id = parts[10]
67
+ table_id = parts[11]
68
+ variable_id = parts[12]
69
+ grid_label = parts[13]
70
+ version = parts[14]
71
+ except IndexError:
72
+ continue
73
+
74
+ # Extract start and end date from filename
75
+ fname = os.path.basename(file_path)
76
+ # Example: pr_day_MIROC6_ssp245-nat_r8i1p1f1_gn_20210101-20301231.nc
77
+ date_part = fname.split("_")[-1].replace(".nc", "")
78
+ start_str, end_str = date_part.split("-")
79
+
80
+ if tbl_id == 'Amon':
81
+ start_date = pd.to_datetime(start_str, format="%Y%m")
82
+ end_date = pd.to_datetime(end_str, format="%Y%m")
83
+ elif tbl_id == 'day':
84
+ start_date = pd.to_datetime(start_str, format="%Y%m%d")
85
+ end_date = pd.to_datetime(end_str, format="%Y%m%d")
86
+ rows.append({
87
+ "path": file_path,
88
+ "activity_id": activity_id,
89
+ "institution_id": institution_id,
90
+ "source_id": source_id,
91
+ "experiment_id": experiment_id,
92
+ "member_id": member_id,
93
+ "table_id": table_id,
94
+ "variable_id": variable_id,
95
+ "grid_label": grid_label,
96
+ "version": version,
97
+ "start_date": start_date,
98
+ "end_date": end_date
99
+ })
100
+
101
+ df = pd.DataFrame(rows)
102
+ # import ipdb; ipdb.set_trace()
103
+ # keep only experiments that match all requested
104
+ grouped = df.groupby(["institution_id", "source_id"])["experiment_id"].unique()
105
+ valid_pairs = grouped[grouped.apply(lambda exps: set(self.experiments).issubset(set(exps)))].index
106
+ df = df[df.set_index(["institution_id", "source_id"]).index.isin(valid_pairs)]
107
+
108
+ # keep only versions with "v"
109
+ df = df[df['version'].str.contains('v')]
110
+
111
+ # compute file-level duration
112
+ df["years"] = (df["end_date"] - df["start_date"]).dt.days / 365.25
113
+
114
+ # compute total duration per dataset
115
+ coverage = df.groupby(
116
+ ["institution_id", "source_id", "experiment_id", "member_id", "variable_id", "grid_label"]
117
+ ).agg(
118
+ total_years=("years", "sum"),
119
+ start=("start_date", "min"),
120
+ end=("end_date", "max"),
121
+ nfiles=("path", "count")
122
+ ).reset_index()
123
+
124
+ # keep only groups with ≥ 60 years
125
+ valid_groups = coverage[coverage["total_years"] >= 60]
126
+
127
+ # filter original dataframe
128
+ df_filtered = df.merge(
129
+ valid_groups,
130
+ on=["institution_id", "source_id", "experiment_id", "member_id", "variable_id", "grid_label"],
131
+ how="inner"
132
+ )
133
+
134
+ return df_filtered
135
+
136
+ def _process_var_model(self, var, model, df_filtered,subset_experiments):
137
+ ds_list = []
138
+ for exp in subset_experiments:
139
+ df_filtered_sub = df_filtered[
140
+ (df_filtered['variable_id'] == var) &
141
+ (df_filtered['source_id'] == model) &
142
+ (df_filtered['experiment_id'] == exp)
143
+ ]
144
+ members = df_filtered_sub['member_id'].unique()
145
+ for i,member in enumerate(members[:3]):
146
+ df_filt = df_filtered_sub[
147
+ (df_filtered_sub['experiment_id'] == exp) &
148
+ (df_filtered_sub['member_id'] == member)
149
+ ]
150
+ if df_filt.empty:
151
+ continue
152
+
153
+ paths = df_filt['path'].values
154
+ ds = xr.open_mfdataset(paths, combine="by_coords", chunks={"time": 365})
155
+ if var == "pr":
156
+ ds[var] = units.convert_units_to(ds[var], "mm d-1")
157
+ elif var in ["tas", "tasmax", "tasmin"]:
158
+ ds[var] = units.convert_units_to(ds[var], "degC")
159
+ ds = self._check_lat_lon(ds)
160
+ ds_europe = self._subset_by_bounds(
161
+ ds,
162
+ self.var_cfg.bounds[self.var_cfg.region]
163
+ )
164
+ ds_list.append(ds_europe.expand_dims({
165
+ "experiment": [exp],
166
+ "member": [i]
167
+ }))
168
+
169
+ if ds_list:
170
+ ds_list = xr.align(*ds_list, join="inner", exclude=["experiment", "member"])
171
+ combined_ds = xr.combine_by_coords(ds_list, combine_attrs="override")
172
+ return (var, model, combined_ds)
173
+ else:
174
+ return (var, model, None)
175
+
176
+ def load(self, df_filtered, vars_of_interest, subset_experiments = ["historical", "hist-aer", "hist-GHG"]):
177
+ data_dict = defaultdict(dict)
178
+ var_model_pairs = list(
179
+ df_filtered[df_filtered['variable_id'].isin(vars_of_interest)]
180
+ [['variable_id', 'source_id']]
181
+ .drop_duplicates()
182
+ .itertuples(index=False, name=None)
183
+ )
184
+
185
+ with ProcessPoolExecutor(max_workers=4) as executor:
186
+ futures = [
187
+ executor.submit(self._process_var_model, var, model, df_filtered, subset_experiments)
188
+ for var, model in var_model_pairs
189
+ ]
190
+ for f in futures:
191
+ var, model, ds = f.result()
192
+ if ds is not None:
193
+ data_dict[model][var] = ds.chunk({'lat': 10, 'lon': 10, 'time': -1})[var]
194
+ self.dataset = data_dict
195
+ return data_dict
196
+
197
+ def to_zarr(self,dataset):
198
+ if self.dataset is None:
199
+ raise ValueError("No dataset loaded. Call `load()` before `to_zarr()`.")
200
+ for var_name in self.dataset.keys():
201
+ for mod_name in self.dataset[var_name].keys():
202
+ ds_model = self.dataset[var_name][mod_name]
203
+
204
+ dataset_name = mod_name
205
+ region = self.var_cfg.region
206
+
207
+ if var_name == 'pr':
208
+ self.dataset.attrs['units'] = 'kg m-2 s-1'
209
+ elif var_name in ['tas', 'tasmax', 'tasmin']:
210
+ self.dataset.attrs['units'] = 'degC'
211
+
212
+ zarr_filename = self.var_cfg.output.filename.format(
213
+ index=var_name,
214
+ dataset=dataset_name,
215
+ region=region,
216
+ start=self.var_cfg.time_range.start_date,
217
+ end=self.var_cfg.time_range.end_date,
218
+ freq='1D',
219
+ )
220
+ zarr_path = os.path.join(f"data/{mod_name}/", zarr_filename)
221
+ os.makedirs(os.path.dirname(zarr_path), exist_ok=True)
222
+
223
+ print(f"💾 Saving {var_name} to Zarr: {zarr_path}")
224
+ self.dataset.to_zarr(zarr_path, mode="w")
climdata/datasets/MSWX.py CHANGED
@@ -1,69 +1,106 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from wetterdienst import Settings
4
+ from wetterdienst.provider.dwd.observation import DwdObservationRequest
5
+ import geemap
6
+ import ee
7
+ import ipdb
8
+ import geopandas as gpd
9
+ from omegaconf import DictConfig
10
+ import os
11
+ import yaml
12
+ import time
13
+ from tqdm import tqdm
14
+ import warnings
15
+ from datetime import datetime, timedelta
16
+ import xarray as xr
17
+ import hydra
18
+ from omegaconf import DictConfig
19
+ import pint
20
+ import pint_pandas
21
+
1
22
  from google.oauth2 import service_account
2
23
  from googleapiclient.discovery import build
3
24
  from googleapiclient.http import MediaIoBaseDownload
4
- import datetime
25
+
26
+ import io
27
+ import requests
28
+ from scipy.spatial import cKDTree
29
+ import argparse
30
+ import re
31
+
32
+ import requests
33
+ from bs4 import BeautifulSoup
34
+ import concurrent.futures
35
+
36
+ import gzip
37
+ # from utils.utils import *
38
+ # from datasets.datasets import *
39
+ import rioxarray
40
+ from shapely.geometry import mapping
41
+
42
+ warnings.filterwarnings("ignore", category=Warning)
43
+
44
+ import cf_xarray
45
+
5
46
  class MSWXmirror:
6
- def __init__(self,cfg):
7
- self.cfg = cfg
8
- self.provider = cfg.dataset.lower()
9
- self.parameter_key = cfg.weather.parameter
10
- self.lat = cfg.location.lat
11
- self.lon = cfg.location.lon
12
- self.start_date = datetime.fromisoformat(cfg.time_range.start_date)
13
- self.end_date = datetime.fromisoformat(cfg.time_range.end_date)
14
- self.output_dir = cfg.data_dir
15
-
16
- provider_cfg = cfg.mappings[self.provider]
17
- self.param_info = provider_cfg['variables'][self.parameter_key]
18
- self.folder_id = self.param_info['folder_id']
19
- self.units = self.param_info.get("units", "")
20
- self.service = self._build_drive_service(provider_cfg.params.google_service_account)
21
-
22
- def _list_drive_files(folder_id, service):
47
+ def __init__(self, var_cfg: DictConfig):
48
+ self.var_cfg = var_cfg
49
+ self.files = []
50
+ self.dataset = None
51
+
52
+ def _fix_coords(self, ds: xr.Dataset | xr.DataArray) -> xr.Dataset | xr.DataArray:
23
53
  """
24
- List all files in a Google Drive folder, handling pagination.
54
+ Ensure latitude is ascending and longitude is in the range [0, 360].
55
+
56
+ Parameters
57
+ ----------
58
+ ds : xr.Dataset or xr.DataArray
59
+ Input dataset or dataarray with latitude and longitude coordinates.
60
+
61
+ Returns
62
+ -------
63
+ xr.Dataset or xr.DataArray
64
+ Dataset with latitude ascending and longitude wrapped to [0, 360].
25
65
  """
26
- files = []
27
- page_token = None
66
+ # Flip latitude to ascending
67
+ ds = ds.cf.sortby("latitude")
28
68
 
29
- while True:
30
- results = service.files().list(
31
- q=f"'{folder_id}' in parents and trashed = false",
32
- fields="files(id, name), nextPageToken",
33
- pageToken=page_token
34
- ).execute()
69
+ # Wrap longitude into [0, 360]
70
+ lon_name = ds.cf["longitude"].name
71
+ ds = ds.assign_coords({lon_name: ds.cf["longitude"] % 360})
35
72
 
36
- files.extend(results.get("files", []))
37
- page_token = results.get("nextPageToken", None)
73
+ # Sort by longitude
74
+ ds = ds.sortby(lon_name)
38
75
 
39
- if not page_token:
40
- break
76
+ return ds
41
77
 
42
- return files
43
- def _download_drive_file(file_id, local_path, service):
44
- """
45
- Download a single file from Drive to a local path.
46
- """
47
- request = service.files().get_media(fileId=file_id)
48
- os.makedirs(os.path.dirname(local_path), exist_ok=True)
49
78
 
50
- with io.FileIO(local_path, 'wb') as fh:
51
- downloader = MediaIoBaseDownload(fh, request)
79
+ def fetch(self):
80
+ param_mapping = self.var_cfg.mappings
81
+ provider = self.var_cfg.dataset.lower()
82
+ parameter_key = self.var_cfg.weather.parameter
83
+
84
+ param_info = param_mapping[provider]['variables'][parameter_key]
85
+ folder_id = param_info["folder_id"]
86
+
87
+ start_date = self.var_cfg.time_range.start_date
88
+ end_date = self.var_cfg.time_range.end_date
89
+
90
+ start = datetime.fromisoformat(start_date)
91
+ end = datetime.fromisoformat(end_date)
52
92
 
53
- done = False
54
- while not done:
55
- status, done = downloader.next_chunk()
56
- print(f" → Download {int(status.progress() * 100)}% complete")
57
- def fetch():
58
93
  expected_files = []
59
- current = self.start_date
60
- while current <= self.end_date:
94
+ current = start
95
+ while current <= end:
61
96
  doy = current.timetuple().tm_yday
62
97
  basename = f"{current.year}{doy:03d}.nc"
63
98
  expected_files.append(basename)
64
99
  current += timedelta(days=1)
65
100
 
66
- output_dir = var_cfg.data_dir
101
+ output_dir = self.var_cfg.data_dir
102
+ provider = self.var_cfg.dataset.lower()
103
+ parameter_key = self.var_cfg.weather.parameter
67
104
  local_files = []
68
105
  missing_files = []
69
106
 
@@ -76,28 +113,26 @@ class MSWXmirror:
76
113
 
77
114
  if not missing_files:
78
115
  print(f"✅ All {len(expected_files)} files already exist locally. No download needed.")
116
+ self.files = local_files
79
117
  return local_files
80
118
 
81
119
  print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching from Drive...")
82
120
 
83
- # === 2) Connect to Drive ===
84
121
  SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
85
122
  creds = service_account.Credentials.from_service_account_file(
86
123
  param_mapping[provider].params.google_service_account, scopes=SCOPES
87
124
  )
88
125
  service = build('drive', 'v3', credentials=creds)
89
126
 
90
- # === 3) List all Drive files ===
91
127
  drive_files = list_drive_files(folder_id, service)
92
128
  valid_filenames = set(missing_files)
93
-
94
129
  files_to_download = [f for f in drive_files if f['name'] in valid_filenames]
95
130
 
96
131
  if not files_to_download:
97
132
  print(f"⚠️ None of the missing files found in Drive. Check folder & date range.")
133
+ self.files = local_files
98
134
  return local_files
99
135
 
100
- # === 4) Download missing ===
101
136
  for file in files_to_download:
102
137
  filename = file['name']
103
138
  local_path = os.path.join(output_dir, provider, parameter_key, filename)
@@ -105,91 +140,161 @@ class MSWXmirror:
105
140
  download_drive_file(file['id'], local_path, service)
106
141
  local_files.append(filename)
107
142
 
143
+ self.files = local_files
108
144
  return local_files
109
145
 
110
- def extract_ts_MSWX(cfg: DictConfig):
111
- parameter = cfg.weather.parameter
112
- param_mapping = cfg.mappings
113
- provider = cfg.dataset.lower()
114
- parameter_key = cfg.weather.parameter
115
- # Validate provider and parameter
116
-
117
- param_info = param_mapping[provider]['variables'][parameter_key]
118
-
119
- base_dir = cfg.data_dir
120
-
121
- target_lat = cfg.location.lat
122
- target_lon = cfg.location.lon
123
-
124
- start_date = pd.to_datetime(cfg.time_range.start_date)
125
- end_date = pd.to_datetime(cfg.time_range.end_date)
126
-
127
- # === 1) Rebuild exact basenames ===
128
- current = start_date
129
- basenames = []
130
- while current <= end_date:
131
- doy = current.timetuple().tm_yday
132
- basename = f"{current.year}{doy:03d}.nc"
133
- basenames.append(basename)
134
- current += timedelta(days=1)
135
-
136
- # === 2) Process only those files ===
137
- ts_list = []
138
- missing = []
146
+ def load(self):
147
+ param_mapping = self.var_cfg.mappings
148
+ provider = self.var_cfg.dataset.lower()
149
+ parameter_key = self.var_cfg.weather.parameter
150
+ region = self.var_cfg.region
151
+ bounds = self.var_cfg.bounds[region]
152
+
153
+ param_info = param_mapping[provider]['variables'][parameter_key]
154
+ output_dir = self.var_cfg.data_dir
155
+ valid_dsets = []
156
+
157
+ for f in self.files:
158
+ local_path = os.path.join(output_dir, provider, parameter_key, f)
159
+ try:
160
+ ds = xr.open_dataset(local_path, chunks='auto', engine='netcdf4')[param_info.name]
161
+ valid_dsets.append(ds)
162
+ except Exception as e:
163
+ print(f"Skipping file due to error: {f}\n{e}")
164
+
165
+ dset = xr.concat(valid_dsets, dim='time')
166
+ dset = dset.transpose('time', 'lat', 'lon')
167
+ self.dataset = self._fix_coords(dset)
168
+ return dset
169
+
170
+ def to_zarr(self, zarr_filename):
171
+ if self.dataset is None:
172
+ raise ValueError("No dataset loaded. Call `load()` before `to_zarr()`.")
173
+
174
+ var_name = self.var_cfg.weather.parameter
175
+ dataset_name = self.var_cfg.dataset
176
+ region = self.var_cfg.region
177
+
178
+ # Add standard units metadata
179
+ if var_name == 'pr':
180
+ self.dataset.attrs['units'] = 'mm/day'
181
+ elif var_name in ['tas', 'tasmax', 'tasmin']:
182
+ self.dataset.attrs['units'] = 'degC'
183
+
184
+ zarr_path = os.path.join("data/MSWX/", zarr_filename)
185
+ os.makedirs(os.path.dirname(zarr_path), exist_ok=True)
186
+
187
+ print(f"💾 Saving {var_name} to Zarr: {zarr_path}")
188
+ self.dataset.to_zarr(zarr_path, mode="w")
189
+
190
+ def extract(self, *, point=None, box=None, shapefile=None, buffer_km=0.0):
191
+ """
192
+ Extract a subset of the dataset by point, bounding box, or shapefile.
193
+
194
+ Parameters
195
+ ----------
196
+ point : tuple(float, float), optional
197
+ (lon, lat) coordinates for a single point.
198
+ box : tuple(float, float, float, float), optional
199
+ (min_lon, min_lat, max_lon, max_lat) bounding box.
200
+ shapefile : str or geopandas.GeoDataFrame, optional
201
+ Path to shapefile or a GeoDataFrame.
202
+ buffer_km : float, optional
203
+ Buffer distance in kilometers (for point or shapefile).
204
+
205
+ Returns
206
+ -------
207
+ xarray.Dataset or xarray.DataArray
208
+ Subset of the dataset.
209
+ """
210
+ if self.dataset is None:
211
+ raise ValueError("No dataset loaded. Call `load()` first.")
212
+
213
+ ds = self.dataset.rio.write_crs("EPSG:4326", inplace=False)
214
+
215
+ if point is not None:
216
+ lon, lat = point
217
+ if buffer_km > 0:
218
+ # buffer around point
219
+ buffer_deg = buffer_km / 111 # rough conversion km→degrees
220
+ ds_subset = ds.sel(
221
+ lon=slice(lon-buffer_deg, lon+buffer_deg),
222
+ lat=slice(lat-buffer_deg, lat+buffer_deg)
223
+ )
224
+ else:
225
+ ds_subset = ds.sel(lon=lon, lat=lat, method="nearest")
226
+
227
+ elif box is not None:
228
+ min_lon, min_lat, max_lon, max_lat = box
229
+ ds_subset = ds.sel(
230
+ lon=slice(min_lon, max_lon),
231
+ lat=slice(min_lat, max_lat)
232
+ )
233
+
234
+ elif shapefile is not None:
235
+ if isinstance(shapefile, str):
236
+ gdf = gpd.read_file(shapefile)
237
+ else:
238
+ gdf = shapefile
239
+ if buffer_km > 0:
240
+ gdf = gdf.to_crs(epsg=3857) # project to meters
241
+ gdf["geometry"] = gdf.buffer(buffer_km * 1000)
242
+ gdf = gdf.to_crs(epsg=4326)
139
243
 
140
- for basename in basenames:
141
- file_path = os.path.join(base_dir, provider, parameter, basename)
244
+ geom = [mapping(g) for g in gdf.geometry]
245
+ ds_subset = ds.rio.clip(geom, gdf.crs, drop=True)
142
246
 
143
- if not os.path.exists(file_path):
144
- missing.append(basename)
145
- continue
247
+ else:
248
+ raise ValueError("Must provide either point, box, or shapefile.")
146
249
 
147
- print(f"📂 Opening: {file_path}")
148
- ds = xr.open_dataset(file_path)
250
+ return ds_subset
251
+
252
+ def to_dataframe(self, ds=None):
253
+ """
254
+ Convert extracted xarray dataset to a tidy dataframe.
149
255
 
150
- time_name = [x for x in ds.coords if "time" in x.lower()][0]
151
- data_var = [v for v in ds.data_vars][0]
256
+ Parameters
257
+ ----------
258
+ ds : xr.DataArray or xr.Dataset, optional
259
+ Dataset to convert. If None, use self.dataset.
152
260
 
153
- ts = ds[data_var].sel(
154
- lat=target_lat,
155
- lon=target_lon,
156
- method='nearest'
157
- )
261
+ Returns
262
+ -------
263
+ pd.DataFrame
264
+ """
265
+ if ds is None:
266
+ if self.dataset is None:
267
+ raise ValueError("No dataset loaded. Call `load()` first or pass `ds`.")
268
+ ds = self.dataset
269
+
270
+ # If Dataset, pick first variable
271
+ if isinstance(ds, xr.Dataset):
272
+ if len(ds.data_vars) != 1:
273
+ raise ValueError("Dataset has multiple variables. Please select one.")
274
+ ds = ds[list(ds.data_vars)[0]]
275
+
276
+ df = ds.to_dataframe().reset_index()
277
+
278
+ # Keep only relevant cols
279
+ df = df[["time", "lat", "lon", ds.name]]
280
+
281
+ # Rename
282
+ df = df.rename(columns={
283
+ "lat": "latitude",
284
+ "lon": "longitude",
285
+ ds.name: "value"
286
+ })
287
+ return df
288
+
289
+ def format(self, df):
290
+ """
291
+ Format dataframe into standard schema.
292
+ """
293
+ df = df.copy()
294
+ df["variable"] = self.var_cfg.weather.parameter
295
+ df["source"] = self.var_cfg.dataset.upper()
296
+ df["units"] = self.dataset.attrs.get("units", "unknown")
158
297
 
159
- df = ts.to_dataframe().reset_index()[[time_name, data_var]]
160
- ts_list.append(df)
161
-
162
- if missing:
163
- print(f"⚠️ Warning: {len(missing)} files were missing and skipped:")
164
- for m in missing:
165
- print(f" - {m}")
166
-
167
- if not ts_list:
168
- raise RuntimeError("❌ No valid files were found. Cannot extract time series.")
169
-
170
- # === 3) Combine and slice (for safety) ===
171
- ts_all = pd.concat(ts_list).sort_values(by=time_name).reset_index(drop=True)
172
-
173
- ts_all[time_name] = pd.to_datetime(ts_all[time_name])
174
- ts_all = ts_all[
175
- (ts_all[time_name] >= start_date) &
176
- (ts_all[time_name] <= end_date)
177
- ].reset_index(drop=True)
178
-
179
- out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
180
- os.makedirs(out_dir, exist_ok=True)
181
- out_path = os.path.join(out_dir, cfg.output.filename)
182
-
183
- ts_all["variable"] = param_info['name']
184
- ts_all["latitude"] = target_lat
185
- ts_all["longitude"] = target_lon
186
- ts_all['source'] = provider.upper()
187
- ts_all['units'] = ts.attrs['units']
188
- ts_all.rename(columns={param_info['name']: 'value'}, inplace=True)
189
- ts_all = ts_all[["latitude", "longitude", "time", "source", "variable", "value",'units']]
298
+ df = df[["latitude", "longitude", "time", "source", "variable", "value", "units"]]
299
+ return df
190
300
 
191
- ts_all.to_csv(out_path, index=False)
192
- print(f"✅ Saved MSWX time series to: {out_path}")
193
-
194
- return ts_all
195
-
climdata/utils/config.py CHANGED
@@ -1,30 +1,30 @@
1
+ import os
2
+ import shutil
3
+ from pathlib import Path
1
4
  from hydra import initialize, compose
2
5
  from omegaconf import OmegaConf
3
- from typing import Optional, List
4
- import os
6
+ import importlib.resources as resources
5
7
 
6
- def load_config(
7
- config_path: str = "../conf",
8
- config_name: str = "config",
9
- overrides: Optional[List[str]] = None,
10
- verbose: bool = False
11
- ):
8
+ def _ensure_local_conf(package="climdata", local_dir="conf"):
12
9
  """
13
- Load a Hydra config file.
14
-
15
- Args:
16
- config_path (str): Path to the config directory.
17
- config_name (str): Name of the config YAML file (without `.yaml`).
18
- overrides (List[str], optional): List of override strings.
19
- verbose (bool): Whether to print the loaded config.
20
-
21
- Returns:
22
- OmegaConf.DictConfig: The loaded config object.
10
+ Copy package conf/ to cwd if not exists.
11
+ Returns the relative path "conf" for Hydra.
23
12
  """
24
- # config_path = os.path.abspath(config_path)
13
+ local_dir_path = Path(os.getcwd()) / local_dir
14
+ if not local_dir_path.exists():
15
+ # Get conf inside the installed package
16
+ conf_src = resources.files(package).joinpath("conf")
17
+ shutil.copytree(conf_src, local_dir_path)
18
+ return local_dir_path.name # relative for Hydra
25
19
 
20
+ def load_config(config_name="config", overrides=None, verbose=False):
21
+ """
22
+ Load Hydra config using ./conf in cwd.
23
+ """
24
+ config_path = _ensure_local_conf()
25
+ # import ipdb; ipdb.set_trace()
26
26
  with initialize(config_path=config_path, version_base=None):
27
27
  cfg = compose(config_name=config_name, overrides=overrides or [])
28
28
  if verbose:
29
29
  print(OmegaConf.to_yaml(cfg))
30
- return cfg
30
+ return cfg
@@ -15,7 +15,7 @@ import warnings
15
15
  from datetime import datetime, timedelta
16
16
  import xarray as xr
17
17
  import hydra
18
- from omegaconf import DictConfig
18
+
19
19
  import pint
20
20
  import pint_pandas
21
21
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: climdata
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: This project automates the fetching and extraction of weather data from multiple sources — such as MSWX, DWD HYRAS, ERA5-Land, NASA-NEX-GDDP, and more — for a given location and time range.
5
5
  Author-email: Kaushik Muduchuru <kaushik.reddy.m@gmail.com>
6
6
  License: MIT License
@@ -0,0 +1,19 @@
1
+ climdata/__init__.py,sha256=ZsUBlrGiniOmJLZh9uvfwSgEn78jKB_saGDg3Kp2bG4,310
2
+ climdata/__main__.py,sha256=Gn-CeD1_A0xSU8lvpuDJeniNtVgkwDpcRhxBOlrfV_w,82
3
+ climdata/main.py,sha256=4_tm82v6tEa1fH1IeL2IslyjiB9NWOk46A-_QcsHg64,1586
4
+ climdata/requirements.txt,sha256=olJJbNA402X16qOlUKY43ntycujwCE2D0jvSrHnznZw,349
5
+ climdata/conf/config.yaml,sha256=e7v1rmSYocAww4THKd7Hzg9qichGzhhOUEAKVkkLM1U,873
6
+ climdata/conf/mappings/parameters.yaml,sha256=HzTjJnEWEYI4PiW29cGp0UbRBDc1f_NhgJdlYT6MXNY,4100
7
+ climdata/conf/mappings/variables.yaml,sha256=mO13rtF0XOzP4fjU5oyao44GmRM0Jrr1RjUBBwg04IU,2030
8
+ climdata/datasets/CMIP.py,sha256=XQtHpFhZtVL0ewrbWhREAEY6XcWa4uu4u3yIHG5_lJE,8809
9
+ climdata/datasets/DWD.py,sha256=DwqBBkRLE_FXyjZX38iVv3cduiO6uQc8MQaTvHQqxjA,2850
10
+ climdata/datasets/MSWX.py,sha256=UpWDNSXAISGVPv9oF4VqWhLmboBN_dZMYt8CFk3PoUY,9912
11
+ climdata/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ climdata/utils/config.py,sha256=UhbCH1PEVkEIdLVgmEEvb39Tq5XX4fsNe5gN4B5WL9Y,1054
13
+ climdata/utils/utils_download.py,sha256=sXO4SOEfTdOCEOiGVNbxcZqGyt5LYUHVhMcSuBvRI5E,33368
14
+ climdata-0.0.3.dist-info/licenses/LICENSE,sha256=f_3cGg8TC4V7GSbVaJo9b_hy-iY3q_ZpWq1MN2VQcnw,1076
15
+ climdata-0.0.3.dist-info/METADATA,sha256=IIlfikeiCSb2u_ThaUsTzMkwbUAaF4D-Zph1UCOpAvk,8163
16
+ climdata-0.0.3.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
17
+ climdata-0.0.3.dist-info/entry_points.txt,sha256=tiYPawJoZiBj7lU67vNTCb3esSvx_d-lBvOffA26ouU,47
18
+ climdata-0.0.3.dist-info/top_level.txt,sha256=BPnAhRqg8vk580nSJDXTdLmfq6OZ_LR8eNTgrRabArw,9
19
+ climdata-0.0.3.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- climdata/__init__.py,sha256=B1_5R81XGP5TfKeZI3HkC30QQyKPFVVNBU6baa8g2A0,225
2
- climdata/__main__.py,sha256=Gn-CeD1_A0xSU8lvpuDJeniNtVgkwDpcRhxBOlrfV_w,82
3
- climdata/main.py,sha256=4_tm82v6tEa1fH1IeL2IslyjiB9NWOk46A-_QcsHg64,1586
4
- climdata/requirements.txt,sha256=olJJbNA402X16qOlUKY43ntycujwCE2D0jvSrHnznZw,349
5
- climdata/conf/config.yaml,sha256=LL5f9CVqkltNjK3DpRcpmtCdKb9v0JdSslIvZ57KbLM,468
6
- climdata/conf/mappings/parameters.yaml,sha256=3E_rB7YuGyW_7wDJNOsHx4RSbOizTzotvJes5AwAzBE,4153
7
- climdata/datasets/DWD.py,sha256=DwqBBkRLE_FXyjZX38iVv3cduiO6uQc8MQaTvHQqxjA,2850
8
- climdata/datasets/MSWX.py,sha256=FA9t1AjcH7D88d-wfsU0g_ZHb-bui_wli8rhSezuTnU,6882
9
- climdata/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- climdata/utils/config.py,sha256=eARrf5fTCeEJbpCxxph4yL2m_ZzKXqcShu8T-1s3twA,927
11
- climdata/utils/utils_download.py,sha256=1yfxP-X6CL4oI0CwgdVcTBqOVd8UmGmgoHSENGpjZgI,33400
12
- climdata-0.0.2.dist-info/licenses/LICENSE,sha256=f_3cGg8TC4V7GSbVaJo9b_hy-iY3q_ZpWq1MN2VQcnw,1076
13
- climdata-0.0.2.dist-info/METADATA,sha256=en2YwpmDhoMyxyb_lTcQsp3VOvpnW0vs-QcrusEdYz4,8163
14
- climdata-0.0.2.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
15
- climdata-0.0.2.dist-info/entry_points.txt,sha256=tiYPawJoZiBj7lU67vNTCb3esSvx_d-lBvOffA26ouU,47
16
- climdata-0.0.2.dist-info/top_level.txt,sha256=BPnAhRqg8vk580nSJDXTdLmfq6OZ_LR8eNTgrRabArw,9
17
- climdata-0.0.2.dist-info/RECORD,,