climdata 0.1.1__py2.py3-none-any.whl → 0.1.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of climdata might be problematic. Click here for more details.

climdata/__init__.py CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  __author__ = """Kaushik Muduchuru"""
4
4
  __email__ = "kaushik.reddy.m@gmail.com"
5
- __version__ = "0.1.1"
5
+ __version__ = "0.1.3"
6
6
 
7
7
  from .utils.utils_download import * # etc.
8
8
  from .utils.config import load_config
@@ -11,4 +11,5 @@ from .datasets.MSWX import MSWXmirror as MSWX
11
11
  from .datasets.ERA5 import ERA5Mirror as ERA5
12
12
  from .datasets.CMIPlocal import CMIPmirror as CMIPlocal
13
13
  from .datasets.CMIPCloud import CMIPCloud as CMIP
14
+ from .datasets.HYRAS import HYRASmirror as HYRAS
14
15
 
climdata/conf/config.yaml CHANGED
@@ -1,31 +1,27 @@
1
-
2
1
  defaults:
3
2
  - _self_
4
3
  - mappings/parameters
5
4
  - mappings/variables
6
- dataset: dwd
5
+
6
+ dataset: MSWX
7
+ lat: null
8
+ lon: null
9
+
10
+ variables: ["tasmin","tasmax","pr"]
11
+
7
12
  data_dir: ./data
8
- weather:
9
- parameter: tas # standardized variable name (e.g., tas, pr, rsds)
13
+ region: None
10
14
 
11
- region: europe
15
+ experiment_id: historical
16
+ source_id: MIROC6
17
+ table_id: day
12
18
 
13
19
  bounds:
14
- global:
15
- lat_min: -90.0
16
- lat_max: 90.0
17
- lon_min: -180.0
18
- lon_max: 180.0
19
20
  europe:
20
- lat_min: 34.0 # Southern Europe (e.g., southern Greece)
21
- lat_max: 71.0 # Northern Europe (e.g., northern Norway)
22
- lon_min: -25.0 # Western Europe (e.g., Azores)
23
- lon_max: 45.0 # Eastern Europe (Ural Mountains, excludes most of Russia)
24
-
25
- location:
26
- lat: 52.5070
27
- lon: 14.1372
28
- buffer_km: 25
21
+ lat_min: 34.0
22
+ lat_max: 71.0
23
+ lon_min: -25.0
24
+ lon_max: 45.0
29
25
 
30
26
  time_range:
31
27
  start_date: "1989-01-01"
@@ -35,4 +31,5 @@ output:
35
31
  out_dir: "./climdata/data/"
36
32
  filename_csv: "{provider}_{parameter}_LAT_{lat}_LON_{lon}_{start}_{end}.csv"
37
33
  filename_zarr: "{provider}_{parameter}_LAT{lat_range}_LON{lon_range}_{start}_{end}.zarr"
38
- fmt: 'standard' # 'standard', 'ICASA', 'simplace', 'monica'
34
+ filename_nc: "{provider}_{parameter}_LAT{lat_range}_LON{lon_range}_{start}_{end}.nc"
35
+ fmt: "standard"
@@ -21,7 +21,7 @@ dwd:
21
21
  resolution: daily
22
22
  dataset: climate_summary
23
23
  name: precipitation_height
24
- unit: mm
24
+ unit: mm d-1
25
25
  rsds:
26
26
  resolution: daily
27
27
  dataset: solar
@@ -124,7 +124,7 @@ mswx:
124
124
  rsds:
125
125
  name: downward_shortwave_radiation
126
126
  folder_id: 1usXbIOi4_jBUdDaZbzPKXznx9PTYzHRv
127
- dwd_hyras:
127
+ hyras:
128
128
  variables:
129
129
  tasmin:
130
130
  name: tasmin
@@ -1,20 +1,32 @@
1
1
  import intake
2
2
  import xarray as xr
3
3
  import pandas as pd
4
+ from omegaconf import DictConfig
5
+ import intake
6
+ import xarray as xr
7
+ import pandas as pd
8
+ from omegaconf import DictConfig
9
+
4
10
 
5
11
  class CMIPCloud:
6
- def __init__(self, experiment_id, source_id, table_id, variables, region_bounds=None):
7
- self.experiment_id = experiment_id
8
- self.source_id = source_id
9
- self.table_id = table_id
10
- self.variables = variables
11
- self.region_bounds = region_bounds
12
+ def __init__(self, cfg: DictConfig):
13
+ # Directly read from flat config
14
+ self.experiment_id = cfg.experiment_id
15
+ self.source_id = cfg.source_id
16
+ self.table_id = cfg.table_id
17
+ self.variables = cfg.variables
18
+ self.start_date = cfg.time_range.start_date
19
+ self.end_date = cfg.time_range.end_date
20
+
12
21
  self.col_subsets = []
13
22
  self.ds = None
23
+ self.col = None
14
24
 
15
25
  def fetch(self):
16
26
  """Collect intake catalog subsets for each variable."""
17
- col = intake.open_esm_datastore("https://storage.googleapis.com/cmip6/pangeo-cmip6.json")
27
+ col = intake.open_esm_datastore(
28
+ "https://storage.googleapis.com/cmip6/pangeo-cmip6.json"
29
+ )
18
30
  self.col_subsets = []
19
31
  for var in self.variables:
20
32
  query = dict(
@@ -27,13 +39,16 @@ class CMIPCloud:
27
39
  if len(col_subset.df) == 0:
28
40
  continue
29
41
  self.col_subsets.append(col_subset)
42
+ self.col = col
30
43
  return self.col_subsets
31
44
 
32
45
  def load(self):
33
46
  """Load and merge datasets from collected col_subsets."""
34
47
  datasets = []
35
48
  for col_subset in self.col_subsets:
36
- zstore_path = col_subset.df.zstore.values[0].replace('gs:/', "https://storage.googleapis.com")
49
+ zstore_path = col_subset.df.zstore.values[0].replace(
50
+ "gs:/", "https://storage.googleapis.com"
51
+ )
37
52
  ds_var = xr.open_zarr(zstore_path)
38
53
  datasets.append(ds_var)
39
54
  if datasets:
@@ -51,25 +66,25 @@ class CMIPCloud:
51
66
 
52
67
  if self.ds is None:
53
68
  raise ValueError("No dataset loaded. Call `load()` first.")
54
-
69
+
70
+ self._subset_time(self.start_date, self.end_date)
71
+
55
72
  ds = self.ds
56
-
57
73
  if point is not None:
58
74
  lon, lat = point
59
75
  if buffer_km > 0:
60
76
  buffer_deg = buffer_km / 111
61
77
  ds_subset = ds.sel(
62
- lon=slice(lon-buffer_deg, lon+buffer_deg),
63
- lat=slice(lat-buffer_deg, lat+buffer_deg)
78
+ lon=slice(lon - buffer_deg, lon + buffer_deg),
79
+ lat=slice(lat - buffer_deg, lat + buffer_deg),
64
80
  )
65
81
  else:
66
82
  ds_subset = ds.sel(lon=lon, lat=lat, method="nearest")
67
83
 
68
84
  elif box is not None:
69
- # Accept dict: {'lat_min': ..., 'lat_max': ..., 'lon_min': ..., 'lon_max': ...}
70
85
  ds_subset = ds.sel(
71
- lon=slice(box['lon_min'], box['lon_max']),
72
- lat=slice(box['lat_min'], box['lat_max'])
86
+ lon=slice(box["lon_min"], box["lon_max"]),
87
+ lat=slice(box["lat_min"], box["lat_max"]),
73
88
  )
74
89
 
75
90
  elif shapefile is not None:
@@ -83,6 +98,7 @@ class CMIPCloud:
83
98
  gdf = gdf.to_crs(epsg=4326)
84
99
  geom = [mapping(g) for g in gdf.geometry]
85
100
  import rioxarray
101
+
86
102
  ds = ds.rio.write_crs("EPSG:4326", inplace=False)
87
103
  ds_subset = ds.rio.clip(geom, gdf.crs, drop=True)
88
104
 
@@ -90,11 +106,9 @@ class CMIPCloud:
90
106
  raise ValueError("Must provide either point, box, or shapefile.")
91
107
  self.ds = ds_subset
92
108
  return ds_subset
109
+
93
110
  def _subset_time(self, start_date, end_date):
94
- """
95
- Subset the dataset by time range.
96
- Dates should be strings in 'YYYY-MM-DD' format.
97
- """
111
+ """Subset the dataset by time range."""
98
112
  if self.ds is None:
99
113
  return None
100
114
  ds_time = self.ds.sel(time=slice(start_date, end_date))
@@ -114,29 +128,38 @@ class CMIPCloud:
114
128
  print(f"Saved Zarr to {store_path}")
115
129
 
116
130
  def _format(self, df):
117
- """
118
- Format the dataframe for standardized output:
119
- - Adds source_id, experiment_id, table_id, variable, value, units columns.
120
- - Stacks variables into long format.
121
- """
122
- # Melt the dataframe to long format: variable, value
131
+ """Format dataframe for standardized output."""
123
132
  value_vars = [v for v in self.variables if v in df.columns]
124
133
  id_vars = [c for c in df.columns if c not in value_vars]
125
- df_long = df.melt(id_vars=id_vars, value_vars=value_vars,
126
- var_name="variable", value_name="value")
127
134
 
128
- # Add units column (from attrs)
135
+ df_long = df.melt(
136
+ id_vars=id_vars,
137
+ value_vars=value_vars,
138
+ var_name="variable",
139
+ value_name="value",
140
+ )
141
+
129
142
  df_long["units"] = df_long["variable"].map(
130
- lambda v: self.ds[v].attrs.get("units", "unknown") if v in self.ds.data_vars else "unknown"
143
+ lambda v: self.ds[v].attrs.get("units", "unknown")
144
+ if v in self.ds.data_vars
145
+ else "unknown"
131
146
  )
132
147
 
133
- # Add metadata columns if missing
134
148
  df_long["source"] = self.source_id
135
149
  df_long["experiment"] = self.experiment_id
136
150
  df_long["table"] = self.table_id
137
151
 
138
- # Reorder columns
139
- cols = ["source", "experiment", "table", "time", "lat", "lon", "variable", "value", "units"]
152
+ cols = [
153
+ "source",
154
+ "experiment",
155
+ "table",
156
+ "time",
157
+ "lat",
158
+ "lon",
159
+ "variable",
160
+ "value",
161
+ "units",
162
+ ]
140
163
  df_long = df_long[[c for c in cols if c in df_long.columns]]
141
164
 
142
165
  return df_long
climdata/datasets/DWD.py CHANGED
@@ -3,23 +3,16 @@ import pandas as pd
3
3
  import hydra
4
4
  from wetterdienst import Settings
5
5
  from wetterdienst.provider.dwd.observation import DwdObservationRequest
6
- from climdata.utils.utils_download import build_output_filename
7
6
 
8
7
  class DWDmirror:
9
8
  def __init__(self, cfg):
10
9
  self.cfg = cfg
11
10
  self.param_mapping = cfg.mappings
12
- self.provider = cfg.dataset.lower()
13
- self.parameter_key = cfg.weather.parameter
14
- self.lat = cfg.location.lat
15
- self.lon = cfg.location.lon
16
- self.distance = cfg.location.buffer_km
17
11
  self.start_date = cfg.time_range.start_date
18
12
  self.end_date = cfg.time_range.end_date
19
- self.units = self.param_mapping[self.provider]['variables'][self.parameter_key].get("unit", None)
20
13
  self.df = None
21
- def fetch(self):
22
- param_info = self.param_mapping[self.provider]['variables'][self.parameter_key]
14
+ def load(self, variable, lat_loc, lon_loc, buffer_km = 50):
15
+ param_info = self.param_mapping.dwd.variables[variable]
23
16
  resolution = param_info["resolution"]
24
17
  dataset = param_info["dataset"]
25
18
  variable_name = param_info["name"]
@@ -31,8 +24,8 @@ class DWDmirror:
31
24
  end_date=self.end_date,
32
25
  settings=settings
33
26
  ).filter_by_distance(
34
- latlon=(self.lat, self.lon),
35
- distance=self.distance,
27
+ latlon=(lat_loc, lon_loc),
28
+ distance=buffer_km,
36
29
  unit="km"
37
30
  )
38
31
 
@@ -40,7 +33,7 @@ class DWDmirror:
40
33
  self.df = df
41
34
  return self.df
42
35
 
43
- def format(self):
36
+ def format(self, variable, lat_loc, lon_loc):
44
37
  self.df['date'] = pd.to_datetime(self.df['date'])
45
38
  self.df = self.df.groupby(['date']).agg({
46
39
  'value': 'mean',
@@ -56,18 +49,17 @@ class DWDmirror:
56
49
  "value": "value",
57
50
  "station_id": "frequent_station",
58
51
  })
59
- self.df["variable"] = self.parameter_key
60
- self.df["latitude"] = self.lat
61
- self.df["longitude"] = self.lon
52
+ self.df["variable"] = variable
53
+ self.df["lat"] = lat_loc
54
+ self.df["lon"] = lon_loc
62
55
  self.df['source'] = 'DWD'
63
- self.df['units'] = self.units
64
- self.df = self.df[["latitude", "longitude", "time", "source", "variable", "value", "units"]]
56
+ self.df['units'] = self.param_mapping.dwd.variables[variable].unit
57
+ self.df = self.df[["lat", "lon", "time", "source", "variable", "value", "units"]]
65
58
  # self.df = df
66
59
  return self.df
67
60
 
68
- def save(self):
69
- filename = build_output_filename(self.cfg)
70
- self.df.to_csv(self.cfg.output.out_dir+filename, index=False)
61
+ def save_csv(self,filename):
62
+ self.df.to_csv(filename, index=False)
71
63
  print(f"✅ Saved time series to: {filename}")
72
64
  return filename
73
65
 
@@ -0,0 +1,133 @@
1
+ import os
2
+ import pandas as pd
3
+ import xarray as xr
4
+ from datetime import datetime
5
+ from omegaconf import DictConfig
6
+ from climdata.utils.utils_download import find_nearest_xy, fetch_dwd
7
+ import geopandas as gpd
8
+
9
+ class HYRASmirror:
10
+ def __init__(self, cfg: DictConfig):
11
+ self.cfg = cfg
12
+ self.dataset = None
13
+ self.variables = cfg.variables
14
+ self.files = []
15
+
16
+ def fetch(self, variable: str):
17
+ """
18
+ Download HYRAS NetCDF files for a given variable and time range.
19
+ """
20
+ fetch_dwd(self.cfg,variable)
21
+ # Build file list for the variable and time range
22
+ param_mapping = self.cfg.mappings
23
+ provider = self.cfg.dataset.lower()
24
+ parameter_key = variable
25
+ param_info = param_mapping[provider]['variables'][parameter_key]
26
+ prefix = param_info["prefix"]
27
+ version = param_info["version"]
28
+ start_year = datetime.fromisoformat(self.cfg.time_range.start_date).year
29
+ end_year = datetime.fromisoformat(self.cfg.time_range.end_date).year
30
+ files = []
31
+ for year in range(start_year, end_year + 1):
32
+ file_name = f"{prefix}_{year}_{version}_de.nc"
33
+ files.append(os.path.join(self.cfg.data_dir, provider, parameter_key.upper(), file_name))
34
+ self.files = files
35
+ return files
36
+
37
+ def load(self, variable: str):
38
+ """
39
+ Load HYRAS NetCDFs for a given variable into a single xarray Dataset.
40
+ """
41
+ files = self.fetch(variable)
42
+ datasets = []
43
+ for f in files:
44
+ if not os.path.exists(f):
45
+ print(f"File not found: {f}")
46
+ continue
47
+ try:
48
+ ds = xr.open_dataset(f)
49
+ datasets.append(ds)
50
+ except Exception as e:
51
+ print(f"Skipping file {f} due to error: {e}")
52
+ if not datasets:
53
+ raise RuntimeError(f"No datasets could be loaded for {variable}.")
54
+ dset = xr.concat(datasets, dim="time")
55
+ dset[variable] = dset[variable].transpose("time", "y", "x")
56
+ self.dataset = dset
57
+ return self.dataset
58
+
59
+ def extract(self, *, point=None, box=None, shapefile=None, buffer_km=0.0):
60
+ """
61
+ Extract data from the loaded HYRAS dataset.
62
+
63
+ Parameters
64
+ ----------
65
+ point : tuple (lon, lat), optional
66
+ Extracts a time series at the nearest grid point.
67
+ box : dict with lat/lon bounds, optional
68
+ Example: {"lat_min": 47, "lat_max": 49, "lon_min": 10, "lon_max": 12}
69
+ shapefile : str, optional
70
+ Path to a shapefile to clip the dataset spatially.
71
+ buffer_km : float, optional
72
+ Buffer distance (in kilometers) applied to the shapefile before clipping.
73
+ """
74
+ if self.dataset is None:
75
+ raise ValueError("No dataset loaded. Call `load()` first.")
76
+ ds = self.dataset
77
+
78
+ # Point extraction
79
+ if point is not None:
80
+ lat, lon = point[1], point[0]
81
+ iy, ix = find_nearest_xy(ds, lat, lon)
82
+ print(f"📌 Nearest grid point at (y,x)=({iy},{ix})")
83
+ ts = ds.isel(x=ix, y=iy)
84
+ self.dataset = ts
85
+ return ts
86
+
87
+ # Box extraction
88
+ elif box is not None:
89
+ if not all(k in box for k in ["lat_min", "lat_max", "lon_min", "lon_max"]):
90
+ raise ValueError("Box must contain lat_min, lat_max, lon_min, lon_max.")
91
+ dset_box = ds.sel(
92
+ y=slice(box["lat_max"], box["lat_min"]), # y usually decreasing (north -> south)
93
+ x=slice(box["lon_min"], box["lon_max"])
94
+ )
95
+ print(f"📦 Extracted box with shape: {dset_box.dims}")
96
+ self.dataset = dset_box
97
+ return dset_box
98
+
99
+ # Shapefile extraction
100
+ elif shapefile is not None:
101
+ gdf = gpd.read_file(shapefile)
102
+
103
+ if buffer_km > 0:
104
+ gdf = gdf.to_crs(epsg=3857) # project to meters
105
+ gdf["geometry"] = gdf.buffer(buffer_km * 1000) # buffer in meters
106
+ gdf = gdf.to_crs(epsg=4326) # back to lat/lon
107
+
108
+ # Ensure dataset has CRS info for clipping
109
+ if not ds.rio.crs:
110
+ ds = ds.rio.write_crs("EPSG:4326")
111
+
112
+ dset_clipped = ds.rio.clip(gdf.geometry, gdf.crs, drop=True)
113
+ print(f"🗺️ Extracted shapefile area with dims: {dset_clipped.dims}")
114
+ self.dataset = dset_clipped
115
+ return dset_clipped
116
+
117
+ else:
118
+ raise NotImplementedError("Must provide either point, box, or shapefile.")
119
+
120
+ def save_csv(self, filename, df=None):
121
+ """
122
+ Save the extracted time series to CSV.
123
+ """
124
+ if df is None:
125
+ if self.dataset is None:
126
+ raise ValueError("No dataset loaded or extracted.")
127
+ # If dataset is a DataArray, convert to DataFrame
128
+ if isinstance(self.dataset, xr.Dataset):
129
+ df = self.dataset.to_dataframe().reset_index()
130
+ else:
131
+ raise ValueError("Please provide a DataFrame or extract a point first.")
132
+ df.to_csv(filename, index=False)
133
+ print(f"Saved CSV to {filename}")