climdata 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of climdata might be problematic. Click here for more details.

Files changed (65) hide show
  1. {climdata-0.1.1 → climdata-0.1.3}/PKG-INFO +1 -1
  2. {climdata-0.1.1 → climdata-0.1.3}/climdata/__init__.py +2 -1
  3. climdata-0.1.3/climdata/conf/config.yaml +35 -0
  4. {climdata-0.1.1 → climdata-0.1.3}/climdata/conf/mappings/parameters.yaml +2 -2
  5. {climdata-0.1.1 → climdata-0.1.3}/climdata/datasets/CMIPCloud.py +55 -32
  6. {climdata-0.1.1 → climdata-0.1.3}/climdata/datasets/DWD.py +12 -20
  7. climdata-0.1.3/climdata/datasets/HYRAS.py +133 -0
  8. climdata-0.1.3/climdata/datasets/MSWX.py +228 -0
  9. climdata-0.1.3/climdata/utils/utils_download.py +241 -0
  10. {climdata-0.1.1 → climdata-0.1.3}/climdata.egg-info/PKG-INFO +1 -1
  11. {climdata-0.1.1 → climdata-0.1.3}/climdata.egg-info/SOURCES.txt +7 -4
  12. climdata-0.1.3/examples/climdata_cli.py +98 -0
  13. climdata-0.1.3/examples/climdata_loader.ipynb +1230 -0
  14. climdata-0.1.3/examples/get_CMIP_box.py +57 -0
  15. climdata-0.1.3/examples/get_CMIP_loc.py +61 -0
  16. climdata-0.1.3/examples/get_MSWX_box.py +60 -0
  17. climdata-0.1.3/examples/get_MSWX_loc.py +71 -0
  18. {climdata-0.1.1 → climdata-0.1.3}/pyproject.toml +2 -2
  19. climdata-0.1.1/climdata/__main__.py +0 -5
  20. climdata-0.1.1/climdata/conf/config.yaml +0 -38
  21. climdata-0.1.1/climdata/datasets/MSWX.py +0 -302
  22. climdata-0.1.1/climdata/main.py +0 -56
  23. climdata-0.1.1/climdata/utils/utils_download.py +0 -975
  24. climdata-0.1.1/examples/extract_dwd_loc.ipynb +0 -2429
  25. climdata-0.1.1/examples/zarr_tas_data/metadata.json +0 -1
  26. {climdata-0.1.1 → climdata-0.1.3}/.editorconfig +0 -0
  27. {climdata-0.1.1 → climdata-0.1.3}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  28. {climdata-0.1.1 → climdata-0.1.3}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  29. {climdata-0.1.1 → climdata-0.1.3}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  30. {climdata-0.1.1 → climdata-0.1.3}/.github/workflows/docs-build.yml +0 -0
  31. {climdata-0.1.1 → climdata-0.1.3}/.github/workflows/docs.yml +0 -0
  32. {climdata-0.1.1 → climdata-0.1.3}/.github/workflows/installation.yml +0 -0
  33. {climdata-0.1.1 → climdata-0.1.3}/.github/workflows/macos.yml +0 -0
  34. {climdata-0.1.1 → climdata-0.1.3}/.github/workflows/pypi.yml +0 -0
  35. {climdata-0.1.1 → climdata-0.1.3}/.github/workflows/ubuntu.yml +0 -0
  36. {climdata-0.1.1 → climdata-0.1.3}/.github/workflows/windows.yml +0 -0
  37. {climdata-0.1.1 → climdata-0.1.3}/.gitignore +0 -0
  38. {climdata-0.1.1 → climdata-0.1.3}/LICENSE +0 -0
  39. {climdata-0.1.1 → climdata-0.1.3}/MANIFEST.in +0 -0
  40. {climdata-0.1.1 → climdata-0.1.3}/README.md +0 -0
  41. {climdata-0.1.1 → climdata-0.1.3}/climdata/conf/mappings/variables.yaml +0 -0
  42. {climdata-0.1.1 → climdata-0.1.3}/climdata/datasets/CMIPlocal.py +0 -0
  43. {climdata-0.1.1 → climdata-0.1.3}/climdata/datasets/ERA5.py +0 -0
  44. {climdata-0.1.1 → climdata-0.1.3}/climdata/utils/__init__.py +0 -0
  45. {climdata-0.1.1 → climdata-0.1.3}/climdata/utils/config.py +0 -0
  46. {climdata-0.1.1 → climdata-0.1.3}/climdata.egg-info/dependency_links.txt +0 -0
  47. {climdata-0.1.1 → climdata-0.1.3}/climdata.egg-info/entry_points.txt +0 -0
  48. {climdata-0.1.1 → climdata-0.1.3}/climdata.egg-info/requires.txt +0 -0
  49. {climdata-0.1.1 → climdata-0.1.3}/climdata.egg-info/top_level.txt +0 -0
  50. {climdata-0.1.1 → climdata-0.1.3}/docs/changelog.md +0 -0
  51. {climdata-0.1.1 → climdata-0.1.3}/docs/climdata.md +0 -0
  52. {climdata-0.1.1 → climdata-0.1.3}/docs/common.md +0 -0
  53. {climdata-0.1.1 → climdata-0.1.3}/docs/contributing.md +0 -0
  54. {climdata-0.1.1 → climdata-0.1.3}/docs/faq.md +0 -0
  55. {climdata-0.1.1 → climdata-0.1.3}/docs/index.md +0 -0
  56. {climdata-0.1.1 → climdata-0.1.3}/docs/installation.md +0 -0
  57. {climdata-0.1.1 → climdata-0.1.3}/docs/overrides/main.html +0 -0
  58. {climdata-0.1.1 → climdata-0.1.3}/docs/usage.md +0 -0
  59. {climdata-0.1.1 → climdata-0.1.3}/dwd_tas_LAT52.507_LON14.1372_1989-01-01_2020-12-31.csv +0 -0
  60. {climdata-0.1.1 → climdata-0.1.3}/mkdocs.yml +0 -0
  61. {climdata-0.1.1 → climdata-0.1.3}/requirements.txt +0 -0
  62. {climdata-0.1.1 → climdata-0.1.3}/requirements_dev.txt +0 -0
  63. {climdata-0.1.1 → climdata-0.1.3}/setup.cfg +0 -0
  64. {climdata-0.1.1 → climdata-0.1.3}/tests/__init__.py +0 -0
  65. {climdata-0.1.1 → climdata-0.1.3}/tests/test_climdata.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: climdata
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: This project automates the fetching and extraction of weather data from multiple sources — such as MSWX, DWD HYRAS, ERA5-Land, NASA-NEX-GDDP, and more — for a given location and time range.
5
5
  Author-email: Kaushik Muduchuru <kaushik.reddy.m@gmail.com>
6
6
  License: MIT License
@@ -2,7 +2,7 @@
2
2
 
3
3
  __author__ = """Kaushik Muduchuru"""
4
4
  __email__ = "kaushik.reddy.m@gmail.com"
5
- __version__ = "0.1.1"
5
+ __version__ = "0.1.3"
6
6
 
7
7
  from .utils.utils_download import * # etc.
8
8
  from .utils.config import load_config
@@ -11,4 +11,5 @@ from .datasets.MSWX import MSWXmirror as MSWX
11
11
  from .datasets.ERA5 import ERA5Mirror as ERA5
12
12
  from .datasets.CMIPlocal import CMIPmirror as CMIPlocal
13
13
  from .datasets.CMIPCloud import CMIPCloud as CMIP
14
+ from .datasets.HYRAS import HYRASmirror as HYRAS
14
15
 
@@ -0,0 +1,35 @@
1
+ defaults:
2
+ - _self_
3
+ - mappings/parameters
4
+ - mappings/variables
5
+
6
+ dataset: MSWX
7
+ lat: null
8
+ lon: null
9
+
10
+ variables: ["tasmin","tasmax","pr"]
11
+
12
+ data_dir: ./data
13
+ region: None
14
+
15
+ experiment_id: historical
16
+ source_id: MIROC6
17
+ table_id: day
18
+
19
+ bounds:
20
+ europe:
21
+ lat_min: 34.0
22
+ lat_max: 71.0
23
+ lon_min: -25.0
24
+ lon_max: 45.0
25
+
26
+ time_range:
27
+ start_date: "1989-01-01"
28
+ end_date: "2020-12-31"
29
+
30
+ output:
31
+ out_dir: "./climdata/data/"
32
+ filename_csv: "{provider}_{parameter}_LAT_{lat}_LON_{lon}_{start}_{end}.csv"
33
+ filename_zarr: "{provider}_{parameter}_LAT{lat_range}_LON{lon_range}_{start}_{end}.zarr"
34
+ filename_nc: "{provider}_{parameter}_LAT{lat_range}_LON{lon_range}_{start}_{end}.nc"
35
+ fmt: "standard"
@@ -21,7 +21,7 @@ dwd:
21
21
  resolution: daily
22
22
  dataset: climate_summary
23
23
  name: precipitation_height
24
- unit: mm
24
+ unit: mm d-1
25
25
  rsds:
26
26
  resolution: daily
27
27
  dataset: solar
@@ -124,7 +124,7 @@ mswx:
124
124
  rsds:
125
125
  name: downward_shortwave_radiation
126
126
  folder_id: 1usXbIOi4_jBUdDaZbzPKXznx9PTYzHRv
127
- dwd_hyras:
127
+ hyras:
128
128
  variables:
129
129
  tasmin:
130
130
  name: tasmin
@@ -1,20 +1,32 @@
1
1
  import intake
2
2
  import xarray as xr
3
3
  import pandas as pd
4
+ from omegaconf import DictConfig
5
+ import intake
6
+ import xarray as xr
7
+ import pandas as pd
8
+ from omegaconf import DictConfig
9
+
4
10
 
5
11
  class CMIPCloud:
6
- def __init__(self, experiment_id, source_id, table_id, variables, region_bounds=None):
7
- self.experiment_id = experiment_id
8
- self.source_id = source_id
9
- self.table_id = table_id
10
- self.variables = variables
11
- self.region_bounds = region_bounds
12
+ def __init__(self, cfg: DictConfig):
13
+ # Directly read from flat config
14
+ self.experiment_id = cfg.experiment_id
15
+ self.source_id = cfg.source_id
16
+ self.table_id = cfg.table_id
17
+ self.variables = cfg.variables
18
+ self.start_date = cfg.time_range.start_date
19
+ self.end_date = cfg.time_range.end_date
20
+
12
21
  self.col_subsets = []
13
22
  self.ds = None
23
+ self.col = None
14
24
 
15
25
  def fetch(self):
16
26
  """Collect intake catalog subsets for each variable."""
17
- col = intake.open_esm_datastore("https://storage.googleapis.com/cmip6/pangeo-cmip6.json")
27
+ col = intake.open_esm_datastore(
28
+ "https://storage.googleapis.com/cmip6/pangeo-cmip6.json"
29
+ )
18
30
  self.col_subsets = []
19
31
  for var in self.variables:
20
32
  query = dict(
@@ -27,13 +39,16 @@ class CMIPCloud:
27
39
  if len(col_subset.df) == 0:
28
40
  continue
29
41
  self.col_subsets.append(col_subset)
42
+ self.col = col
30
43
  return self.col_subsets
31
44
 
32
45
  def load(self):
33
46
  """Load and merge datasets from collected col_subsets."""
34
47
  datasets = []
35
48
  for col_subset in self.col_subsets:
36
- zstore_path = col_subset.df.zstore.values[0].replace('gs:/', "https://storage.googleapis.com")
49
+ zstore_path = col_subset.df.zstore.values[0].replace(
50
+ "gs:/", "https://storage.googleapis.com"
51
+ )
37
52
  ds_var = xr.open_zarr(zstore_path)
38
53
  datasets.append(ds_var)
39
54
  if datasets:
@@ -51,25 +66,25 @@ class CMIPCloud:
51
66
 
52
67
  if self.ds is None:
53
68
  raise ValueError("No dataset loaded. Call `load()` first.")
54
-
69
+
70
+ self._subset_time(self.start_date, self.end_date)
71
+
55
72
  ds = self.ds
56
-
57
73
  if point is not None:
58
74
  lon, lat = point
59
75
  if buffer_km > 0:
60
76
  buffer_deg = buffer_km / 111
61
77
  ds_subset = ds.sel(
62
- lon=slice(lon-buffer_deg, lon+buffer_deg),
63
- lat=slice(lat-buffer_deg, lat+buffer_deg)
78
+ lon=slice(lon - buffer_deg, lon + buffer_deg),
79
+ lat=slice(lat - buffer_deg, lat + buffer_deg),
64
80
  )
65
81
  else:
66
82
  ds_subset = ds.sel(lon=lon, lat=lat, method="nearest")
67
83
 
68
84
  elif box is not None:
69
- # Accept dict: {'lat_min': ..., 'lat_max': ..., 'lon_min': ..., 'lon_max': ...}
70
85
  ds_subset = ds.sel(
71
- lon=slice(box['lon_min'], box['lon_max']),
72
- lat=slice(box['lat_min'], box['lat_max'])
86
+ lon=slice(box["lon_min"], box["lon_max"]),
87
+ lat=slice(box["lat_min"], box["lat_max"]),
73
88
  )
74
89
 
75
90
  elif shapefile is not None:
@@ -83,6 +98,7 @@ class CMIPCloud:
83
98
  gdf = gdf.to_crs(epsg=4326)
84
99
  geom = [mapping(g) for g in gdf.geometry]
85
100
  import rioxarray
101
+
86
102
  ds = ds.rio.write_crs("EPSG:4326", inplace=False)
87
103
  ds_subset = ds.rio.clip(geom, gdf.crs, drop=True)
88
104
 
@@ -90,11 +106,9 @@ class CMIPCloud:
90
106
  raise ValueError("Must provide either point, box, or shapefile.")
91
107
  self.ds = ds_subset
92
108
  return ds_subset
109
+
93
110
  def _subset_time(self, start_date, end_date):
94
- """
95
- Subset the dataset by time range.
96
- Dates should be strings in 'YYYY-MM-DD' format.
97
- """
111
+ """Subset the dataset by time range."""
98
112
  if self.ds is None:
99
113
  return None
100
114
  ds_time = self.ds.sel(time=slice(start_date, end_date))
@@ -114,29 +128,38 @@ class CMIPCloud:
114
128
  print(f"Saved Zarr to {store_path}")
115
129
 
116
130
  def _format(self, df):
117
- """
118
- Format the dataframe for standardized output:
119
- - Adds source_id, experiment_id, table_id, variable, value, units columns.
120
- - Stacks variables into long format.
121
- """
122
- # Melt the dataframe to long format: variable, value
131
+ """Format dataframe for standardized output."""
123
132
  value_vars = [v for v in self.variables if v in df.columns]
124
133
  id_vars = [c for c in df.columns if c not in value_vars]
125
- df_long = df.melt(id_vars=id_vars, value_vars=value_vars,
126
- var_name="variable", value_name="value")
127
134
 
128
- # Add units column (from attrs)
135
+ df_long = df.melt(
136
+ id_vars=id_vars,
137
+ value_vars=value_vars,
138
+ var_name="variable",
139
+ value_name="value",
140
+ )
141
+
129
142
  df_long["units"] = df_long["variable"].map(
130
- lambda v: self.ds[v].attrs.get("units", "unknown") if v in self.ds.data_vars else "unknown"
143
+ lambda v: self.ds[v].attrs.get("units", "unknown")
144
+ if v in self.ds.data_vars
145
+ else "unknown"
131
146
  )
132
147
 
133
- # Add metadata columns if missing
134
148
  df_long["source"] = self.source_id
135
149
  df_long["experiment"] = self.experiment_id
136
150
  df_long["table"] = self.table_id
137
151
 
138
- # Reorder columns
139
- cols = ["source", "experiment", "table", "time", "lat", "lon", "variable", "value", "units"]
152
+ cols = [
153
+ "source",
154
+ "experiment",
155
+ "table",
156
+ "time",
157
+ "lat",
158
+ "lon",
159
+ "variable",
160
+ "value",
161
+ "units",
162
+ ]
140
163
  df_long = df_long[[c for c in cols if c in df_long.columns]]
141
164
 
142
165
  return df_long
@@ -3,23 +3,16 @@ import pandas as pd
3
3
  import hydra
4
4
  from wetterdienst import Settings
5
5
  from wetterdienst.provider.dwd.observation import DwdObservationRequest
6
- from climdata.utils.utils_download import build_output_filename
7
6
 
8
7
  class DWDmirror:
9
8
  def __init__(self, cfg):
10
9
  self.cfg = cfg
11
10
  self.param_mapping = cfg.mappings
12
- self.provider = cfg.dataset.lower()
13
- self.parameter_key = cfg.weather.parameter
14
- self.lat = cfg.location.lat
15
- self.lon = cfg.location.lon
16
- self.distance = cfg.location.buffer_km
17
11
  self.start_date = cfg.time_range.start_date
18
12
  self.end_date = cfg.time_range.end_date
19
- self.units = self.param_mapping[self.provider]['variables'][self.parameter_key].get("unit", None)
20
13
  self.df = None
21
- def fetch(self):
22
- param_info = self.param_mapping[self.provider]['variables'][self.parameter_key]
14
+ def load(self, variable, lat_loc, lon_loc, buffer_km = 50):
15
+ param_info = self.param_mapping.dwd.variables[variable]
23
16
  resolution = param_info["resolution"]
24
17
  dataset = param_info["dataset"]
25
18
  variable_name = param_info["name"]
@@ -31,8 +24,8 @@ class DWDmirror:
31
24
  end_date=self.end_date,
32
25
  settings=settings
33
26
  ).filter_by_distance(
34
- latlon=(self.lat, self.lon),
35
- distance=self.distance,
27
+ latlon=(lat_loc, lon_loc),
28
+ distance=buffer_km,
36
29
  unit="km"
37
30
  )
38
31
 
@@ -40,7 +33,7 @@ class DWDmirror:
40
33
  self.df = df
41
34
  return self.df
42
35
 
43
- def format(self):
36
+ def format(self, variable, lat_loc, lon_loc):
44
37
  self.df['date'] = pd.to_datetime(self.df['date'])
45
38
  self.df = self.df.groupby(['date']).agg({
46
39
  'value': 'mean',
@@ -56,18 +49,17 @@ class DWDmirror:
56
49
  "value": "value",
57
50
  "station_id": "frequent_station",
58
51
  })
59
- self.df["variable"] = self.parameter_key
60
- self.df["latitude"] = self.lat
61
- self.df["longitude"] = self.lon
52
+ self.df["variable"] = variable
53
+ self.df["lat"] = lat_loc
54
+ self.df["lon"] = lon_loc
62
55
  self.df['source'] = 'DWD'
63
- self.df['units'] = self.units
64
- self.df = self.df[["latitude", "longitude", "time", "source", "variable", "value", "units"]]
56
+ self.df['units'] = self.param_mapping.dwd.variables[variable].unit
57
+ self.df = self.df[["lat", "lon", "time", "source", "variable", "value", "units"]]
65
58
  # self.df = df
66
59
  return self.df
67
60
 
68
- def save(self):
69
- filename = build_output_filename(self.cfg)
70
- self.df.to_csv(self.cfg.output.out_dir+filename, index=False)
61
+ def save_csv(self,filename):
62
+ self.df.to_csv(filename, index=False)
71
63
  print(f"✅ Saved time series to: {filename}")
72
64
  return filename
73
65
 
@@ -0,0 +1,133 @@
1
+ import os
2
+ import pandas as pd
3
+ import xarray as xr
4
+ from datetime import datetime
5
+ from omegaconf import DictConfig
6
+ from climdata.utils.utils_download import find_nearest_xy, fetch_dwd
7
+ import geopandas as gpd
8
+
9
+ class HYRASmirror:
10
+ def __init__(self, cfg: DictConfig):
11
+ self.cfg = cfg
12
+ self.dataset = None
13
+ self.variables = cfg.variables
14
+ self.files = []
15
+
16
+ def fetch(self, variable: str):
17
+ """
18
+ Download HYRAS NetCDF files for a given variable and time range.
19
+ """
20
+ fetch_dwd(self.cfg,variable)
21
+ # Build file list for the variable and time range
22
+ param_mapping = self.cfg.mappings
23
+ provider = self.cfg.dataset.lower()
24
+ parameter_key = variable
25
+ param_info = param_mapping[provider]['variables'][parameter_key]
26
+ prefix = param_info["prefix"]
27
+ version = param_info["version"]
28
+ start_year = datetime.fromisoformat(self.cfg.time_range.start_date).year
29
+ end_year = datetime.fromisoformat(self.cfg.time_range.end_date).year
30
+ files = []
31
+ for year in range(start_year, end_year + 1):
32
+ file_name = f"{prefix}_{year}_{version}_de.nc"
33
+ files.append(os.path.join(self.cfg.data_dir, provider, parameter_key.upper(), file_name))
34
+ self.files = files
35
+ return files
36
+
37
+ def load(self, variable: str):
38
+ """
39
+ Load HYRAS NetCDFs for a given variable into a single xarray Dataset.
40
+ """
41
+ files = self.fetch(variable)
42
+ datasets = []
43
+ for f in files:
44
+ if not os.path.exists(f):
45
+ print(f"File not found: {f}")
46
+ continue
47
+ try:
48
+ ds = xr.open_dataset(f)
49
+ datasets.append(ds)
50
+ except Exception as e:
51
+ print(f"Skipping file {f} due to error: {e}")
52
+ if not datasets:
53
+ raise RuntimeError(f"No datasets could be loaded for {variable}.")
54
+ dset = xr.concat(datasets, dim="time")
55
+ dset[variable] = dset[variable].transpose("time", "y", "x")
56
+ self.dataset = dset
57
+ return self.dataset
58
+
59
+ def extract(self, *, point=None, box=None, shapefile=None, buffer_km=0.0):
60
+ """
61
+ Extract data from the loaded HYRAS dataset.
62
+
63
+ Parameters
64
+ ----------
65
+ point : tuple (lon, lat), optional
66
+ Extracts a time series at the nearest grid point.
67
+ box : dict with lat/lon bounds, optional
68
+ Example: {"lat_min": 47, "lat_max": 49, "lon_min": 10, "lon_max": 12}
69
+ shapefile : str, optional
70
+ Path to a shapefile to clip the dataset spatially.
71
+ buffer_km : float, optional
72
+ Buffer distance (in kilometers) applied to the shapefile before clipping.
73
+ """
74
+ if self.dataset is None:
75
+ raise ValueError("No dataset loaded. Call `load()` first.")
76
+ ds = self.dataset
77
+
78
+ # Point extraction
79
+ if point is not None:
80
+ lat, lon = point[1], point[0]
81
+ iy, ix = find_nearest_xy(ds, lat, lon)
82
+ print(f"📌 Nearest grid point at (y,x)=({iy},{ix})")
83
+ ts = ds.isel(x=ix, y=iy)
84
+ self.dataset = ts
85
+ return ts
86
+
87
+ # Box extraction
88
+ elif box is not None:
89
+ if not all(k in box for k in ["lat_min", "lat_max", "lon_min", "lon_max"]):
90
+ raise ValueError("Box must contain lat_min, lat_max, lon_min, lon_max.")
91
+ dset_box = ds.sel(
92
+ y=slice(box["lat_max"], box["lat_min"]), # y usually decreasing (north -> south)
93
+ x=slice(box["lon_min"], box["lon_max"])
94
+ )
95
+ print(f"📦 Extracted box with shape: {dset_box.dims}")
96
+ self.dataset = dset_box
97
+ return dset_box
98
+
99
+ # Shapefile extraction
100
+ elif shapefile is not None:
101
+ gdf = gpd.read_file(shapefile)
102
+
103
+ if buffer_km > 0:
104
+ gdf = gdf.to_crs(epsg=3857) # project to meters
105
+ gdf["geometry"] = gdf.buffer(buffer_km * 1000) # buffer in meters
106
+ gdf = gdf.to_crs(epsg=4326) # back to lat/lon
107
+
108
+ # Ensure dataset has CRS info for clipping
109
+ if not ds.rio.crs:
110
+ ds = ds.rio.write_crs("EPSG:4326")
111
+
112
+ dset_clipped = ds.rio.clip(gdf.geometry, gdf.crs, drop=True)
113
+ print(f"🗺️ Extracted shapefile area with dims: {dset_clipped.dims}")
114
+ self.dataset = dset_clipped
115
+ return dset_clipped
116
+
117
+ else:
118
+ raise NotImplementedError("Must provide either point, box, or shapefile.")
119
+
120
+ def save_csv(self, filename, df=None):
121
+ """
122
+ Save the extracted time series to CSV.
123
+ """
124
+ if df is None:
125
+ if self.dataset is None:
126
+ raise ValueError("No dataset loaded or extracted.")
127
+ # If dataset is a DataArray, convert to DataFrame
128
+ if isinstance(self.dataset, xr.Dataset):
129
+ df = self.dataset.to_dataframe().reset_index()
130
+ else:
131
+ raise ValueError("Please provide a DataFrame or extract a point first.")
132
+ df.to_csv(filename, index=False)
133
+ print(f"Saved CSV to {filename}")
@@ -0,0 +1,228 @@
1
+ import pandas as pd
2
+ import geopandas as gpd
3
+ import os
4
+ from tqdm import tqdm
5
+ import warnings
6
+ from datetime import datetime, timedelta
7
+ import xarray as xr
8
+ from omegaconf import DictConfig
9
+
10
+ from google.oauth2 import service_account
11
+ from googleapiclient.discovery import build
12
+
13
+ from climdata.utils.utils_download import list_drive_files, download_drive_file
14
+ from shapely.geometry import mapping
15
+ import cf_xarray
16
+
17
+ warnings.filterwarnings("ignore", category=Warning)
18
+
19
+
20
+ class MSWXmirror:
21
+ def __init__(self, cfg: DictConfig):
22
+ self.cfg = cfg
23
+ self.dataset = None
24
+ self.variables = cfg.variables
25
+ self.files = []
26
+
27
+ def _fix_coords(self, ds: xr.Dataset | xr.DataArray):
28
+ """Ensure latitude is ascending and longitude is in the range [0, 360]."""
29
+ ds = ds.cf.sortby("latitude")
30
+ lon_name = ds.cf["longitude"].name
31
+ ds = ds.assign_coords({lon_name: ds.cf["longitude"] % 360})
32
+ return ds.sortby(lon_name)
33
+
34
+ def fetch(self, folder_id: str, variable: str):
35
+ """
36
+ Fetch MSWX files from Google Drive for a given variable.
37
+ """
38
+ start = datetime.fromisoformat(self.cfg.time_range.start_date)
39
+ end = datetime.fromisoformat(self.cfg.time_range.end_date)
40
+
41
+ expected_files = []
42
+ current = start
43
+ while current <= end:
44
+ doy = current.timetuple().tm_yday
45
+ basename = f"{current.year}{doy:03d}.nc"
46
+ expected_files.append(basename)
47
+ current += timedelta(days=1)
48
+
49
+ output_dir = self.cfg.data_dir
50
+ local_files, missing_files = [], []
51
+
52
+ for basename in expected_files:
53
+ local_path = os.path.join(output_dir,self.cfg.dataset, variable, basename)
54
+ if os.path.exists(local_path):
55
+ local_files.append(basename)
56
+ else:
57
+ missing_files.append(basename)
58
+
59
+ if not missing_files:
60
+ print(f"✅ All {len(expected_files)} {variable} files already exist locally.")
61
+ return local_files
62
+
63
+ print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching {variable} from Drive...")
64
+
65
+ SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
66
+ creds = service_account.Credentials.from_service_account_file(
67
+ self.cfg.mappings.mswx.params.google_service_account, scopes=SCOPES
68
+ )
69
+ service = build('drive', 'v3', credentials=creds)
70
+
71
+ drive_files = list_drive_files(folder_id, service)
72
+ valid_filenames = set(missing_files)
73
+ files_to_download = [f for f in drive_files if f['name'] in valid_filenames]
74
+
75
+ if not files_to_download:
76
+ print(f"⚠️ No {variable} files found in Drive for requested dates.")
77
+ return local_files
78
+
79
+ for file in files_to_download:
80
+ filename = file['name']
81
+ local_path = os.path.join(output_dir, self.cfg.dataset, variable, filename)
82
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
83
+ print(f"⬇️ Downloading {filename} ...")
84
+ download_drive_file(file['id'], local_path, service)
85
+ local_files.append(filename)
86
+
87
+ return local_files
88
+
89
+ def load(self, variable: str):
90
+ """
91
+ Load MSWX NetCDFs for a given variable into a single xarray Dataset.
92
+ """
93
+ folder_id = self.cfg.mappings["mswx"]["variables"][variable]["folder_id"]
94
+ files = self.fetch(folder_id, variable)
95
+ datasets = []
96
+
97
+ for f in files:
98
+ local_path = os.path.join(self.cfg.data_dir, self.cfg.dataset.lower(), variable, f)
99
+ try:
100
+ ds = xr.open_dataset(local_path, chunks="auto", engine="netcdf4")[self.cfg.mappings[self.cfg.dataset].variables[variable].name]
101
+ ds = ds.rename(variable)
102
+ datasets.append(ds)
103
+ except Exception as e:
104
+ print(f"Skipping file {f} due to error: {e}")
105
+
106
+ if not datasets:
107
+ raise RuntimeError(f"No datasets could be loaded for {variable}.")
108
+
109
+ dset = xr.concat(datasets, dim="time")
110
+ dset = dset.transpose("time", "lat", "lon")
111
+ dset = self._fix_coords(dset)
112
+
113
+ self.dataset = dset
114
+ return self.dataset
115
+
116
+ def to_zarr(self, zarr_filename: str):
117
+ if self.dataset is None:
118
+ raise ValueError("No dataset loaded. Call `load()` first.")
119
+
120
+ var_name = self.dataset.name
121
+ if var_name == 'pr':
122
+ self.dataset.attrs['units'] = 'mm/day'
123
+ elif var_name in ['tas', 'tasmax', 'tasmin']:
124
+ self.dataset.attrs['units'] = 'degC'
125
+
126
+ zarr_path = os.path.join("data/MSWX", zarr_filename)
127
+ os.makedirs(os.path.dirname(zarr_path), exist_ok=True)
128
+
129
+ print(f"💾 Saving {var_name} to Zarr: {zarr_path}")
130
+ self.dataset.to_zarr(zarr_path, mode="w")
131
+
132
+ def extract(self, *, point=None, box=None, shapefile=None, buffer_km=0.0):
133
+ if self.dataset is None:
134
+ raise ValueError("No dataset loaded. Call `load()` first.")
135
+
136
+ ds = self.dataset.rio.write_crs("EPSG:4326", inplace=False)
137
+
138
+ if point is not None:
139
+ lon, lat = point
140
+ if buffer_km > 0:
141
+ buffer_deg = buffer_km / 111
142
+ ds_subset = ds.sel(
143
+ lon=slice(lon-buffer_deg, lon+buffer_deg),
144
+ lat=slice(lat-buffer_deg, lat+buffer_deg),
145
+ )
146
+ else:
147
+ ds_subset = ds.sel(lon=lon, lat=lat, method="nearest")
148
+
149
+ elif box is not None:
150
+ ds_subset = ds.sel(
151
+ lon=slice(box["lon_min"], box["lon_max"]),
152
+ lat=slice(box["lat_min"], box["lat_max"]),
153
+ )
154
+
155
+ elif shapefile is not None:
156
+ if isinstance(shapefile, str):
157
+ gdf = gpd.read_file(shapefile)
158
+ else:
159
+ gdf = shapefile
160
+ if buffer_km > 0:
161
+ gdf = gdf.to_crs(epsg=3857)
162
+ gdf["geometry"] = gdf.buffer(buffer_km * 1000)
163
+ gdf = gdf.to_crs(epsg=4326)
164
+ geom = [mapping(g) for g in gdf.geometry]
165
+ ds_subset = ds.rio.clip(geom, gdf.crs, drop=True)
166
+
167
+ else:
168
+ raise ValueError("Must provide either point, box, or shapefile.")
169
+
170
+ self.dataset = ds_subset.to_dataset()
171
+ return ds_subset
172
+
173
+ # def to_dataframe(self, ds=None):
174
+ # if ds is None:
175
+ # if self.dataset is None:
176
+ # raise ValueError("No dataset loaded. Call `load()` first or pass `ds`.")
177
+ # ds = self.dataset
178
+
179
+ # if isinstance(ds, xr.Dataset):
180
+ # if len(ds.data_vars) != 1:
181
+ # raise ValueError("Dataset has multiple variables. Please select one.")
182
+ # ds = ds[list(ds.data_vars)[0]]
183
+
184
+ # df = ds.to_dataframe().reset_index()
185
+ # df = df[["time", "lat", "lon", ds.name]]
186
+ # df = df.rename(columns={"lat": "latitude", "lon": "longitude", ds.name: "value"})
187
+ # return df
188
+
189
+ def _format(self, df):
190
+ """Format dataframe for standardized output."""
191
+ value_vars = [v for v in self.variables if v in df.columns]
192
+ id_vars = [c for c in df.columns if c not in value_vars]
193
+
194
+ df_long = df.melt(
195
+ id_vars=id_vars,
196
+ value_vars=value_vars,
197
+ var_name="variable",
198
+ value_name="value",
199
+ )
200
+
201
+ df_long["units"] = df_long["variable"].map(
202
+ lambda v: self.dataset[v].attrs.get("units", "unknown")
203
+ if v in self.dataset.data_vars
204
+ else "unknown"
205
+ )
206
+
207
+ df_long["source"] = self.cfg.dataset
208
+
209
+ cols = [
210
+ "source",
211
+ "table",
212
+ "time",
213
+ "lat",
214
+ "lon",
215
+ "variable",
216
+ "value",
217
+ "units",
218
+ ]
219
+ df_long = df_long[[c for c in cols if c in df_long.columns]]
220
+
221
+ return df_long
222
+
223
+ def save_csv(self, filename):
224
+ if self.dataset is not None:
225
+ df = self.dataset.to_dataframe().reset_index()
226
+ df = self._format(df)
227
+ df.to_csv(filename, index=False)
228
+ print(f"Saved CSV to {filename}")