PyPI - climdata - Versions diffs - 0.1.1__py2.py3-none-any.whl → 0.1.3__py2.py3-none-any.whl - Mend

climdata 0.1.1py2.py3-none-any.whl → 0.1.3py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of climdata might be problematic. Click here for more details.

Files changed (17) hide show

climdata/__init__.py +2 -1
climdata/conf/config.yaml +17 -20
climdata/conf/mappings/parameters.yaml +2 -2
climdata/datasets/CMIPCloud.py +55 -32
climdata/datasets/DWD.py +12 -20
climdata/datasets/HYRAS.py +133 -0
climdata/datasets/MSWX.py +110 -184
climdata/utils/utils_download.py +33 -767
{climdata-0.1.1.dist-info → climdata-0.1.3.dist-info}/METADATA +1 -1
climdata-0.1.3.dist-info/RECORD +19 -0
climdata/__main__.py +0 -5
climdata/main.py +0 -56
climdata-0.1.1.dist-info/RECORD +0 -20
{climdata-0.1.1.dist-info → climdata-0.1.3.dist-info}/WHEEL +0 -0
{climdata-0.1.1.dist-info → climdata-0.1.3.dist-info}/entry_points.txt +0 -0
{climdata-0.1.1.dist-info → climdata-0.1.3.dist-info}/licenses/LICENSE +0 -0
{climdata-0.1.1.dist-info → climdata-0.1.3.dist-info}/top_level.txt +0 -0

climdata/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@
 __author__ = """Kaushik Muduchuru"""
 __email__ = "kaushik.reddy.m@gmail.com"
-__version__ = "0.1.1"
+__version__ = "0.1.3"
 from .utils.utils_download import * # etc.
 from .utils.config import load_config
@@ -11,4 +11,5 @@ from .datasets.MSWX import MSWXmirror as MSWX
 from .datasets.ERA5 import ERA5Mirror as ERA5
 from .datasets.CMIPlocal import CMIPmirror as CMIPlocal
 from .datasets.CMIPCloud import CMIPCloud as CMIP
+from .datasets.HYRAS import HYRASmirror as HYRAS

climdata/conf/config.yaml CHANGED Viewed

@@ -1,31 +1,27 @@
 defaults:
   - _self_
   - mappings/parameters
   - mappings/variables
-dataset: dwd
+dataset: MSWX
+lat: null
+lon: null
+variables: ["tasmin","tasmax","pr"]
 data_dir: ./data
-weather:
-  parameter: tas  # standardized variable name (e.g., tas, pr, rsds)
+region: None
-region: europe
+experiment_id: historical
+source_id: MIROC6
+table_id: day
 bounds:
-  global:
-    lat_min: -90.0
-    lat_max: 90.0
-    lon_min: -180.0
-    lon_max: 180.0
   europe:
-    lat_min: 34.0     # Southern Europe (e.g., southern Greece)
-    lat_max: 71.0     # Northern Europe (e.g., northern Norway)
-    lon_min: -25.0    # Western Europe (e.g., Azores)
-    lon_max: 45.0     # Eastern Europe (Ural Mountains, excludes most of Russia)
-location:
-  lat: 52.5070
-  lon: 14.1372
-  buffer_km: 25
+    lat_min: 34.0
+    lat_max: 71.0
+    lon_min: -25.0
+    lon_max: 45.0
 time_range:
   start_date: "1989-01-01"
@@ -35,4 +31,5 @@ output:
   out_dir: "./climdata/data/"
   filename_csv: "{provider}_{parameter}_LAT_{lat}_LON_{lon}_{start}_{end}.csv"
   filename_zarr: "{provider}_{parameter}_LAT{lat_range}_LON{lon_range}_{start}_{end}.zarr"
-  fmt: 'standard' # 'standard', 'ICASA', 'simplace', 'monica'
+  filename_nc: "{provider}_{parameter}_LAT{lat_range}_LON{lon_range}_{start}_{end}.nc"
+  fmt: "standard"

climdata/conf/mappings/parameters.yaml CHANGED Viewed

@@ -21,7 +21,7 @@ dwd:
       resolution: daily
       dataset: climate_summary
       name: precipitation_height
-      unit: mm
+      unit: mm d-1
     rsds:
       resolution: daily
       dataset: solar
@@ -124,7 +124,7 @@ mswx:
     rsds:
       name: downward_shortwave_radiation
       folder_id: 1usXbIOi4_jBUdDaZbzPKXznx9PTYzHRv
-dwd_hyras:
+hyras:
   variables:
     tasmin:
       name: tasmin

climdata/datasets/CMIPCloud.py CHANGED Viewed

@@ -1,20 +1,32 @@
 import intake
 import xarray as xr
 import pandas as pd
+from omegaconf import DictConfig
+import intake
+import xarray as xr
+import pandas as pd
+from omegaconf import DictConfig
 class CMIPCloud:
-    def __init__(self, experiment_id, source_id, table_id, variables, region_bounds=None):
-        self.experiment_id = experiment_id
-        self.source_id = source_id
-        self.table_id = table_id
-        self.variables = variables
-        self.region_bounds = region_bounds
+    def __init__(self, cfg: DictConfig):
+        # Directly read from flat config
+        self.experiment_id = cfg.experiment_id
+        self.source_id = cfg.source_id
+        self.table_id = cfg.table_id
+        self.variables = cfg.variables
+        self.start_date = cfg.time_range.start_date
+        self.end_date = cfg.time_range.end_date
         self.col_subsets = []
         self.ds = None
+        self.col = None
     def fetch(self):
         """Collect intake catalog subsets for each variable."""
-        col = intake.open_esm_datastore("https://storage.googleapis.com/cmip6/pangeo-cmip6.json")
+        col = intake.open_esm_datastore(
+            "https://storage.googleapis.com/cmip6/pangeo-cmip6.json"
+        )
         self.col_subsets = []
         for var in self.variables:
             query = dict(
@@ -27,13 +39,16 @@ class CMIPCloud:
             if len(col_subset.df) == 0:
                 continue
             self.col_subsets.append(col_subset)
+            self.col = col
         return self.col_subsets
     def load(self):
         """Load and merge datasets from collected col_subsets."""
         datasets = []
         for col_subset in self.col_subsets:
-            zstore_path = col_subset.df.zstore.values[0].replace('gs:/', "https://storage.googleapis.com")
+            zstore_path = col_subset.df.zstore.values[0].replace(
+                "gs:/", "https://storage.googleapis.com"
+            )
             ds_var = xr.open_zarr(zstore_path)
             datasets.append(ds_var)
         if datasets:
@@ -51,25 +66,25 @@ class CMIPCloud:
         if self.ds is None:
             raise ValueError("No dataset loaded. Call `load()` first.")
+        self._subset_time(self.start_date, self.end_date)
         ds = self.ds
         if point is not None:
             lon, lat = point
             if buffer_km > 0:
                 buffer_deg = buffer_km / 111
                 ds_subset = ds.sel(
-                    lon=slice(lon-buffer_deg, lon+buffer_deg),
-                    lat=slice(lat-buffer_deg, lat+buffer_deg)
+                    lon=slice(lon - buffer_deg, lon + buffer_deg),
+                    lat=slice(lat - buffer_deg, lat + buffer_deg),
                 )
             else:
                 ds_subset = ds.sel(lon=lon, lat=lat, method="nearest")
         elif box is not None:
-            # Accept dict: {'lat_min': ..., 'lat_max': ..., 'lon_min': ..., 'lon_max': ...}
             ds_subset = ds.sel(
-                lon=slice(box['lon_min'], box['lon_max']),
-                lat=slice(box['lat_min'], box['lat_max'])
+                lon=slice(box["lon_min"], box["lon_max"]),
+                lat=slice(box["lat_min"], box["lat_max"]),
             )
         elif shapefile is not None:
@@ -83,6 +98,7 @@ class CMIPCloud:
                 gdf = gdf.to_crs(epsg=4326)
             geom = [mapping(g) for g in gdf.geometry]
             import rioxarray
             ds = ds.rio.write_crs("EPSG:4326", inplace=False)
             ds_subset = ds.rio.clip(geom, gdf.crs, drop=True)
@@ -90,11 +106,9 @@ class CMIPCloud:
             raise ValueError("Must provide either point, box, or shapefile.")
         self.ds = ds_subset
         return ds_subset
     def _subset_time(self, start_date, end_date):
-        """
-        Subset the dataset by time range.
-        Dates should be strings in 'YYYY-MM-DD' format.
-        """
+        """Subset the dataset by time range."""
         if self.ds is None:
             return None
         ds_time = self.ds.sel(time=slice(start_date, end_date))
@@ -114,29 +128,38 @@ class CMIPCloud:
             print(f"Saved Zarr to {store_path}")
     def _format(self, df):
-        """
-        Format the dataframe for standardized output:
-        - Adds source_id, experiment_id, table_id, variable, value, units columns.
-        - Stacks variables into long format.
-        """
-        # Melt the dataframe to long format: variable, value
+        """Format dataframe for standardized output."""
         value_vars = [v for v in self.variables if v in df.columns]
         id_vars = [c for c in df.columns if c not in value_vars]
-        df_long = df.melt(id_vars=id_vars, value_vars=value_vars,
-                          var_name="variable", value_name="value")
-        # Add units column (from attrs)
+        df_long = df.melt(
+            id_vars=id_vars,
+            value_vars=value_vars,
+            var_name="variable",
+            value_name="value",
+        )
         df_long["units"] = df_long["variable"].map(
-            lambda v: self.ds[v].attrs.get("units", "unknown") if v in self.ds.data_vars else "unknown"
+            lambda v: self.ds[v].attrs.get("units", "unknown")
+            if v in self.ds.data_vars
+            else "unknown"
         )
-        # Add metadata columns if missing
         df_long["source"] = self.source_id
         df_long["experiment"] = self.experiment_id
         df_long["table"] = self.table_id
-        # Reorder columns
-        cols = ["source", "experiment", "table", "time", "lat", "lon", "variable", "value", "units"]
+        cols = [
+            "source",
+            "experiment",
+            "table",
+            "time",
+            "lat",
+            "lon",
+            "variable",
+            "value",
+            "units",
+        ]
         df_long = df_long[[c for c in cols if c in df_long.columns]]
         return df_long

climdata/datasets/DWD.py CHANGED Viewed

@@ -3,23 +3,16 @@ import pandas as pd
 import hydra
 from wetterdienst import Settings
 from wetterdienst.provider.dwd.observation import DwdObservationRequest
-from climdata.utils.utils_download import build_output_filename
 class DWDmirror:
     def __init__(self, cfg):
         self.cfg = cfg
         self.param_mapping = cfg.mappings
-        self.provider = cfg.dataset.lower()
-        self.parameter_key = cfg.weather.parameter
-        self.lat = cfg.location.lat
-        self.lon = cfg.location.lon
-        self.distance = cfg.location.buffer_km
         self.start_date = cfg.time_range.start_date
         self.end_date = cfg.time_range.end_date
-        self.units = self.param_mapping[self.provider]['variables'][self.parameter_key].get("unit", None)
         self.df = None
-    def fetch(self):
-        param_info = self.param_mapping[self.provider]['variables'][self.parameter_key]
+    def load(self, variable, lat_loc, lon_loc, buffer_km = 50):
+        param_info = self.param_mapping.dwd.variables[variable]
         resolution = param_info["resolution"]
         dataset = param_info["dataset"]
         variable_name = param_info["name"]
@@ -31,8 +24,8 @@ class DWDmirror:
             end_date=self.end_date,
             settings=settings
         ).filter_by_distance(
-            latlon=(self.lat, self.lon),
-            distance=self.distance,
+            latlon=(lat_loc, lon_loc),
+            distance=buffer_km,
             unit="km"
         )
@@ -40,7 +33,7 @@ class DWDmirror:
         self.df = df
         return self.df
-    def format(self):
+    def format(self, variable, lat_loc, lon_loc):
         self.df['date'] = pd.to_datetime(self.df['date'])
         self.df = self.df.groupby(['date']).agg({
             'value': 'mean',
@@ -56,18 +49,17 @@ class DWDmirror:
             "value": "value",
             "station_id": "frequent_station",
         })
-        self.df["variable"] = self.parameter_key
-        self.df["latitude"] = self.lat
-        self.df["longitude"] = self.lon
+        self.df["variable"] = variable
+        self.df["lat"] = lat_loc
+        self.df["lon"] = lon_loc
         self.df['source'] = 'DWD'
-        self.df['units'] = self.units
-        self.df = self.df[["latitude", "longitude", "time", "source", "variable", "value", "units"]]
+        self.df['units'] = self.param_mapping.dwd.variables[variable].unit
+        self.df = self.df[["lat", "lon", "time", "source", "variable", "value", "units"]]
         # self.df = df
         return self.df
-    def save(self):
-        filename = build_output_filename(self.cfg)
-        self.df.to_csv(self.cfg.output.out_dir+filename, index=False)
+    def save_csv(self,filename):
+        self.df.to_csv(filename, index=False)
         print(f"✅ Saved time series to: {filename}")
         return filename

climdata/datasets/HYRAS.py ADDED Viewed

@@ -0,0 +1,133 @@
+import os
+import pandas as pd
+import xarray as xr
+from datetime import datetime
+from omegaconf import DictConfig
+from climdata.utils.utils_download import find_nearest_xy, fetch_dwd
+import geopandas as gpd
+class HYRASmirror:
+    def __init__(self, cfg: DictConfig):
+        self.cfg = cfg
+        self.dataset = None
+        self.variables = cfg.variables
+        self.files = []
+    def fetch(self, variable: str):
+        """
+        Download HYRAS NetCDF files for a given variable and time range.
+        """
+        fetch_dwd(self.cfg,variable)
+        # Build file list for the variable and time range
+        param_mapping = self.cfg.mappings
+        provider = self.cfg.dataset.lower()
+        parameter_key = variable
+        param_info = param_mapping[provider]['variables'][parameter_key]
+        prefix = param_info["prefix"]
+        version = param_info["version"]
+        start_year = datetime.fromisoformat(self.cfg.time_range.start_date).year
+        end_year = datetime.fromisoformat(self.cfg.time_range.end_date).year
+        files = []
+        for year in range(start_year, end_year + 1):
+            file_name = f"{prefix}_{year}_{version}_de.nc"
+            files.append(os.path.join(self.cfg.data_dir, provider, parameter_key.upper(), file_name))
+        self.files = files
+        return files
+    def load(self, variable: str):
+        """
+        Load HYRAS NetCDFs for a given variable into a single xarray Dataset.
+        """
+        files = self.fetch(variable)
+        datasets = []
+        for f in files:
+            if not os.path.exists(f):
+                print(f"File not found: {f}")
+                continue
+            try:
+                ds = xr.open_dataset(f)
+                datasets.append(ds)
+            except Exception as e:
+                print(f"Skipping file {f} due to error: {e}")
+        if not datasets:
+            raise RuntimeError(f"No datasets could be loaded for {variable}.")
+        dset = xr.concat(datasets, dim="time")
+        dset[variable] = dset[variable].transpose("time", "y", "x")
+        self.dataset = dset
+        return self.dataset
+    def extract(self, *, point=None, box=None, shapefile=None, buffer_km=0.0):
+        """
+        Extract data from the loaded HYRAS dataset.
+        Parameters
+        ----------
+        point : tuple (lon, lat), optional
+            Extracts a time series at the nearest grid point.
+        box : dict with lat/lon bounds, optional
+            Example: {"lat_min": 47, "lat_max": 49, "lon_min": 10, "lon_max": 12}
+        shapefile : str, optional
+            Path to a shapefile to clip the dataset spatially.
+        buffer_km : float, optional
+            Buffer distance (in kilometers) applied to the shapefile before clipping.
+        """
+        if self.dataset is None:
+            raise ValueError("No dataset loaded. Call `load()` first.")
+        ds = self.dataset
+        # Point extraction
+        if point is not None:
+            lat, lon = point[1], point[0]
+            iy, ix = find_nearest_xy(ds, lat, lon)
+            print(f"📌 Nearest grid point at (y,x)=({iy},{ix})")
+            ts = ds.isel(x=ix, y=iy)
+            self.dataset = ts
+            return ts
+        # Box extraction
+        elif box is not None:
+            if not all(k in box for k in ["lat_min", "lat_max", "lon_min", "lon_max"]):
+                raise ValueError("Box must contain lat_min, lat_max, lon_min, lon_max.")
+            dset_box = ds.sel(
+                y=slice(box["lat_max"], box["lat_min"]),  # y usually decreasing (north -> south)
+                x=slice(box["lon_min"], box["lon_max"])
+            )
+            print(f"📦 Extracted box with shape: {dset_box.dims}")
+            self.dataset = dset_box
+            return dset_box
+        # Shapefile extraction
+        elif shapefile is not None:
+            gdf = gpd.read_file(shapefile)
+            if buffer_km > 0:
+                gdf = gdf.to_crs(epsg=3857)  # project to meters
+                gdf["geometry"] = gdf.buffer(buffer_km * 1000)  # buffer in meters
+                gdf = gdf.to_crs(epsg=4326)  # back to lat/lon
+            # Ensure dataset has CRS info for clipping
+            if not ds.rio.crs:
+                ds = ds.rio.write_crs("EPSG:4326")
+            dset_clipped = ds.rio.clip(gdf.geometry, gdf.crs, drop=True)
+            print(f"🗺️ Extracted shapefile area with dims: {dset_clipped.dims}")
+            self.dataset = dset_clipped
+            return dset_clipped
+        else:
+            raise NotImplementedError("Must provide either point, box, or shapefile.")
+    def save_csv(self, filename, df=None):
+        """
+        Save the extracted time series to CSV.
+        """
+        if df is None:
+            if self.dataset is None:
+                raise ValueError("No dataset loaded or extracted.")
+            # If dataset is a DataArray, convert to DataFrame
+            if isinstance(self.dataset, xr.Dataset):
+                df = self.dataset.to_dataframe().reset_index()
+            else:
+                raise ValueError("Please provide a DataFrame or extract a point first.")
+        df.to_csv(filename, index=False)
+        print(f"Saved CSV to {filename}")

climdata 0.1.1__py2.py3-none-any.whl → 0.1.3__py2.py3-none-any.whl

Potentially problematic release.

climdata 0.1.1py2.py3-none-any.whl → 0.1.3py2.py3-none-any.whl