PyPI - climdata - Versions diffs - 0.0.2__py2.py3-none-any.whl - Mend

climdata 0.0.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of climdata might be problematic. Click here for more details.

Files changed (17) hide show

climdata/__init__.py +8 -0
climdata/__main__.py +5 -0
climdata/conf/config.yaml +23 -0
climdata/conf/mappings/parameters.yaml +172 -0
climdata/datasets/DWD.py +73 -0
climdata/datasets/MSWX.py +195 -0
climdata/main.py +56 -0
climdata/requirements.txt +20 -0
climdata/utils/__init__.py +0 -0
climdata/utils/config.py +30 -0
climdata/utils/utils_download.py +976 -0
climdata-0.0.2.dist-info/METADATA +253 -0
climdata-0.0.2.dist-info/RECORD +17 -0
climdata-0.0.2.dist-info/WHEEL +6 -0
climdata-0.0.2.dist-info/entry_points.txt +2 -0
climdata-0.0.2.dist-info/licenses/LICENSE +22 -0
climdata-0.0.2.dist-info/top_level.txt +1 -0

climdata/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""Top-level package for climdata."""
+__author__ = """Kaushik Muduchuru"""
+__email__ = "kaushik.reddy.m@gmail.com"
+__version__ = "0.0.2"
+from .utils.utils_download import * # etc.
+from .datasets.DWD import DWDmirror as DWD

climdata/__main__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# climdata/__main__.py
+from .main import run
+if __name__ == "__main__":
+    run()

climdata/conf/config.yaml ADDED Viewed

@@ -0,0 +1,23 @@
+defaults:
+  - _self_
+  - mappings/parameters
+dataset: dwd
+data_dir: /beegfs/muduchuru/data
+weather:
+  parameter: tas  # standardized variable name (e.g., tas, pr, rsds)
+location:
+  lat: 52.5070
+  lon: 14.1372
+  buffer_km: 25
+time_range:
+  start_date: "1989-01-01"
+  end_date: "2020-12-31"
+output:
+  out_dir: "./climdata/data/"
+  filename: "{provider}_{parameter}_LAT{lat}_LON{lon}_{start}_{end}.csv"
+  fmt: 'standard' # 'standard', 'ICASA', 'simplace', 'monica'

climdata/conf/mappings/parameters.yaml ADDED Viewed

@@ -0,0 +1,172 @@
+dwd:
+  type: in-situ
+  subsetting: None
+  variables:
+    tas:
+      resolution: daily
+      dataset: climate_summary
+      name: temperature_air_mean_2m
+      unit: degC
+    tasmax:
+      resolution: daily
+      dataset: climate_summary
+      name: temperature_air_max_2m
+      unit: degC
+    tasmin:
+      resolution: daily
+      dataset: climate_summary
+      name: temperature_air_min_2m
+      unit: degC
+    pr:
+      resolution: daily
+      dataset: climate_summary
+      name: precipitation_height
+      unit: mm
+    rsds:
+      resolution: daily
+      dataset: solar
+      name: radiation_sky_short_wave_diffuse
+      unit: J/cm2
+    sfcWind:
+      resolution: daily
+      dataset: climate_summary
+      name: wind_speed
+      unit: m/s
+era5-land:
+  type: image
+  subsetting: point
+  params:
+    collection: ECMWF/ERA5_LAND/DAILY_AGGR
+    scale: 11132
+  variables:
+    tas:
+      name: temperature_2m
+      unit: K
+    tasmax:
+      name: temperature_2m_max
+      unit: K
+    tasmin:
+      name: temperature_2m_min
+      unit: K
+    pr:
+      name: total_precipitation_sum
+      unit: m
+    rsds:
+      name: surface_solar_radiation_downwards_sum
+      unit: J/m2
+era5:
+  type: image
+  subsetting: point
+  params:
+    collection: ECMWF/ERA5/DAILY
+    scale: 27830
+  variables:
+    tas:
+      name: mean_2m_air_temperature
+      unit: K
+    tasmax:
+      name: maximum_2m_air_temperature
+      unit: K
+    tasmin:
+      name: minimum_2m_air_temperature
+      unit: K
+    pr:
+      name: total_precipitation
+      unit: m
+gddp:
+  type: image
+  subsetting: point
+  params:
+    collection: NASA/GDDP-CMIP6
+    model: ACCESS-CM2
+    scenario: historical
+    scale: 27830
+  variables:
+    tas:
+      name: tas
+      unit: K
+    tasmax:
+      name: tasmax
+      unit: K
+    tasmin:
+      name: tasmin
+      unit: K
+    pr:
+      name: pr
+      unit: kg m-2 s-1
+    rsds:
+      name: rsds
+      unit: W/m2
+    sfcWind:
+      name: sfcWind
+      unit: m/s
+    hurs:
+      name: hurs
+      unit: "%"
+mswx:
+  type: image
+  subsetting: None
+  params:
+    google_service_account: /beegfs/muduchuru/codes/python/download/conf/service.json
+  variables:
+    tasmin:
+      name: air_temperature
+      folder_id: 1_h0hgJThJLXhVOk905IJ7EFpkFV5KDSC
+    tasmax:
+      name: air_temperature
+      folder_id: 1BKvHhO1JsBKKzMPCbzl5UpvqpRXHTj9c
+    tas:
+      name: air_temperature
+      folder_id: 10l3ThTEkabK0yfk7DvpEcNgXkhbF9NNt
+    pr:
+      name: precipitation
+      folder_id: 1gWoZ2bK2u5osJ8Iw-dvguZ56Kmz2QWrL
+    rsds:
+      name: downward_shortwave_radiation
+      folder_id: 1usXbIOi4_jBUdDaZbzPKXznx9PTYzHRv
+dwd_hyras:
+  variables:
+    tasmin:
+      name: tasmin
+      base_url: "https://opendata.dwd.de/climate_environment/CDC/grids_germany/daily/hyras_de/air_temperature_min/"
+      prefix: "tasmin_hyras_1"
+      version: "v6-0"
+    tasmax:
+      name: tasmax
+      base_url: "https://opendata.dwd.de/climate_environment/CDC/grids_germany/daily/hyras_de/air_temperature_max/"
+      prefix: "tasmax_hyras_1"
+      version: "v6-0"
+    tas:
+      name: tas
+      base_url: "https://opendata.dwd.de/climate_environment/CDC/grids_germany/daily/hyras_de/air_temperature_mean/"
+      prefix: "tas_hyras_1"
+      version: "v6-0"
+    pr:
+      name: pr
+      base_url: "https://opendata.dwd.de/climate_environment/CDC/grids_germany/daily/hyras_de/precipitation/"
+      prefix: "pr_hyras_1"
+      version: "v6-0"
+    rsds:
+      name: rsds
+      base_url: "https://opendata.dwd.de/climate_environment/CDC/grids_germany/daily/hyras_de/radiation_global/"
+      prefix: "rsds_hyras_5"
+      version: "v3-1"
+    hurs:
+      name: hurs
+      base_url: "https://opendata.dwd.de/climate_environment/CDC/grids_germany/daily/hyras_de/humidity/"
+      prefix: "hurs_hyras_1"
+      version: "v6-0"
+era5_cds:
+  params:
+    dataset: derived-era5-single-levels-daily-statistics
+  variables:
+    tas:
+      name: 2m_temperature
+    tasmax:
+      name: maximum_2m_temperature_since_previous_post_processing
+    tasmin:
+      name: minimum_2m_temperature_since_previous_post_processing
+    pr:
+      name: total_precipitation
+    rsds:
+      name: surface_solar_radiation_downwards

climdata/datasets/DWD.py ADDED Viewed

@@ -0,0 +1,73 @@
+import os
+import pandas as pd
+import hydra
+from wetterdienst import Settings
+from wetterdienst.provider.dwd.observation import DwdObservationRequest
+from climdata.utils.utils_download import build_output_filename
+class DWDmirror:
+    def __init__(self, cfg):
+        self.cfg = cfg
+        self.param_mapping = cfg.mappings
+        self.provider = cfg.dataset.lower()
+        self.parameter_key = cfg.weather.parameter
+        self.lat = cfg.location.lat
+        self.lon = cfg.location.lon
+        self.distance = cfg.location.buffer_km
+        self.start_date = cfg.time_range.start_date
+        self.end_date = cfg.time_range.end_date
+        self.units = self.param_mapping[self.provider]['variables'][self.parameter_key].get("unit", None)
+        self.df = None
+    def fetch(self):
+        param_info = self.param_mapping[self.provider]['variables'][self.parameter_key]
+        resolution = param_info["resolution"]
+        dataset = param_info["dataset"]
+        variable_name = param_info["name"]
+        settings = Settings(ts_shape="long", ts_humanize=True)
+        request = DwdObservationRequest(
+            parameters=(resolution, dataset, variable_name),
+            start_date=self.start_date,
+            end_date=self.end_date,
+            settings=settings
+        ).filter_by_distance(
+            latlon=(self.lat, self.lon),
+            distance=self.distance,
+            unit="km"
+        )
+        df = request.values.all().df.to_pandas()
+        self.df = df
+        return self.df
+    def format(self):
+        self.df['date'] = pd.to_datetime(self.df['date'])
+        self.df = self.df.groupby(['date']).agg({
+            'value': 'mean',
+            'station_id': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
+            'resolution': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
+            'dataset': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
+            'parameter': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
+            'quality': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
+        }).reset_index()
+        self.df = self.df.rename(columns={
+            "date": "time",
+            "value": "value",
+            "station_id": "frequent_station",
+        })
+        self.df["variable"] = self.parameter_key
+        self.df["latitude"] = self.lat
+        self.df["longitude"] = self.lon
+        self.df['source'] = 'DWD'
+        self.df['units'] = self.units
+        self.df = self.df[["latitude", "longitude", "time", "source", "variable", "value", "units"]]
+        # self.df = df
+        return self.df
+    def save(self):
+        filename = build_output_filename(self.cfg)
+        self.df.to_csv(self.cfg.output.out_dir+filename, index=False)
+        print(f"✅ Saved time series to: {filename}")
+        return filename

climdata/datasets/MSWX.py ADDED Viewed

@@ -0,0 +1,195 @@
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaIoBaseDownload
+import datetime
+class MSWXmirror:
+    def __init__(self,cfg):
+        self.cfg = cfg
+        self.provider = cfg.dataset.lower()
+        self.parameter_key = cfg.weather.parameter
+        self.lat = cfg.location.lat
+        self.lon = cfg.location.lon
+        self.start_date = datetime.fromisoformat(cfg.time_range.start_date)
+        self.end_date = datetime.fromisoformat(cfg.time_range.end_date)
+        self.output_dir = cfg.data_dir
+        provider_cfg = cfg.mappings[self.provider]
+        self.param_info = provider_cfg['variables'][self.parameter_key]
+        self.folder_id = self.param_info['folder_id']
+        self.units = self.param_info.get("units", "")
+        self.service = self._build_drive_service(provider_cfg.params.google_service_account)
+    def _list_drive_files(folder_id, service):
+        """
+        List all files in a Google Drive folder, handling pagination.
+        """
+        files = []
+        page_token = None
+        while True:
+            results = service.files().list(
+                q=f"'{folder_id}' in parents and trashed = false",
+                fields="files(id, name), nextPageToken",
+                pageToken=page_token
+            ).execute()
+            files.extend(results.get("files", []))
+            page_token = results.get("nextPageToken", None)
+            if not page_token:
+                break
+        return files
+    def _download_drive_file(file_id, local_path, service):
+        """
+        Download a single file from Drive to a local path.
+        """
+        request = service.files().get_media(fileId=file_id)
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        with io.FileIO(local_path, 'wb') as fh:
+            downloader = MediaIoBaseDownload(fh, request)
+            done = False
+            while not done:
+                status, done = downloader.next_chunk()
+                print(f"   → Download {int(status.progress() * 100)}% complete")
+    def fetch():
+        expected_files = []
+        current = self.start_date
+        while current <= self.end_date:
+            doy = current.timetuple().tm_yday
+            basename = f"{current.year}{doy:03d}.nc"
+            expected_files.append(basename)
+            current += timedelta(days=1)
+        output_dir = var_cfg.data_dir
+        local_files = []
+        missing_files = []
+        for basename in expected_files:
+            local_path = os.path.join(output_dir, provider, parameter_key, basename)
+            if os.path.exists(local_path):
+                local_files.append(basename)
+            else:
+                missing_files.append(basename)
+        if not missing_files:
+            print(f"✅ All {len(expected_files)} files already exist locally. No download needed.")
+            return local_files
+        print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching from Drive...")
+        # === 2) Connect to Drive ===
+        SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
+        creds = service_account.Credentials.from_service_account_file(
+            param_mapping[provider].params.google_service_account, scopes=SCOPES
+        )
+        service = build('drive', 'v3', credentials=creds)
+        # === 3) List all Drive files ===
+        drive_files = list_drive_files(folder_id, service)
+        valid_filenames = set(missing_files)
+        files_to_download = [f for f in drive_files if f['name'] in valid_filenames]
+        if not files_to_download:
+            print(f"⚠️ None of the missing files found in Drive. Check folder & date range.")
+            return local_files
+        # === 4) Download missing ===
+        for file in files_to_download:
+            filename = file['name']
+            local_path = os.path.join(output_dir, provider, parameter_key, filename)
+            print(f"⬇️ Downloading {filename} ...")
+            download_drive_file(file['id'], local_path, service)
+            local_files.append(filename)
+        return local_files
+def extract_ts_MSWX(cfg: DictConfig):
+    parameter = cfg.weather.parameter
+    param_mapping = cfg.mappings
+    provider = cfg.dataset.lower()
+    parameter_key = cfg.weather.parameter
+    # Validate provider and parameter
+    param_info = param_mapping[provider]['variables'][parameter_key]
+    base_dir = cfg.data_dir
+    target_lat = cfg.location.lat
+    target_lon = cfg.location.lon
+    start_date = pd.to_datetime(cfg.time_range.start_date)
+    end_date = pd.to_datetime(cfg.time_range.end_date)
+    # === 1) Rebuild exact basenames ===
+    current = start_date
+    basenames = []
+    while current <= end_date:
+        doy = current.timetuple().tm_yday
+        basename = f"{current.year}{doy:03d}.nc"
+        basenames.append(basename)
+        current += timedelta(days=1)
+    # === 2) Process only those files ===
+    ts_list = []
+    missing = []
+    for basename in basenames:
+        file_path = os.path.join(base_dir, provider, parameter, basename)
+        if not os.path.exists(file_path):
+            missing.append(basename)
+            continue
+        print(f"📂 Opening: {file_path}")
+        ds = xr.open_dataset(file_path)
+        time_name = [x for x in ds.coords if "time" in x.lower()][0]
+        data_var = [v for v in ds.data_vars][0]
+        ts = ds[data_var].sel(
+            lat=target_lat,
+            lon=target_lon,
+            method='nearest'
+        )
+        df = ts.to_dataframe().reset_index()[[time_name, data_var]]
+        ts_list.append(df)
+    if missing:
+        print(f"⚠️ Warning: {len(missing)} files were missing and skipped:")
+        for m in missing:
+            print(f"   - {m}")
+    if not ts_list:
+        raise RuntimeError("❌ No valid files were found. Cannot extract time series.")
+    # === 3) Combine and slice (for safety) ===
+    ts_all = pd.concat(ts_list).sort_values(by=time_name).reset_index(drop=True)
+    ts_all[time_name] = pd.to_datetime(ts_all[time_name])
+    ts_all = ts_all[
+        (ts_all[time_name] >= start_date) &
+        (ts_all[time_name] <= end_date)
+    ].reset_index(drop=True)
+    out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
+    os.makedirs(out_dir, exist_ok=True)
+    out_path = os.path.join(out_dir, cfg.output.filename)
+    ts_all["variable"] = param_info['name']
+    ts_all["latitude"] = target_lat
+    ts_all["longitude"] = target_lon
+    ts_all['source'] = provider.upper()
+    ts_all['units'] = ts.attrs['units']
+    ts_all.rename(columns={param_info['name']: 'value'}, inplace=True)
+    ts_all = ts_all[["latitude", "longitude", "time", "source", "variable", "value",'units']]
+    ts_all.to_csv(out_path, index=False)
+    print(f"✅ Saved MSWX time series to: {out_path}")
+    return ts_all

climdata/main.py ADDED Viewed

@@ -0,0 +1,56 @@
+import argparse
+import re
+# import yaml
+import os
+import io
+import requests
+from bs4 import BeautifulSoup
+import concurrent.futures
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaIoBaseDownload
+from datetime import datetime
+import ipdb
+import hydra
+from omegaconf import DictConfig
+import xarray as xr
+import pandas as pd
+import numpy as np
+from scipy.spatial import cKDTree
+from datetime import datetime, timedelta
+from .utils.utils_download import *
+import hydra
+from omegaconf import DictConfig
+from climdata.datasets.DWD import DWDmirror as DWD
+@hydra.main(config_path="conf", config_name="config", version_base="1.3")
+def run(cfg: DictConfig):
+    provider = cfg.dataset
+    filename = build_output_filename(cfg)
+    cfg.output.filename = filename
+    print(f"📡 Fetching data for dataset: {provider.upper()}")
+    print(f"📁 Output will be saved as: {filename}")
+    if provider.lower() == "mswx":
+        fetch_MSWX(cfg)
+        extract_ts_MSWX(cfg)
+    elif provider.lower() == "dwd_hyras":
+        fetch_dwd(cfg)
+        extract_ts_dwd(cfg)
+    elif provider == "dwd":
+        dwd = DWD(cfg)
+        dwd.fetch()
+        dwd.format()
+        dwd.save()
+    elif provider in ["gddp"]:
+        fetch_ee_loc(cfg)
+    elif provider == "era5-land":
+        fetch_ee_loc_mod(cfg)
+    else:
+        raise NotImplementedError(f"Provider '{provider}' is not yet supported in this script.")
+    # print(f"Downloaded {len(downloaded)} new files for {var.name}")
+if __name__ == '__main__':
+    run()

climdata/requirements.txt ADDED Viewed

@@ -0,0 +1,20 @@
+beautifulsoup4==4.13.5
+earthengine_api==1.5.19
+geemap==0.35.3
+geopandas==1.0.1
+google_api_python_client==2.172.0
+hydra-core==1.3.2
+ipdb==0.13.13
+numpy==2.3.2
+omegaconf==2.3.0
+pandas==2.3.2
+Pint==0.24.4
+Pint_Pandas==0.7.1
+protobuf==6.32.0
+PyYAML==6.0.2
+PyYAML==6.0.2
+Requests==2.32.5
+scipy==1.16.1
+tqdm==4.67.1
+wetterdienst==0.111.0
+xarray==2025.4.0

climdata/utils/__init__.py ADDED Viewed

File without changes

climdata/utils/config.py ADDED Viewed

@@ -0,0 +1,30 @@
+from hydra import initialize, compose
+from omegaconf import OmegaConf
+from typing import Optional, List
+import os
+def load_config(
+    config_path: str = "../conf",
+    config_name: str = "config",
+    overrides: Optional[List[str]] = None,
+    verbose: bool = False
+):
+    """
+    Load a Hydra config file.
+    Args:
+        config_path (str): Path to the config directory.
+        config_name (str): Name of the config YAML file (without `.yaml`).
+        overrides (List[str], optional): List of override strings.
+        verbose (bool): Whether to print the loaded config.
+    Returns:
+        OmegaConf.DictConfig: The loaded config object.
+    """
+    # config_path = os.path.abspath(config_path)
+    with initialize(config_path=config_path, version_base=None):
+        cfg = compose(config_name=config_name, overrides=overrides or [])
+        if verbose:
+            print(OmegaConf.to_yaml(cfg))
+        return cfg