climdata 0.0.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of climdata might be problematic. Click here for more details.

climdata/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ """Top-level package for climdata."""
2
+
3
+ __author__ = """Kaushik Muduchuru"""
4
+ __email__ = "kaushik.reddy.m@gmail.com"
5
+ __version__ = "0.0.2"
6
+
7
+ from .utils.utils_download import * # etc.
8
+ from .datasets.DWD import DWDmirror as DWD
climdata/__main__.py ADDED
@@ -0,0 +1,5 @@
1
+ # climdata/__main__.py
2
+ from .main import run
3
+
4
+ if __name__ == "__main__":
5
+ run()
@@ -0,0 +1,23 @@
1
+
2
+ defaults:
3
+ - _self_
4
+ - mappings/parameters
5
+
6
+ dataset: dwd
7
+ data_dir: /beegfs/muduchuru/data
8
+ weather:
9
+ parameter: tas # standardized variable name (e.g., tas, pr, rsds)
10
+
11
+ location:
12
+ lat: 52.5070
13
+ lon: 14.1372
14
+ buffer_km: 25
15
+
16
+ time_range:
17
+ start_date: "1989-01-01"
18
+ end_date: "2020-12-31"
19
+
20
+ output:
21
+ out_dir: "./climdata/data/"
22
+ filename: "{provider}_{parameter}_LAT{lat}_LON{lon}_{start}_{end}.csv"
23
+ fmt: 'standard' # 'standard', 'ICASA', 'simplace', 'monica'
@@ -0,0 +1,172 @@
1
+ dwd:
2
+ type: in-situ
3
+ subsetting: None
4
+ variables:
5
+ tas:
6
+ resolution: daily
7
+ dataset: climate_summary
8
+ name: temperature_air_mean_2m
9
+ unit: degC
10
+ tasmax:
11
+ resolution: daily
12
+ dataset: climate_summary
13
+ name: temperature_air_max_2m
14
+ unit: degC
15
+ tasmin:
16
+ resolution: daily
17
+ dataset: climate_summary
18
+ name: temperature_air_min_2m
19
+ unit: degC
20
+ pr:
21
+ resolution: daily
22
+ dataset: climate_summary
23
+ name: precipitation_height
24
+ unit: mm
25
+ rsds:
26
+ resolution: daily
27
+ dataset: solar
28
+ name: radiation_sky_short_wave_diffuse
29
+ unit: J/cm2
30
+ sfcWind:
31
+ resolution: daily
32
+ dataset: climate_summary
33
+ name: wind_speed
34
+ unit: m/s
35
+ era5-land:
36
+ type: image
37
+ subsetting: point
38
+ params:
39
+ collection: ECMWF/ERA5_LAND/DAILY_AGGR
40
+ scale: 11132
41
+ variables:
42
+ tas:
43
+ name: temperature_2m
44
+ unit: K
45
+ tasmax:
46
+ name: temperature_2m_max
47
+ unit: K
48
+ tasmin:
49
+ name: temperature_2m_min
50
+ unit: K
51
+ pr:
52
+ name: total_precipitation_sum
53
+ unit: m
54
+ rsds:
55
+ name: surface_solar_radiation_downwards_sum
56
+ unit: J/m2
57
+ era5:
58
+ type: image
59
+ subsetting: point
60
+ params:
61
+ collection: ECMWF/ERA5/DAILY
62
+ scale: 27830
63
+ variables:
64
+ tas:
65
+ name: mean_2m_air_temperature
66
+ unit: K
67
+ tasmax:
68
+ name: maximum_2m_air_temperature
69
+ unit: K
70
+ tasmin:
71
+ name: minimum_2m_air_temperature
72
+ unit: K
73
+ pr:
74
+ name: total_precipitation
75
+ unit: m
76
+ gddp:
77
+ type: image
78
+ subsetting: point
79
+ params:
80
+ collection: NASA/GDDP-CMIP6
81
+ model: ACCESS-CM2
82
+ scenario: historical
83
+ scale: 27830
84
+ variables:
85
+ tas:
86
+ name: tas
87
+ unit: K
88
+ tasmax:
89
+ name: tasmax
90
+ unit: K
91
+ tasmin:
92
+ name: tasmin
93
+ unit: K
94
+ pr:
95
+ name: pr
96
+ unit: kg m-2 s-1
97
+ rsds:
98
+ name: rsds
99
+ unit: W/m2
100
+ sfcWind:
101
+ name: sfcWind
102
+ unit: m/s
103
+ hurs:
104
+ name: hurs
105
+ unit: "%"
106
+ mswx:
107
+ type: image
108
+ subsetting: None
109
+ params:
110
+ google_service_account: /beegfs/muduchuru/codes/python/download/conf/service.json
111
+ variables:
112
+ tasmin:
113
+ name: air_temperature
114
+ folder_id: 1_h0hgJThJLXhVOk905IJ7EFpkFV5KDSC
115
+ tasmax:
116
+ name: air_temperature
117
+ folder_id: 1BKvHhO1JsBKKzMPCbzl5UpvqpRXHTj9c
118
+ tas:
119
+ name: air_temperature
120
+ folder_id: 10l3ThTEkabK0yfk7DvpEcNgXkhbF9NNt
121
+ pr:
122
+ name: precipitation
123
+ folder_id: 1gWoZ2bK2u5osJ8Iw-dvguZ56Kmz2QWrL
124
+ rsds:
125
+ name: downward_shortwave_radiation
126
+ folder_id: 1usXbIOi4_jBUdDaZbzPKXznx9PTYzHRv
127
+ dwd_hyras:
128
+ variables:
129
+ tasmin:
130
+ name: tasmin
131
+ base_url: "https://opendata.dwd.de/climate_environment/CDC/grids_germany/daily/hyras_de/air_temperature_min/"
132
+ prefix: "tasmin_hyras_1"
133
+ version: "v6-0"
134
+ tasmax:
135
+ name: tasmax
136
+ base_url: "https://opendata.dwd.de/climate_environment/CDC/grids_germany/daily/hyras_de/air_temperature_max/"
137
+ prefix: "tasmax_hyras_1"
138
+ version: "v6-0"
139
+ tas:
140
+ name: tas
141
+ base_url: "https://opendata.dwd.de/climate_environment/CDC/grids_germany/daily/hyras_de/air_temperature_mean/"
142
+ prefix: "tas_hyras_1"
143
+ version: "v6-0"
144
+ pr:
145
+ name: pr
146
+ base_url: "https://opendata.dwd.de/climate_environment/CDC/grids_germany/daily/hyras_de/precipitation/"
147
+ prefix: "pr_hyras_1"
148
+ version: "v6-0"
149
+ rsds:
150
+ name: rsds
151
+ base_url: "https://opendata.dwd.de/climate_environment/CDC/grids_germany/daily/hyras_de/radiation_global/"
152
+ prefix: "rsds_hyras_5"
153
+ version: "v3-1"
154
+ hurs:
155
+ name: hurs
156
+ base_url: "https://opendata.dwd.de/climate_environment/CDC/grids_germany/daily/hyras_de/humidity/"
157
+ prefix: "hurs_hyras_1"
158
+ version: "v6-0"
159
+ era5_cds:
160
+ params:
161
+ dataset: derived-era5-single-levels-daily-statistics
162
+ variables:
163
+ tas:
164
+ name: 2m_temperature
165
+ tasmax:
166
+ name: maximum_2m_temperature_since_previous_post_processing
167
+ tasmin:
168
+ name: minimum_2m_temperature_since_previous_post_processing
169
+ pr:
170
+ name: total_precipitation
171
+ rsds:
172
+ name: surface_solar_radiation_downwards
@@ -0,0 +1,73 @@
1
+ import os
2
+ import pandas as pd
3
+ import hydra
4
+ from wetterdienst import Settings
5
+ from wetterdienst.provider.dwd.observation import DwdObservationRequest
6
+ from climdata.utils.utils_download import build_output_filename
7
+
8
+ class DWDmirror:
9
+ def __init__(self, cfg):
10
+ self.cfg = cfg
11
+ self.param_mapping = cfg.mappings
12
+ self.provider = cfg.dataset.lower()
13
+ self.parameter_key = cfg.weather.parameter
14
+ self.lat = cfg.location.lat
15
+ self.lon = cfg.location.lon
16
+ self.distance = cfg.location.buffer_km
17
+ self.start_date = cfg.time_range.start_date
18
+ self.end_date = cfg.time_range.end_date
19
+ self.units = self.param_mapping[self.provider]['variables'][self.parameter_key].get("unit", None)
20
+ self.df = None
21
+ def fetch(self):
22
+ param_info = self.param_mapping[self.provider]['variables'][self.parameter_key]
23
+ resolution = param_info["resolution"]
24
+ dataset = param_info["dataset"]
25
+ variable_name = param_info["name"]
26
+
27
+ settings = Settings(ts_shape="long", ts_humanize=True)
28
+ request = DwdObservationRequest(
29
+ parameters=(resolution, dataset, variable_name),
30
+ start_date=self.start_date,
31
+ end_date=self.end_date,
32
+ settings=settings
33
+ ).filter_by_distance(
34
+ latlon=(self.lat, self.lon),
35
+ distance=self.distance,
36
+ unit="km"
37
+ )
38
+
39
+ df = request.values.all().df.to_pandas()
40
+ self.df = df
41
+ return self.df
42
+
43
+ def format(self):
44
+ self.df['date'] = pd.to_datetime(self.df['date'])
45
+ self.df = self.df.groupby(['date']).agg({
46
+ 'value': 'mean',
47
+ 'station_id': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
48
+ 'resolution': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
49
+ 'dataset': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
50
+ 'parameter': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
51
+ 'quality': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
52
+ }).reset_index()
53
+
54
+ self.df = self.df.rename(columns={
55
+ "date": "time",
56
+ "value": "value",
57
+ "station_id": "frequent_station",
58
+ })
59
+ self.df["variable"] = self.parameter_key
60
+ self.df["latitude"] = self.lat
61
+ self.df["longitude"] = self.lon
62
+ self.df['source'] = 'DWD'
63
+ self.df['units'] = self.units
64
+ self.df = self.df[["latitude", "longitude", "time", "source", "variable", "value", "units"]]
65
+ # self.df = df
66
+ return self.df
67
+
68
+ def save(self):
69
+ filename = build_output_filename(self.cfg)
70
+ self.df.to_csv(self.cfg.output.out_dir+filename, index=False)
71
+ print(f"✅ Saved time series to: {filename}")
72
+ return filename
73
+
@@ -0,0 +1,195 @@
1
+ from google.oauth2 import service_account
2
+ from googleapiclient.discovery import build
3
+ from googleapiclient.http import MediaIoBaseDownload
4
+ import datetime
5
+ class MSWXmirror:
6
+ def __init__(self,cfg):
7
+ self.cfg = cfg
8
+ self.provider = cfg.dataset.lower()
9
+ self.parameter_key = cfg.weather.parameter
10
+ self.lat = cfg.location.lat
11
+ self.lon = cfg.location.lon
12
+ self.start_date = datetime.fromisoformat(cfg.time_range.start_date)
13
+ self.end_date = datetime.fromisoformat(cfg.time_range.end_date)
14
+ self.output_dir = cfg.data_dir
15
+
16
+ provider_cfg = cfg.mappings[self.provider]
17
+ self.param_info = provider_cfg['variables'][self.parameter_key]
18
+ self.folder_id = self.param_info['folder_id']
19
+ self.units = self.param_info.get("units", "")
20
+ self.service = self._build_drive_service(provider_cfg.params.google_service_account)
21
+
22
+ def _list_drive_files(folder_id, service):
23
+ """
24
+ List all files in a Google Drive folder, handling pagination.
25
+ """
26
+ files = []
27
+ page_token = None
28
+
29
+ while True:
30
+ results = service.files().list(
31
+ q=f"'{folder_id}' in parents and trashed = false",
32
+ fields="files(id, name), nextPageToken",
33
+ pageToken=page_token
34
+ ).execute()
35
+
36
+ files.extend(results.get("files", []))
37
+ page_token = results.get("nextPageToken", None)
38
+
39
+ if not page_token:
40
+ break
41
+
42
+ return files
43
+ def _download_drive_file(file_id, local_path, service):
44
+ """
45
+ Download a single file from Drive to a local path.
46
+ """
47
+ request = service.files().get_media(fileId=file_id)
48
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
49
+
50
+ with io.FileIO(local_path, 'wb') as fh:
51
+ downloader = MediaIoBaseDownload(fh, request)
52
+
53
+ done = False
54
+ while not done:
55
+ status, done = downloader.next_chunk()
56
+ print(f" → Download {int(status.progress() * 100)}% complete")
57
+ def fetch():
58
+ expected_files = []
59
+ current = self.start_date
60
+ while current <= self.end_date:
61
+ doy = current.timetuple().tm_yday
62
+ basename = f"{current.year}{doy:03d}.nc"
63
+ expected_files.append(basename)
64
+ current += timedelta(days=1)
65
+
66
+ output_dir = var_cfg.data_dir
67
+ local_files = []
68
+ missing_files = []
69
+
70
+ for basename in expected_files:
71
+ local_path = os.path.join(output_dir, provider, parameter_key, basename)
72
+ if os.path.exists(local_path):
73
+ local_files.append(basename)
74
+ else:
75
+ missing_files.append(basename)
76
+
77
+ if not missing_files:
78
+ print(f"✅ All {len(expected_files)} files already exist locally. No download needed.")
79
+ return local_files
80
+
81
+ print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching from Drive...")
82
+
83
+ # === 2) Connect to Drive ===
84
+ SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
85
+ creds = service_account.Credentials.from_service_account_file(
86
+ param_mapping[provider].params.google_service_account, scopes=SCOPES
87
+ )
88
+ service = build('drive', 'v3', credentials=creds)
89
+
90
+ # === 3) List all Drive files ===
91
+ drive_files = list_drive_files(folder_id, service)
92
+ valid_filenames = set(missing_files)
93
+
94
+ files_to_download = [f for f in drive_files if f['name'] in valid_filenames]
95
+
96
+ if not files_to_download:
97
+ print(f"⚠️ None of the missing files found in Drive. Check folder & date range.")
98
+ return local_files
99
+
100
+ # === 4) Download missing ===
101
+ for file in files_to_download:
102
+ filename = file['name']
103
+ local_path = os.path.join(output_dir, provider, parameter_key, filename)
104
+ print(f"⬇️ Downloading {filename} ...")
105
+ download_drive_file(file['id'], local_path, service)
106
+ local_files.append(filename)
107
+
108
+ return local_files
109
+
110
+ def extract_ts_MSWX(cfg: DictConfig):
111
+ parameter = cfg.weather.parameter
112
+ param_mapping = cfg.mappings
113
+ provider = cfg.dataset.lower()
114
+ parameter_key = cfg.weather.parameter
115
+ # Validate provider and parameter
116
+
117
+ param_info = param_mapping[provider]['variables'][parameter_key]
118
+
119
+ base_dir = cfg.data_dir
120
+
121
+ target_lat = cfg.location.lat
122
+ target_lon = cfg.location.lon
123
+
124
+ start_date = pd.to_datetime(cfg.time_range.start_date)
125
+ end_date = pd.to_datetime(cfg.time_range.end_date)
126
+
127
+ # === 1) Rebuild exact basenames ===
128
+ current = start_date
129
+ basenames = []
130
+ while current <= end_date:
131
+ doy = current.timetuple().tm_yday
132
+ basename = f"{current.year}{doy:03d}.nc"
133
+ basenames.append(basename)
134
+ current += timedelta(days=1)
135
+
136
+ # === 2) Process only those files ===
137
+ ts_list = []
138
+ missing = []
139
+
140
+ for basename in basenames:
141
+ file_path = os.path.join(base_dir, provider, parameter, basename)
142
+
143
+ if not os.path.exists(file_path):
144
+ missing.append(basename)
145
+ continue
146
+
147
+ print(f"📂 Opening: {file_path}")
148
+ ds = xr.open_dataset(file_path)
149
+
150
+ time_name = [x for x in ds.coords if "time" in x.lower()][0]
151
+ data_var = [v for v in ds.data_vars][0]
152
+
153
+ ts = ds[data_var].sel(
154
+ lat=target_lat,
155
+ lon=target_lon,
156
+ method='nearest'
157
+ )
158
+
159
+ df = ts.to_dataframe().reset_index()[[time_name, data_var]]
160
+ ts_list.append(df)
161
+
162
+ if missing:
163
+ print(f"⚠️ Warning: {len(missing)} files were missing and skipped:")
164
+ for m in missing:
165
+ print(f" - {m}")
166
+
167
+ if not ts_list:
168
+ raise RuntimeError("❌ No valid files were found. Cannot extract time series.")
169
+
170
+ # === 3) Combine and slice (for safety) ===
171
+ ts_all = pd.concat(ts_list).sort_values(by=time_name).reset_index(drop=True)
172
+
173
+ ts_all[time_name] = pd.to_datetime(ts_all[time_name])
174
+ ts_all = ts_all[
175
+ (ts_all[time_name] >= start_date) &
176
+ (ts_all[time_name] <= end_date)
177
+ ].reset_index(drop=True)
178
+
179
+ out_dir = hydra.utils.to_absolute_path(cfg.output.out_dir)
180
+ os.makedirs(out_dir, exist_ok=True)
181
+ out_path = os.path.join(out_dir, cfg.output.filename)
182
+
183
+ ts_all["variable"] = param_info['name']
184
+ ts_all["latitude"] = target_lat
185
+ ts_all["longitude"] = target_lon
186
+ ts_all['source'] = provider.upper()
187
+ ts_all['units'] = ts.attrs['units']
188
+ ts_all.rename(columns={param_info['name']: 'value'}, inplace=True)
189
+ ts_all = ts_all[["latitude", "longitude", "time", "source", "variable", "value",'units']]
190
+
191
+ ts_all.to_csv(out_path, index=False)
192
+ print(f"✅ Saved MSWX time series to: {out_path}")
193
+
194
+ return ts_all
195
+
climdata/main.py ADDED
@@ -0,0 +1,56 @@
1
+ import argparse
2
+ import re
3
+ # import yaml
4
+ import os
5
+ import io
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ import concurrent.futures
9
+ from google.oauth2 import service_account
10
+ from googleapiclient.discovery import build
11
+ from googleapiclient.http import MediaIoBaseDownload
12
+ from datetime import datetime
13
+ import ipdb
14
+ import hydra
15
+ from omegaconf import DictConfig
16
+ import xarray as xr
17
+ import pandas as pd
18
+ import numpy as np
19
+ from scipy.spatial import cKDTree
20
+ from datetime import datetime, timedelta
21
+ from .utils.utils_download import *
22
+ import hydra
23
+ from omegaconf import DictConfig
24
+
25
+ from climdata.datasets.DWD import DWDmirror as DWD
26
+
27
+ @hydra.main(config_path="conf", config_name="config", version_base="1.3")
28
+ def run(cfg: DictConfig):
29
+ provider = cfg.dataset
30
+
31
+ filename = build_output_filename(cfg)
32
+ cfg.output.filename = filename
33
+
34
+ print(f"📡 Fetching data for dataset: {provider.upper()}")
35
+ print(f"📁 Output will be saved as: {filename}")
36
+
37
+ if provider.lower() == "mswx":
38
+ fetch_MSWX(cfg)
39
+ extract_ts_MSWX(cfg)
40
+ elif provider.lower() == "dwd_hyras":
41
+ fetch_dwd(cfg)
42
+ extract_ts_dwd(cfg)
43
+ elif provider == "dwd":
44
+ dwd = DWD(cfg)
45
+ dwd.fetch()
46
+ dwd.format()
47
+ dwd.save()
48
+ elif provider in ["gddp"]:
49
+ fetch_ee_loc(cfg)
50
+ elif provider == "era5-land":
51
+ fetch_ee_loc_mod(cfg)
52
+ else:
53
+ raise NotImplementedError(f"Provider '{provider}' is not yet supported in this script.")
54
+ # print(f"Downloaded {len(downloaded)} new files for {var.name}")
55
+ if __name__ == '__main__':
56
+ run()
@@ -0,0 +1,20 @@
1
+ beautifulsoup4==4.13.5
2
+ earthengine_api==1.5.19
3
+ geemap==0.35.3
4
+ geopandas==1.0.1
5
+ google_api_python_client==2.172.0
6
+ hydra-core==1.3.2
7
+ ipdb==0.13.13
8
+ numpy==2.3.2
9
+ omegaconf==2.3.0
10
+ pandas==2.3.2
11
+ Pint==0.24.4
12
+ Pint_Pandas==0.7.1
13
+ protobuf==6.32.0
14
+ PyYAML==6.0.2
15
+ PyYAML==6.0.2
16
+ Requests==2.32.5
17
+ scipy==1.16.1
18
+ tqdm==4.67.1
19
+ wetterdienst==0.111.0
20
+ xarray==2025.4.0
File without changes
@@ -0,0 +1,30 @@
1
+ from hydra import initialize, compose
2
+ from omegaconf import OmegaConf
3
+ from typing import Optional, List
4
+ import os
5
+
6
+ def load_config(
7
+ config_path: str = "../conf",
8
+ config_name: str = "config",
9
+ overrides: Optional[List[str]] = None,
10
+ verbose: bool = False
11
+ ):
12
+ """
13
+ Load a Hydra config file.
14
+
15
+ Args:
16
+ config_path (str): Path to the config directory.
17
+ config_name (str): Name of the config YAML file (without `.yaml`).
18
+ overrides (List[str], optional): List of override strings.
19
+ verbose (bool): Whether to print the loaded config.
20
+
21
+ Returns:
22
+ OmegaConf.DictConfig: The loaded config object.
23
+ """
24
+ # config_path = os.path.abspath(config_path)
25
+
26
+ with initialize(config_path=config_path, version_base=None):
27
+ cfg = compose(config_name=config_name, overrides=overrides or [])
28
+ if verbose:
29
+ print(OmegaConf.to_yaml(cfg))
30
+ return cfg