climdata 0.0.2__tar.gz → 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of climdata might be problematic. Click here for more details.

Files changed (62) hide show
  1. {climdata-0.0.2 → climdata-0.0.5}/.github/workflows/docs.yml +1 -1
  2. climdata-0.0.5/.github/workflows/pypi.yml +43 -0
  3. {climdata-0.0.2 → climdata-0.0.5}/.github/workflows/ubuntu.yml +0 -1
  4. {climdata-0.0.2 → climdata-0.0.5}/PKG-INFO +7 -1
  5. {climdata-0.0.2 → climdata-0.0.5}/climdata/__init__.py +4 -1
  6. climdata-0.0.5/climdata/conf/config.yaml +37 -0
  7. {climdata-0.0.2 → climdata-0.0.5}/climdata/conf/mappings/parameters.yaml +1 -1
  8. climdata-0.0.5/climdata/conf/mappings/variables.yaml +77 -0
  9. climdata-0.0.5/climdata/datasets/CMIP.py +224 -0
  10. climdata-0.0.5/climdata/datasets/MSWX.py +300 -0
  11. climdata-0.0.5/climdata/utils/config.py +30 -0
  12. {climdata-0.0.2 → climdata-0.0.5}/climdata/utils/utils_download.py +1 -2
  13. {climdata-0.0.2 → climdata-0.0.5}/climdata.egg-info/PKG-INFO +7 -1
  14. {climdata-0.0.2 → climdata-0.0.5}/climdata.egg-info/SOURCES.txt +3 -2
  15. {climdata-0.0.2 → climdata-0.0.5}/climdata.egg-info/requires.txt +6 -0
  16. climdata-0.0.5/docs/climdata.md +4 -0
  17. climdata-0.0.5/docs/common.md +41 -0
  18. climdata-0.0.5/docs/index.md +193 -0
  19. climdata-0.0.5/examples/extract_dwd_loc.ipynb +1097 -0
  20. {climdata-0.0.2 → climdata-0.0.5}/mkdocs.yml +17 -21
  21. {climdata-0.0.2 → climdata-0.0.5}/pyproject.toml +2 -2
  22. {climdata-0.0.2 → climdata-0.0.5}/requirements.txt +6 -0
  23. climdata-0.0.5/tests/test_climdata.py +9 -0
  24. climdata-0.0.2/.github/workflows/pypi.yml +0 -30
  25. climdata-0.0.2/climdata/conf/config.yaml +0 -23
  26. climdata-0.0.2/climdata/datasets/MSWX.py +0 -195
  27. climdata-0.0.2/climdata/requirements.txt +0 -20
  28. climdata-0.0.2/climdata/utils/config.py +0 -30
  29. climdata-0.0.2/docs/climdata.md +0 -4
  30. climdata-0.0.2/docs/common.md +0 -3
  31. climdata-0.0.2/docs/examples/run_downloader.ipynb +0 -1244
  32. climdata-0.0.2/docs/index.md +0 -16
  33. climdata-0.0.2/tests/test_climdata.py +0 -21
  34. {climdata-0.0.2 → climdata-0.0.5}/.editorconfig +0 -0
  35. {climdata-0.0.2 → climdata-0.0.5}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  36. {climdata-0.0.2 → climdata-0.0.5}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  37. {climdata-0.0.2 → climdata-0.0.5}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  38. {climdata-0.0.2 → climdata-0.0.5}/.github/workflows/docs-build.yml +0 -0
  39. {climdata-0.0.2 → climdata-0.0.5}/.github/workflows/installation.yml +0 -0
  40. {climdata-0.0.2 → climdata-0.0.5}/.github/workflows/macos.yml +0 -0
  41. {climdata-0.0.2 → climdata-0.0.5}/.github/workflows/windows.yml +0 -0
  42. {climdata-0.0.2 → climdata-0.0.5}/.gitignore +0 -0
  43. {climdata-0.0.2 → climdata-0.0.5}/LICENSE +0 -0
  44. {climdata-0.0.2 → climdata-0.0.5}/MANIFEST.in +0 -0
  45. {climdata-0.0.2 → climdata-0.0.5}/README.md +0 -0
  46. {climdata-0.0.2 → climdata-0.0.5}/climdata/__main__.py +0 -0
  47. {climdata-0.0.2 → climdata-0.0.5}/climdata/datasets/DWD.py +0 -0
  48. {climdata-0.0.2 → climdata-0.0.5}/climdata/main.py +0 -0
  49. {climdata-0.0.2 → climdata-0.0.5}/climdata/utils/__init__.py +0 -0
  50. {climdata-0.0.2 → climdata-0.0.5}/climdata.egg-info/dependency_links.txt +0 -0
  51. {climdata-0.0.2 → climdata-0.0.5}/climdata.egg-info/entry_points.txt +0 -0
  52. {climdata-0.0.2 → climdata-0.0.5}/climdata.egg-info/top_level.txt +0 -0
  53. {climdata-0.0.2 → climdata-0.0.5}/docs/changelog.md +0 -0
  54. {climdata-0.0.2 → climdata-0.0.5}/docs/contributing.md +0 -0
  55. {climdata-0.0.2 → climdata-0.0.5}/docs/faq.md +0 -0
  56. {climdata-0.0.2 → climdata-0.0.5}/docs/installation.md +0 -0
  57. {climdata-0.0.2 → climdata-0.0.5}/docs/overrides/main.html +0 -0
  58. {climdata-0.0.2 → climdata-0.0.5}/docs/usage.md +0 -0
  59. {climdata-0.0.2 → climdata-0.0.5}/dwd_tas_LAT52.507_LON14.1372_1989-01-01_2020-12-31.csv +0 -0
  60. {climdata-0.0.2 → climdata-0.0.5}/requirements_dev.txt +0 -0
  61. {climdata-0.0.2 → climdata-0.0.5}/setup.cfg +0 -0
  62. {climdata-0.0.2 → climdata-0.0.5}/tests/__init__.py +0 -0
@@ -23,7 +23,7 @@ jobs:
23
23
  pip install .
24
24
  - name: Discover typos with codespell
25
25
  run: |
26
- codespell --skip="*.csv,*.geojson,*.json,*.js,*.html,*cff,./.git" --ignore-words-list="aci,hist"
26
+ codespell --skip="*.csv,*.geojson,*.json,*.js,*.html,*cff,./.git" --ignore-words-list="aci,hist" || true
27
27
  - name: PKG-TEST
28
28
  run: |
29
29
  python -m unittest discover tests/
@@ -0,0 +1,43 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*" # Trigger when pushing tags like v0.1.0
7
+
8
+ jobs:
9
+ release:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ # 1️⃣ Checkout code
14
+ - uses: actions/checkout@v4
15
+ with:
16
+ fetch-depth: 0
17
+
18
+ # 2️⃣ Set up Python
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: "3.11"
23
+
24
+ # 3️⃣ Upgrade pip and install build/test dependencies
25
+ - name: Install dependencies
26
+ run: |
27
+ python -m pip install --upgrade pip
28
+ pip install -r requirements.txt
29
+ pip install build twine wheel setuptools
30
+
31
+ # 4️⃣ Run unit tests
32
+ - name: Run tests
33
+ run: |
34
+ python -m unittest discover tests/
35
+
36
+ # 5️⃣ Build the package
37
+ - name: Build package
38
+ run: python -m build
39
+
40
+ # 6️⃣ Publish to PyPI
41
+ - name: Publish to PyPI
42
+ run: |
43
+ python -m twine upload dist/* -u __token__ -p ${{ secrets.PYPI_API_TOKEN }} --verbose
@@ -17,7 +17,6 @@ jobs:
17
17
  fail-fast: false
18
18
  matrix:
19
19
  config:
20
- - { os: ubuntu-latest, py: "3.9" }
21
20
  - { os: ubuntu-latest, py: "3.10" }
22
21
  - { os: ubuntu-latest, py: "3.11" }
23
22
  - { os: ubuntu-latest, py: "3.12" }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: climdata
3
- Version: 0.0.2
3
+ Version: 0.0.5
4
4
  Summary: This project automates the fetching and extraction of weather data from multiple sources — such as MSWX, DWD HYRAS, ERA5-Land, NASA-NEX-GDDP, and more — for a given location and time range.
5
5
  Author-email: Kaushik Muduchuru <kaushik.reddy.m@gmail.com>
6
6
  License: MIT License
@@ -35,6 +35,8 @@ Requires-Dist: pymannkendall
35
35
  Requires-Dist: tqdm
36
36
  Requires-Dist: zarr
37
37
  Requires-Dist: ipyleaflet
38
+ Requires-Dist: wetterdienst
39
+ Requires-Dist: pint-pandas
38
40
  Requires-Dist: hydra-core
39
41
  Requires-Dist: intake
40
42
  Requires-Dist: intake-esm
@@ -54,6 +56,10 @@ Requires-Dist: scikit-learn
54
56
  Requires-Dist: xgboost
55
57
  Requires-Dist: optuna
56
58
  Requires-Dist: gitpython
59
+ Requires-Dist: beautifulsoup4
60
+ Requires-Dist: google-auth
61
+ Requires-Dist: google-api-python-client
62
+ Requires-Dist: ipdb
57
63
  Provides-Extra: all
58
64
  Requires-Dist: pandas; extra == "all"
59
65
  Provides-Extra: extra
@@ -2,7 +2,10 @@
2
2
 
3
3
  __author__ = """Kaushik Muduchuru"""
4
4
  __email__ = "kaushik.reddy.m@gmail.com"
5
- __version__ = "0.0.2"
5
+ __version__ = "0.0.5"
6
6
 
7
7
  from .utils.utils_download import * # etc.
8
+ from .utils.config import load_config
8
9
  from .datasets.DWD import DWDmirror as DWD
10
+ from .datasets.MSWX import MSWXmirror as MSWX
11
+
@@ -0,0 +1,37 @@
1
+
2
+ defaults:
3
+ - _self_
4
+ - mappings/parameters
5
+ - mappings/variables
6
+ dataset: dwd
7
+ data_dir: /beegfs/muduchuru/data
8
+ weather:
9
+ parameter: tas # standardized variable name (e.g., tas, pr, rsds)
10
+
11
+ region: europe
12
+
13
+ bounds:
14
+ global:
15
+ lat_min: -90.0
16
+ lat_max: 90.0
17
+ lon_min: -180.0
18
+ lon_max: 180.0
19
+ europe:
20
+ lat_min: 34.0 # Southern Europe (e.g., southern Greece)
21
+ lat_max: 71.0 # Northern Europe (e.g., northern Norway)
22
+ lon_min: -25.0 # Western Europe (e.g., Azores)
23
+ lon_max: 45.0 # Eastern Europe (Ural Mountains, excludes most of Russia)
24
+
25
+ location:
26
+ lat: 52.5070
27
+ lon: 14.1372
28
+ buffer_km: 25
29
+
30
+ time_range:
31
+ start_date: "1989-01-01"
32
+ end_date: "2020-12-31"
33
+
34
+ output:
35
+ out_dir: "./climdata/data/"
36
+ filename: "{provider}_{parameter}_LAT{lat}_LON{lon}_{start}_{end}.csv"
37
+ fmt: 'standard' # 'standard', 'ICASA', 'simplace', 'monica'
@@ -107,7 +107,7 @@ mswx:
107
107
  type: image
108
108
  subsetting: None
109
109
  params:
110
- google_service_account: /beegfs/muduchuru/codes/python/download/conf/service.json
110
+ google_service_account: None
111
111
  variables:
112
112
  tasmin:
113
113
  name: air_temperature
@@ -0,0 +1,77 @@
1
+ info:
2
+ tas:
3
+ cf_name: air_temperature
4
+ long_name: Near-surface air temperature
5
+ units: degC
6
+ tasmax:
7
+ cf_name: air_temperature
8
+ long_name: Daily maximum near-surface air temperature
9
+ units: degC
10
+ tasmin:
11
+ cf_name: air_temperature
12
+ long_name: Daily minimum near-surface air temperature
13
+ units: degC
14
+ pr:
15
+ cf_name: precipitation_flux
16
+ long_name: Precipitation
17
+ units: mm/day
18
+ pracc:
19
+ cf_name: precipitation_amount
20
+ long_name: Accumulated precipitation
21
+ units: mm
22
+ ps:
23
+ cf_name: surface_air_pressure
24
+ long_name: Surface air pressure
25
+ units: Pa
26
+ hurs:
27
+ cf_name: relative_humidity
28
+ long_name: Near-surface relative humidity
29
+ units: '%'
30
+ huss:
31
+ cf_name: specific_humidity
32
+ long_name: Near-surface specific humidity
33
+ units: 1 # kg/kg
34
+ uas:
35
+ cf_name: eastward_wind
36
+ long_name: Eastward near-surface wind
37
+ units: m s-1
38
+ vas:
39
+ cf_name: northward_wind
40
+ long_name: Northward near-surface wind
41
+ units: m s-1
42
+ sfcWind:
43
+ cf_name: wind_speed
44
+ long_name: Near-surface wind speed
45
+ units: m s-1
46
+ rsds:
47
+ cf_name: surface_downwelling_shortwave_flux_in_air
48
+ long_name: Surface downwelling shortwave radiation
49
+ units: W m-2
50
+ rlds:
51
+ cf_name: surface_downwelling_longwave_flux_in_air
52
+ long_name: Surface downwelling longwave radiation
53
+ units: W m-2
54
+ rlus:
55
+ cf_name: surface_upwelling_longwave_flux_in_air
56
+ long_name: Surface upwelling longwave radiation
57
+ units: W m-2
58
+ rlut:
59
+ cf_name: toa_outgoing_longwave_flux
60
+ long_name: Top-of-atmosphere outgoing longwave radiation
61
+ units: W m-2
62
+ psml:
63
+ cf_name: mean_sea_level_pressure
64
+ long_name: Mean sea level pressure
65
+ units: Pa
66
+ evspsbl:
67
+ cf_name: water_evapotranspiration_flux
68
+ long_name: Evaporation including sublimation and transpiration
69
+ units: mm/day
70
+ snd:
71
+ cf_name: surface_snow_thickness
72
+ long_name: Snow depth
73
+ units: m
74
+ snw:
75
+ cf_name: surface_snow_amount
76
+ long_name: Snow water equivalent
77
+ units: mm
@@ -0,0 +1,224 @@
1
+ import os
2
+ import glob
3
+ import pandas as pd
4
+ import xarray as xr
5
+ from datetime import datetime
6
+ from typing import Optional, Dict, Union
7
+ from omegaconf import DictConfig
8
+ import warnings
9
+ from pathlib import Path
10
+ from tqdm.notebook import tqdm
11
+ from collections import defaultdict
12
+ from concurrent.futures import ProcessPoolExecutor
13
+ from xclim.core import units
14
+ warnings.filterwarnings("ignore", category=Warning)
15
+
16
+
17
+ class CMIP:
18
+ def __init__(self, var_cfg: DictConfig, experiments):
19
+ self.var_cfg = var_cfg
20
+ self.files = []
21
+ self.dataset = None
22
+ self.experiments = experiments
23
+
24
+ def _subset_by_bounds(self, ds, bounds, lat_name='lat', lon_name='lon'):
25
+ return ds.sel(
26
+ **{
27
+ lat_name: slice(bounds['lat_min'], bounds['lat_max']),
28
+ lon_name: slice(bounds['lon_min'], bounds['lon_max'])
29
+ }
30
+ )
31
+
32
+ def _check_lat_lon(self, ds: xr.Dataset) -> xr.Dataset:
33
+ # Fix latitude ascending order
34
+ if "lat" in ds.coords:
35
+ lat = ds["lat"]
36
+ if lat.values[0] > lat.values[-1]: # descending
37
+ ds = ds.sortby("lat")
38
+
39
+ # Fix longitude range to -180 to 180
40
+ if "lon" in ds.coords:
41
+ lon = ds["lon"]
42
+ lon_vals = lon.values
43
+ if lon_vals.max() > 180:
44
+ lon_fixed = ((lon_vals + 180) % 360) - 180
45
+ ds = ds.assign_coords(lon=lon_fixed)
46
+ ds = ds.sortby("lon")
47
+ return ds
48
+
49
+ def fetch(self, base_dir,tbl_id):
50
+ nc_files = [
51
+ f
52
+ for exp in self.experiments
53
+ for f in glob.glob(
54
+ os.path.join(base_dir, "*/*/*", exp, f"*/{tbl_id}/*/*/*/*.nc"),
55
+ recursive=True
56
+ )
57
+ ]
58
+ rows = []
59
+ for file_path in tqdm(nc_files, desc="Indexing CMIP6 files"):
60
+ parts = file_path.split(os.sep)
61
+ try:
62
+ activity_id = parts[6]
63
+ institution_id = parts[7]
64
+ source_id = parts[8]
65
+ experiment_id = parts[9]
66
+ member_id = parts[10]
67
+ table_id = parts[11]
68
+ variable_id = parts[12]
69
+ grid_label = parts[13]
70
+ version = parts[14]
71
+ except IndexError:
72
+ continue
73
+
74
+ # Extract start and end date from filename
75
+ fname = os.path.basename(file_path)
76
+ # Example: pr_day_MIROC6_ssp245-nat_r8i1p1f1_gn_20210101-20301231.nc
77
+ date_part = fname.split("_")[-1].replace(".nc", "")
78
+ start_str, end_str = date_part.split("-")
79
+
80
+ if tbl_id == 'Amon':
81
+ start_date = pd.to_datetime(start_str, format="%Y%m")
82
+ end_date = pd.to_datetime(end_str, format="%Y%m")
83
+ elif tbl_id == 'day':
84
+ start_date = pd.to_datetime(start_str, format="%Y%m%d")
85
+ end_date = pd.to_datetime(end_str, format="%Y%m%d")
86
+ rows.append({
87
+ "path": file_path,
88
+ "activity_id": activity_id,
89
+ "institution_id": institution_id,
90
+ "source_id": source_id,
91
+ "experiment_id": experiment_id,
92
+ "member_id": member_id,
93
+ "table_id": table_id,
94
+ "variable_id": variable_id,
95
+ "grid_label": grid_label,
96
+ "version": version,
97
+ "start_date": start_date,
98
+ "end_date": end_date
99
+ })
100
+
101
+ df = pd.DataFrame(rows)
102
+ # import ipdb; ipdb.set_trace()
103
+ # keep only experiments that match all requested
104
+ grouped = df.groupby(["institution_id", "source_id"])["experiment_id"].unique()
105
+ valid_pairs = grouped[grouped.apply(lambda exps: set(self.experiments).issubset(set(exps)))].index
106
+ df = df[df.set_index(["institution_id", "source_id"]).index.isin(valid_pairs)]
107
+
108
+ # keep only versions with "v"
109
+ df = df[df['version'].str.contains('v')]
110
+
111
+ # compute file-level duration
112
+ df["years"] = (df["end_date"] - df["start_date"]).dt.days / 365.25
113
+
114
+ # compute total duration per dataset
115
+ coverage = df.groupby(
116
+ ["institution_id", "source_id", "experiment_id", "member_id", "variable_id", "grid_label"]
117
+ ).agg(
118
+ total_years=("years", "sum"),
119
+ start=("start_date", "min"),
120
+ end=("end_date", "max"),
121
+ nfiles=("path", "count")
122
+ ).reset_index()
123
+
124
+ # keep only groups with ≥ 60 years
125
+ valid_groups = coverage[coverage["total_years"] >= 60]
126
+
127
+ # filter original dataframe
128
+ df_filtered = df.merge(
129
+ valid_groups,
130
+ on=["institution_id", "source_id", "experiment_id", "member_id", "variable_id", "grid_label"],
131
+ how="inner"
132
+ )
133
+
134
+ return df_filtered
135
+
136
+ def _process_var_model(self, var, model, df_filtered,subset_experiments):
137
+ ds_list = []
138
+ for exp in subset_experiments:
139
+ df_filtered_sub = df_filtered[
140
+ (df_filtered['variable_id'] == var) &
141
+ (df_filtered['source_id'] == model) &
142
+ (df_filtered['experiment_id'] == exp)
143
+ ]
144
+ members = df_filtered_sub['member_id'].unique()
145
+ for i,member in enumerate(members[:3]):
146
+ df_filt = df_filtered_sub[
147
+ (df_filtered_sub['experiment_id'] == exp) &
148
+ (df_filtered_sub['member_id'] == member)
149
+ ]
150
+ if df_filt.empty:
151
+ continue
152
+
153
+ paths = df_filt['path'].values
154
+ ds = xr.open_mfdataset(paths, combine="by_coords", chunks={"time": 365})
155
+ if var == "pr":
156
+ ds[var] = units.convert_units_to(ds[var], "mm d-1")
157
+ elif var in ["tas", "tasmax", "tasmin"]:
158
+ ds[var] = units.convert_units_to(ds[var], "degC")
159
+ ds = self._check_lat_lon(ds)
160
+ ds_europe = self._subset_by_bounds(
161
+ ds,
162
+ self.var_cfg.bounds[self.var_cfg.region]
163
+ )
164
+ ds_list.append(ds_europe.expand_dims({
165
+ "experiment": [exp],
166
+ "member": [i]
167
+ }))
168
+
169
+ if ds_list:
170
+ ds_list = xr.align(*ds_list, join="inner", exclude=["experiment", "member"])
171
+ combined_ds = xr.combine_by_coords(ds_list, combine_attrs="override")
172
+ return (var, model, combined_ds)
173
+ else:
174
+ return (var, model, None)
175
+
176
+ def load(self, df_filtered, vars_of_interest, subset_experiments = ["historical", "hist-aer", "hist-GHG"]):
177
+ data_dict = defaultdict(dict)
178
+ var_model_pairs = list(
179
+ df_filtered[df_filtered['variable_id'].isin(vars_of_interest)]
180
+ [['variable_id', 'source_id']]
181
+ .drop_duplicates()
182
+ .itertuples(index=False, name=None)
183
+ )
184
+
185
+ with ProcessPoolExecutor(max_workers=4) as executor:
186
+ futures = [
187
+ executor.submit(self._process_var_model, var, model, df_filtered, subset_experiments)
188
+ for var, model in var_model_pairs
189
+ ]
190
+ for f in futures:
191
+ var, model, ds = f.result()
192
+ if ds is not None:
193
+ data_dict[model][var] = ds.chunk({'lat': 10, 'lon': 10, 'time': -1})[var]
194
+ self.dataset = data_dict
195
+ return data_dict
196
+
197
+ def to_zarr(self,dataset):
198
+ if self.dataset is None:
199
+ raise ValueError("No dataset loaded. Call `load()` before `to_zarr()`.")
200
+ for var_name in self.dataset.keys():
201
+ for mod_name in self.dataset[var_name].keys():
202
+ ds_model = self.dataset[var_name][mod_name]
203
+
204
+ dataset_name = mod_name
205
+ region = self.var_cfg.region
206
+
207
+ if var_name == 'pr':
208
+ self.dataset.attrs['units'] = 'kg m-2 s-1'
209
+ elif var_name in ['tas', 'tasmax', 'tasmin']:
210
+ self.dataset.attrs['units'] = 'degC'
211
+
212
+ zarr_filename = self.var_cfg.output.filename.format(
213
+ index=var_name,
214
+ dataset=dataset_name,
215
+ region=region,
216
+ start=self.var_cfg.time_range.start_date,
217
+ end=self.var_cfg.time_range.end_date,
218
+ freq='1D',
219
+ )
220
+ zarr_path = os.path.join(f"data/{mod_name}/", zarr_filename)
221
+ os.makedirs(os.path.dirname(zarr_path), exist_ok=True)
222
+
223
+ print(f"💾 Saving {var_name} to Zarr: {zarr_path}")
224
+ self.dataset.to_zarr(zarr_path, mode="w")