climdata 0.1.1__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of climdata might be problematic. Click here for more details.
- {climdata-0.1.1 → climdata-0.1.3}/PKG-INFO +1 -1
- {climdata-0.1.1 → climdata-0.1.3}/climdata/__init__.py +2 -1
- climdata-0.1.3/climdata/conf/config.yaml +35 -0
- {climdata-0.1.1 → climdata-0.1.3}/climdata/conf/mappings/parameters.yaml +2 -2
- {climdata-0.1.1 → climdata-0.1.3}/climdata/datasets/CMIPCloud.py +55 -32
- {climdata-0.1.1 → climdata-0.1.3}/climdata/datasets/DWD.py +12 -20
- climdata-0.1.3/climdata/datasets/HYRAS.py +133 -0
- climdata-0.1.3/climdata/datasets/MSWX.py +228 -0
- climdata-0.1.3/climdata/utils/utils_download.py +241 -0
- {climdata-0.1.1 → climdata-0.1.3}/climdata.egg-info/PKG-INFO +1 -1
- {climdata-0.1.1 → climdata-0.1.3}/climdata.egg-info/SOURCES.txt +7 -4
- climdata-0.1.3/examples/climdata_cli.py +98 -0
- climdata-0.1.3/examples/climdata_loader.ipynb +1230 -0
- climdata-0.1.3/examples/get_CMIP_box.py +57 -0
- climdata-0.1.3/examples/get_CMIP_loc.py +61 -0
- climdata-0.1.3/examples/get_MSWX_box.py +60 -0
- climdata-0.1.3/examples/get_MSWX_loc.py +71 -0
- {climdata-0.1.1 → climdata-0.1.3}/pyproject.toml +2 -2
- climdata-0.1.1/climdata/__main__.py +0 -5
- climdata-0.1.1/climdata/conf/config.yaml +0 -38
- climdata-0.1.1/climdata/datasets/MSWX.py +0 -302
- climdata-0.1.1/climdata/main.py +0 -56
- climdata-0.1.1/climdata/utils/utils_download.py +0 -975
- climdata-0.1.1/examples/extract_dwd_loc.ipynb +0 -2429
- climdata-0.1.1/examples/zarr_tas_data/metadata.json +0 -1
- {climdata-0.1.1 → climdata-0.1.3}/.editorconfig +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/.github/workflows/docs-build.yml +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/.github/workflows/docs.yml +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/.github/workflows/installation.yml +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/.github/workflows/macos.yml +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/.github/workflows/pypi.yml +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/.github/workflows/ubuntu.yml +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/.github/workflows/windows.yml +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/.gitignore +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/LICENSE +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/MANIFEST.in +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/README.md +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/climdata/conf/mappings/variables.yaml +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/climdata/datasets/CMIPlocal.py +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/climdata/datasets/ERA5.py +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/climdata/utils/__init__.py +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/climdata/utils/config.py +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/climdata.egg-info/dependency_links.txt +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/climdata.egg-info/entry_points.txt +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/climdata.egg-info/requires.txt +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/climdata.egg-info/top_level.txt +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/docs/changelog.md +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/docs/climdata.md +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/docs/common.md +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/docs/contributing.md +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/docs/faq.md +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/docs/index.md +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/docs/installation.md +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/docs/overrides/main.html +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/docs/usage.md +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/dwd_tas_LAT52.507_LON14.1372_1989-01-01_2020-12-31.csv +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/mkdocs.yml +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/requirements.txt +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/requirements_dev.txt +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/setup.cfg +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/tests/__init__.py +0 -0
- {climdata-0.1.1 → climdata-0.1.3}/tests/test_climdata.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: climdata
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: This project automates the fetching and extraction of weather data from multiple sources — such as MSWX, DWD HYRAS, ERA5-Land, NASA-NEX-GDDP, and more — for a given location and time range.
|
|
5
5
|
Author-email: Kaushik Muduchuru <kaushik.reddy.m@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
__author__ = """Kaushik Muduchuru"""
|
|
4
4
|
__email__ = "kaushik.reddy.m@gmail.com"
|
|
5
|
-
__version__ = "0.1.
|
|
5
|
+
__version__ = "0.1.3"
|
|
6
6
|
|
|
7
7
|
from .utils.utils_download import * # etc.
|
|
8
8
|
from .utils.config import load_config
|
|
@@ -11,4 +11,5 @@ from .datasets.MSWX import MSWXmirror as MSWX
|
|
|
11
11
|
from .datasets.ERA5 import ERA5Mirror as ERA5
|
|
12
12
|
from .datasets.CMIPlocal import CMIPmirror as CMIPlocal
|
|
13
13
|
from .datasets.CMIPCloud import CMIPCloud as CMIP
|
|
14
|
+
from .datasets.HYRAS import HYRASmirror as HYRAS
|
|
14
15
|
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
defaults:
|
|
2
|
+
- _self_
|
|
3
|
+
- mappings/parameters
|
|
4
|
+
- mappings/variables
|
|
5
|
+
|
|
6
|
+
dataset: MSWX
|
|
7
|
+
lat: null
|
|
8
|
+
lon: null
|
|
9
|
+
|
|
10
|
+
variables: ["tasmin","tasmax","pr"]
|
|
11
|
+
|
|
12
|
+
data_dir: ./data
|
|
13
|
+
region: None
|
|
14
|
+
|
|
15
|
+
experiment_id: historical
|
|
16
|
+
source_id: MIROC6
|
|
17
|
+
table_id: day
|
|
18
|
+
|
|
19
|
+
bounds:
|
|
20
|
+
europe:
|
|
21
|
+
lat_min: 34.0
|
|
22
|
+
lat_max: 71.0
|
|
23
|
+
lon_min: -25.0
|
|
24
|
+
lon_max: 45.0
|
|
25
|
+
|
|
26
|
+
time_range:
|
|
27
|
+
start_date: "1989-01-01"
|
|
28
|
+
end_date: "2020-12-31"
|
|
29
|
+
|
|
30
|
+
output:
|
|
31
|
+
out_dir: "./climdata/data/"
|
|
32
|
+
filename_csv: "{provider}_{parameter}_LAT_{lat}_LON_{lon}_{start}_{end}.csv"
|
|
33
|
+
filename_zarr: "{provider}_{parameter}_LAT{lat_range}_LON{lon_range}_{start}_{end}.zarr"
|
|
34
|
+
filename_nc: "{provider}_{parameter}_LAT{lat_range}_LON{lon_range}_{start}_{end}.nc"
|
|
35
|
+
fmt: "standard"
|
|
@@ -21,7 +21,7 @@ dwd:
|
|
|
21
21
|
resolution: daily
|
|
22
22
|
dataset: climate_summary
|
|
23
23
|
name: precipitation_height
|
|
24
|
-
unit: mm
|
|
24
|
+
unit: mm d-1
|
|
25
25
|
rsds:
|
|
26
26
|
resolution: daily
|
|
27
27
|
dataset: solar
|
|
@@ -124,7 +124,7 @@ mswx:
|
|
|
124
124
|
rsds:
|
|
125
125
|
name: downward_shortwave_radiation
|
|
126
126
|
folder_id: 1usXbIOi4_jBUdDaZbzPKXznx9PTYzHRv
|
|
127
|
-
|
|
127
|
+
hyras:
|
|
128
128
|
variables:
|
|
129
129
|
tasmin:
|
|
130
130
|
name: tasmin
|
|
@@ -1,20 +1,32 @@
|
|
|
1
1
|
import intake
|
|
2
2
|
import xarray as xr
|
|
3
3
|
import pandas as pd
|
|
4
|
+
from omegaconf import DictConfig
|
|
5
|
+
import intake
|
|
6
|
+
import xarray as xr
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from omegaconf import DictConfig
|
|
9
|
+
|
|
4
10
|
|
|
5
11
|
class CMIPCloud:
|
|
6
|
-
def __init__(self,
|
|
7
|
-
|
|
8
|
-
self.
|
|
9
|
-
self.
|
|
10
|
-
self.
|
|
11
|
-
self.
|
|
12
|
+
def __init__(self, cfg: DictConfig):
|
|
13
|
+
# Directly read from flat config
|
|
14
|
+
self.experiment_id = cfg.experiment_id
|
|
15
|
+
self.source_id = cfg.source_id
|
|
16
|
+
self.table_id = cfg.table_id
|
|
17
|
+
self.variables = cfg.variables
|
|
18
|
+
self.start_date = cfg.time_range.start_date
|
|
19
|
+
self.end_date = cfg.time_range.end_date
|
|
20
|
+
|
|
12
21
|
self.col_subsets = []
|
|
13
22
|
self.ds = None
|
|
23
|
+
self.col = None
|
|
14
24
|
|
|
15
25
|
def fetch(self):
|
|
16
26
|
"""Collect intake catalog subsets for each variable."""
|
|
17
|
-
col = intake.open_esm_datastore(
|
|
27
|
+
col = intake.open_esm_datastore(
|
|
28
|
+
"https://storage.googleapis.com/cmip6/pangeo-cmip6.json"
|
|
29
|
+
)
|
|
18
30
|
self.col_subsets = []
|
|
19
31
|
for var in self.variables:
|
|
20
32
|
query = dict(
|
|
@@ -27,13 +39,16 @@ class CMIPCloud:
|
|
|
27
39
|
if len(col_subset.df) == 0:
|
|
28
40
|
continue
|
|
29
41
|
self.col_subsets.append(col_subset)
|
|
42
|
+
self.col = col
|
|
30
43
|
return self.col_subsets
|
|
31
44
|
|
|
32
45
|
def load(self):
|
|
33
46
|
"""Load and merge datasets from collected col_subsets."""
|
|
34
47
|
datasets = []
|
|
35
48
|
for col_subset in self.col_subsets:
|
|
36
|
-
zstore_path = col_subset.df.zstore.values[0].replace(
|
|
49
|
+
zstore_path = col_subset.df.zstore.values[0].replace(
|
|
50
|
+
"gs:/", "https://storage.googleapis.com"
|
|
51
|
+
)
|
|
37
52
|
ds_var = xr.open_zarr(zstore_path)
|
|
38
53
|
datasets.append(ds_var)
|
|
39
54
|
if datasets:
|
|
@@ -51,25 +66,25 @@ class CMIPCloud:
|
|
|
51
66
|
|
|
52
67
|
if self.ds is None:
|
|
53
68
|
raise ValueError("No dataset loaded. Call `load()` first.")
|
|
54
|
-
|
|
69
|
+
|
|
70
|
+
self._subset_time(self.start_date, self.end_date)
|
|
71
|
+
|
|
55
72
|
ds = self.ds
|
|
56
|
-
|
|
57
73
|
if point is not None:
|
|
58
74
|
lon, lat = point
|
|
59
75
|
if buffer_km > 0:
|
|
60
76
|
buffer_deg = buffer_km / 111
|
|
61
77
|
ds_subset = ds.sel(
|
|
62
|
-
lon=slice(lon-buffer_deg, lon+buffer_deg),
|
|
63
|
-
lat=slice(lat-buffer_deg, lat+buffer_deg)
|
|
78
|
+
lon=slice(lon - buffer_deg, lon + buffer_deg),
|
|
79
|
+
lat=slice(lat - buffer_deg, lat + buffer_deg),
|
|
64
80
|
)
|
|
65
81
|
else:
|
|
66
82
|
ds_subset = ds.sel(lon=lon, lat=lat, method="nearest")
|
|
67
83
|
|
|
68
84
|
elif box is not None:
|
|
69
|
-
# Accept dict: {'lat_min': ..., 'lat_max': ..., 'lon_min': ..., 'lon_max': ...}
|
|
70
85
|
ds_subset = ds.sel(
|
|
71
|
-
lon=slice(box[
|
|
72
|
-
lat=slice(box[
|
|
86
|
+
lon=slice(box["lon_min"], box["lon_max"]),
|
|
87
|
+
lat=slice(box["lat_min"], box["lat_max"]),
|
|
73
88
|
)
|
|
74
89
|
|
|
75
90
|
elif shapefile is not None:
|
|
@@ -83,6 +98,7 @@ class CMIPCloud:
|
|
|
83
98
|
gdf = gdf.to_crs(epsg=4326)
|
|
84
99
|
geom = [mapping(g) for g in gdf.geometry]
|
|
85
100
|
import rioxarray
|
|
101
|
+
|
|
86
102
|
ds = ds.rio.write_crs("EPSG:4326", inplace=False)
|
|
87
103
|
ds_subset = ds.rio.clip(geom, gdf.crs, drop=True)
|
|
88
104
|
|
|
@@ -90,11 +106,9 @@ class CMIPCloud:
|
|
|
90
106
|
raise ValueError("Must provide either point, box, or shapefile.")
|
|
91
107
|
self.ds = ds_subset
|
|
92
108
|
return ds_subset
|
|
109
|
+
|
|
93
110
|
def _subset_time(self, start_date, end_date):
|
|
94
|
-
"""
|
|
95
|
-
Subset the dataset by time range.
|
|
96
|
-
Dates should be strings in 'YYYY-MM-DD' format.
|
|
97
|
-
"""
|
|
111
|
+
"""Subset the dataset by time range."""
|
|
98
112
|
if self.ds is None:
|
|
99
113
|
return None
|
|
100
114
|
ds_time = self.ds.sel(time=slice(start_date, end_date))
|
|
@@ -114,29 +128,38 @@ class CMIPCloud:
|
|
|
114
128
|
print(f"Saved Zarr to {store_path}")
|
|
115
129
|
|
|
116
130
|
def _format(self, df):
|
|
117
|
-
"""
|
|
118
|
-
Format the dataframe for standardized output:
|
|
119
|
-
- Adds source_id, experiment_id, table_id, variable, value, units columns.
|
|
120
|
-
- Stacks variables into long format.
|
|
121
|
-
"""
|
|
122
|
-
# Melt the dataframe to long format: variable, value
|
|
131
|
+
"""Format dataframe for standardized output."""
|
|
123
132
|
value_vars = [v for v in self.variables if v in df.columns]
|
|
124
133
|
id_vars = [c for c in df.columns if c not in value_vars]
|
|
125
|
-
df_long = df.melt(id_vars=id_vars, value_vars=value_vars,
|
|
126
|
-
var_name="variable", value_name="value")
|
|
127
134
|
|
|
128
|
-
|
|
135
|
+
df_long = df.melt(
|
|
136
|
+
id_vars=id_vars,
|
|
137
|
+
value_vars=value_vars,
|
|
138
|
+
var_name="variable",
|
|
139
|
+
value_name="value",
|
|
140
|
+
)
|
|
141
|
+
|
|
129
142
|
df_long["units"] = df_long["variable"].map(
|
|
130
|
-
lambda v: self.ds[v].attrs.get("units", "unknown")
|
|
143
|
+
lambda v: self.ds[v].attrs.get("units", "unknown")
|
|
144
|
+
if v in self.ds.data_vars
|
|
145
|
+
else "unknown"
|
|
131
146
|
)
|
|
132
147
|
|
|
133
|
-
# Add metadata columns if missing
|
|
134
148
|
df_long["source"] = self.source_id
|
|
135
149
|
df_long["experiment"] = self.experiment_id
|
|
136
150
|
df_long["table"] = self.table_id
|
|
137
151
|
|
|
138
|
-
|
|
139
|
-
|
|
152
|
+
cols = [
|
|
153
|
+
"source",
|
|
154
|
+
"experiment",
|
|
155
|
+
"table",
|
|
156
|
+
"time",
|
|
157
|
+
"lat",
|
|
158
|
+
"lon",
|
|
159
|
+
"variable",
|
|
160
|
+
"value",
|
|
161
|
+
"units",
|
|
162
|
+
]
|
|
140
163
|
df_long = df_long[[c for c in cols if c in df_long.columns]]
|
|
141
164
|
|
|
142
165
|
return df_long
|
|
@@ -3,23 +3,16 @@ import pandas as pd
|
|
|
3
3
|
import hydra
|
|
4
4
|
from wetterdienst import Settings
|
|
5
5
|
from wetterdienst.provider.dwd.observation import DwdObservationRequest
|
|
6
|
-
from climdata.utils.utils_download import build_output_filename
|
|
7
6
|
|
|
8
7
|
class DWDmirror:
|
|
9
8
|
def __init__(self, cfg):
|
|
10
9
|
self.cfg = cfg
|
|
11
10
|
self.param_mapping = cfg.mappings
|
|
12
|
-
self.provider = cfg.dataset.lower()
|
|
13
|
-
self.parameter_key = cfg.weather.parameter
|
|
14
|
-
self.lat = cfg.location.lat
|
|
15
|
-
self.lon = cfg.location.lon
|
|
16
|
-
self.distance = cfg.location.buffer_km
|
|
17
11
|
self.start_date = cfg.time_range.start_date
|
|
18
12
|
self.end_date = cfg.time_range.end_date
|
|
19
|
-
self.units = self.param_mapping[self.provider]['variables'][self.parameter_key].get("unit", None)
|
|
20
13
|
self.df = None
|
|
21
|
-
def
|
|
22
|
-
param_info = self.param_mapping
|
|
14
|
+
def load(self, variable, lat_loc, lon_loc, buffer_km = 50):
|
|
15
|
+
param_info = self.param_mapping.dwd.variables[variable]
|
|
23
16
|
resolution = param_info["resolution"]
|
|
24
17
|
dataset = param_info["dataset"]
|
|
25
18
|
variable_name = param_info["name"]
|
|
@@ -31,8 +24,8 @@ class DWDmirror:
|
|
|
31
24
|
end_date=self.end_date,
|
|
32
25
|
settings=settings
|
|
33
26
|
).filter_by_distance(
|
|
34
|
-
latlon=(
|
|
35
|
-
distance=
|
|
27
|
+
latlon=(lat_loc, lon_loc),
|
|
28
|
+
distance=buffer_km,
|
|
36
29
|
unit="km"
|
|
37
30
|
)
|
|
38
31
|
|
|
@@ -40,7 +33,7 @@ class DWDmirror:
|
|
|
40
33
|
self.df = df
|
|
41
34
|
return self.df
|
|
42
35
|
|
|
43
|
-
def format(self):
|
|
36
|
+
def format(self, variable, lat_loc, lon_loc):
|
|
44
37
|
self.df['date'] = pd.to_datetime(self.df['date'])
|
|
45
38
|
self.df = self.df.groupby(['date']).agg({
|
|
46
39
|
'value': 'mean',
|
|
@@ -56,18 +49,17 @@ class DWDmirror:
|
|
|
56
49
|
"value": "value",
|
|
57
50
|
"station_id": "frequent_station",
|
|
58
51
|
})
|
|
59
|
-
self.df["variable"] =
|
|
60
|
-
self.df["
|
|
61
|
-
self.df["
|
|
52
|
+
self.df["variable"] = variable
|
|
53
|
+
self.df["lat"] = lat_loc
|
|
54
|
+
self.df["lon"] = lon_loc
|
|
62
55
|
self.df['source'] = 'DWD'
|
|
63
|
-
self.df['units'] = self.
|
|
64
|
-
self.df = self.df[["
|
|
56
|
+
self.df['units'] = self.param_mapping.dwd.variables[variable].unit
|
|
57
|
+
self.df = self.df[["lat", "lon", "time", "source", "variable", "value", "units"]]
|
|
65
58
|
# self.df = df
|
|
66
59
|
return self.df
|
|
67
60
|
|
|
68
|
-
def
|
|
69
|
-
filename =
|
|
70
|
-
self.df.to_csv(self.cfg.output.out_dir+filename, index=False)
|
|
61
|
+
def save_csv(self,filename):
|
|
62
|
+
self.df.to_csv(filename, index=False)
|
|
71
63
|
print(f"✅ Saved time series to: {filename}")
|
|
72
64
|
return filename
|
|
73
65
|
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import xarray as xr
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from omegaconf import DictConfig
|
|
6
|
+
from climdata.utils.utils_download import find_nearest_xy, fetch_dwd
|
|
7
|
+
import geopandas as gpd
|
|
8
|
+
|
|
9
|
+
class HYRASmirror:
|
|
10
|
+
def __init__(self, cfg: DictConfig):
|
|
11
|
+
self.cfg = cfg
|
|
12
|
+
self.dataset = None
|
|
13
|
+
self.variables = cfg.variables
|
|
14
|
+
self.files = []
|
|
15
|
+
|
|
16
|
+
def fetch(self, variable: str):
|
|
17
|
+
"""
|
|
18
|
+
Download HYRAS NetCDF files for a given variable and time range.
|
|
19
|
+
"""
|
|
20
|
+
fetch_dwd(self.cfg,variable)
|
|
21
|
+
# Build file list for the variable and time range
|
|
22
|
+
param_mapping = self.cfg.mappings
|
|
23
|
+
provider = self.cfg.dataset.lower()
|
|
24
|
+
parameter_key = variable
|
|
25
|
+
param_info = param_mapping[provider]['variables'][parameter_key]
|
|
26
|
+
prefix = param_info["prefix"]
|
|
27
|
+
version = param_info["version"]
|
|
28
|
+
start_year = datetime.fromisoformat(self.cfg.time_range.start_date).year
|
|
29
|
+
end_year = datetime.fromisoformat(self.cfg.time_range.end_date).year
|
|
30
|
+
files = []
|
|
31
|
+
for year in range(start_year, end_year + 1):
|
|
32
|
+
file_name = f"{prefix}_{year}_{version}_de.nc"
|
|
33
|
+
files.append(os.path.join(self.cfg.data_dir, provider, parameter_key.upper(), file_name))
|
|
34
|
+
self.files = files
|
|
35
|
+
return files
|
|
36
|
+
|
|
37
|
+
def load(self, variable: str):
|
|
38
|
+
"""
|
|
39
|
+
Load HYRAS NetCDFs for a given variable into a single xarray Dataset.
|
|
40
|
+
"""
|
|
41
|
+
files = self.fetch(variable)
|
|
42
|
+
datasets = []
|
|
43
|
+
for f in files:
|
|
44
|
+
if not os.path.exists(f):
|
|
45
|
+
print(f"File not found: {f}")
|
|
46
|
+
continue
|
|
47
|
+
try:
|
|
48
|
+
ds = xr.open_dataset(f)
|
|
49
|
+
datasets.append(ds)
|
|
50
|
+
except Exception as e:
|
|
51
|
+
print(f"Skipping file {f} due to error: {e}")
|
|
52
|
+
if not datasets:
|
|
53
|
+
raise RuntimeError(f"No datasets could be loaded for {variable}.")
|
|
54
|
+
dset = xr.concat(datasets, dim="time")
|
|
55
|
+
dset[variable] = dset[variable].transpose("time", "y", "x")
|
|
56
|
+
self.dataset = dset
|
|
57
|
+
return self.dataset
|
|
58
|
+
|
|
59
|
+
def extract(self, *, point=None, box=None, shapefile=None, buffer_km=0.0):
|
|
60
|
+
"""
|
|
61
|
+
Extract data from the loaded HYRAS dataset.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
point : tuple (lon, lat), optional
|
|
66
|
+
Extracts a time series at the nearest grid point.
|
|
67
|
+
box : dict with lat/lon bounds, optional
|
|
68
|
+
Example: {"lat_min": 47, "lat_max": 49, "lon_min": 10, "lon_max": 12}
|
|
69
|
+
shapefile : str, optional
|
|
70
|
+
Path to a shapefile to clip the dataset spatially.
|
|
71
|
+
buffer_km : float, optional
|
|
72
|
+
Buffer distance (in kilometers) applied to the shapefile before clipping.
|
|
73
|
+
"""
|
|
74
|
+
if self.dataset is None:
|
|
75
|
+
raise ValueError("No dataset loaded. Call `load()` first.")
|
|
76
|
+
ds = self.dataset
|
|
77
|
+
|
|
78
|
+
# Point extraction
|
|
79
|
+
if point is not None:
|
|
80
|
+
lat, lon = point[1], point[0]
|
|
81
|
+
iy, ix = find_nearest_xy(ds, lat, lon)
|
|
82
|
+
print(f"📌 Nearest grid point at (y,x)=({iy},{ix})")
|
|
83
|
+
ts = ds.isel(x=ix, y=iy)
|
|
84
|
+
self.dataset = ts
|
|
85
|
+
return ts
|
|
86
|
+
|
|
87
|
+
# Box extraction
|
|
88
|
+
elif box is not None:
|
|
89
|
+
if not all(k in box for k in ["lat_min", "lat_max", "lon_min", "lon_max"]):
|
|
90
|
+
raise ValueError("Box must contain lat_min, lat_max, lon_min, lon_max.")
|
|
91
|
+
dset_box = ds.sel(
|
|
92
|
+
y=slice(box["lat_max"], box["lat_min"]), # y usually decreasing (north -> south)
|
|
93
|
+
x=slice(box["lon_min"], box["lon_max"])
|
|
94
|
+
)
|
|
95
|
+
print(f"📦 Extracted box with shape: {dset_box.dims}")
|
|
96
|
+
self.dataset = dset_box
|
|
97
|
+
return dset_box
|
|
98
|
+
|
|
99
|
+
# Shapefile extraction
|
|
100
|
+
elif shapefile is not None:
|
|
101
|
+
gdf = gpd.read_file(shapefile)
|
|
102
|
+
|
|
103
|
+
if buffer_km > 0:
|
|
104
|
+
gdf = gdf.to_crs(epsg=3857) # project to meters
|
|
105
|
+
gdf["geometry"] = gdf.buffer(buffer_km * 1000) # buffer in meters
|
|
106
|
+
gdf = gdf.to_crs(epsg=4326) # back to lat/lon
|
|
107
|
+
|
|
108
|
+
# Ensure dataset has CRS info for clipping
|
|
109
|
+
if not ds.rio.crs:
|
|
110
|
+
ds = ds.rio.write_crs("EPSG:4326")
|
|
111
|
+
|
|
112
|
+
dset_clipped = ds.rio.clip(gdf.geometry, gdf.crs, drop=True)
|
|
113
|
+
print(f"🗺️ Extracted shapefile area with dims: {dset_clipped.dims}")
|
|
114
|
+
self.dataset = dset_clipped
|
|
115
|
+
return dset_clipped
|
|
116
|
+
|
|
117
|
+
else:
|
|
118
|
+
raise NotImplementedError("Must provide either point, box, or shapefile.")
|
|
119
|
+
|
|
120
|
+
def save_csv(self, filename, df=None):
|
|
121
|
+
"""
|
|
122
|
+
Save the extracted time series to CSV.
|
|
123
|
+
"""
|
|
124
|
+
if df is None:
|
|
125
|
+
if self.dataset is None:
|
|
126
|
+
raise ValueError("No dataset loaded or extracted.")
|
|
127
|
+
# If dataset is a DataArray, convert to DataFrame
|
|
128
|
+
if isinstance(self.dataset, xr.Dataset):
|
|
129
|
+
df = self.dataset.to_dataframe().reset_index()
|
|
130
|
+
else:
|
|
131
|
+
raise ValueError("Please provide a DataFrame or extract a point first.")
|
|
132
|
+
df.to_csv(filename, index=False)
|
|
133
|
+
print(f"Saved CSV to {filename}")
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import geopandas as gpd
|
|
3
|
+
import os
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
import warnings
|
|
6
|
+
from datetime import datetime, timedelta
|
|
7
|
+
import xarray as xr
|
|
8
|
+
from omegaconf import DictConfig
|
|
9
|
+
|
|
10
|
+
from google.oauth2 import service_account
|
|
11
|
+
from googleapiclient.discovery import build
|
|
12
|
+
|
|
13
|
+
from climdata.utils.utils_download import list_drive_files, download_drive_file
|
|
14
|
+
from shapely.geometry import mapping
|
|
15
|
+
import cf_xarray
|
|
16
|
+
|
|
17
|
+
warnings.filterwarnings("ignore", category=Warning)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MSWXmirror:
|
|
21
|
+
def __init__(self, cfg: DictConfig):
|
|
22
|
+
self.cfg = cfg
|
|
23
|
+
self.dataset = None
|
|
24
|
+
self.variables = cfg.variables
|
|
25
|
+
self.files = []
|
|
26
|
+
|
|
27
|
+
def _fix_coords(self, ds: xr.Dataset | xr.DataArray):
|
|
28
|
+
"""Ensure latitude is ascending and longitude is in the range [0, 360]."""
|
|
29
|
+
ds = ds.cf.sortby("latitude")
|
|
30
|
+
lon_name = ds.cf["longitude"].name
|
|
31
|
+
ds = ds.assign_coords({lon_name: ds.cf["longitude"] % 360})
|
|
32
|
+
return ds.sortby(lon_name)
|
|
33
|
+
|
|
34
|
+
def fetch(self, folder_id: str, variable: str):
|
|
35
|
+
"""
|
|
36
|
+
Fetch MSWX files from Google Drive for a given variable.
|
|
37
|
+
"""
|
|
38
|
+
start = datetime.fromisoformat(self.cfg.time_range.start_date)
|
|
39
|
+
end = datetime.fromisoformat(self.cfg.time_range.end_date)
|
|
40
|
+
|
|
41
|
+
expected_files = []
|
|
42
|
+
current = start
|
|
43
|
+
while current <= end:
|
|
44
|
+
doy = current.timetuple().tm_yday
|
|
45
|
+
basename = f"{current.year}{doy:03d}.nc"
|
|
46
|
+
expected_files.append(basename)
|
|
47
|
+
current += timedelta(days=1)
|
|
48
|
+
|
|
49
|
+
output_dir = self.cfg.data_dir
|
|
50
|
+
local_files, missing_files = [], []
|
|
51
|
+
|
|
52
|
+
for basename in expected_files:
|
|
53
|
+
local_path = os.path.join(output_dir,self.cfg.dataset, variable, basename)
|
|
54
|
+
if os.path.exists(local_path):
|
|
55
|
+
local_files.append(basename)
|
|
56
|
+
else:
|
|
57
|
+
missing_files.append(basename)
|
|
58
|
+
|
|
59
|
+
if not missing_files:
|
|
60
|
+
print(f"✅ All {len(expected_files)} {variable} files already exist locally.")
|
|
61
|
+
return local_files
|
|
62
|
+
|
|
63
|
+
print(f"📂 {len(local_files)} exist, {len(missing_files)} missing — fetching {variable} from Drive...")
|
|
64
|
+
|
|
65
|
+
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
|
|
66
|
+
creds = service_account.Credentials.from_service_account_file(
|
|
67
|
+
self.cfg.mappings.mswx.params.google_service_account, scopes=SCOPES
|
|
68
|
+
)
|
|
69
|
+
service = build('drive', 'v3', credentials=creds)
|
|
70
|
+
|
|
71
|
+
drive_files = list_drive_files(folder_id, service)
|
|
72
|
+
valid_filenames = set(missing_files)
|
|
73
|
+
files_to_download = [f for f in drive_files if f['name'] in valid_filenames]
|
|
74
|
+
|
|
75
|
+
if not files_to_download:
|
|
76
|
+
print(f"⚠️ No {variable} files found in Drive for requested dates.")
|
|
77
|
+
return local_files
|
|
78
|
+
|
|
79
|
+
for file in files_to_download:
|
|
80
|
+
filename = file['name']
|
|
81
|
+
local_path = os.path.join(output_dir, self.cfg.dataset, variable, filename)
|
|
82
|
+
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
83
|
+
print(f"⬇️ Downloading {filename} ...")
|
|
84
|
+
download_drive_file(file['id'], local_path, service)
|
|
85
|
+
local_files.append(filename)
|
|
86
|
+
|
|
87
|
+
return local_files
|
|
88
|
+
|
|
89
|
+
def load(self, variable: str):
|
|
90
|
+
"""
|
|
91
|
+
Load MSWX NetCDFs for a given variable into a single xarray Dataset.
|
|
92
|
+
"""
|
|
93
|
+
folder_id = self.cfg.mappings["mswx"]["variables"][variable]["folder_id"]
|
|
94
|
+
files = self.fetch(folder_id, variable)
|
|
95
|
+
datasets = []
|
|
96
|
+
|
|
97
|
+
for f in files:
|
|
98
|
+
local_path = os.path.join(self.cfg.data_dir, self.cfg.dataset.lower(), variable, f)
|
|
99
|
+
try:
|
|
100
|
+
ds = xr.open_dataset(local_path, chunks="auto", engine="netcdf4")[self.cfg.mappings[self.cfg.dataset].variables[variable].name]
|
|
101
|
+
ds = ds.rename(variable)
|
|
102
|
+
datasets.append(ds)
|
|
103
|
+
except Exception as e:
|
|
104
|
+
print(f"Skipping file {f} due to error: {e}")
|
|
105
|
+
|
|
106
|
+
if not datasets:
|
|
107
|
+
raise RuntimeError(f"No datasets could be loaded for {variable}.")
|
|
108
|
+
|
|
109
|
+
dset = xr.concat(datasets, dim="time")
|
|
110
|
+
dset = dset.transpose("time", "lat", "lon")
|
|
111
|
+
dset = self._fix_coords(dset)
|
|
112
|
+
|
|
113
|
+
self.dataset = dset
|
|
114
|
+
return self.dataset
|
|
115
|
+
|
|
116
|
+
def to_zarr(self, zarr_filename: str):
|
|
117
|
+
if self.dataset is None:
|
|
118
|
+
raise ValueError("No dataset loaded. Call `load()` first.")
|
|
119
|
+
|
|
120
|
+
var_name = self.dataset.name
|
|
121
|
+
if var_name == 'pr':
|
|
122
|
+
self.dataset.attrs['units'] = 'mm/day'
|
|
123
|
+
elif var_name in ['tas', 'tasmax', 'tasmin']:
|
|
124
|
+
self.dataset.attrs['units'] = 'degC'
|
|
125
|
+
|
|
126
|
+
zarr_path = os.path.join("data/MSWX", zarr_filename)
|
|
127
|
+
os.makedirs(os.path.dirname(zarr_path), exist_ok=True)
|
|
128
|
+
|
|
129
|
+
print(f"💾 Saving {var_name} to Zarr: {zarr_path}")
|
|
130
|
+
self.dataset.to_zarr(zarr_path, mode="w")
|
|
131
|
+
|
|
132
|
+
def extract(self, *, point=None, box=None, shapefile=None, buffer_km=0.0):
|
|
133
|
+
if self.dataset is None:
|
|
134
|
+
raise ValueError("No dataset loaded. Call `load()` first.")
|
|
135
|
+
|
|
136
|
+
ds = self.dataset.rio.write_crs("EPSG:4326", inplace=False)
|
|
137
|
+
|
|
138
|
+
if point is not None:
|
|
139
|
+
lon, lat = point
|
|
140
|
+
if buffer_km > 0:
|
|
141
|
+
buffer_deg = buffer_km / 111
|
|
142
|
+
ds_subset = ds.sel(
|
|
143
|
+
lon=slice(lon-buffer_deg, lon+buffer_deg),
|
|
144
|
+
lat=slice(lat-buffer_deg, lat+buffer_deg),
|
|
145
|
+
)
|
|
146
|
+
else:
|
|
147
|
+
ds_subset = ds.sel(lon=lon, lat=lat, method="nearest")
|
|
148
|
+
|
|
149
|
+
elif box is not None:
|
|
150
|
+
ds_subset = ds.sel(
|
|
151
|
+
lon=slice(box["lon_min"], box["lon_max"]),
|
|
152
|
+
lat=slice(box["lat_min"], box["lat_max"]),
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
elif shapefile is not None:
|
|
156
|
+
if isinstance(shapefile, str):
|
|
157
|
+
gdf = gpd.read_file(shapefile)
|
|
158
|
+
else:
|
|
159
|
+
gdf = shapefile
|
|
160
|
+
if buffer_km > 0:
|
|
161
|
+
gdf = gdf.to_crs(epsg=3857)
|
|
162
|
+
gdf["geometry"] = gdf.buffer(buffer_km * 1000)
|
|
163
|
+
gdf = gdf.to_crs(epsg=4326)
|
|
164
|
+
geom = [mapping(g) for g in gdf.geometry]
|
|
165
|
+
ds_subset = ds.rio.clip(geom, gdf.crs, drop=True)
|
|
166
|
+
|
|
167
|
+
else:
|
|
168
|
+
raise ValueError("Must provide either point, box, or shapefile.")
|
|
169
|
+
|
|
170
|
+
self.dataset = ds_subset.to_dataset()
|
|
171
|
+
return ds_subset
|
|
172
|
+
|
|
173
|
+
# def to_dataframe(self, ds=None):
|
|
174
|
+
# if ds is None:
|
|
175
|
+
# if self.dataset is None:
|
|
176
|
+
# raise ValueError("No dataset loaded. Call `load()` first or pass `ds`.")
|
|
177
|
+
# ds = self.dataset
|
|
178
|
+
|
|
179
|
+
# if isinstance(ds, xr.Dataset):
|
|
180
|
+
# if len(ds.data_vars) != 1:
|
|
181
|
+
# raise ValueError("Dataset has multiple variables. Please select one.")
|
|
182
|
+
# ds = ds[list(ds.data_vars)[0]]
|
|
183
|
+
|
|
184
|
+
# df = ds.to_dataframe().reset_index()
|
|
185
|
+
# df = df[["time", "lat", "lon", ds.name]]
|
|
186
|
+
# df = df.rename(columns={"lat": "latitude", "lon": "longitude", ds.name: "value"})
|
|
187
|
+
# return df
|
|
188
|
+
|
|
189
|
+
def _format(self, df):
|
|
190
|
+
"""Format dataframe for standardized output."""
|
|
191
|
+
value_vars = [v for v in self.variables if v in df.columns]
|
|
192
|
+
id_vars = [c for c in df.columns if c not in value_vars]
|
|
193
|
+
|
|
194
|
+
df_long = df.melt(
|
|
195
|
+
id_vars=id_vars,
|
|
196
|
+
value_vars=value_vars,
|
|
197
|
+
var_name="variable",
|
|
198
|
+
value_name="value",
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
df_long["units"] = df_long["variable"].map(
|
|
202
|
+
lambda v: self.dataset[v].attrs.get("units", "unknown")
|
|
203
|
+
if v in self.dataset.data_vars
|
|
204
|
+
else "unknown"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
df_long["source"] = self.cfg.dataset
|
|
208
|
+
|
|
209
|
+
cols = [
|
|
210
|
+
"source",
|
|
211
|
+
"table",
|
|
212
|
+
"time",
|
|
213
|
+
"lat",
|
|
214
|
+
"lon",
|
|
215
|
+
"variable",
|
|
216
|
+
"value",
|
|
217
|
+
"units",
|
|
218
|
+
]
|
|
219
|
+
df_long = df_long[[c for c in cols if c in df_long.columns]]
|
|
220
|
+
|
|
221
|
+
return df_long
|
|
222
|
+
|
|
223
|
+
def save_csv(self, filename):
|
|
224
|
+
if self.dataset is not None:
|
|
225
|
+
df = self.dataset.to_dataframe().reset_index()
|
|
226
|
+
df = self._format(df)
|
|
227
|
+
df.to_csv(filename, index=False)
|
|
228
|
+
print(f"Saved CSV to {filename}")
|