pyreclaim 0.3.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/PKG-INFO +9 -3
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/README.md +2 -2
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/pyproject.toml +7 -1
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/setup.py +1 -1
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/pyreclaim.egg-info/PKG-INFO +9 -3
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/pyreclaim.egg-info/SOURCES.txt +1 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/pyreclaim.egg-info/requires.txt +6 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/derived_features/feature_engineering_and_transformation.py +68 -7
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/catchment_dynamic.py +50 -44
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/reservoir_dynamic.py +68 -70
- pyreclaim-0.5.0/src/reclaim/dynamic_features/utils/ts_aggregate.py +104 -0
- pyreclaim-0.5.0/src/reclaim/generate_features.py +297 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/reclaim.py +18 -5
- pyreclaim-0.5.0/src/reclaim/static_features/catchment_static.py +202 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/static_features/reservoir_static.py +47 -8
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/static_features/utils/aec_shape.py +2 -2
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/static_features/utils/area_perimeter.py +1 -1
- pyreclaim-0.5.0/src/reclaim/static_features/utils/basin_names.py +78 -0
- pyreclaim-0.5.0/src/reclaim/static_features/utils/catchment_agreggate.py +355 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/static_features/utils/flow_length.py +65 -1
- pyreclaim-0.3.0/src/reclaim/dynamic_features/utils/ts_aggregate.py +0 -63
- pyreclaim-0.3.0/src/reclaim/generate_features.py +0 -141
- pyreclaim-0.3.0/src/reclaim/static_features/catchment_static.py +0 -127
- pyreclaim-0.3.0/src/reclaim/static_features/utils/catchment_agreggate.py +0 -147
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/LICENSE +0 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/setup.cfg +0 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/pyreclaim.egg-info/dependency_links.txt +0 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/pyreclaim.egg-info/top_level.txt +0 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/__init__.py +0 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/derived_features/__init__.py +0 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/__init__.py +0 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/utils/__init__.py +0 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/utils/catchment_meteorology.py +0 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/utils/inflow_outflow.py +0 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/utils/rainfall.py +0 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/utils/statistical_metrics.py +0 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/static_features/__init__.py +0 -0
- {pyreclaim-0.3.0 → pyreclaim-0.5.0}/src/reclaim/static_features/utils/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pyreclaim
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Reservoir Estimation of Capacity Loss using AI based Methods
|
|
5
5
|
Author-email: Sanchit Minocha <msanchit@uw.edu>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -698,6 +698,12 @@ Requires-Dist: joblib
|
|
|
698
698
|
Requires-Dist: xgboost
|
|
699
699
|
Requires-Dist: lightgbm
|
|
700
700
|
Requires-Dist: catboost
|
|
701
|
+
Requires-Dist: openpyxl
|
|
702
|
+
Requires-Dist: netcdf4
|
|
703
|
+
Requires-Dist: dask
|
|
704
|
+
Requires-Dist: rioxarray
|
|
705
|
+
Requires-Dist: matplotlib
|
|
706
|
+
Requires-Dist: tqdm
|
|
701
707
|
Dynamic: license-file
|
|
702
708
|
|
|
703
709
|
<div align="center">
|
|
@@ -737,7 +743,7 @@ pip install pyreclaim
|
|
|
737
743
|
|
|
738
744
|
To generate features for reservoirs using the **RECLAIM** framework and the [`pyreclaim`](https://pypi.org/project/pyreclaim/) Python package, you will need the global datasets.
|
|
739
745
|
|
|
740
|
-
You can download all required global datasets from the
|
|
746
|
+
You can download all required global datasets from the Zenodo Repository:
|
|
741
747
|
|
|
742
748
|
[Download Global Datasets](https://doi.org/10.5281/zenodo.17230533)
|
|
743
749
|
|
|
@@ -763,7 +769,7 @@ from reclaim.reclaim import Reclaim
|
|
|
763
769
|
reservoir_static = {
|
|
764
770
|
"obc": 150.0,
|
|
765
771
|
"hgt": 45.0,
|
|
766
|
-
"mrb":
|
|
772
|
+
"mrb": 4030033640,
|
|
767
773
|
"lat": 25.6,
|
|
768
774
|
"lon": 81.9,
|
|
769
775
|
"reservoir_polygon": reservoir_polygon,
|
|
@@ -35,7 +35,7 @@ pip install pyreclaim
|
|
|
35
35
|
|
|
36
36
|
To generate features for reservoirs using the **RECLAIM** framework and the [`pyreclaim`](https://pypi.org/project/pyreclaim/) Python package, you will need the global datasets.
|
|
37
37
|
|
|
38
|
-
You can download all required global datasets from the
|
|
38
|
+
You can download all required global datasets from the Zenodo Repository:
|
|
39
39
|
|
|
40
40
|
[Download Global Datasets](https://doi.org/10.5281/zenodo.17230533)
|
|
41
41
|
|
|
@@ -61,7 +61,7 @@ from reclaim.reclaim import Reclaim
|
|
|
61
61
|
reservoir_static = {
|
|
62
62
|
"obc": 150.0,
|
|
63
63
|
"hgt": 45.0,
|
|
64
|
-
"mrb":
|
|
64
|
+
"mrb": 4030033640,
|
|
65
65
|
"lat": 25.6,
|
|
66
66
|
"lon": 81.9,
|
|
67
67
|
"reservoir_polygon": reservoir_polygon,
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "pyreclaim"
|
|
7
|
-
version = "v0.
|
|
7
|
+
version = "v0.5.0"
|
|
8
8
|
authors = [
|
|
9
9
|
{ name="Sanchit Minocha", email="msanchit@uw.edu" },
|
|
10
10
|
]
|
|
@@ -30,6 +30,12 @@ dependencies = [
|
|
|
30
30
|
"xgboost",
|
|
31
31
|
"lightgbm",
|
|
32
32
|
"catboost",
|
|
33
|
+
"openpyxl",
|
|
34
|
+
"netcdf4",
|
|
35
|
+
"dask",
|
|
36
|
+
"rioxarray",
|
|
37
|
+
"matplotlib",
|
|
38
|
+
"tqdm"
|
|
33
39
|
]
|
|
34
40
|
|
|
35
41
|
[project.urls]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pyreclaim
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Reservoir Estimation of Capacity Loss using AI based Methods
|
|
5
5
|
Author-email: Sanchit Minocha <msanchit@uw.edu>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -698,6 +698,12 @@ Requires-Dist: joblib
|
|
|
698
698
|
Requires-Dist: xgboost
|
|
699
699
|
Requires-Dist: lightgbm
|
|
700
700
|
Requires-Dist: catboost
|
|
701
|
+
Requires-Dist: openpyxl
|
|
702
|
+
Requires-Dist: netcdf4
|
|
703
|
+
Requires-Dist: dask
|
|
704
|
+
Requires-Dist: rioxarray
|
|
705
|
+
Requires-Dist: matplotlib
|
|
706
|
+
Requires-Dist: tqdm
|
|
701
707
|
Dynamic: license-file
|
|
702
708
|
|
|
703
709
|
<div align="center">
|
|
@@ -737,7 +743,7 @@ pip install pyreclaim
|
|
|
737
743
|
|
|
738
744
|
To generate features for reservoirs using the **RECLAIM** framework and the [`pyreclaim`](https://pypi.org/project/pyreclaim/) Python package, you will need the global datasets.
|
|
739
745
|
|
|
740
|
-
You can download all required global datasets from the
|
|
746
|
+
You can download all required global datasets from the Zenodo Repository:
|
|
741
747
|
|
|
742
748
|
[Download Global Datasets](https://doi.org/10.5281/zenodo.17230533)
|
|
743
749
|
|
|
@@ -763,7 +769,7 @@ from reclaim.reclaim import Reclaim
|
|
|
763
769
|
reservoir_static = {
|
|
764
770
|
"obc": 150.0,
|
|
765
771
|
"hgt": 45.0,
|
|
766
|
-
"mrb":
|
|
772
|
+
"mrb": 4030033640,
|
|
767
773
|
"lat": 25.6,
|
|
768
774
|
"lon": 81.9,
|
|
769
775
|
"reservoir_polygon": reservoir_polygon,
|
|
@@ -27,5 +27,6 @@ src/reclaim/static_features/reservoir_static.py
|
|
|
27
27
|
src/reclaim/static_features/utils/__init__.py
|
|
28
28
|
src/reclaim/static_features/utils/aec_shape.py
|
|
29
29
|
src/reclaim/static_features/utils/area_perimeter.py
|
|
30
|
+
src/reclaim/static_features/utils/basin_names.py
|
|
30
31
|
src/reclaim/static_features/utils/catchment_agreggate.py
|
|
31
32
|
src/reclaim/static_features/utils/flow_length.py
|
|
@@ -1,6 +1,45 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
import numpy as np
|
|
3
3
|
|
|
4
|
+
ALL_FEATURES = [
|
|
5
|
+
'log_OBC', 'log_HGT', 'MRB', 'LAT', 'LON',
|
|
6
|
+
'log_RA', 'log_RP', 'log_FL',
|
|
7
|
+
'log_CA', 'log_DCA',
|
|
8
|
+
|
|
9
|
+
'AECS', 'AECC','AECI',
|
|
10
|
+
|
|
11
|
+
'log_LCAS', 'log_LCC',
|
|
12
|
+
'log_LCG', 'log_LCT', 'log_LCS',
|
|
13
|
+
'log_LCHV', 'log_LCM',
|
|
14
|
+
'log_LCSV','log_LCBS',
|
|
15
|
+
'log_LCSG', 'log_LCWB','DLC',
|
|
16
|
+
|
|
17
|
+
'COAR', 'SAND', 'SILT', 'CLAY', 'BULK',
|
|
18
|
+
|
|
19
|
+
'ELEV', 'SLOP', 'CURV', 'ASP', 'HILL',
|
|
20
|
+
|
|
21
|
+
'log_MAI', 'log_PAI', 'I_cv',
|
|
22
|
+
'log_I_std','I_above_90', 'I_max_persis',
|
|
23
|
+
'log_MAO', 'log_O_std', 'O_cv',
|
|
24
|
+
'E_mean', 'E_std',
|
|
25
|
+
'log_SA_mean', 'log_SA_std', 'SA_cv', 'SA_skew', 'log_SA_kurt',
|
|
26
|
+
'log_SA_mean_clip', 'SA_above_90',
|
|
27
|
+
'NSSC1_mean', 'NSSC1_std', 'NSSC1_cv', 'NSSC1_skew', 'NSSC1_kurt',
|
|
28
|
+
'NSSC2_mean', 'NSSC2_above_90', 'NSSC2_max_persis',
|
|
29
|
+
|
|
30
|
+
'log_MAR', '#_rain_above_10', '#_rain_above_50', '#_rain_above_100',
|
|
31
|
+
'tmin_mean', 'tmax_mean',
|
|
32
|
+
'wind_mean', 'wind_std', 'wind_cv', 'wind_skew', 'wind_kurt',
|
|
33
|
+
|
|
34
|
+
'AGE', 'log_ROBC', 'log_GC',
|
|
35
|
+
'NVGF',
|
|
36
|
+
'R_tree_bare', 'R_shrub_bare', 'R_coarse_sand',
|
|
37
|
+
'log_rel_SA_mean_clip', 'log_R_SA_cap',
|
|
38
|
+
'log_rain_per_area',
|
|
39
|
+
'log_TE', 'log_RT', 'log_ECLR', 'ESR',
|
|
40
|
+
'log_SIN', 'log_SOUT',
|
|
41
|
+
]
|
|
42
|
+
|
|
4
43
|
def engineer_and_transform_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
5
44
|
"""
|
|
6
45
|
Engineer and transform features in reservoir/catchment dataset.
|
|
@@ -54,22 +93,44 @@ def engineer_and_transform_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
54
93
|
"SOUT": df["MAO"] * df["NSSC2_mean"],
|
|
55
94
|
}
|
|
56
95
|
|
|
57
|
-
# Land cover log-area features
|
|
58
|
-
lc_cols = ['LCAS','LCC','LCG','LCT','LCS','LCHV','LCM','LCSV','LCBS','LCSG','LCWB']
|
|
59
|
-
for col in lc_cols:
|
|
60
|
-
feature_dict[col] = df["CA"] * df[col] / 100
|
|
61
|
-
|
|
62
96
|
df = pd.concat([df, pd.DataFrame(feature_dict)], axis=1)
|
|
63
97
|
|
|
98
|
+
# Land cover log-area features
|
|
99
|
+
lc_cols = ['LCAS','LCC','LCG','LCT','LCS','LCHV','LCM','LCSV','LCBS','LCSG','LCWB']
|
|
100
|
+
# for col in lc_cols:
|
|
101
|
+
# df[col] = df["CA"] * df[col] / 100
|
|
102
|
+
# Doing calculation along with taking log as done in model training. results will slightly differ for cases where percentage of LC is 0.
|
|
103
|
+
|
|
64
104
|
# -------------------------
|
|
65
105
|
# APPLY LOG TRANSFORMATIONS
|
|
66
106
|
# -------------------------
|
|
67
107
|
log_candidates = ['CA','DCA','OBC','HGT','RA','RP','FL',
|
|
68
108
|
'SA_mean','SA_mean_clip','SA_std','SA_kurt','PAI','MAI','MAO','I_std','O_std','MAR',
|
|
69
|
-
'rain_per_area','GC','TE','ECLR','SIN','SOUT'] + lc_cols
|
|
109
|
+
'ROBC','rain_per_area','GC','TE','RT','ECLR','SIN','SOUT', 'rel_SA_mean_clip', 'R_SA_cap'] + lc_cols
|
|
70
110
|
|
|
71
111
|
for col in log_candidates:
|
|
72
112
|
log_col = f'log_{col}' # add prefix to avoid double log
|
|
73
|
-
|
|
113
|
+
try:
|
|
114
|
+
if col in ['ECLR','SIN','SOUT']:
|
|
115
|
+
# Land cover columns can be zero (upto 15 decimal places), clip at 1e-15
|
|
116
|
+
df[log_col] = np.log(df[col].clip(lower=1e-15))
|
|
117
|
+
elif col in ['rain_per_area']:
|
|
118
|
+
# Rain per area can be zero (upto 10 decimal places), clip at 1e-10
|
|
119
|
+
df[log_col] = np.log(df[col].clip(lower=1e-10))
|
|
120
|
+
elif col in lc_cols:
|
|
121
|
+
df[log_col] = np.log(df["CA"].clip(lower=1e-6)) + np.log(df[col].clip(lower=1e-6)) - np.log(100)
|
|
122
|
+
else:
|
|
123
|
+
# All other columns can be zero (upto 6 decimal places), clip at 1e-6
|
|
124
|
+
df[log_col] = np.log(df[col].clip(lower=1e-6))
|
|
125
|
+
except Exception as e:
|
|
126
|
+
raise ValueError(f"Error applying log transform to column '{col}': {e}")
|
|
127
|
+
|
|
128
|
+
# Process DLc as categorical column
|
|
129
|
+
df['DLC'] = df['DLC'].astype(int).fillna(0)
|
|
130
|
+
|
|
131
|
+
# Add empty columns for any missing features
|
|
132
|
+
for feature in ALL_FEATURES:
|
|
133
|
+
if feature not in df.columns:
|
|
134
|
+
df[feature] = np.nan
|
|
74
135
|
|
|
75
136
|
return df
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
import numpy as np
|
|
3
|
-
from typing import Dict, Sequence
|
|
3
|
+
from typing import Dict, Sequence, List
|
|
4
4
|
|
|
5
5
|
from reclaim.dynamic_features.utils.rainfall import (
|
|
6
6
|
mean_annual_rainfall_mm,
|
|
@@ -13,12 +13,34 @@ from reclaim.dynamic_features.utils.statistical_metrics import (
|
|
|
13
13
|
skewness,
|
|
14
14
|
kurtosis_val,
|
|
15
15
|
)
|
|
16
|
-
from reclaim.dynamic_features.utils.ts_aggregate import
|
|
16
|
+
from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregates
|
|
17
|
+
|
|
18
|
+
VARIABLE_FEATURES = {
|
|
19
|
+
"precip": {
|
|
20
|
+
"MAR": mean_annual_rainfall_mm,
|
|
21
|
+
"#_rain_above_10": lambda ts: mean_annual_rainy_days(ts, threshold=10.0),
|
|
22
|
+
"#_rain_above_50": lambda ts: mean_annual_rainy_days(ts, threshold=50.0),
|
|
23
|
+
"#_rain_above_100": lambda ts: mean_annual_rainy_days(ts, threshold=100.0),
|
|
24
|
+
},
|
|
25
|
+
"tmin": {
|
|
26
|
+
"tmin_mean": annual_mean,
|
|
27
|
+
},
|
|
28
|
+
"tmax": {
|
|
29
|
+
"tmax_mean": annual_mean,
|
|
30
|
+
},
|
|
31
|
+
"wind": {
|
|
32
|
+
"wind_mean": annual_mean,
|
|
33
|
+
"wind_std": annual_std,
|
|
34
|
+
"wind_cv": coefficient_of_variation,
|
|
35
|
+
"wind_skew": skewness,
|
|
36
|
+
"wind_kurt": kurtosis_val,
|
|
37
|
+
},
|
|
38
|
+
}
|
|
17
39
|
|
|
18
40
|
|
|
19
41
|
def catchment_based_dynamic_features(
|
|
20
42
|
variable_info: Dict[str, Dict[str, str]],
|
|
21
|
-
|
|
43
|
+
observation_intervals: List[Sequence[int]],
|
|
22
44
|
) -> pd.DataFrame:
|
|
23
45
|
"""
|
|
24
46
|
Compute dynamic catchment-based features for a single reservoir's catchment,
|
|
@@ -41,63 +63,47 @@ def catchment_based_dynamic_features(
|
|
|
41
63
|
"time_column": str,
|
|
42
64
|
"data_column": str
|
|
43
65
|
}
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
66
|
+
|
|
67
|
+
observation_intervals : list of list of int
|
|
68
|
+
List of [start_year, end_year] intervals to compute features over.
|
|
47
69
|
|
|
48
70
|
Returns
|
|
49
71
|
-------
|
|
50
72
|
pd.DataFrame
|
|
51
|
-
A
|
|
52
|
-
|
|
73
|
+
A DataFrame containing as many rows as there are observation intervals and columns corresponding to the computed catchment-based features.
|
|
74
|
+
Missing variables in ``variable_info`` will result in NaN values for their features.
|
|
53
75
|
Notes
|
|
54
76
|
-----
|
|
55
77
|
- Precipitation features are reported as mm/year (for MAR) and counts (rainy days).
|
|
56
78
|
- Wind statistics include mean, std, CV, skewness, kurtosis.
|
|
57
79
|
- Temperature features are simple annual means (°C).
|
|
58
80
|
"""
|
|
59
|
-
|
|
60
|
-
variable_features = {
|
|
61
|
-
"precip": {
|
|
62
|
-
"MAR": mean_annual_rainfall_mm,
|
|
63
|
-
"#_rain_above_10": lambda ts: mean_annual_rainy_days(ts, threshold=10.0),
|
|
64
|
-
"#_rain_above_50": lambda ts: mean_annual_rainy_days(ts, threshold=50.0),
|
|
65
|
-
"#_rain_above_100": lambda ts: mean_annual_rainy_days(ts, threshold=100.0),
|
|
66
|
-
},
|
|
67
|
-
"tmin": {
|
|
68
|
-
"tmin_mean": annual_mean,
|
|
69
|
-
},
|
|
70
|
-
"tmax": {
|
|
71
|
-
"tmax_mean": annual_mean,
|
|
72
|
-
},
|
|
73
|
-
"wind": {
|
|
74
|
-
"wind_mean": annual_mean,
|
|
75
|
-
"wind_std": annual_std,
|
|
76
|
-
"wind_cv": coefficient_of_variation,
|
|
77
|
-
"wind_skew": skewness,
|
|
78
|
-
"wind_kurt": kurtosis_val,
|
|
79
|
-
},
|
|
80
|
-
}
|
|
81
81
|
|
|
82
|
-
|
|
82
|
+
all_vars = []
|
|
83
83
|
|
|
84
|
-
for var, feat_dict in
|
|
84
|
+
for var, feat_dict in VARIABLE_FEATURES.items():
|
|
85
85
|
if var not in variable_info:
|
|
86
|
-
|
|
87
|
-
|
|
86
|
+
all_vars.append(
|
|
87
|
+
pd.DataFrame(np.nan, index=range(len(observation_intervals)),
|
|
88
|
+
columns=feat_dict.keys())
|
|
89
|
+
)
|
|
88
90
|
continue
|
|
89
91
|
|
|
90
92
|
path = variable_info[var]["path"]
|
|
91
93
|
time_col = variable_info[var]["time_column"]
|
|
92
94
|
data_col = variable_info[var]["data_column"]
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
df_var = compute_ts_aggregates(
|
|
98
|
+
ts_csv_path=path,
|
|
99
|
+
time_column=time_col,
|
|
100
|
+
value_column=data_col,
|
|
101
|
+
feature_functions=feat_dict,
|
|
102
|
+
intervals=observation_intervals,
|
|
103
|
+
)
|
|
104
|
+
all_vars.append(df_var)
|
|
105
|
+
except Exception:
|
|
106
|
+
df_var = pd.DataFrame()
|
|
107
|
+
all_vars.append(df_var)
|
|
93
108
|
|
|
94
|
-
|
|
95
|
-
try:
|
|
96
|
-
df_feat = compute_ts_aggregate(
|
|
97
|
-
path, time_col, data_col, func, feat, observation_period
|
|
98
|
-
)
|
|
99
|
-
results[feat] = df_feat.iloc[0, 0] # extract scalar
|
|
100
|
-
except Exception:
|
|
101
|
-
results[feat] = np.nan
|
|
102
|
-
|
|
103
|
-
return pd.DataFrame([results])
|
|
109
|
+
return pd.concat(all_vars, axis=1)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import numpy as np
|
|
4
|
-
from typing import Dict, Sequence, Union, Callable
|
|
4
|
+
from typing import Dict, Sequence, Union, Callable, List
|
|
5
5
|
|
|
6
6
|
from reclaim.dynamic_features.utils.statistical_metrics import (
|
|
7
7
|
annual_mean,
|
|
@@ -18,11 +18,53 @@ from reclaim.dynamic_features.utils.inflow_outflow import (
|
|
|
18
18
|
max_annual_flow_m3_per_s,
|
|
19
19
|
mean_annual_flow_variability
|
|
20
20
|
)
|
|
21
|
-
from reclaim.dynamic_features.utils.ts_aggregate import
|
|
21
|
+
from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregates
|
|
22
|
+
|
|
23
|
+
# Define which features depend on which variable
|
|
24
|
+
VARIABLE_FEATURES = {
|
|
25
|
+
"inflow": {
|
|
26
|
+
"MAI": mean_annual_flow_m3_per_s,
|
|
27
|
+
"PAI": max_annual_flow_m3_per_s,
|
|
28
|
+
"I_cv": mean_annual_flow_variability,
|
|
29
|
+
"I_std": mean_annual_flow_std_m3_per_s,
|
|
30
|
+
"I_above_90": max_days_above_90th,
|
|
31
|
+
"I_max_persis": max_annual_persistence,
|
|
32
|
+
},
|
|
33
|
+
"outflow": {
|
|
34
|
+
"MAO": mean_annual_flow_m3_per_s,
|
|
35
|
+
"O_std": mean_annual_flow_std_m3_per_s,
|
|
36
|
+
"O_cv": mean_annual_flow_variability,
|
|
37
|
+
},
|
|
38
|
+
"evaporation": {
|
|
39
|
+
"E_mean": annual_mean,
|
|
40
|
+
"E_std": annual_std,
|
|
41
|
+
},
|
|
42
|
+
"surface_area": {
|
|
43
|
+
"SA_mean": annual_mean,
|
|
44
|
+
"SA_std": annual_std,
|
|
45
|
+
"SA_cv": coefficient_of_variation,
|
|
46
|
+
"SA_skew": skewness,
|
|
47
|
+
"SA_kurt": kurtosis_val,
|
|
48
|
+
"SA_mean_clip": annual_mean,
|
|
49
|
+
"SA_above_90": max_days_above_90th,
|
|
50
|
+
},
|
|
51
|
+
"nssc": {
|
|
52
|
+
"NSSC1_mean": annual_mean,
|
|
53
|
+
"NSSC1_std": annual_std,
|
|
54
|
+
"NSSC1_cv": coefficient_of_variation,
|
|
55
|
+
"NSSC1_skew": skewness,
|
|
56
|
+
"NSSC1_kurt": kurtosis_val,
|
|
57
|
+
},
|
|
58
|
+
"nssc2": {
|
|
59
|
+
"NSSC2_mean": annual_mean,
|
|
60
|
+
"NSSC2_above_90": max_days_above_90th,
|
|
61
|
+
"NSSC2_max_persis": max_annual_persistence,
|
|
62
|
+
},
|
|
63
|
+
}
|
|
22
64
|
|
|
23
65
|
def reservoir_based_dynamic_features(
|
|
24
66
|
variable_info: Dict[str, Dict[str, str]],
|
|
25
|
-
|
|
67
|
+
observation_intervals: List[Sequence[int]]
|
|
26
68
|
) -> pd.DataFrame:
|
|
27
69
|
"""
|
|
28
70
|
Compute dynamic reservoir features for a single reservoir using inflow, outflow,
|
|
@@ -57,13 +99,13 @@ def reservoir_based_dynamic_features(
|
|
|
57
99
|
"outflow": {"path": "data/outflow.csv", "time_column": "date", "data_column": "outflow (m3/d)"}
|
|
58
100
|
}
|
|
59
101
|
|
|
60
|
-
|
|
61
|
-
|
|
102
|
+
observation_intervals : list of list of int
|
|
103
|
+
List of [start_year, end_year] intervals to compute features over.
|
|
62
104
|
|
|
63
105
|
Returns
|
|
64
106
|
-------
|
|
65
107
|
pd.DataFrame
|
|
66
|
-
A
|
|
108
|
+
A DataFrame containing as many rows as ``observation_intervals`` and columns corresponding to the computed reservoir dynamic features.
|
|
67
109
|
Missing variables in ``variable_info`` will result in NaN values for their features.
|
|
68
110
|
|
|
69
111
|
Notes
|
|
@@ -74,75 +116,31 @@ def reservoir_based_dynamic_features(
|
|
|
74
116
|
- If a variable is missing in ``variable_info``, its corresponding features are NaN.
|
|
75
117
|
"""
|
|
76
118
|
|
|
77
|
-
|
|
78
|
-
variable_features = {
|
|
79
|
-
"inflow": {
|
|
80
|
-
"MAI": mean_annual_flow_m3_per_s,
|
|
81
|
-
"PAI": max_annual_flow_m3_per_s,
|
|
82
|
-
"I_cv": mean_annual_flow_variability,
|
|
83
|
-
"I_std": mean_annual_flow_std_m3_per_s,
|
|
84
|
-
"I_above_90": max_days_above_90th,
|
|
85
|
-
"I_max_persis": max_annual_persistence,
|
|
86
|
-
},
|
|
87
|
-
"outflow": {
|
|
88
|
-
"MAO": mean_annual_flow_m3_per_s,
|
|
89
|
-
"O_std": mean_annual_flow_std_m3_per_s,
|
|
90
|
-
"O_cv": mean_annual_flow_variability,
|
|
91
|
-
},
|
|
92
|
-
"evaporation": {
|
|
93
|
-
"E_mean": annual_mean,
|
|
94
|
-
"E_std": annual_std,
|
|
95
|
-
},
|
|
96
|
-
"surface_area": {
|
|
97
|
-
"SA_mean": annual_mean,
|
|
98
|
-
"SA_std": annual_std,
|
|
99
|
-
"SA_cv": coefficient_of_variation,
|
|
100
|
-
"SA_skew": skewness,
|
|
101
|
-
"SA_kurt": kurtosis_val,
|
|
102
|
-
"SA_mean_clip": annual_mean,
|
|
103
|
-
"SA_above_90": max_days_above_90th,
|
|
104
|
-
},
|
|
105
|
-
"nssc": {
|
|
106
|
-
"NSSC1_mean": annual_mean,
|
|
107
|
-
"NSSC1_std": annual_std,
|
|
108
|
-
"NSSC1_cv": coefficient_of_variation,
|
|
109
|
-
"NSSC1_skew": skewness,
|
|
110
|
-
"NSSC1_kurt": kurtosis_val,
|
|
111
|
-
},
|
|
112
|
-
"nssc2": {
|
|
113
|
-
"NSSC2_mean": annual_mean,
|
|
114
|
-
"NSSC2_above_90": max_days_above_90th,
|
|
115
|
-
"NSSC2_max_persis": max_annual_persistence,
|
|
116
|
-
},
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
results = {}
|
|
119
|
+
all_vars = []
|
|
120
120
|
|
|
121
121
|
# Loop through required variables
|
|
122
|
-
for var, feat_dict in
|
|
122
|
+
for var, feat_dict in VARIABLE_FEATURES.items():
|
|
123
123
|
if var not in variable_info:
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
124
|
+
all_vars.append(
|
|
125
|
+
pd.DataFrame(np.nan, index=range(len(observation_intervals)),
|
|
126
|
+
columns=feat_dict.keys())
|
|
127
|
+
)
|
|
127
128
|
continue
|
|
128
129
|
|
|
129
130
|
path = variable_info[var]["path"]
|
|
130
131
|
time_col = variable_info[var]["time_column"]
|
|
131
132
|
data_col = variable_info[var]["data_column"]
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
results[feat] = np.nan
|
|
147
|
-
|
|
148
|
-
return pd.DataFrame([results])
|
|
133
|
+
try:
|
|
134
|
+
df_var = compute_ts_aggregates(
|
|
135
|
+
ts_csv_path=path,
|
|
136
|
+
time_column=time_col,
|
|
137
|
+
value_column=data_col,
|
|
138
|
+
feature_functions=feat_dict,
|
|
139
|
+
intervals=observation_intervals,
|
|
140
|
+
)
|
|
141
|
+
all_vars.append(df_var)
|
|
142
|
+
except Exception:
|
|
143
|
+
df_var = pd.DataFrame()
|
|
144
|
+
all_vars.append(df_var)
|
|
145
|
+
|
|
146
|
+
return pd.concat(all_vars, axis=1)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Callable, Union, Sequence, List, Dict
|
|
5
|
+
|
|
6
|
+
FULL_RECORD_FEATURES = ["SA_mean", "SA_std", "SA_cv", "SA_skew", "SA_kurt", "NSSC2_max_persis"]
|
|
7
|
+
|
|
8
|
+
def build_intervals(start_year, end_year, time_interval):
|
|
9
|
+
total_years = end_year - start_year + 1
|
|
10
|
+
|
|
11
|
+
# Case 1: Entire window shorter than interval
|
|
12
|
+
if total_years <= time_interval:
|
|
13
|
+
return [[start_year, end_year]]
|
|
14
|
+
|
|
15
|
+
remainder = total_years % time_interval
|
|
16
|
+
outputs = []
|
|
17
|
+
|
|
18
|
+
# First interval absorbs remainder (if any)
|
|
19
|
+
first_len = time_interval + remainder if remainder != 0 else time_interval
|
|
20
|
+
first_end = min(start_year + first_len - 1, end_year)
|
|
21
|
+
outputs.append([start_year, first_end])
|
|
22
|
+
|
|
23
|
+
# Remaining intervals
|
|
24
|
+
current_start = first_end + 1
|
|
25
|
+
while current_start <= end_year:
|
|
26
|
+
current_end = current_start + time_interval - 1
|
|
27
|
+
outputs.append([current_start, min(current_end, end_year)])
|
|
28
|
+
current_start = current_end + 1
|
|
29
|
+
|
|
30
|
+
return outputs
|
|
31
|
+
|
|
32
|
+
def compute_ts_aggregates(
|
|
33
|
+
ts_csv_path: str,
|
|
34
|
+
time_column: str,
|
|
35
|
+
value_column: str,
|
|
36
|
+
feature_functions: Dict[str, Callable],
|
|
37
|
+
intervals: List[Sequence[int]],
|
|
38
|
+
) -> pd.DataFrame:
|
|
39
|
+
"""
|
|
40
|
+
Compute an aggregate feature from a user-provided time series CSV for a single reservoir.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
ts_csv_path : str
|
|
45
|
+
Path to the CSV file containing the time series.
|
|
46
|
+
time_column : str
|
|
47
|
+
Name of the column representing dates/timestamps.
|
|
48
|
+
value_column : str
|
|
49
|
+
Name of the column representing the variable values.
|
|
50
|
+
feature_functions : Dict[str, Callable]
|
|
51
|
+
Dictionary where keys are feature names (column names for output DataFrame) and values are functions that take a pd.Series and return a single value.
|
|
52
|
+
intervals : list of list of int
|
|
53
|
+
List of [start_year, end_year] intervals to compute features over.
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
pd.DataFrame
|
|
58
|
+
A single-row DataFrame containing the computed feature with the specified column name.
|
|
59
|
+
"""
|
|
60
|
+
# --- Read CSV ONCE ---
|
|
61
|
+
# Check if path exists
|
|
62
|
+
if not Path(ts_csv_path).is_file():
|
|
63
|
+
raise FileNotFoundError(f"CSV file not found at path: {ts_csv_path}")
|
|
64
|
+
|
|
65
|
+
df = pd.read_csv(ts_csv_path)
|
|
66
|
+
if df.empty:
|
|
67
|
+
raise ValueError(f"CSV at {ts_csv_path} is empty.")
|
|
68
|
+
|
|
69
|
+
# Ensure columns exist
|
|
70
|
+
if time_column not in df.columns:
|
|
71
|
+
raise ValueError(f"Time column '{time_column}' not found in CSV.")
|
|
72
|
+
if value_column not in df.columns:
|
|
73
|
+
raise ValueError(f"Value column '{value_column}' not found in CSV.")
|
|
74
|
+
|
|
75
|
+
# Ensure time column is datetime
|
|
76
|
+
df[time_column] = pd.to_datetime(df[time_column], errors='coerce')
|
|
77
|
+
if df[time_column].isna().all():
|
|
78
|
+
raise ValueError(f"Time column '{time_column}' could not be converted to datetime.")
|
|
79
|
+
|
|
80
|
+
# Set index
|
|
81
|
+
ts = df.set_index(time_column)[value_column].sort_index()
|
|
82
|
+
|
|
83
|
+
if ts.empty:
|
|
84
|
+
raise ValueError("Time series is completely empty. Please check the data or avoid providing this variable.")
|
|
85
|
+
|
|
86
|
+
rows = []
|
|
87
|
+
|
|
88
|
+
for osy, oey in intervals:
|
|
89
|
+
ts_clip = ts[(ts.index.year >= osy) & (ts.index.year <= oey)]
|
|
90
|
+
ts_till_end_year = ts[ts.index.year <= oey]
|
|
91
|
+
|
|
92
|
+
row = {}
|
|
93
|
+
for feat, func in feature_functions.items():
|
|
94
|
+
try:
|
|
95
|
+
if feat in FULL_RECORD_FEATURES:
|
|
96
|
+
row[feat] = func(ts_till_end_year) if not ts_till_end_year.empty else np.nan
|
|
97
|
+
else:
|
|
98
|
+
row[feat] = func(ts_clip) if not ts_clip.empty else np.nan
|
|
99
|
+
except Exception:
|
|
100
|
+
row[feat] = np.nan
|
|
101
|
+
|
|
102
|
+
rows.append(row)
|
|
103
|
+
|
|
104
|
+
return pd.DataFrame(rows)
|