TempEst-NEXT 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- NEXT/__init__.py +2 -0
- NEXT/coef_est.py +185 -0
- NEXT/coefs.pickle +0 -0
- NEXT/data.py +654 -0
- NEXT/datatools.py +97 -0
- NEXT/model_manager.py +196 -0
- NEXT/reach_prep.py +414 -0
- NEXT/wforecast.py +181 -0
- tempest_next-0.1.0.dist-info/METADATA +97 -0
- tempest_next-0.1.0.dist-info/RECORD +13 -0
- tempest_next-0.1.0.dist-info/WHEEL +5 -0
- tempest_next-0.1.0.dist-info/licenses/LICENSE +674 -0
- tempest_next-0.1.0.dist-info/top_level.txt +1 -0
NEXT/__init__.py
ADDED
NEXT/coef_est.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Created on Wed Sep 18 11:03:40 2024
|
|
4
|
+
|
|
5
|
+
@author: dphilippus
|
|
6
|
+
|
|
7
|
+
This file handles data preprocessing and coefficient estimation.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from pygam import LinearGAM, s, l, te
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import numpy as np
|
|
13
|
+
from NEWT import analysis, statics, Watershed
|
|
14
|
+
|
|
15
|
+
rand = np.random.default_rng()
|
|
16
|
+
|
|
17
|
+
# Used: ['slope', 'elev_min', 'elev', 'area', 'intercept', 'srad_sd', 'cold_prcp', 'prcp', 'prcp_sd', 'srad', 'water', 'wetland', 'developed', 'ssn_phi', 'Intercept', 'ice_snow', 'vp_sd', 'lat', 'tamp', 'frozen', 'lon', 'ssn_index', 'forest']
|
|
18
|
+
|
|
19
|
+
inp_cols = ["tmax", "prcp", "vp",
|
|
20
|
+
"area", "elev_min", "elev", "slope",
|
|
21
|
+
"wetland", "developed", "ice_snow", "water",
|
|
22
|
+
"canopy", "ws_canopy",
|
|
23
|
+
"date", "day"]
|
|
24
|
+
req_cols = inp_cols + ["id"]
|
|
25
|
+
training_req_cols = req_cols + ["temperature"]
|
|
26
|
+
|
|
27
|
+
def ssn_df(col):
|
|
28
|
+
def f(data):
|
|
29
|
+
ctr, I = analysis.circular_season(data["date"], data[col])
|
|
30
|
+
return pd.DataFrame({col + "_phi": [ctr], col + "_index": I})
|
|
31
|
+
return f
|
|
32
|
+
|
|
33
|
+
def preprocess(data, allow_no_id=True):
|
|
34
|
+
"""
|
|
35
|
+
Convert raw input data into appropriate format, with all required covariates.
|
|
36
|
+
"""
|
|
37
|
+
if not "id" in data.columns and allow_no_id:
|
|
38
|
+
data["id"] = "null"
|
|
39
|
+
if not all([col in data.columns for col in req_cols]):
|
|
40
|
+
missing = [col for col in req_cols if not col in data.columns]
|
|
41
|
+
raise ValueError(f"Missing columns in input data; required: {req_cols}; missing: {missing}")
|
|
42
|
+
data["frozen"] = data["tmax"] < 0
|
|
43
|
+
data["cold_prcp"] = data["prcp"] * data["frozen"]
|
|
44
|
+
predictors = data.groupby("id", as_index=False)[
|
|
45
|
+
inp_cols + ["frozen", "cold_prcp"]].mean().assign(
|
|
46
|
+
snowfrac=lambda x: x["cold_prcp"]/x["prcp"]).drop(
|
|
47
|
+
columns=["cold_prcp"]).merge(
|
|
48
|
+
data.groupby("id", as_index=False)[["prcp", "vp"]].std(),
|
|
49
|
+
on="id", suffixes=["", "_sd"]).merge(
|
|
50
|
+
# Why different grouping? apply was dropping id
|
|
51
|
+
data.groupby("id").apply(ssn_df("prcp"), include_groups=False).reset_index(),
|
|
52
|
+
on="id").merge(
|
|
53
|
+
data.groupby("id").apply(ssn_df("tmax"), include_groups=False).reset_index(),
|
|
54
|
+
on="id"
|
|
55
|
+
)
|
|
56
|
+
return predictors
|
|
57
|
+
|
|
58
|
+
var_sets = [
|
|
59
|
+
{"name": "PCA0", "vars": ['tmax', 'prcp', 'vp', 'slope', 'wetland', 'developed', 'water', 'snowfrac', 'vp_sd', 'tmax_phi', 'elev', 'elev_min', 'frozen', 'tmax_index', 'canopy', 'ws_canopy'], "eq": s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + te(10, 11) + te(12, 13) + te(14, 15), "lam": 95, "noise": 0.763, "scale": 1.17},
|
|
60
|
+
{"name": "PCA1", "vars": ['tmax', 'area', 'wetland', 'water', 'snowfrac', 'vp_sd', 'tmax_phi', 'elev', 'ws_canopy', 'frozen', 'tmax_index'], "eq": s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + te(9, 10), "lam": 33, "noise": 0.789, "scale": 1.43},
|
|
61
|
+
{"name": "PCA2", "vars": ['tmax', 'vp', 'ice_snow', 'snowfrac', 'elev', 'frozen', 'tmax_index', 'canopy', 'ws_canopy'], "eq": s(0) + s(1) + s(2) + s(3) + s(4) + te(5, 6) + te(7, 8), "lam": 120, "noise": 0.722, "scale": 1.94},
|
|
62
|
+
{"name": "PCA3", "vars": ['tmax', 'prcp', 'vp', 'slope', 'wetland', 'ice_snow', 'water', 'snowfrac', 'vp_sd', 'prcp_index', 'tmax_phi', 'elev_min', 'tmax_index', 'canopy', 'ws_canopy'], "eq": s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10) + s(11) + s(12) + te(13, 14), "lam": 300, "noise": 0.649, "scale": 1.81},
|
|
63
|
+
{"name": "PCA4", "vars": ['tmax', 'water', 'vp_sd', 'prcp_phi', 'prcp_index', 'tmax_phi', 'elev', 'elev_min', 'frozen', 'tmax_index', 'canopy', 'ws_canopy'], "eq": s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + te(6, 7) + te(8, 9) + te(10, 11), "lam": 100, "noise": 0.374, "scale": 1.18},
|
|
64
|
+
{"name": "PCA5", "vars": ['tmax', 'prcp', 'vp', 'area', 'slope', 'wetland', 'developed', 'ice_snow', 'water', 'snowfrac', 'vp_sd', 'prcp_phi', 'prcp_index', 'tmax_phi', 'frozen', 'elev', 'elev_min', 'canopy', 'ws_canopy'], "eq": s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10) + s(11) + s(12) + s(13) + s(14) + te(15, 16) + te(17, 18), "lam": 900, "noise": 0.498, "scale": 1.89},
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
coef_names = ["PCA" + str(i) for i in range(9)]
|
|
68
|
+
col_order = ['Intercept', 'Amplitude', 'FallDay', 'WinterDay', 'SpringDay',
|
|
69
|
+
'SummerDay', 'SpringSummer', 'FallWinter', 'at_coef']
|
|
70
|
+
offset = pd.Series([ 12.74728664, 8.88411531, 326.74835165, 70.17814877,
|
|
71
|
+
154.06593407, 217.67912088, 0.73641769, 1.39860397,
|
|
72
|
+
0.60761769],
|
|
73
|
+
index=col_order)
|
|
74
|
+
scale = pd.Series([ 3.99956942, 2.72834273, 1. , 24.9749798 , 1. ,
|
|
75
|
+
1. , 0.95228087, 0.92086495, 0.16206394],
|
|
76
|
+
index=col_order)
|
|
77
|
+
|
|
78
|
+
pca_components = np.array([[ 4.75273148e-01, 3.53017223e-01, -0.00000000e+00,
|
|
79
|
+
3.89964302e-01, 4.33680869e-19, 6.87080990e-30,
|
|
80
|
+
-4.33972379e-01, 2.62777555e-01, 4.89936781e-01],
|
|
81
|
+
[-2.98526209e-01, 5.75733187e-01, 2.22044605e-16,
|
|
82
|
+
-3.51360414e-01, 1.38777878e-17, 1.73472348e-18,
|
|
83
|
+
1.40231899e-01, 6.56430019e-01, -7.34432019e-02],
|
|
84
|
+
[-1.99812934e-01, 1.10527951e-01, -0.00000000e+00,
|
|
85
|
+
4.05686198e-01, -0.00000000e+00, -1.11022302e-16,
|
|
86
|
+
7.37245878e-01, -7.40061442e-02, 4.84013097e-01],
|
|
87
|
+
[-4.53376419e-01, -1.17950445e-03, 1.11022302e-16,
|
|
88
|
+
7.25830343e-01, 1.38777878e-17, 1.11022302e-16,
|
|
89
|
+
-2.21043018e-01, 1.82393601e-01, -4.30687013e-01],
|
|
90
|
+
[ 3.38474188e-01, -6.06349571e-01, 2.77555756e-17,
|
|
91
|
+
7.62238071e-02, 0.00000000e+00, 1.11022302e-16,
|
|
92
|
+
2.58643763e-01, 6.62516936e-01, -7.83585718e-02],
|
|
93
|
+
[-5.70050328e-01, -4.05009279e-01, -3.33066907e-16,
|
|
94
|
+
-1.65083785e-01, -2.77555756e-17, 2.08166817e-17,
|
|
95
|
+
-3.64300551e-01, 1.49581284e-01, 5.73295735e-01],
|
|
96
|
+
[-2.28222730e-17, 2.14447949e-17, 2.44517236e-01,
|
|
97
|
+
-1.50217073e-17, 6.80988603e-01, 6.90265054e-01,
|
|
98
|
+
4.62816197e-18, -3.41355305e-17, 6.05612031e-17],
|
|
99
|
+
[ 7.09295599e-18, -7.35218129e-19, -8.10461498e-02,
|
|
100
|
+
-1.51829973e-17, 7.23736480e-01, -6.85300685e-01,
|
|
101
|
+
-5.71988276e-17, 1.01952746e-17, -5.83613217e-17],
|
|
102
|
+
[-9.10120242e-17, 3.59809750e-17, 9.66251956e-01,
|
|
103
|
+
2.12446194e-17, -1.11624504e-01, -2.32157548e-01,
|
|
104
|
+
-3.43148925e-17, 7.42260318e-17, 1.10636100e-16]])
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def build_model_from_data(tr_data):
|
|
108
|
+
"""
|
|
109
|
+
Prepares a coefficient estimation model from the provided training data. Training data is assumed to have coefficients listed in col_order,
|
|
110
|
+
which will be converted through PCA.
|
|
111
|
+
"""
|
|
112
|
+
vars_local = var_sets.copy()
|
|
113
|
+
# To reduce noise, set "weak-anomaly" dates to their mean.
|
|
114
|
+
means = tr_data[col_order].mean()
|
|
115
|
+
fwt = tr_data["FallWinter"].quantile(0.25)
|
|
116
|
+
sst = tr_data["SpringSummer"].quantile(0.25)
|
|
117
|
+
tr_data.loc[tr_data["FallWinter"] < fwt, "WinterDay"] = means["WinterDay"]
|
|
118
|
+
tr_data["FallDay"] = means["FallDay"]
|
|
119
|
+
tr_data["SpringDay"] = means["SpringDay"]
|
|
120
|
+
tr_data["SummerDay"] = means["SummerDay"]
|
|
121
|
+
# Resume analysis
|
|
122
|
+
X = tr_data.drop(columns=col_order)
|
|
123
|
+
Y = tr_data[["id"] + col_order].set_index("id")
|
|
124
|
+
Y = (Y - offset) / scale # normalize scale
|
|
125
|
+
Y = Y @ np.transpose(pca_components)
|
|
126
|
+
Y.columns = coef_names
|
|
127
|
+
Y["FallDay"] = 0
|
|
128
|
+
Y["SpringDay"] = 0
|
|
129
|
+
Y["SummerDay"] = 0
|
|
130
|
+
for vs in vars_local:
|
|
131
|
+
vs["gam"] = LinearGAM(vs["eq"], lam=vs["lam"]).fit(X[vs["vars"]], Y[vs["name"]])
|
|
132
|
+
vs["noise"] = np.sqrt(np.mean((vs["gam"].predict(X[vs["vars"]]) - Y[vs["name"]])**2))
|
|
133
|
+
vs["scale"] = Y[vs["name"]].std() / vs["gam"].predict(X[vs["vars"]]).std()
|
|
134
|
+
return vars_local
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def predict_site_coefficients(model, data, draw=False, noise_factor=0.9):
|
|
138
|
+
"""
|
|
139
|
+
Predicts model coefficients using the provided (pre-processed) data for
|
|
140
|
+
a specific site. Then invert PCA to produce NEWT coefficients.
|
|
141
|
+
If draw is True, generate a random draw.
|
|
142
|
+
"""
|
|
143
|
+
if draw:
|
|
144
|
+
predictor = lambda cols, gam, ws, noise, scale: (gam.confidence_intervals(ws[cols], quantiles=[rand.uniform()])[0,0] +
|
|
145
|
+
rand.normal(scale=noise * noise_factor))
|
|
146
|
+
else:
|
|
147
|
+
predictor = lambda cols, gam, ws, noise, scale: gam.predict(ws[cols])[0]
|
|
148
|
+
pcaed = {}
|
|
149
|
+
for cn in coef_names:
|
|
150
|
+
# For when we don't fit all PCAs.
|
|
151
|
+
pcaed[cn] = 0
|
|
152
|
+
for vs in model:
|
|
153
|
+
pcaed[vs["name"]] = predictor(vs["vars"], vs["gam"], data, vs["noise"], vs["scale"])
|
|
154
|
+
pcaed = pd.DataFrame(pcaed, index=[0])[coef_names] # ensure correct order
|
|
155
|
+
inv = pcaed @ pca_components
|
|
156
|
+
inv.columns = col_order
|
|
157
|
+
return inv * scale + offset
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def predict_all_coefficients(model, data, draw=False):
|
|
161
|
+
"""
|
|
162
|
+
Predicts model coefficients for all sites.
|
|
163
|
+
"""
|
|
164
|
+
keepll = "lat" in data.columns and "lon" in data.columns
|
|
165
|
+
keep = data[["id", "elev", "lat", "lon"]] if keepll else data[["id", "elev"]]
|
|
166
|
+
coefs = data.groupby("id").apply(
|
|
167
|
+
lambda x: predict_site_coefficients(model, x, draw),
|
|
168
|
+
include_groups=False)
|
|
169
|
+
return coefs.droplevel(1).merge(keep, how="left", on="id")
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def build_training_data(data):
|
|
173
|
+
"""
|
|
174
|
+
Prepare a training dataset by fitting watershed models.
|
|
175
|
+
"""
|
|
176
|
+
if not all([col in data.columns for col in training_req_cols]):
|
|
177
|
+
raise ValueError(f"Missing columns in input data; required: {training_req_cols}")
|
|
178
|
+
coefs = data.groupby("id").apply(lambda x:
|
|
179
|
+
Watershed.from_data(x).coefs_to_df().drop(columns=["R2", "RMSE"]) if
|
|
180
|
+
(len(x[["day", "temperature"]].dropna()["day"].unique()) >= 181) and
|
|
181
|
+
len(x) >= 730 else None,
|
|
182
|
+
include_groups=False)
|
|
183
|
+
coefs.index = coefs.index.get_level_values("id")
|
|
184
|
+
covar = preprocess(data)
|
|
185
|
+
return coefs.merge(covar, on="id")
|
NEXT/coefs.pickle
ADDED
|
Binary file
|