TempEst-NEXT 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
NEXT/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ from NEXT.model_manager import NEXT
2
+ from NEXT.data import full_data, all_data_gpkg
NEXT/coef_est.py ADDED
@@ -0,0 +1,185 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Sep 18 11:03:40 2024
4
+
5
+ @author: dphilippus
6
+
7
+ This file handles data preprocessing and coefficient estimation.
8
+ """
9
+
10
+ from pygam import LinearGAM, s, l, te
11
+ import pandas as pd
12
+ import numpy as np
13
+ from NEWT import analysis, statics, Watershed
14
+
15
+ rand = np.random.default_rng()
16
+
17
+ # Used: ['slope', 'elev_min', 'elev', 'area', 'intercept', 'srad_sd', 'cold_prcp', 'prcp', 'prcp_sd', 'srad', 'water', 'wetland', 'developed', 'ssn_phi', 'Intercept', 'ice_snow', 'vp_sd', 'lat', 'tamp', 'frozen', 'lon', 'ssn_index', 'forest']
18
+
19
+ inp_cols = ["tmax", "prcp", "vp",
20
+ "area", "elev_min", "elev", "slope",
21
+ "wetland", "developed", "ice_snow", "water",
22
+ "canopy", "ws_canopy",
23
+ "date", "day"]
24
+ req_cols = inp_cols + ["id"]
25
+ training_req_cols = req_cols + ["temperature"]
26
+
27
+ def ssn_df(col):
28
+ def f(data):
29
+ ctr, I = analysis.circular_season(data["date"], data[col])
30
+ return pd.DataFrame({col + "_phi": [ctr], col + "_index": I})
31
+ return f
32
+
33
+ def preprocess(data, allow_no_id=True):
34
+ """
35
+ Convert raw input data into appropriate format, with all required covariates.
36
+ """
37
+ if not "id" in data.columns and allow_no_id:
38
+ data["id"] = "null"
39
+ if not all([col in data.columns for col in req_cols]):
40
+ missing = [col for col in req_cols if not col in data.columns]
41
+ raise ValueError(f"Missing columns in input data; required: {req_cols}; missing: {missing}")
42
+ data["frozen"] = data["tmax"] < 0
43
+ data["cold_prcp"] = data["prcp"] * data["frozen"]
44
+ predictors = data.groupby("id", as_index=False)[
45
+ inp_cols + ["frozen", "cold_prcp"]].mean().assign(
46
+ snowfrac=lambda x: x["cold_prcp"]/x["prcp"]).drop(
47
+ columns=["cold_prcp"]).merge(
48
+ data.groupby("id", as_index=False)[["prcp", "vp"]].std(),
49
+ on="id", suffixes=["", "_sd"]).merge(
50
+ # Why different grouping? apply was dropping id
51
+ data.groupby("id").apply(ssn_df("prcp"), include_groups=False).reset_index(),
52
+ on="id").merge(
53
+ data.groupby("id").apply(ssn_df("tmax"), include_groups=False).reset_index(),
54
+ on="id"
55
+ )
56
+ return predictors
57
+
58
+ var_sets = [
59
+ {"name": "PCA0", "vars": ['tmax', 'prcp', 'vp', 'slope', 'wetland', 'developed', 'water', 'snowfrac', 'vp_sd', 'tmax_phi', 'elev', 'elev_min', 'frozen', 'tmax_index', 'canopy', 'ws_canopy'], "eq": s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + te(10, 11) + te(12, 13) + te(14, 15), "lam": 95, "noise": 0.763, "scale": 1.17},
60
+ {"name": "PCA1", "vars": ['tmax', 'area', 'wetland', 'water', 'snowfrac', 'vp_sd', 'tmax_phi', 'elev', 'ws_canopy', 'frozen', 'tmax_index'], "eq": s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + te(9, 10), "lam": 33, "noise": 0.789, "scale": 1.43},
61
+ {"name": "PCA2", "vars": ['tmax', 'vp', 'ice_snow', 'snowfrac', 'elev', 'frozen', 'tmax_index', 'canopy', 'ws_canopy'], "eq": s(0) + s(1) + s(2) + s(3) + s(4) + te(5, 6) + te(7, 8), "lam": 120, "noise": 0.722, "scale": 1.94},
62
+ {"name": "PCA3", "vars": ['tmax', 'prcp', 'vp', 'slope', 'wetland', 'ice_snow', 'water', 'snowfrac', 'vp_sd', 'prcp_index', 'tmax_phi', 'elev_min', 'tmax_index', 'canopy', 'ws_canopy'], "eq": s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10) + s(11) + s(12) + te(13, 14), "lam": 300, "noise": 0.649, "scale": 1.81},
63
+ {"name": "PCA4", "vars": ['tmax', 'water', 'vp_sd', 'prcp_phi', 'prcp_index', 'tmax_phi', 'elev', 'elev_min', 'frozen', 'tmax_index', 'canopy', 'ws_canopy'], "eq": s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + te(6, 7) + te(8, 9) + te(10, 11), "lam": 100, "noise": 0.374, "scale": 1.18},
64
+ {"name": "PCA5", "vars": ['tmax', 'prcp', 'vp', 'area', 'slope', 'wetland', 'developed', 'ice_snow', 'water', 'snowfrac', 'vp_sd', 'prcp_phi', 'prcp_index', 'tmax_phi', 'frozen', 'elev', 'elev_min', 'canopy', 'ws_canopy'], "eq": s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10) + s(11) + s(12) + s(13) + s(14) + te(15, 16) + te(17, 18), "lam": 900, "noise": 0.498, "scale": 1.89},
65
+ ]
66
+
67
+ coef_names = ["PCA" + str(i) for i in range(9)]
68
+ col_order = ['Intercept', 'Amplitude', 'FallDay', 'WinterDay', 'SpringDay',
69
+ 'SummerDay', 'SpringSummer', 'FallWinter', 'at_coef']
70
+ offset = pd.Series([ 12.74728664, 8.88411531, 326.74835165, 70.17814877,
71
+ 154.06593407, 217.67912088, 0.73641769, 1.39860397,
72
+ 0.60761769],
73
+ index=col_order)
74
+ scale = pd.Series([ 3.99956942, 2.72834273, 1. , 24.9749798 , 1. ,
75
+ 1. , 0.95228087, 0.92086495, 0.16206394],
76
+ index=col_order)
77
+
78
+ pca_components = np.array([[ 4.75273148e-01, 3.53017223e-01, -0.00000000e+00,
79
+ 3.89964302e-01, 4.33680869e-19, 6.87080990e-30,
80
+ -4.33972379e-01, 2.62777555e-01, 4.89936781e-01],
81
+ [-2.98526209e-01, 5.75733187e-01, 2.22044605e-16,
82
+ -3.51360414e-01, 1.38777878e-17, 1.73472348e-18,
83
+ 1.40231899e-01, 6.56430019e-01, -7.34432019e-02],
84
+ [-1.99812934e-01, 1.10527951e-01, -0.00000000e+00,
85
+ 4.05686198e-01, -0.00000000e+00, -1.11022302e-16,
86
+ 7.37245878e-01, -7.40061442e-02, 4.84013097e-01],
87
+ [-4.53376419e-01, -1.17950445e-03, 1.11022302e-16,
88
+ 7.25830343e-01, 1.38777878e-17, 1.11022302e-16,
89
+ -2.21043018e-01, 1.82393601e-01, -4.30687013e-01],
90
+ [ 3.38474188e-01, -6.06349571e-01, 2.77555756e-17,
91
+ 7.62238071e-02, 0.00000000e+00, 1.11022302e-16,
92
+ 2.58643763e-01, 6.62516936e-01, -7.83585718e-02],
93
+ [-5.70050328e-01, -4.05009279e-01, -3.33066907e-16,
94
+ -1.65083785e-01, -2.77555756e-17, 2.08166817e-17,
95
+ -3.64300551e-01, 1.49581284e-01, 5.73295735e-01],
96
+ [-2.28222730e-17, 2.14447949e-17, 2.44517236e-01,
97
+ -1.50217073e-17, 6.80988603e-01, 6.90265054e-01,
98
+ 4.62816197e-18, -3.41355305e-17, 6.05612031e-17],
99
+ [ 7.09295599e-18, -7.35218129e-19, -8.10461498e-02,
100
+ -1.51829973e-17, 7.23736480e-01, -6.85300685e-01,
101
+ -5.71988276e-17, 1.01952746e-17, -5.83613217e-17],
102
+ [-9.10120242e-17, 3.59809750e-17, 9.66251956e-01,
103
+ 2.12446194e-17, -1.11624504e-01, -2.32157548e-01,
104
+ -3.43148925e-17, 7.42260318e-17, 1.10636100e-16]])
105
+
106
+
107
+ def build_model_from_data(tr_data):
108
+ """
109
+ Prepares a coefficient estimation model from the provided training data. Training data is assumed to have coefficients listed in col_order,
110
+ which will be converted through PCA.
111
+ """
112
+ vars_local = var_sets.copy()
113
+ # To reduce noise, set "weak-anomaly" dates to their mean.
114
+ means = tr_data[col_order].mean()
115
+ fwt = tr_data["FallWinter"].quantile(0.25)
116
+ sst = tr_data["SpringSummer"].quantile(0.25)
117
+ tr_data.loc[tr_data["FallWinter"] < fwt, "WinterDay"] = means["WinterDay"]
118
+ tr_data["FallDay"] = means["FallDay"]
119
+ tr_data["SpringDay"] = means["SpringDay"]
120
+ tr_data["SummerDay"] = means["SummerDay"]
121
+ # Resume analysis
122
+ X = tr_data.drop(columns=col_order)
123
+ Y = tr_data[["id"] + col_order].set_index("id")
124
+ Y = (Y - offset) / scale # normalize scale
125
+ Y = Y @ np.transpose(pca_components)
126
+ Y.columns = coef_names
127
+ Y["FallDay"] = 0
128
+ Y["SpringDay"] = 0
129
+ Y["SummerDay"] = 0
130
+ for vs in vars_local:
131
+ vs["gam"] = LinearGAM(vs["eq"], lam=vs["lam"]).fit(X[vs["vars"]], Y[vs["name"]])
132
+ vs["noise"] = np.sqrt(np.mean((vs["gam"].predict(X[vs["vars"]]) - Y[vs["name"]])**2))
133
+ vs["scale"] = Y[vs["name"]].std() / vs["gam"].predict(X[vs["vars"]]).std()
134
+ return vars_local
135
+
136
+
137
+ def predict_site_coefficients(model, data, draw=False, noise_factor=0.9):
138
+ """
139
+ Predicts model coefficients using the provided (pre-processed) data for
140
+ a specific site. Then invert PCA to produce NEWT coefficients.
141
+ If draw is True, generate a random draw.
142
+ """
143
+ if draw:
144
+ predictor = lambda cols, gam, ws, noise, scale: (gam.confidence_intervals(ws[cols], quantiles=[rand.uniform()])[0,0] +
145
+ rand.normal(scale=noise * noise_factor))
146
+ else:
147
+ predictor = lambda cols, gam, ws, noise, scale: gam.predict(ws[cols])[0]
148
+ pcaed = {}
149
+ for cn in coef_names:
150
+ # For when we don't fit all PCAs.
151
+ pcaed[cn] = 0
152
+ for vs in model:
153
+ pcaed[vs["name"]] = predictor(vs["vars"], vs["gam"], data, vs["noise"], vs["scale"])
154
+ pcaed = pd.DataFrame(pcaed, index=[0])[coef_names] # ensure correct order
155
+ inv = pcaed @ pca_components
156
+ inv.columns = col_order
157
+ return inv * scale + offset
158
+
159
+
160
+ def predict_all_coefficients(model, data, draw=False):
161
+ """
162
+ Predicts model coefficients for all sites.
163
+ """
164
+ keepll = "lat" in data.columns and "lon" in data.columns
165
+ keep = data[["id", "elev", "lat", "lon"]] if keepll else data[["id", "elev"]]
166
+ coefs = data.groupby("id").apply(
167
+ lambda x: predict_site_coefficients(model, x, draw),
168
+ include_groups=False)
169
+ return coefs.droplevel(1).merge(keep, how="left", on="id")
170
+
171
+
172
+ def build_training_data(data):
173
+ """
174
+ Prepare a training dataset by fitting watershed models.
175
+ """
176
+ if not all([col in data.columns for col in training_req_cols]):
177
+ raise ValueError(f"Missing columns in input data; required: {training_req_cols}")
178
+ coefs = data.groupby("id").apply(lambda x:
179
+ Watershed.from_data(x).coefs_to_df().drop(columns=["R2", "RMSE"]) if
180
+ (len(x[["day", "temperature"]].dropna()["day"].unique()) >= 181) and
181
+ len(x) >= 730 else None,
182
+ include_groups=False)
183
+ coefs.index = coefs.index.get_level_values("id")
184
+ covar = preprocess(data)
185
+ return coefs.merge(covar, on="id")
NEXT/coefs.pickle ADDED
Binary file