ECOv002-calval-tables 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,376 @@
1
+ """
2
+ This module contains various error and statistical functions for model
3
+ evaluation, particularly for comparing simulated data with observations.
4
+
5
+ The functions cover a range of metrics, including R-squared, Kendall's Tau,
6
+ linear regression, bias, RMSE, and mean absolute bias. It also includes
7
+ utility functions for data filtering and generating summary statistics tables.
8
+ """
9
+ import numpy as np
10
+ import pandas as pd
11
+ import sklearn.metrics as metrics
12
+ import scipy.stats
13
+ from scipy import stats
14
+
15
+
16
+ def filter_nan(s, o):
17
+ """
18
+ Removes data from simulated and observed arrays wherever the observed data
19
+ contains NaN.
20
+
21
+ This ensures that all functions operate on valid, comparable data points.
22
+
23
+ Args:
24
+ s (array-like): Simulated values.
25
+ o (array-like): Observed values.
26
+
27
+ Returns:
28
+ tuple: A tuple containing the filtered simulated and observed arrays.
29
+ """
30
+ s = np.array(s)
31
+ o = np.array(o)
32
+ mask = ~np.isnan(o)
33
+ return s[mask], o[mask]
34
+
35
+
36
+ def R2_fun(s, o):
37
+ """
38
+ Calculates R^2 (coefficient of determination) or the correlation
39
+ coefficient squared.
40
+
41
+ It handles cases where the data is constant or contains no valid points
42
+ after filtering NaNs.
43
+
44
+ Args:
45
+ s (array-like): Simulated values.
46
+ o (array-like): Observed values.
47
+
48
+ Returns:
49
+ float: The R^2 value, or NaN if calculation is not possible.
50
+ """
51
+ o = np.array(o)
52
+ s = np.array(s)
53
+ if np.all(o == o[0]) or np.all(s == s[0]):
54
+ return np.nan
55
+ valid_mask = ~np.isnan(o) & ~np.isnan(s)
56
+ if np.sum(valid_mask) == 0:
57
+ return np.nan
58
+ _, _, r_value, _, _ = stats.linregress(o[valid_mask], s[valid_mask])
59
+ r2 = r_value**2
60
+ return r2
61
+
62
+
63
+ def KT_fun(s, o):
64
+ """
65
+ Calculates Kendall's Tau correlation coefficient and p-value.
66
+
67
+ This non-parametric test measures the strength of dependence between two
68
+ variables.
69
+
70
+ Args:
71
+ s (array-like): Simulated values.
72
+ o (array-like): Observed values.
73
+
74
+ Returns:
75
+ tuple: A tuple containing the Kendall's Tau correlation coefficient
76
+ and the p-value.
77
+ """
78
+ s, o = filter_nan(s, o)
79
+ tau, pvalue = scipy.stats.kendalltau(s, o)
80
+ return tau, pvalue
81
+
82
+
83
+ def lin_regress(Y, X):
84
+ """
85
+ Performs a linear regression of Y on X and returns the slope and intercept.
86
+
87
+ This function uses numpy's least-squares method.
88
+
89
+ Args:
90
+ Y (array-like): The dependent variable.
91
+ X (array-like): The independent variable.
92
+
93
+ Returns:
94
+ tuple: A tuple containing the slope and intercept of the regression line.
95
+ """
96
+ x, y = filter_nan(X, Y)
97
+ A = np.vstack([x, np.ones(len(x))]).T
98
+ slope, intercept = np.linalg.lstsq(A, y, rcond=None)[0]
99
+ return slope, intercept
100
+
101
+
102
+ def BIAS_fun(s, o):
103
+ """
104
+ Returns the mean bias of the simulated data in relation to the observations.
105
+
106
+ Args:
107
+ s (array-like): Simulated values.
108
+ o (array-like): Observed values.
109
+
110
+ Returns:
111
+ float: The mean bias.
112
+ """
113
+ s, o = filter_nan(s, o)
114
+ dif = s - o
115
+ bias = np.mean(dif)
116
+ return bias
117
+
118
+
119
+ def rmse(s, o):
120
+ """
121
+ Calculates root mean squared error between simulated and observed values.
122
+
123
+ Args:
124
+ s (array-like): Simulated values.
125
+ o (array-like): Observed values.
126
+
127
+ Returns:
128
+ float: The RMSE value.
129
+ """
130
+ s, o = filter_nan(s, o)
131
+ return np.sqrt(np.mean((s - o) ** 2))
132
+
133
+
134
+ def ABS_BIAS_fun(s, o):
135
+ """
136
+ Returns the mean absolute difference (bias) between the simulated and
137
+ observed data.
138
+
139
+ Args:
140
+ s (array-like): Simulated values.
141
+ o (array-like): Observed values.
142
+
143
+ Returns:
144
+ float: The mean of the absolute difference.
145
+ """
146
+ s, o = filter_nan(s, o)
147
+ dif = np.absolute(s - o)
148
+ abs_bias = np.mean(dif)
149
+ return abs_bias
150
+
151
+
152
+ def get_summary_stats(s, o):
153
+ """
154
+ Returns a list of summary statistics for model evaluation.
155
+
156
+ Args:
157
+ s (array-like): Simulated values.
158
+ o (array-like): Observed values.
159
+
160
+ Returns:
161
+ list: A list containing [mbe, mae, rmse, r2, kt, slope, intercept].
162
+ """
163
+ s, o = filter_nan(s, o)
164
+ mbe = np.mean(s) - np.mean(o)
165
+ mae = metrics.mean_absolute_error(o, s)
166
+ mse = metrics.mean_squared_error(o, s)
167
+ _rmse = np.sqrt(mse)
168
+ r2 = R2_fun(s, o)
169
+ kt, _ = KT_fun(s, o)
170
+ slope, intercept = lin_regress(s, o)
171
+ return [mbe, mae, _rmse, r2, kt, slope, intercept]
172
+
173
+
174
+ def intersection(lst1, lst2):
175
+ """
176
+ Finds the common elements between two lists.
177
+
178
+ Args:
179
+ lst1 (list): The first list.
180
+ lst2 (list): The second list.
181
+
182
+ Returns:
183
+ list: A new list containing elements that are in both input lists.
184
+ """
185
+ return [value for value in lst1 if value in lst2]
186
+
187
+
188
+ def create_sum_stats(in_df, LE_var='LEcorr50'):
189
+ """
190
+ Creates a table of statistics for models and ancillary variables.
191
+
192
+ This function calculates and populates a pandas DataFrame with key metrics
193
+ like RMSE, MAB, BIAS, R2, Slope, and Intercept for various model outputs
194
+ against observed data.
195
+
196
+ Args:
197
+ in_df (pd.DataFrame): A DataFrame containing model and ground
198
+ observations.
199
+ LE_var (str, optional): The name of the column in in_df to use as the
200
+ reference LE variable. Defaults to 'LEcorr50'.
201
+
202
+ Returns:
203
+ pd.DataFrame: A DataFrame containing the calculated statistics.
204
+ """
205
+ stats_df = pd.DataFrame(
206
+ columns=['VAR', 'RMSE', 'MAB', 'BIAS', 'R2', 'Slope', 'Int'],
207
+ )
208
+ models = ['SM', 'BESS', 'MOD16', 'Rn', 'Rg', 'Ta', 'RH']
209
+
210
+ for model in models:
211
+ # Define model-specific columns and observation names
212
+ obs_name_map = {
213
+ 'SM': 'SM_surf',
214
+ 'Rn': 'NETRAD_filt',
215
+ 'Rg': 'SW_IN',
216
+ 'Ta': 'AirTempC',
217
+ 'RH': 'RH_percentage',
218
+ }
219
+ obs_name = obs_name_map.get(model, LE_var)
220
+ model_col = model + 'inst' if model in ['BESS', 'MOD16'] else model
221
+
222
+ # Handle special cases for SM_surf and SM_rz
223
+ if model == 'SM':
224
+ obs_names = ['SM_surf', 'SM_rz']
225
+ for obs_n in obs_names:
226
+ m_rmse = rmse(in_df[model].to_numpy(), in_df[obs_n].to_numpy())
227
+ m_mab = ABS_BIAS_fun(in_df[model].to_numpy(), in_df[obs_n].to_numpy())
228
+ m_bias = BIAS_fun(in_df[model].to_numpy(), in_df[obs_n].to_numpy())
229
+ m_r2 = R2_fun(in_df[model].to_numpy(), in_df[obs_n].to_numpy())
230
+ m_slope, m_int = lin_regress(
231
+ in_df[model].to_numpy(),
232
+ in_df[obs_n].to_numpy(),
233
+ )
234
+ stats_df.loc[len(stats_df.index)] = [
235
+ model + obs_n.split('_')[-1],
236
+ m_rmse,
237
+ m_mab,
238
+ m_bias,
239
+ m_r2,
240
+ m_slope,
241
+ m_int,
242
+ ]
243
+ continue
244
+
245
+ # Calculate metrics for other models
246
+ m_rmse = rmse(in_df[model_col].to_numpy(), in_df[obs_name].to_numpy())
247
+ m_mab = ABS_BIAS_fun(in_df[model_col].to_numpy(), in_df[obs_name].to_numpy())
248
+ m_bias = BIAS_fun(in_df[model_col].to_numpy(), in_df[obs_name].to_numpy())
249
+ m_r2 = R2_fun(in_df[model_col].to_numpy(), in_df[obs_name].to_numpy())
250
+ m_slope, m_int = lin_regress(
251
+ in_df[model_col].to_numpy(),
252
+ in_df[obs_name].to_numpy(),
253
+ )
254
+ stats_df.loc[len(stats_df.index)] = [
255
+ model,
256
+ m_rmse,
257
+ m_mab,
258
+ m_bias,
259
+ m_r2,
260
+ m_slope,
261
+ m_int,
262
+ ]
263
+
264
+ return stats_df
265
+
266
+
267
+ def create_sum_stats_daily(in_df, LE_var='ETcorr50daily'):
268
+ """
269
+ Creates a table of statistics for daily models.
270
+
271
+ Args:
272
+ in_df (pd.DataFrame): DataFrame with model and daily observation data.
273
+ LE_var (str, optional): The reference variable for daily latent heat
274
+ flux. Defaults to 'ETcorr50daily'.
275
+
276
+ Returns:
277
+ pd.DataFrame: A DataFrame containing the calculated statistics.
278
+ """
279
+ stats_df = pd.DataFrame(
280
+ columns=['VAR', 'RMSE', 'MAB', 'BIAS', 'R2', 'Slope', 'Int'],
281
+ )
282
+ models = ['ETdaily_L3T_JET', 'ETdaily_L3T_ET_ALEXI']
283
+
284
+ for model in models:
285
+ m_rmse = rmse(in_df[model].to_numpy(), in_df[LE_var].to_numpy())
286
+ m_mab = ABS_BIAS_fun(in_df[model].to_numpy(), in_df[LE_var].to_numpy())
287
+ m_bias = BIAS_fun(in_df[model].to_numpy(), in_df[LE_var].to_numpy())
288
+ m_r2 = R2_fun(in_df[model].to_numpy(), in_df[LE_var].to_numpy())
289
+ m_slope, m_int = lin_regress(
290
+ in_df[model].to_numpy(),
291
+ in_df[LE_var].to_numpy(),
292
+ )
293
+ stats_df.loc[len(stats_df.index)] = [
294
+ model,
295
+ m_rmse,
296
+ m_mab,
297
+ m_bias,
298
+ m_r2,
299
+ m_slope,
300
+ m_int,
301
+ ]
302
+
303
+ return stats_df
304
+
305
+
306
+ def find_ideal(big_df_ss):
307
+ """
308
+ Calculates an 'ideal' latent heat (LE) value for each observation.
309
+
310
+ The 'ideal' LE is defined as the value from a set of LE estimates
311
+ (including different correction methods and an energy balance residual)
312
+ that is closest to the 'JET' model's output for that observation. This
313
+ can be used to evaluate the consistency of the 'JET' model.
314
+
315
+ Args:
316
+ big_df_ss (pd.DataFrame): DataFrame containing various LE flux
317
+ estimates.
318
+
319
+ Returns:
320
+ pd.DataFrame: The input DataFrame with a new 'LE_ideal' column.
321
+ """
322
+ big_df_ss['LE_residual'] = (
323
+ big_df_ss['NETRAD_filt'] - big_df_ss['H_filt'] - big_df_ss['G_filt']
324
+ )
325
+ big_df_ss['LE_ideal'] = big_df_ss.apply(
326
+ lambda row: min(
327
+ [
328
+ row[
329
+ [
330
+ 'LEcorr25',
331
+ 'LEcorr50',
332
+ 'LEcorr75',
333
+ 'LE_filt',
334
+ 'LEcorr_ann',
335
+ 'LE_residual',
336
+ ]
337
+ ].min(),
338
+ row[
339
+ [
340
+ 'LEcorr25',
341
+ 'LEcorr50',
342
+ 'LEcorr75',
343
+ 'LE_filt',
344
+ 'LEcorr_ann',
345
+ 'LE_residual',
346
+ ]
347
+ ].max(),
348
+ (
349
+ row[
350
+ [
351
+ 'LEcorr25',
352
+ 'LEcorr50',
353
+ 'LEcorr75',
354
+ 'LE_filt',
355
+ 'LEcorr_ann',
356
+ 'LE_residual',
357
+ ]
358
+ ].min()
359
+ + row[
360
+ [
361
+ 'LEcorr25',
362
+ 'LEcorr50',
363
+ 'LEcorr75',
364
+ 'LE_filt',
365
+ 'LEcorr_ann',
366
+ 'LE_residual',
367
+ ]
368
+ ].max()
369
+ )
370
+ / 2,
371
+ ],
372
+ key=lambda x: abs(x - row['JET']),
373
+ ),
374
+ axis=1,
375
+ )
376
+ return big_df_ss
@@ -0,0 +1,67 @@
1
+ import os
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import geopandas as gpd
6
+
7
+ from shapely.geometry import Point
8
+
9
+ def load_combined_eco_flux_ec_filtered() -> pd.DataFrame:
10
+ """
11
+ Load the filtered eddy covariance (EC) flux dataset used for ECOSTRESS Collection 2 ET product validation.
12
+ This dataset contains site-level, quality-controlled flux measurements that serve as ground truth for evaluating ECOSTRESS evapotranspiration estimates.
13
+ Returns:
14
+ pd.DataFrame: DataFrame of filtered EC flux data for validation analysis.
15
+ """
16
+ return pd.read_csv(os.path.join(os.path.dirname(__file__), 'combined_eco_flux_EC_filtered.csv'))
17
+
18
+
19
+ def load_metadata_ebc_filt() -> gpd.GeoDataFrame:
20
+ """
21
+ Load the metadata for the filtered eddy covariance (EC) flux sites used in the ECOSTRESS Collection 2 validation study.
22
+ This table provides site information (location, climate, land cover, etc.) for interpreting and grouping the flux data in the validation analysis.
23
+ Returns:
24
+ pd.DataFrame: DataFrame of site metadata for the filtered EC flux dataset.
25
+ """
26
+ df = pd.read_csv(os.path.join(os.path.dirname(__file__), 'metadata_ebc_filt.csv'))
27
+
28
+ if 'Lat' not in df.columns or 'Long' not in df.columns:
29
+ raise ValueError("metadata_ebc_filt.csv must contain 'Lat' and 'Long' columns.")
30
+
31
+ geometry = [Point(xy) for xy in zip(df['Long'], df['Lat'])]
32
+ gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")
33
+
34
+ return gdf
35
+
36
+ def load_calval_table() -> gpd.GeoDataFrame:
37
+ """
38
+ Load the combined ECOSTRESS Collection 2 validation table, which includes both the filtered eddy covariance flux data
39
+ and the associated site metadata.
40
+
41
+ Returns:
42
+ gpd.GeoDataFrame: Combined GeoDataFrame of EC flux data and site metadata for validation analysis.
43
+ """
44
+ tower_locations_gdf = load_metadata_ebc_filt()
45
+ tower_data_df = load_combined_eco_flux_ec_filtered()
46
+
47
+ # Merge all columns from both tables, matching tower_data_df.ID to tower_locations_gdf["Site ID"]
48
+ merged_df = pd.merge(
49
+ tower_data_df,
50
+ tower_locations_gdf,
51
+ left_on="ID",
52
+ right_on="Site ID",
53
+ how="left",
54
+ suffixes=("", "_meta")
55
+ )
56
+
57
+ merged_df["time_UTC"] = merged_df["eco_time_utc"]
58
+ merged_df["ST_K"] = np.array(merged_df.LST)
59
+ merged_df["ST_C"] = merged_df.ST_K - 273.15
60
+ merged_df["Ta_C"] = np.array(merged_df.Ta)
61
+ merged_df["SWin_Wm2"] = np.array(merged_df.Rg)
62
+ merged_df["emissivity"] = np.array(merged_df.EmisWB)
63
+
64
+ # Convert merged DataFrame to GeoDataFrame
65
+ gdf = gpd.GeoDataFrame(merged_df, geometry=merged_df["geometry"], crs="EPSG:4326")
66
+
67
+ return gdf