ECOv002-calval-tables 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ECOv002_calval_tables/ECOv002_calval_tables.py +5 -67
- ECOv002_calval_tables/ec_lib.py +736 -0
- ECOv002_calval_tables/error_funcs.py +376 -0
- ECOv002_calval_tables/load_tables.py +67 -0
- ECOv002_calval_tables/plot_funcs.py +710 -0
- ECOv002_calval_tables/plot_single_model.py +113 -0
- {ecov002_calval_tables-1.4.0.dist-info → ecov002_calval_tables-1.6.0.dist-info}/METADATA +7 -1
- ecov002_calval_tables-1.6.0.dist-info/RECORD +15 -0
- ecov002_calval_tables-1.4.0.dist-info/RECORD +0 -10
- {ecov002_calval_tables-1.4.0.dist-info → ecov002_calval_tables-1.6.0.dist-info}/WHEEL +0 -0
- {ecov002_calval_tables-1.4.0.dist-info → ecov002_calval_tables-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {ecov002_calval_tables-1.4.0.dist-info → ecov002_calval_tables-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,736 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module contains functions for processing and quality-controlling
|
|
3
|
+
AmeriFlux eddy covariance data.
|
|
4
|
+
|
|
5
|
+
It includes utilities for filtering sites based on metadata, handling time
|
|
6
|
+
conversions (UTC, local, and solar time), and performing energy balance
|
|
7
|
+
closure corrections. The module also provides functions for reading and
|
|
8
|
+
cleaning raw AmeriFlux data, as well as converting latent heat flux to
|
|
9
|
+
evapotranspiration.
|
|
10
|
+
"""
|
|
11
|
+
import os
|
|
12
|
+
import sys
|
|
13
|
+
import warnings
|
|
14
|
+
from datetime import timedelta
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pandas as pd
|
|
18
|
+
from tables import NaturalNameWarning
|
|
19
|
+
|
|
20
|
+
# Ignore specific warnings to prevent clutter
|
|
21
|
+
warnings.filterwarnings(action='ignore', category=NaturalNameWarning)
|
|
22
|
+
warnings.simplefilter(action='ignore', category=RuntimeWarning)
|
|
23
|
+
warnings.filterwarnings(action='ignore', message='All-NaN slice encountered')
|
|
24
|
+
pd.options.mode.chained_assignment = None
|
|
25
|
+
|
|
26
|
+
REL_PATH = os.getcwd() + '/'
|
|
27
|
+
DATA_PATH = REL_PATH + 'data/AMF_metadata/'
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# --- SITE METADATA FILTERING ---
|
|
31
|
+
def limit_cols(sites):
|
|
32
|
+
"""
|
|
33
|
+
Limits a DataFrame of AmeriFlux sites to useful information.
|
|
34
|
+
|
|
35
|
+
Cleans up column names and sets a new index based on the site ID.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
sites (pd.DataFrame): DataFrame of AmeriFlux sites with metadata.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
pd.DataFrame: A new DataFrame with a limited set of columns and a
|
|
42
|
+
cleaned index.
|
|
43
|
+
"""
|
|
44
|
+
new_index = [s.replace('\xa0', '') for s in sites.index]
|
|
45
|
+
with warnings.catch_warnings():
|
|
46
|
+
warnings.simplefilter('ignore')
|
|
47
|
+
sites['new_index'] = new_index
|
|
48
|
+
sites.set_index(sites.new_index, inplace=True)
|
|
49
|
+
out_df = sites[
|
|
50
|
+
['Name', 'Lat', 'Long', 'Elev_(m)', 'Clim', 'Veg', 'MAT_(°C)', 'MAP_(mm)']
|
|
51
|
+
]
|
|
52
|
+
return out_df
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def filter_sites(filename=DATA_PATH + 'ameriflux_meta.csv'):
|
|
56
|
+
"""
|
|
57
|
+
Filters AmeriFlux sites based on specific criteria for ECOSTRESS
|
|
58
|
+
observations.
|
|
59
|
+
|
|
60
|
+
The criteria are:
|
|
61
|
+
- Latitude between 53.6N and 53.6S.
|
|
62
|
+
- Open Access License CC-By-4.0.
|
|
63
|
+
- End date is NaN or more recent than 2018.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
filename (str): Path to the AmeriFlux metadata file.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
pd.DataFrame: A DataFrame of filtered sites.
|
|
70
|
+
"""
|
|
71
|
+
table = pd.read_csv(filename)
|
|
72
|
+
table.set_index(table.columns[0], inplace=True)
|
|
73
|
+
|
|
74
|
+
lat_f_sites = table[(table[table.columns[2]] > -53.6) & (table[table.columns[2]] < 53.6)]
|
|
75
|
+
out_cols = [c.split('\xa0')[0].replace(' ', '_') for c in lat_f_sites.columns]
|
|
76
|
+
lat_f_sites.columns = out_cols
|
|
77
|
+
|
|
78
|
+
license_filter = lat_f_sites['Data_Use_Policy1'] == 'CC-BY-4.0'
|
|
79
|
+
lat_lic_sites = lat_f_sites[license_filter]
|
|
80
|
+
out_df = limit_cols(lat_lic_sites)
|
|
81
|
+
out_df.index.rename('Sites', inplace=True)
|
|
82
|
+
|
|
83
|
+
return out_df
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_dois(in_path=DATA_PATH):
|
|
87
|
+
"""
|
|
88
|
+
Retrieves the DOI for each AmeriFlux site.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
in_path (str): Path to the metadata directory.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
pd.DataFrame: A DataFrame with site IDs as the index and a 'doi' column.
|
|
95
|
+
"""
|
|
96
|
+
citation_name = in_path + 'ameriflux_citations.csv'
|
|
97
|
+
cite_meta = pd.read_csv(citation_name)
|
|
98
|
+
cite_meta.set_index('site_id', inplace=True)
|
|
99
|
+
return cite_meta[['doi']]
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def create_table_1(save_to_csv=False, out_dir=''):
|
|
103
|
+
"""
|
|
104
|
+
Generates a table of sites, merging filtered metadata with DOIs.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
save_to_csv (bool): If True, saves the table to a CSV file.
|
|
108
|
+
out_dir (str): The directory to save the CSV file.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
pd.DataFrame: The merged DataFrame.
|
|
112
|
+
"""
|
|
113
|
+
site_df = filter_sites()
|
|
114
|
+
doi_df = get_dois()
|
|
115
|
+
table1 = pd.merge(site_df, doi_df, left_index=True, right_index=True)
|
|
116
|
+
|
|
117
|
+
if save_to_csv:
|
|
118
|
+
table1.to_csv(out_dir + 'table1.csv')
|
|
119
|
+
return table1
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# --- TIME CONVERSION FUNCTIONS ---
|
|
123
|
+
def get_utc_hr_offset(site_meta_fname):
|
|
124
|
+
"""
|
|
125
|
+
Returns the UTC offset in hours from a site's metadata file.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
site_meta_fname (str): Path to the site metadata file.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
int: The UTC offset in hours.
|
|
132
|
+
"""
|
|
133
|
+
site_meta = pd.read_excel(site_meta_fname)
|
|
134
|
+
utc_offset_s = site_meta.DATAVALUE[site_meta['VARIABLE'] == 'UTC_OFFSET']
|
|
135
|
+
utc_offset_it = iter(np.array(utc_offset_s))
|
|
136
|
+
utc_offset_first = next(utc_offset_it)
|
|
137
|
+
utc_offset = int(float(utc_offset_first))
|
|
138
|
+
print(f'\tutc offset is:\t{utc_offset}')
|
|
139
|
+
return utc_offset
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def change_to_utc(times, utc_offset):
|
|
143
|
+
"""
|
|
144
|
+
Converts a time series to UTC by subtracting the UTC offset.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
times (pd.DatetimeIndex): Time series to convert.
|
|
148
|
+
utc_offset (int): The UTC offset in hours.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
pd.DatetimeIndex: The converted time series in UTC.
|
|
152
|
+
"""
|
|
153
|
+
return times - pd.DateOffset(hours=utc_offset)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def change_to_local(times, utc_offset):
|
|
157
|
+
"""
|
|
158
|
+
Converts a time series to local time by adding the UTC offset.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
times (pd.DatetimeIndex): Time series to convert.
|
|
162
|
+
utc_offset (int): The UTC offset in hours.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
pd.DatetimeIndex: The converted time series in local time.
|
|
166
|
+
"""
|
|
167
|
+
print('creating local time columns')
|
|
168
|
+
return times + pd.DateOffset(hours=utc_offset)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def get_lon(site_meta_fname):
|
|
172
|
+
"""
|
|
173
|
+
Returns the longitude of a site from its metadata file.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
site_meta_fname (str): Path to the site metadata file.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
float: The longitude in degrees.
|
|
180
|
+
"""
|
|
181
|
+
site_meta = pd.read_excel(site_meta_fname)
|
|
182
|
+
long = site_meta.DATAVALUE[site_meta['VARIABLE'] == 'LOCATION_LONG']
|
|
183
|
+
return np.array(long).astype(float)[0]
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def longitude_to_offset(longitude_deg):
|
|
187
|
+
"""
|
|
188
|
+
Converts longitude to a time offset.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
longitude_deg (float): Longitude in degrees.
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
timedelta: The time offset corresponding to the longitude.
|
|
195
|
+
"""
|
|
196
|
+
return timedelta(hours=(np.radians(longitude_deg) / np.pi * 12))
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def utc_to_solar(datetime_utc, longitude_deg):
|
|
200
|
+
"""
|
|
201
|
+
Converts UTC datetime to solar apparent time.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
datetime_utc (pd.DatetimeIndex): Time series in UTC.
|
|
205
|
+
longitude_deg (float): Longitude in degrees.
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
pd.DatetimeIndex: The converted time series in solar time.
|
|
209
|
+
"""
|
|
210
|
+
return datetime_utc + longitude_to_offset(longitude_deg)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
# --- VARIABLE CALCULATION FUNCTIONS ---
|
|
214
|
+
def calc_SWin(in_df):
|
|
215
|
+
"""
|
|
216
|
+
Calculates the mean shortwave incoming radiation from available columns.
|
|
217
|
+
"""
|
|
218
|
+
print('\treading SW_IN')
|
|
219
|
+
final_list = [c for c in in_df.columns if c.startswith('SW_IN') and '_F' not in c]
|
|
220
|
+
return in_df[final_list].mean(axis=1).values
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def calc_H(in_df):
|
|
224
|
+
"""
|
|
225
|
+
Calculates the mean sensible heat flux (H) from available columns.
|
|
226
|
+
"""
|
|
227
|
+
print('\treading H')
|
|
228
|
+
final_list = [c for c in in_df.columns if c.startswith('H')]
|
|
229
|
+
final_list_filt = [c for c in final_list if 'H2O' not in c]
|
|
230
|
+
final_list_filt2 = [c for c in final_list_filt if 'SSITC' not in c]
|
|
231
|
+
final_list_filt3 = [c for c in final_list_filt2 if '_F' not in c]
|
|
232
|
+
return in_df[final_list_filt3].mean(axis=1).values
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def calc_G(in_df):
|
|
236
|
+
"""
|
|
237
|
+
Calculates the mean ground heat flux (G) from available columns.
|
|
238
|
+
"""
|
|
239
|
+
print('\treading G')
|
|
240
|
+
final_list = [c for c in in_df.columns if c.startswith('G')]
|
|
241
|
+
final_list_filt = [c for c in final_list if 'GPP' not in c]
|
|
242
|
+
final_list_filt2 = [c for c in final_list_filt if '_F' not in c]
|
|
243
|
+
return in_df[final_list_filt2].mean(axis=1).values
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def calc_NETRAD(in_df):
|
|
247
|
+
"""
|
|
248
|
+
Calculates the mean net radiation (NETRAD) from available columns.
|
|
249
|
+
"""
|
|
250
|
+
print('\treading NETRAD')
|
|
251
|
+
final_list = [c for c in in_df.columns if c.startswith('NETRAD')]
|
|
252
|
+
final_list_filt = [c for c in final_list if '_F' not in c]
|
|
253
|
+
return in_df[final_list_filt].mean(axis=1).values
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def calc_LE(in_df):
|
|
257
|
+
"""
|
|
258
|
+
Calculates the mean latent heat flux (LE) from available columns.
|
|
259
|
+
"""
|
|
260
|
+
print('\treading LE')
|
|
261
|
+
final_list = [c for c in in_df.columns if c.startswith('LE')]
|
|
262
|
+
final_list_filt2 = [c for c in final_list if 'SSITC' not in c]
|
|
263
|
+
final_list_filt3 = [c for c in final_list_filt2 if 'LEAF' not in c]
|
|
264
|
+
final_list_filt4 = [c for c in final_list_filt3 if '_F' not in c]
|
|
265
|
+
LE = in_df[final_list_filt4].mean(axis=1).values
|
|
266
|
+
print(LE)
|
|
267
|
+
return LE
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def calc_SWC(in_df):
|
|
271
|
+
"""
|
|
272
|
+
Calculates the mean surface soil water content (SWC) from available
|
|
273
|
+
columns.
|
|
274
|
+
"""
|
|
275
|
+
print('\treading SWC surface')
|
|
276
|
+
final_list = []
|
|
277
|
+
for i in np.arange(1, 9):
|
|
278
|
+
try:
|
|
279
|
+
final_list.append(
|
|
280
|
+
list(in_df.columns[(in_df.columns.str.startswith(f'SWC_{i}_1'))])[0],
|
|
281
|
+
)
|
|
282
|
+
except IndexError:
|
|
283
|
+
continue
|
|
284
|
+
final_list_filt2 = [c for c in final_list if '_PI' not in c]
|
|
285
|
+
return in_df[final_list_filt2].mean(axis=1).values
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def calc_all_SWC(in_df):
|
|
289
|
+
"""
|
|
290
|
+
Calculates the mean soil water content (SWC) for all observations.
|
|
291
|
+
"""
|
|
292
|
+
print('\treading SWC all')
|
|
293
|
+
final_list = in_df.columns[in_df.columns.str.startswith('SWC_')]
|
|
294
|
+
final_list_filt2 = [c for c in final_list if '_PI' not in c]
|
|
295
|
+
return in_df[final_list_filt2].mean(axis=1).values
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def calc_RH(in_df):
|
|
299
|
+
"""
|
|
300
|
+
Calculates the mean relative humidity (RH) from available columns.
|
|
301
|
+
"""
|
|
302
|
+
print('\treading RH')
|
|
303
|
+
final_list = [c for c in in_df.columns if c.startswith('RH')]
|
|
304
|
+
final_list_filt2 = [c for c in final_list if '_PI' not in c]
|
|
305
|
+
return in_df[final_list_filt2].mean(axis=1).values
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def calc_AirTemp(in_df):
|
|
309
|
+
"""
|
|
310
|
+
Calculates the mean air temperature from available columns.
|
|
311
|
+
"""
|
|
312
|
+
print('\treading Air Temperature')
|
|
313
|
+
final_list = [c for c in in_df.columns if c.startswith('TA')]
|
|
314
|
+
final_list_filt2 = [c for c in final_list if 'TAU' not in c]
|
|
315
|
+
final_list_filt3 = [c for c in final_list_filt2 if '_PI' not in c]
|
|
316
|
+
return in_df[final_list_filt3].mean(axis=1).values
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
# --- QAQC AND ENERGY BALANCE CLOSURE ---
|
|
320
|
+
def remove_spikes(in_df, varnames=['LE'], z=6.5):
|
|
321
|
+
"""
|
|
322
|
+
Removes spikes in data using the median of absolute deviation about the
|
|
323
|
+
median, as described in Papale et al. (2006).
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
in_df (pd.DataFrame): DataFrame with AmeriFlux data.
|
|
327
|
+
varnames (list): List of variable names to filter.
|
|
328
|
+
z (float): The threshold for outlier detection. Larger numbers are
|
|
329
|
+
more conservative.
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
pd.DataFrame: The DataFrame with spikes removed, creating new
|
|
333
|
+
filtered columns (e.g., 'LE_filt').
|
|
334
|
+
"""
|
|
335
|
+
df_temp = in_df.copy()
|
|
336
|
+
df_day = df_temp[
|
|
337
|
+
(df_temp.NETRAD > 0)
|
|
338
|
+
| (df_temp.NETRAD.isnull())
|
|
339
|
+
& ((df_temp.index.hour >= 7) & (df_temp.index.hour < 17))
|
|
340
|
+
]
|
|
341
|
+
df_night = df_temp[
|
|
342
|
+
(df_temp.NETRAD <= 0)
|
|
343
|
+
| (df_temp.NETRAD.isnull())
|
|
344
|
+
& ((df_temp.index.hour < 7) | ((df_temp.index.hour >= 17)))
|
|
345
|
+
]
|
|
346
|
+
|
|
347
|
+
for var in varnames:
|
|
348
|
+
di_n = df_night[var].diff() - (df_night[var].diff(periods=-1) * -1.0)
|
|
349
|
+
di_d = df_day[var].diff() - (df_day[var].diff(periods=-1) * -1.0)
|
|
350
|
+
md_n = np.nanmedian(di_n)
|
|
351
|
+
md_d = np.nanmedian(di_d)
|
|
352
|
+
mad_n = np.nanmedian(np.abs(di_n - md_n))
|
|
353
|
+
mad_d = np.nanmedian(np.abs(di_d - md_d))
|
|
354
|
+
|
|
355
|
+
mask_nh = di_n < md_n - (z * mad_n / 0.6745)
|
|
356
|
+
mask_nl = di_n > md_n + (z * mad_n / 0.6745)
|
|
357
|
+
df_night.loc[mask_nh | mask_nl, var] = np.nan
|
|
358
|
+
|
|
359
|
+
mask_dh = di_d < md_d - (z * mad_d / 0.6745)
|
|
360
|
+
mask_dl = di_d > md_d + (z * mad_d / 0.6745)
|
|
361
|
+
df_day.loc[mask_dh | mask_dl, var] = np.nan
|
|
362
|
+
|
|
363
|
+
df_out = pd.concat([df_night, df_day], verify_integrity=True).sort_index()
|
|
364
|
+
vnameout = var + '_filt'
|
|
365
|
+
in_df[vnameout] = df_out[var]
|
|
366
|
+
print(f'\t{var}_filt created')
|
|
367
|
+
return in_df
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def rolling_quantile_filter(in_df, _var_='LE'):
|
|
371
|
+
"""
|
|
372
|
+
Applies a conservative rolling 15-day quantile filter to remove outliers
|
|
373
|
+
that weren't caught by the spike removal algorithm.
|
|
374
|
+
"""
|
|
375
|
+
df = in_df.copy()
|
|
376
|
+
df['IQR'] = (
|
|
377
|
+
df[_var_].rolling('15D', min_periods=int(48 * 5)).quantile(0.75)
|
|
378
|
+
- df[_var_].rolling('15D', min_periods=int(48 * 5)).quantile(0.25)
|
|
379
|
+
)
|
|
380
|
+
df['max'] = (
|
|
381
|
+
df['IQR'] * 2.5 + df[_var_].rolling('15D', min_periods=int(48 * 5)).quantile(0.75)
|
|
382
|
+
)
|
|
383
|
+
df['min'] = (
|
|
384
|
+
df[_var_].rolling('15D', min_periods=int(48 * 5)).quantile(0.25) - df['IQR'] * 2.5
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
df.loc[df[_var_] > df['max'], _var_] = np.nan
|
|
388
|
+
df.loc[df[_var_] < df['min'], _var_] = np.nan
|
|
389
|
+
df.drop(['IQR', 'max', 'min'], inplace=True, axis=1)
|
|
390
|
+
|
|
391
|
+
return df
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def filter_based_on_threshs(
|
|
395
|
+
in_df,
|
|
396
|
+
LE_threshes=[-150, 1200],
|
|
397
|
+
H_threshes=[-150, 1200],
|
|
398
|
+
NETRAD_threshes=[-250, 1400],
|
|
399
|
+
G_threshes=[-250, 500],
|
|
400
|
+
filtered=True,
|
|
401
|
+
):
|
|
402
|
+
"""
|
|
403
|
+
Removes data that falls outside of specified physical thresholds.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
in_df (pd.DataFrame): DataFrame containing flux data.
|
|
407
|
+
LE_threshes (list): Min and max thresholds for LE.
|
|
408
|
+
H_threshes (list): Min and max thresholds for H.
|
|
409
|
+
NETRAD_threshes (list): Min and max thresholds for NETRAD.
|
|
410
|
+
G_threshes (list): Min and max thresholds for G.
|
|
411
|
+
filtered (bool): If True, applies filters to '_filt' columns.
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
pd.DataFrame: The DataFrame with values outside the thresholds set
|
|
415
|
+
to NaN.
|
|
416
|
+
"""
|
|
417
|
+
_f_ = '_filt' if filtered else ''
|
|
418
|
+
df_amf = in_df.copy()
|
|
419
|
+
|
|
420
|
+
df_amf.loc[df_amf['LE' + _f_] < LE_threshes[0], 'LE' + _f_] = np.nan
|
|
421
|
+
df_amf.loc[df_amf['LE' + _f_] > LE_threshes[1], 'LE' + _f_] = np.nan
|
|
422
|
+
df_amf.loc[df_amf['NETRAD' + _f_] < NETRAD_threshes[0], 'NETRAD' + _f_] = np.nan
|
|
423
|
+
df_amf.loc[df_amf['NETRAD' + _f_] > NETRAD_threshes[1], 'NETRAD' + _f_] = np.nan
|
|
424
|
+
df_amf.loc[df_amf['G' + _f_] < G_threshes[0], 'G' + _f_] = np.nan
|
|
425
|
+
df_amf.loc[df_amf['G' + _f_] > G_threshes[1], 'G' + _f_] = np.nan
|
|
426
|
+
df_amf.loc[df_amf['H' + _f_] < LE_threshes[0], 'H' + _f_] = np.nan
|
|
427
|
+
df_amf.loc[df_amf['H' + _f_] > LE_threshes[1], 'H' + _f_] = np.nan
|
|
428
|
+
|
|
429
|
+
return df_amf
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def force_close_fluxnet(in_df, filtered=False, verbose=True):
|
|
433
|
+
"""
|
|
434
|
+
Performs Energy Balance Forced Closure according to Fluxnet methods.
|
|
435
|
+
|
|
436
|
+
This function calculates a correction factor and applies it to LE and H.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
in_df (pd.DataFrame): AmeriFlux data frame with energy balance variables.
|
|
440
|
+
filtered (bool): If True, uses filtered columns.
|
|
441
|
+
verbose (bool): If True, prints status messages.
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
pd.DataFrame: DataFrame with adjusted LE and H variables.
|
|
445
|
+
"""
|
|
446
|
+
_f_ = '_filt' if filtered else ''
|
|
447
|
+
df = in_df.copy()
|
|
448
|
+
|
|
449
|
+
vars_to_use = ['LE' + _f_, 'H' + _f_, 'NETRAD' + _f_, 'G' + _f_]
|
|
450
|
+
df = df[vars_to_use].astype(float).copy()
|
|
451
|
+
|
|
452
|
+
if int(df['G' + _f_].count()) == 0 or df['G' + _f_].count() / len(df.index) < 0.3:
|
|
453
|
+
df['_RadFlux_'] = df['NETRAD' + _f_]
|
|
454
|
+
df['no_G_flag'] = 1
|
|
455
|
+
if verbose:
|
|
456
|
+
print('\tno valid G data available')
|
|
457
|
+
else:
|
|
458
|
+
df['_RadFlux_'] = df['NETRAD' + _f_] - df['G' + _f_]
|
|
459
|
+
df['no_G_flag'] = 0
|
|
460
|
+
|
|
461
|
+
df['ebc_cf'] = df['_RadFlux_'] / (df['H' + _f_] + df['LE' + _f_])
|
|
462
|
+
Q1 = df['ebc_cf'].quantile(0.25)
|
|
463
|
+
Q3 = df['ebc_cf'].quantile(0.75)
|
|
464
|
+
IQR = Q3 - Q1
|
|
465
|
+
|
|
466
|
+
filtered_df = df.query('(@Q1 - 1.5 * @IQR) <= ebc_cf <= (@Q3 + 1.5 * @IQR)')
|
|
467
|
+
removed_mask = set(df.index) - set(filtered_df.index)
|
|
468
|
+
removed_mask = pd.to_datetime(list(removed_mask))
|
|
469
|
+
df.ebc_cf.loc[removed_mask] = np.nan
|
|
470
|
+
|
|
471
|
+
if verbose:
|
|
472
|
+
print(f'\tmean correction factor is: {np.round(np.nanmean(df.ebc_cf.values), 2)}')
|
|
473
|
+
print(f'\tclosure ratio mean is: {1 / df.ebc_cf.mean()}')
|
|
474
|
+
print(
|
|
475
|
+
f'\tpercent of valid closure crs is: {100 * df["ebc_cf"].count() / len(df.index)}',
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
df['ebc_cf_all'] = df.ebc_cf.median()
|
|
479
|
+
df['ebc_cf_stable'] = df.ebc_cf.copy()
|
|
480
|
+
|
|
481
|
+
min_period_thresh = 48
|
|
482
|
+
night_or_day_mask = (df.index.hour > 20) | (df.index.hour <= 3) | ((df.index.hour > 10) & (df.index.hour <= 14))
|
|
483
|
+
df.loc[~night_or_day_mask, 'ebc_cf_stable'] = np.nan
|
|
484
|
+
df['ebc_cf_25'] = df.ebc_cf_stable.rolling('15D', min_periods=min_period_thresh, center=True).quantile(
|
|
485
|
+
0.25,
|
|
486
|
+
interpolation='nearest',
|
|
487
|
+
)
|
|
488
|
+
df['ebc_cf_50'] = df.ebc_cf_stable.rolling('15D', min_periods=min_period_thresh, center=True).quantile(
|
|
489
|
+
0.5,
|
|
490
|
+
interpolation='nearest',
|
|
491
|
+
)
|
|
492
|
+
df['ebc_cf_75'] = df.ebc_cf_stable.rolling('15D', min_periods=min_period_thresh, center=True).quantile(
|
|
493
|
+
0.75,
|
|
494
|
+
interpolation='nearest',
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
df['LEcorr25'] = df.ebc_cf_25 * df['LE' + _f_]
|
|
498
|
+
df['LEcorr50'] = df.ebc_cf_50 * df['LE' + _f_]
|
|
499
|
+
df['LEcorr75'] = df.ebc_cf_75 * df['LE' + _f_]
|
|
500
|
+
df['LEcorr_ann'] = df.ebc_cf_all * df['LE' + _f_]
|
|
501
|
+
|
|
502
|
+
le_lims = [-100, 800]
|
|
503
|
+
for col in ['LEcorr_ann', 'LEcorr25', 'LEcorr50', 'LEcorr75']:
|
|
504
|
+
df.loc[(df[col] >= le_lims[1]) | (df[col] <= le_lims[0]), col] = np.nan
|
|
505
|
+
|
|
506
|
+
cf_lims = [0.5, 2]
|
|
507
|
+
for col in ['ebc_cf_all', 'ebc_cf_25', 'ebc_cf_50', 'ebc_cf_75']:
|
|
508
|
+
df.loc[(df[col] >= cf_lims[1]) | (df[col] <= cf_lims[0]), col] = np.nan
|
|
509
|
+
|
|
510
|
+
df['Hcorr25'] = df.ebc_cf_25 * df['H' + _f_]
|
|
511
|
+
df['Hcorr50'] = df.ebc_cf_50 * df['H' + _f_]
|
|
512
|
+
df['Hcorr75'] = df.ebc_cf_75 * df['H' + _f_]
|
|
513
|
+
df['Hcorr_ann'] = df.ebc_cf_all * df['H' + _f_]
|
|
514
|
+
|
|
515
|
+
out_vars = [
|
|
516
|
+
'LEcorr_ann',
|
|
517
|
+
'LEcorr25',
|
|
518
|
+
'LEcorr50',
|
|
519
|
+
'LEcorr75',
|
|
520
|
+
'ebc_cf',
|
|
521
|
+
'Hcorr_ann',
|
|
522
|
+
'Hcorr25',
|
|
523
|
+
'Hcorr50',
|
|
524
|
+
'Hcorr75',
|
|
525
|
+
]
|
|
526
|
+
df_out = df[out_vars]
|
|
527
|
+
|
|
528
|
+
cr = 1.0 / np.round(np.nanmean(df.ebc_cf_stable.values), 5)
|
|
529
|
+
cf = np.round(np.nanmean(df.ebc_cf_stable.values), 5)
|
|
530
|
+
if verbose:
|
|
531
|
+
print(f'\n\tmean stable correction factor\n\t{cr} closure\n\t{cf} correction factor\n')
|
|
532
|
+
print(f'\tclosure at site when filtered for stable conditions is:\t{cr}')
|
|
533
|
+
|
|
534
|
+
return df_out
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def force_close_br_daily(in_df, filtered=True):
|
|
538
|
+
"""
|
|
539
|
+
Performs forced closure using a daily Bowen ratio approach.
|
|
540
|
+
"""
|
|
541
|
+
_f_ = '_filt' if filtered else ''
|
|
542
|
+
df = in_df.copy()
|
|
543
|
+
vars_to_use = ['LE' + _f_, 'H' + _f_, 'NETRAD' + _f_, 'G' + _f_]
|
|
544
|
+
df = df[vars_to_use].astype(float).copy()
|
|
545
|
+
|
|
546
|
+
if int(df['G' + _f_].count()) == 0 or df['G' + _f_].count() / len(df.index) > 0.3:
|
|
547
|
+
df['_RadFlux_'] = df['NETRAD' + _f_]
|
|
548
|
+
df['no_G_flag'] = 1
|
|
549
|
+
else:
|
|
550
|
+
df['_RadFlux_'] = df['NETRAD' + _f_] - df['G' + _f_]
|
|
551
|
+
df['no_G_flag'] = 0
|
|
552
|
+
|
|
553
|
+
min_period_thresh = int(12 * 3)
|
|
554
|
+
df['cf'] = df['_RadFlux_'] / (df['LE' + _f_] + df['H' + _f_])
|
|
555
|
+
df['cf_1day'] = df.cf.rolling('3D', min_periods=min_period_thresh, center=True).median()
|
|
556
|
+
|
|
557
|
+
df['LEcorr_br'] = df['cf_1day'] * df['LE' + _f_]
|
|
558
|
+
df.loc[(df.LEcorr_br >= 1200) | (df.LEcorr_br <= -150), 'LEcorr_br'] = np.nan
|
|
559
|
+
|
|
560
|
+
df['Hcorr_br'] = df['cf_1day'] * df['H' + _f_]
|
|
561
|
+
df.loc[(df.Hcorr_br >= 1200) | (df.Hcorr_br <= -150), 'Hcorr_br'] = np.nan
|
|
562
|
+
|
|
563
|
+
out_vars = ['LEcorr_br', 'Hcorr_br']
|
|
564
|
+
return df[out_vars]
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def read_amflx_data(filename, site_meta_fname, filtered=True, gapfill_interp=True, verbose=True):
|
|
568
|
+
"""
|
|
569
|
+
Reads, cleans, and processes an AmeriFlux data file.
|
|
570
|
+
|
|
571
|
+
This is a comprehensive function that handles multiple steps:
|
|
572
|
+
1. Reads the file and sets the time index.
|
|
573
|
+
2. Identifies and extracts key variables (e.g., LE, H, NETRAD).
|
|
574
|
+
3. Applies QAQC filters like spike removal and quantile filtering.
|
|
575
|
+
4. Performs energy balance closure corrections.
|
|
576
|
+
5. Converts time to UTC and solar time.
|
|
577
|
+
|
|
578
|
+
Args:
|
|
579
|
+
filename (str): Path to the AmeriFlux data file.
|
|
580
|
+
site_meta_fname (str): Path to the site metadata file.
|
|
581
|
+
filtered (bool): If True, applies QAQC filtering.
|
|
582
|
+
gapfill_interp (bool): If True, interpolates small data gaps.
|
|
583
|
+
verbose (bool): If True, prints status messages.
|
|
584
|
+
|
|
585
|
+
Returns:
|
|
586
|
+
pd.DataFrame: The fully processed and cleaned DataFrame.
|
|
587
|
+
"""
|
|
588
|
+
site = filename.split('/')[-1].split('_')[1]
|
|
589
|
+
if verbose:
|
|
590
|
+
print(f'starting to process & clean:\t{site}')
|
|
591
|
+
|
|
592
|
+
df_amf = pd.read_csv(filename, skiprows=2, header=0)
|
|
593
|
+
df_amf['local_time'] = pd.to_datetime(df_amf['TIMESTAMP_END'], format='%Y%m%d%H%M')
|
|
594
|
+
df_amf.set_index(['local_time'], inplace=True)
|
|
595
|
+
if verbose:
|
|
596
|
+
print('\tfile read and time set to local')
|
|
597
|
+
|
|
598
|
+
df_amf = df_amf[df_amf.index >= '2018-10-01']
|
|
599
|
+
df_amf.replace(-9999, np.nan, inplace=True)
|
|
600
|
+
|
|
601
|
+
g_exists = False
|
|
602
|
+
try:
|
|
603
|
+
df_amf['G'] = calc_G(df_amf)
|
|
604
|
+
g_exists = True
|
|
605
|
+
except (KeyError, IndexError):
|
|
606
|
+
print('\tno ground heat flux\nassigning 0 to G for energy balance closure')
|
|
607
|
+
df_amf['G'] = 0
|
|
608
|
+
df_amf['G_filt'] = 0
|
|
609
|
+
|
|
610
|
+
if len([c for c in df_amf.columns if c.startswith('NETRAD')]) >= 1:
|
|
611
|
+
df_amf['NETRAD'] = calc_NETRAD(df_amf)
|
|
612
|
+
elif site == 'US-MMS':
|
|
613
|
+
df_amf['NETRAD'] = df_amf['SW_IN_1_1_1'] - df_amf['SW_OUT_1_1_1'] + df_amf['LW_IN_1_1_1'] - df_amf['LW_OUT_1_1_1']
|
|
614
|
+
else:
|
|
615
|
+
df_amf['NETRAD'] = np.nan
|
|
616
|
+
|
|
617
|
+
if len([c for c in df_amf.columns if c.startswith('LE')]) >= 1:
|
|
618
|
+
df_amf['LE'] = calc_LE(df_amf)
|
|
619
|
+
else:
|
|
620
|
+
df_amf['LE'] = np.nan
|
|
621
|
+
|
|
622
|
+
if len([c for c in df_amf.columns if c.startswith('H')]) >= 1:
|
|
623
|
+
df_amf['H'] = calc_H(df_amf)
|
|
624
|
+
else:
|
|
625
|
+
df_amf['H'] = np.nan
|
|
626
|
+
|
|
627
|
+
if verbose:
|
|
628
|
+
print('\tchecked for energy balance variables')
|
|
629
|
+
|
|
630
|
+
if len([c for c in df_amf.columns if c.startswith('SWC')]) >= 1:
|
|
631
|
+
df_amf['SM_surf'] = calc_SWC(df_amf)
|
|
632
|
+
df_amf['SM_rz'] = calc_all_SWC(df_amf)
|
|
633
|
+
else:
|
|
634
|
+
df_amf['SM_surf'], df_amf['SM_rz'] = np.nan, np.nan
|
|
635
|
+
|
|
636
|
+
if len([c for c in df_amf.columns if c.startswith('RH')]) >= 1:
|
|
637
|
+
df_amf['RH'] = calc_RH(df_amf)
|
|
638
|
+
else:
|
|
639
|
+
df_amf['RH'] = np.nan
|
|
640
|
+
|
|
641
|
+
if len([c for c in df_amf.columns if c.startswith('TA')]) >= 1:
|
|
642
|
+
df_amf['AirTempC'] = calc_AirTemp(df_amf)
|
|
643
|
+
else:
|
|
644
|
+
df_amf['AirTempC'] = np.nan
|
|
645
|
+
|
|
646
|
+
if len([c for c in df_amf.columns if c.startswith('SW_IN')]) >= 1:
|
|
647
|
+
df_amf['SW_IN'] = calc_SWin(df_amf)
|
|
648
|
+
else:
|
|
649
|
+
df_amf['SW_IN'] = np.nan
|
|
650
|
+
|
|
651
|
+
if verbose:
|
|
652
|
+
print('\tchecked for ancillary variables')
|
|
653
|
+
|
|
654
|
+
if filtered:
|
|
655
|
+
for var in ['LE', 'H', 'NETRAD']:
|
|
656
|
+
df_amf = remove_spikes(df_amf, varnames=[var])
|
|
657
|
+
df_amf = rolling_quantile_filter(df_amf, f'{var}_filt')
|
|
658
|
+
|
|
659
|
+
if g_exists:
|
|
660
|
+
df_amf = rolling_quantile_filter(df_amf, 'G')
|
|
661
|
+
df_amf = remove_spikes(df_amf, varnames=['G'])
|
|
662
|
+
|
|
663
|
+
if gapfill_interp:
|
|
664
|
+
for var in ['LE_filt', 'H_filt', 'G_filt', 'NETRAD_filt']:
|
|
665
|
+
if var in df_amf.columns:
|
|
666
|
+
df_amf[var].interpolate('linear', limit=8, inplace=True)
|
|
667
|
+
|
|
668
|
+
df_amf = filter_based_on_threshs(df_amf, filtered=True)
|
|
669
|
+
df_corr_flux = force_close_fluxnet(df_amf, filtered=True)
|
|
670
|
+
df_corr_br_daily = force_close_br_daily(df_amf, filtered=True)
|
|
671
|
+
df_out = pd.concat([df_amf, df_corr_flux, df_corr_br_daily], axis=1)
|
|
672
|
+
|
|
673
|
+
df_out['LE_std'] = df_out.LE.rolling(4, min_periods=3).std()
|
|
674
|
+
df_out['LE_2hr_med'] = df_out.LE.rolling(4, min_periods=3).median()
|
|
675
|
+
df_out['LE_2hr_avg'] = df_out.LE.rolling(4, min_periods=3).mean()
|
|
676
|
+
|
|
677
|
+
print('\tmeta data read to access utc offset')
|
|
678
|
+
offset = get_utc_hr_offset(site_meta_fname)
|
|
679
|
+
out_times = change_to_utc(df_out.index, offset)
|
|
680
|
+
df_out['time_utc'] = out_times
|
|
681
|
+
|
|
682
|
+
site_long = get_lon(site_meta_fname)
|
|
683
|
+
df_out['solar_time'] = utc_to_solar(df_out.time_utc, site_long)
|
|
684
|
+
df_out['solar_hour'] = df_out['solar_time'].dt.hour
|
|
685
|
+
df_out.set_index(['time_utc'], inplace=True)
|
|
686
|
+
df_out['local_time'] = pd.to_datetime(df_out['TIMESTAMP_END'], format='%Y%m%d%H%M')
|
|
687
|
+
|
|
688
|
+
return df_out
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
# --- OTHER UTILITIES ---
|
|
692
|
+
def LE_2_ETmm(LE_Wm2, freq='day'):
|
|
693
|
+
"""
|
|
694
|
+
Converts Latent Energy (LE) flux to Evapotranspiration (ET).
|
|
695
|
+
|
|
696
|
+
Args:
|
|
697
|
+
LE_Wm2 (np.ndarray): Latent energy flux in W/m^2.
|
|
698
|
+
freq (str): The time frequency ('30 min' or 'day').
|
|
699
|
+
|
|
700
|
+
Returns:
|
|
701
|
+
np.ndarray: Evapotranspiration in mm.
|
|
702
|
+
"""
|
|
703
|
+
lambda_e = 2.460 * 10**6
|
|
704
|
+
roe_w = 1000
|
|
705
|
+
m_2_mm = 1000
|
|
706
|
+
|
|
707
|
+
if freq == '30 min':
|
|
708
|
+
sec_conv = 60 * 30
|
|
709
|
+
elif freq == 'day':
|
|
710
|
+
sec_conv = 60 * 30 * 48
|
|
711
|
+
else:
|
|
712
|
+
raise ValueError("Invalid frequency. Choose '30 min' or 'day'.")
|
|
713
|
+
|
|
714
|
+
mask = ~np.isnan(LE_Wm2)
|
|
715
|
+
ET_mm = np.empty(LE_Wm2.shape)
|
|
716
|
+
ET_mm[:] = np.nan
|
|
717
|
+
ET_mm[mask] = LE_Wm2[mask] * (m_2_mm * sec_conv) / (lambda_e * roe_w)
|
|
718
|
+
return ET_mm
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
def assign_time(in_df, time_col='time_UTC'):
|
|
722
|
+
"""
|
|
723
|
+
Converts a specified time column to a datetime index.
|
|
724
|
+
|
|
725
|
+
Args:
|
|
726
|
+
in_df (pd.DataFrame): DataFrame with a time column.
|
|
727
|
+
time_col (str): The name of the time column to use.
|
|
728
|
+
|
|
729
|
+
Returns:
|
|
730
|
+
pd.DataFrame: A new DataFrame with the time column set as the index.
|
|
731
|
+
"""
|
|
732
|
+
df_test_var = in_df.copy()
|
|
733
|
+
df_test_var['time'] = pd.to_datetime(df_test_var[time_col])
|
|
734
|
+
df_test_var.set_index('time', inplace=True)
|
|
735
|
+
df_test_var.drop(time_col, axis=1, inplace=True)
|
|
736
|
+
return df_test_var.copy()
|