ECOv002-calval-tables 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,736 @@
1
+ """
2
+ This module contains functions for processing and quality-controlling
3
+ AmeriFlux eddy covariance data.
4
+
5
+ It includes utilities for filtering sites based on metadata, handling time
6
+ conversions (UTC, local, and solar time), and performing energy balance
7
+ closure corrections. The module also provides functions for reading and
8
+ cleaning raw AmeriFlux data, as well as converting latent heat flux to
9
+ evapotranspiration.
10
+ """
11
+ import os
12
+ import sys
13
+ import warnings
14
+ from datetime import timedelta
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ from tables import NaturalNameWarning
19
+
20
+ # Ignore specific warnings to prevent clutter
21
+ warnings.filterwarnings(action='ignore', category=NaturalNameWarning)
22
+ warnings.simplefilter(action='ignore', category=RuntimeWarning)
23
+ warnings.filterwarnings(action='ignore', message='All-NaN slice encountered')
24
+ pd.options.mode.chained_assignment = None
25
+
26
+ REL_PATH = os.getcwd() + '/'
27
+ DATA_PATH = REL_PATH + 'data/AMF_metadata/'
28
+
29
+
30
+ # --- SITE METADATA FILTERING ---
31
+ def limit_cols(sites):
32
+ """
33
+ Limits a DataFrame of AmeriFlux sites to useful information.
34
+
35
+ Cleans up column names and sets a new index based on the site ID.
36
+
37
+ Args:
38
+ sites (pd.DataFrame): DataFrame of AmeriFlux sites with metadata.
39
+
40
+ Returns:
41
+ pd.DataFrame: A new DataFrame with a limited set of columns and a
42
+ cleaned index.
43
+ """
44
+ new_index = [s.replace('\xa0', '') for s in sites.index]
45
+ with warnings.catch_warnings():
46
+ warnings.simplefilter('ignore')
47
+ sites['new_index'] = new_index
48
+ sites.set_index(sites.new_index, inplace=True)
49
+ out_df = sites[
50
+ ['Name', 'Lat', 'Long', 'Elev_(m)', 'Clim', 'Veg', 'MAT_(°C)', 'MAP_(mm)']
51
+ ]
52
+ return out_df
53
+
54
+
55
+ def filter_sites(filename=DATA_PATH + 'ameriflux_meta.csv'):
56
+ """
57
+ Filters AmeriFlux sites based on specific criteria for ECOSTRESS
58
+ observations.
59
+
60
+ The criteria are:
61
+ - Latitude between 53.6N and 53.6S.
62
+ - Open Access License CC-By-4.0.
63
+ - End date is NaN or more recent than 2018.
64
+
65
+ Args:
66
+ filename (str): Path to the AmeriFlux metadata file.
67
+
68
+ Returns:
69
+ pd.DataFrame: A DataFrame of filtered sites.
70
+ """
71
+ table = pd.read_csv(filename)
72
+ table.set_index(table.columns[0], inplace=True)
73
+
74
+ lat_f_sites = table[(table[table.columns[2]] > -53.6) & (table[table.columns[2]] < 53.6)]
75
+ out_cols = [c.split('\xa0')[0].replace(' ', '_') for c in lat_f_sites.columns]
76
+ lat_f_sites.columns = out_cols
77
+
78
+ license_filter = lat_f_sites['Data_Use_Policy1'] == 'CC-BY-4.0'
79
+ lat_lic_sites = lat_f_sites[license_filter]
80
+ out_df = limit_cols(lat_lic_sites)
81
+ out_df.index.rename('Sites', inplace=True)
82
+
83
+ return out_df
84
+
85
+
86
+ def get_dois(in_path=DATA_PATH):
87
+ """
88
+ Retrieves the DOI for each AmeriFlux site.
89
+
90
+ Args:
91
+ in_path (str): Path to the metadata directory.
92
+
93
+ Returns:
94
+ pd.DataFrame: A DataFrame with site IDs as the index and a 'doi' column.
95
+ """
96
+ citation_name = in_path + 'ameriflux_citations.csv'
97
+ cite_meta = pd.read_csv(citation_name)
98
+ cite_meta.set_index('site_id', inplace=True)
99
+ return cite_meta[['doi']]
100
+
101
+
102
+ def create_table_1(save_to_csv=False, out_dir=''):
103
+ """
104
+ Generates a table of sites, merging filtered metadata with DOIs.
105
+
106
+ Args:
107
+ save_to_csv (bool): If True, saves the table to a CSV file.
108
+ out_dir (str): The directory to save the CSV file.
109
+
110
+ Returns:
111
+ pd.DataFrame: The merged DataFrame.
112
+ """
113
+ site_df = filter_sites()
114
+ doi_df = get_dois()
115
+ table1 = pd.merge(site_df, doi_df, left_index=True, right_index=True)
116
+
117
+ if save_to_csv:
118
+ table1.to_csv(out_dir + 'table1.csv')
119
+ return table1
120
+
121
+
122
+ # --- TIME CONVERSION FUNCTIONS ---
123
+ def get_utc_hr_offset(site_meta_fname):
124
+ """
125
+ Returns the UTC offset in hours from a site's metadata file.
126
+
127
+ Args:
128
+ site_meta_fname (str): Path to the site metadata file.
129
+
130
+ Returns:
131
+ int: The UTC offset in hours.
132
+ """
133
+ site_meta = pd.read_excel(site_meta_fname)
134
+ utc_offset_s = site_meta.DATAVALUE[site_meta['VARIABLE'] == 'UTC_OFFSET']
135
+ utc_offset_it = iter(np.array(utc_offset_s))
136
+ utc_offset_first = next(utc_offset_it)
137
+ utc_offset = int(float(utc_offset_first))
138
+ print(f'\tutc offset is:\t{utc_offset}')
139
+ return utc_offset
140
+
141
+
142
+ def change_to_utc(times, utc_offset):
143
+ """
144
+ Converts a time series to UTC by subtracting the UTC offset.
145
+
146
+ Args:
147
+ times (pd.DatetimeIndex): Time series to convert.
148
+ utc_offset (int): The UTC offset in hours.
149
+
150
+ Returns:
151
+ pd.DatetimeIndex: The converted time series in UTC.
152
+ """
153
+ return times - pd.DateOffset(hours=utc_offset)
154
+
155
+
156
+ def change_to_local(times, utc_offset):
157
+ """
158
+ Converts a time series to local time by adding the UTC offset.
159
+
160
+ Args:
161
+ times (pd.DatetimeIndex): Time series to convert.
162
+ utc_offset (int): The UTC offset in hours.
163
+
164
+ Returns:
165
+ pd.DatetimeIndex: The converted time series in local time.
166
+ """
167
+ print('creating local time columns')
168
+ return times + pd.DateOffset(hours=utc_offset)
169
+
170
+
171
+ def get_lon(site_meta_fname):
172
+ """
173
+ Returns the longitude of a site from its metadata file.
174
+
175
+ Args:
176
+ site_meta_fname (str): Path to the site metadata file.
177
+
178
+ Returns:
179
+ float: The longitude in degrees.
180
+ """
181
+ site_meta = pd.read_excel(site_meta_fname)
182
+ long = site_meta.DATAVALUE[site_meta['VARIABLE'] == 'LOCATION_LONG']
183
+ return np.array(long).astype(float)[0]
184
+
185
+
186
+ def longitude_to_offset(longitude_deg):
187
+ """
188
+ Converts longitude to a time offset.
189
+
190
+ Args:
191
+ longitude_deg (float): Longitude in degrees.
192
+
193
+ Returns:
194
+ timedelta: The time offset corresponding to the longitude.
195
+ """
196
+ return timedelta(hours=(np.radians(longitude_deg) / np.pi * 12))
197
+
198
+
199
+ def utc_to_solar(datetime_utc, longitude_deg):
200
+ """
201
+ Converts UTC datetime to solar apparent time.
202
+
203
+ Args:
204
+ datetime_utc (pd.DatetimeIndex): Time series in UTC.
205
+ longitude_deg (float): Longitude in degrees.
206
+
207
+ Returns:
208
+ pd.DatetimeIndex: The converted time series in solar time.
209
+ """
210
+ return datetime_utc + longitude_to_offset(longitude_deg)
211
+
212
+
213
+ # --- VARIABLE CALCULATION FUNCTIONS ---
214
+ def calc_SWin(in_df):
215
+ """
216
+ Calculates the mean shortwave incoming radiation from available columns.
217
+ """
218
+ print('\treading SW_IN')
219
+ final_list = [c for c in in_df.columns if c.startswith('SW_IN') and '_F' not in c]
220
+ return in_df[final_list].mean(axis=1).values
221
+
222
+
223
+ def calc_H(in_df):
224
+ """
225
+ Calculates the mean sensible heat flux (H) from available columns.
226
+ """
227
+ print('\treading H')
228
+ final_list = [c for c in in_df.columns if c.startswith('H')]
229
+ final_list_filt = [c for c in final_list if 'H2O' not in c]
230
+ final_list_filt2 = [c for c in final_list_filt if 'SSITC' not in c]
231
+ final_list_filt3 = [c for c in final_list_filt2 if '_F' not in c]
232
+ return in_df[final_list_filt3].mean(axis=1).values
233
+
234
+
235
+ def calc_G(in_df):
236
+ """
237
+ Calculates the mean ground heat flux (G) from available columns.
238
+ """
239
+ print('\treading G')
240
+ final_list = [c for c in in_df.columns if c.startswith('G')]
241
+ final_list_filt = [c for c in final_list if 'GPP' not in c]
242
+ final_list_filt2 = [c for c in final_list_filt if '_F' not in c]
243
+ return in_df[final_list_filt2].mean(axis=1).values
244
+
245
+
246
+ def calc_NETRAD(in_df):
247
+ """
248
+ Calculates the mean net radiation (NETRAD) from available columns.
249
+ """
250
+ print('\treading NETRAD')
251
+ final_list = [c for c in in_df.columns if c.startswith('NETRAD')]
252
+ final_list_filt = [c for c in final_list if '_F' not in c]
253
+ return in_df[final_list_filt].mean(axis=1).values
254
+
255
+
256
+ def calc_LE(in_df):
257
+ """
258
+ Calculates the mean latent heat flux (LE) from available columns.
259
+ """
260
+ print('\treading LE')
261
+ final_list = [c for c in in_df.columns if c.startswith('LE')]
262
+ final_list_filt2 = [c for c in final_list if 'SSITC' not in c]
263
+ final_list_filt3 = [c for c in final_list_filt2 if 'LEAF' not in c]
264
+ final_list_filt4 = [c for c in final_list_filt3 if '_F' not in c]
265
+ LE = in_df[final_list_filt4].mean(axis=1).values
266
+ print(LE)
267
+ return LE
268
+
269
+
270
+ def calc_SWC(in_df):
271
+ """
272
+ Calculates the mean surface soil water content (SWC) from available
273
+ columns.
274
+ """
275
+ print('\treading SWC surface')
276
+ final_list = []
277
+ for i in np.arange(1, 9):
278
+ try:
279
+ final_list.append(
280
+ list(in_df.columns[(in_df.columns.str.startswith(f'SWC_{i}_1'))])[0],
281
+ )
282
+ except IndexError:
283
+ continue
284
+ final_list_filt2 = [c for c in final_list if '_PI' not in c]
285
+ return in_df[final_list_filt2].mean(axis=1).values
286
+
287
+
288
+ def calc_all_SWC(in_df):
289
+ """
290
+ Calculates the mean soil water content (SWC) for all observations.
291
+ """
292
+ print('\treading SWC all')
293
+ final_list = in_df.columns[in_df.columns.str.startswith('SWC_')]
294
+ final_list_filt2 = [c for c in final_list if '_PI' not in c]
295
+ return in_df[final_list_filt2].mean(axis=1).values
296
+
297
+
298
+ def calc_RH(in_df):
299
+ """
300
+ Calculates the mean relative humidity (RH) from available columns.
301
+ """
302
+ print('\treading RH')
303
+ final_list = [c for c in in_df.columns if c.startswith('RH')]
304
+ final_list_filt2 = [c for c in final_list if '_PI' not in c]
305
+ return in_df[final_list_filt2].mean(axis=1).values
306
+
307
+
308
+ def calc_AirTemp(in_df):
309
+ """
310
+ Calculates the mean air temperature from available columns.
311
+ """
312
+ print('\treading Air Temperature')
313
+ final_list = [c for c in in_df.columns if c.startswith('TA')]
314
+ final_list_filt2 = [c for c in final_list if 'TAU' not in c]
315
+ final_list_filt3 = [c for c in final_list_filt2 if '_PI' not in c]
316
+ return in_df[final_list_filt3].mean(axis=1).values
317
+
318
+
319
+ # --- QAQC AND ENERGY BALANCE CLOSURE ---
320
+ def remove_spikes(in_df, varnames=['LE'], z=6.5):
321
+ """
322
+ Removes spikes in data using the median of absolute deviation about the
323
+ median, as described in Papale et al. (2006).
324
+
325
+ Args:
326
+ in_df (pd.DataFrame): DataFrame with AmeriFlux data.
327
+ varnames (list): List of variable names to filter.
328
+ z (float): The threshold for outlier detection. Larger numbers are
329
+ more conservative.
330
+
331
+ Returns:
332
+ pd.DataFrame: The DataFrame with spikes removed, creating new
333
+ filtered columns (e.g., 'LE_filt').
334
+ """
335
+ df_temp = in_df.copy()
336
+ df_day = df_temp[
337
+ (df_temp.NETRAD > 0)
338
+ | (df_temp.NETRAD.isnull())
339
+ & ((df_temp.index.hour >= 7) & (df_temp.index.hour < 17))
340
+ ]
341
+ df_night = df_temp[
342
+ (df_temp.NETRAD <= 0)
343
+ | (df_temp.NETRAD.isnull())
344
+ & ((df_temp.index.hour < 7) | ((df_temp.index.hour >= 17)))
345
+ ]
346
+
347
+ for var in varnames:
348
+ di_n = df_night[var].diff() - (df_night[var].diff(periods=-1) * -1.0)
349
+ di_d = df_day[var].diff() - (df_day[var].diff(periods=-1) * -1.0)
350
+ md_n = np.nanmedian(di_n)
351
+ md_d = np.nanmedian(di_d)
352
+ mad_n = np.nanmedian(np.abs(di_n - md_n))
353
+ mad_d = np.nanmedian(np.abs(di_d - md_d))
354
+
355
+ mask_nh = di_n < md_n - (z * mad_n / 0.6745)
356
+ mask_nl = di_n > md_n + (z * mad_n / 0.6745)
357
+ df_night.loc[mask_nh | mask_nl, var] = np.nan
358
+
359
+ mask_dh = di_d < md_d - (z * mad_d / 0.6745)
360
+ mask_dl = di_d > md_d + (z * mad_d / 0.6745)
361
+ df_day.loc[mask_dh | mask_dl, var] = np.nan
362
+
363
+ df_out = pd.concat([df_night, df_day], verify_integrity=True).sort_index()
364
+ vnameout = var + '_filt'
365
+ in_df[vnameout] = df_out[var]
366
+ print(f'\t{var}_filt created')
367
+ return in_df
368
+
369
+
370
+ def rolling_quantile_filter(in_df, _var_='LE'):
371
+ """
372
+ Applies a conservative rolling 15-day quantile filter to remove outliers
373
+ that weren't caught by the spike removal algorithm.
374
+ """
375
+ df = in_df.copy()
376
+ df['IQR'] = (
377
+ df[_var_].rolling('15D', min_periods=int(48 * 5)).quantile(0.75)
378
+ - df[_var_].rolling('15D', min_periods=int(48 * 5)).quantile(0.25)
379
+ )
380
+ df['max'] = (
381
+ df['IQR'] * 2.5 + df[_var_].rolling('15D', min_periods=int(48 * 5)).quantile(0.75)
382
+ )
383
+ df['min'] = (
384
+ df[_var_].rolling('15D', min_periods=int(48 * 5)).quantile(0.25) - df['IQR'] * 2.5
385
+ )
386
+
387
+ df.loc[df[_var_] > df['max'], _var_] = np.nan
388
+ df.loc[df[_var_] < df['min'], _var_] = np.nan
389
+ df.drop(['IQR', 'max', 'min'], inplace=True, axis=1)
390
+
391
+ return df
392
+
393
+
394
+ def filter_based_on_threshs(
395
+ in_df,
396
+ LE_threshes=[-150, 1200],
397
+ H_threshes=[-150, 1200],
398
+ NETRAD_threshes=[-250, 1400],
399
+ G_threshes=[-250, 500],
400
+ filtered=True,
401
+ ):
402
+ """
403
+ Removes data that falls outside of specified physical thresholds.
404
+
405
+ Args:
406
+ in_df (pd.DataFrame): DataFrame containing flux data.
407
+ LE_threshes (list): Min and max thresholds for LE.
408
+ H_threshes (list): Min and max thresholds for H.
409
+ NETRAD_threshes (list): Min and max thresholds for NETRAD.
410
+ G_threshes (list): Min and max thresholds for G.
411
+ filtered (bool): If True, applies filters to '_filt' columns.
412
+
413
+ Returns:
414
+ pd.DataFrame: The DataFrame with values outside the thresholds set
415
+ to NaN.
416
+ """
417
+ _f_ = '_filt' if filtered else ''
418
+ df_amf = in_df.copy()
419
+
420
+ df_amf.loc[df_amf['LE' + _f_] < LE_threshes[0], 'LE' + _f_] = np.nan
421
+ df_amf.loc[df_amf['LE' + _f_] > LE_threshes[1], 'LE' + _f_] = np.nan
422
+ df_amf.loc[df_amf['NETRAD' + _f_] < NETRAD_threshes[0], 'NETRAD' + _f_] = np.nan
423
+ df_amf.loc[df_amf['NETRAD' + _f_] > NETRAD_threshes[1], 'NETRAD' + _f_] = np.nan
424
+ df_amf.loc[df_amf['G' + _f_] < G_threshes[0], 'G' + _f_] = np.nan
425
+ df_amf.loc[df_amf['G' + _f_] > G_threshes[1], 'G' + _f_] = np.nan
426
+ df_amf.loc[df_amf['H' + _f_] < LE_threshes[0], 'H' + _f_] = np.nan
427
+ df_amf.loc[df_amf['H' + _f_] > LE_threshes[1], 'H' + _f_] = np.nan
428
+
429
+ return df_amf
430
+
431
+
432
+ def force_close_fluxnet(in_df, filtered=False, verbose=True):
433
+ """
434
+ Performs Energy Balance Forced Closure according to Fluxnet methods.
435
+
436
+ This function calculates a correction factor and applies it to LE and H.
437
+
438
+ Args:
439
+ in_df (pd.DataFrame): AmeriFlux data frame with energy balance variables.
440
+ filtered (bool): If True, uses filtered columns.
441
+ verbose (bool): If True, prints status messages.
442
+
443
+ Returns:
444
+ pd.DataFrame: DataFrame with adjusted LE and H variables.
445
+ """
446
+ _f_ = '_filt' if filtered else ''
447
+ df = in_df.copy()
448
+
449
+ vars_to_use = ['LE' + _f_, 'H' + _f_, 'NETRAD' + _f_, 'G' + _f_]
450
+ df = df[vars_to_use].astype(float).copy()
451
+
452
+ if int(df['G' + _f_].count()) == 0 or df['G' + _f_].count() / len(df.index) < 0.3:
453
+ df['_RadFlux_'] = df['NETRAD' + _f_]
454
+ df['no_G_flag'] = 1
455
+ if verbose:
456
+ print('\tno valid G data available')
457
+ else:
458
+ df['_RadFlux_'] = df['NETRAD' + _f_] - df['G' + _f_]
459
+ df['no_G_flag'] = 0
460
+
461
+ df['ebc_cf'] = df['_RadFlux_'] / (df['H' + _f_] + df['LE' + _f_])
462
+ Q1 = df['ebc_cf'].quantile(0.25)
463
+ Q3 = df['ebc_cf'].quantile(0.75)
464
+ IQR = Q3 - Q1
465
+
466
+ filtered_df = df.query('(@Q1 - 1.5 * @IQR) <= ebc_cf <= (@Q3 + 1.5 * @IQR)')
467
+ removed_mask = set(df.index) - set(filtered_df.index)
468
+ removed_mask = pd.to_datetime(list(removed_mask))
469
+ df.ebc_cf.loc[removed_mask] = np.nan
470
+
471
+ if verbose:
472
+ print(f'\tmean correction factor is: {np.round(np.nanmean(df.ebc_cf.values), 2)}')
473
+ print(f'\tclosure ratio mean is: {1 / df.ebc_cf.mean()}')
474
+ print(
475
+ f'\tpercent of valid closure crs is: {100 * df["ebc_cf"].count() / len(df.index)}',
476
+ )
477
+
478
+ df['ebc_cf_all'] = df.ebc_cf.median()
479
+ df['ebc_cf_stable'] = df.ebc_cf.copy()
480
+
481
+ min_period_thresh = 48
482
+ night_or_day_mask = (df.index.hour > 20) | (df.index.hour <= 3) | ((df.index.hour > 10) & (df.index.hour <= 14))
483
+ df.loc[~night_or_day_mask, 'ebc_cf_stable'] = np.nan
484
+ df['ebc_cf_25'] = df.ebc_cf_stable.rolling('15D', min_periods=min_period_thresh, center=True).quantile(
485
+ 0.25,
486
+ interpolation='nearest',
487
+ )
488
+ df['ebc_cf_50'] = df.ebc_cf_stable.rolling('15D', min_periods=min_period_thresh, center=True).quantile(
489
+ 0.5,
490
+ interpolation='nearest',
491
+ )
492
+ df['ebc_cf_75'] = df.ebc_cf_stable.rolling('15D', min_periods=min_period_thresh, center=True).quantile(
493
+ 0.75,
494
+ interpolation='nearest',
495
+ )
496
+
497
+ df['LEcorr25'] = df.ebc_cf_25 * df['LE' + _f_]
498
+ df['LEcorr50'] = df.ebc_cf_50 * df['LE' + _f_]
499
+ df['LEcorr75'] = df.ebc_cf_75 * df['LE' + _f_]
500
+ df['LEcorr_ann'] = df.ebc_cf_all * df['LE' + _f_]
501
+
502
+ le_lims = [-100, 800]
503
+ for col in ['LEcorr_ann', 'LEcorr25', 'LEcorr50', 'LEcorr75']:
504
+ df.loc[(df[col] >= le_lims[1]) | (df[col] <= le_lims[0]), col] = np.nan
505
+
506
+ cf_lims = [0.5, 2]
507
+ for col in ['ebc_cf_all', 'ebc_cf_25', 'ebc_cf_50', 'ebc_cf_75']:
508
+ df.loc[(df[col] >= cf_lims[1]) | (df[col] <= cf_lims[0]), col] = np.nan
509
+
510
+ df['Hcorr25'] = df.ebc_cf_25 * df['H' + _f_]
511
+ df['Hcorr50'] = df.ebc_cf_50 * df['H' + _f_]
512
+ df['Hcorr75'] = df.ebc_cf_75 * df['H' + _f_]
513
+ df['Hcorr_ann'] = df.ebc_cf_all * df['H' + _f_]
514
+
515
+ out_vars = [
516
+ 'LEcorr_ann',
517
+ 'LEcorr25',
518
+ 'LEcorr50',
519
+ 'LEcorr75',
520
+ 'ebc_cf',
521
+ 'Hcorr_ann',
522
+ 'Hcorr25',
523
+ 'Hcorr50',
524
+ 'Hcorr75',
525
+ ]
526
+ df_out = df[out_vars]
527
+
528
+ cr = 1.0 / np.round(np.nanmean(df.ebc_cf_stable.values), 5)
529
+ cf = np.round(np.nanmean(df.ebc_cf_stable.values), 5)
530
+ if verbose:
531
+ print(f'\n\tmean stable correction factor\n\t{cr} closure\n\t{cf} correction factor\n')
532
+ print(f'\tclosure at site when filtered for stable conditions is:\t{cr}')
533
+
534
+ return df_out
535
+
536
+
537
+ def force_close_br_daily(in_df, filtered=True):
538
+ """
539
+ Performs forced closure using a daily Bowen ratio approach.
540
+ """
541
+ _f_ = '_filt' if filtered else ''
542
+ df = in_df.copy()
543
+ vars_to_use = ['LE' + _f_, 'H' + _f_, 'NETRAD' + _f_, 'G' + _f_]
544
+ df = df[vars_to_use].astype(float).copy()
545
+
546
+ if int(df['G' + _f_].count()) == 0 or df['G' + _f_].count() / len(df.index) > 0.3:
547
+ df['_RadFlux_'] = df['NETRAD' + _f_]
548
+ df['no_G_flag'] = 1
549
+ else:
550
+ df['_RadFlux_'] = df['NETRAD' + _f_] - df['G' + _f_]
551
+ df['no_G_flag'] = 0
552
+
553
+ min_period_thresh = int(12 * 3)
554
+ df['cf'] = df['_RadFlux_'] / (df['LE' + _f_] + df['H' + _f_])
555
+ df['cf_1day'] = df.cf.rolling('3D', min_periods=min_period_thresh, center=True).median()
556
+
557
+ df['LEcorr_br'] = df['cf_1day'] * df['LE' + _f_]
558
+ df.loc[(df.LEcorr_br >= 1200) | (df.LEcorr_br <= -150), 'LEcorr_br'] = np.nan
559
+
560
+ df['Hcorr_br'] = df['cf_1day'] * df['H' + _f_]
561
+ df.loc[(df.Hcorr_br >= 1200) | (df.Hcorr_br <= -150), 'Hcorr_br'] = np.nan
562
+
563
+ out_vars = ['LEcorr_br', 'Hcorr_br']
564
+ return df[out_vars]
565
+
566
+
567
+ def read_amflx_data(filename, site_meta_fname, filtered=True, gapfill_interp=True, verbose=True):
568
+ """
569
+ Reads, cleans, and processes an AmeriFlux data file.
570
+
571
+ This is a comprehensive function that handles multiple steps:
572
+ 1. Reads the file and sets the time index.
573
+ 2. Identifies and extracts key variables (e.g., LE, H, NETRAD).
574
+ 3. Applies QAQC filters like spike removal and quantile filtering.
575
+ 4. Performs energy balance closure corrections.
576
+ 5. Converts time to UTC and solar time.
577
+
578
+ Args:
579
+ filename (str): Path to the AmeriFlux data file.
580
+ site_meta_fname (str): Path to the site metadata file.
581
+ filtered (bool): If True, applies QAQC filtering.
582
+ gapfill_interp (bool): If True, interpolates small data gaps.
583
+ verbose (bool): If True, prints status messages.
584
+
585
+ Returns:
586
+ pd.DataFrame: The fully processed and cleaned DataFrame.
587
+ """
588
+ site = filename.split('/')[-1].split('_')[1]
589
+ if verbose:
590
+ print(f'starting to process & clean:\t{site}')
591
+
592
+ df_amf = pd.read_csv(filename, skiprows=2, header=0)
593
+ df_amf['local_time'] = pd.to_datetime(df_amf['TIMESTAMP_END'], format='%Y%m%d%H%M')
594
+ df_amf.set_index(['local_time'], inplace=True)
595
+ if verbose:
596
+ print('\tfile read and time set to local')
597
+
598
+ df_amf = df_amf[df_amf.index >= '2018-10-01']
599
+ df_amf.replace(-9999, np.nan, inplace=True)
600
+
601
+ g_exists = False
602
+ try:
603
+ df_amf['G'] = calc_G(df_amf)
604
+ g_exists = True
605
+ except (KeyError, IndexError):
606
+ print('\tno ground heat flux\nassigning 0 to G for energy balance closure')
607
+ df_amf['G'] = 0
608
+ df_amf['G_filt'] = 0
609
+
610
+ if len([c for c in df_amf.columns if c.startswith('NETRAD')]) >= 1:
611
+ df_amf['NETRAD'] = calc_NETRAD(df_amf)
612
+ elif site == 'US-MMS':
613
+ df_amf['NETRAD'] = df_amf['SW_IN_1_1_1'] - df_amf['SW_OUT_1_1_1'] + df_amf['LW_IN_1_1_1'] - df_amf['LW_OUT_1_1_1']
614
+ else:
615
+ df_amf['NETRAD'] = np.nan
616
+
617
+ if len([c for c in df_amf.columns if c.startswith('LE')]) >= 1:
618
+ df_amf['LE'] = calc_LE(df_amf)
619
+ else:
620
+ df_amf['LE'] = np.nan
621
+
622
+ if len([c for c in df_amf.columns if c.startswith('H')]) >= 1:
623
+ df_amf['H'] = calc_H(df_amf)
624
+ else:
625
+ df_amf['H'] = np.nan
626
+
627
+ if verbose:
628
+ print('\tchecked for energy balance variables')
629
+
630
+ if len([c for c in df_amf.columns if c.startswith('SWC')]) >= 1:
631
+ df_amf['SM_surf'] = calc_SWC(df_amf)
632
+ df_amf['SM_rz'] = calc_all_SWC(df_amf)
633
+ else:
634
+ df_amf['SM_surf'], df_amf['SM_rz'] = np.nan, np.nan
635
+
636
+ if len([c for c in df_amf.columns if c.startswith('RH')]) >= 1:
637
+ df_amf['RH'] = calc_RH(df_amf)
638
+ else:
639
+ df_amf['RH'] = np.nan
640
+
641
+ if len([c for c in df_amf.columns if c.startswith('TA')]) >= 1:
642
+ df_amf['AirTempC'] = calc_AirTemp(df_amf)
643
+ else:
644
+ df_amf['AirTempC'] = np.nan
645
+
646
+ if len([c for c in df_amf.columns if c.startswith('SW_IN')]) >= 1:
647
+ df_amf['SW_IN'] = calc_SWin(df_amf)
648
+ else:
649
+ df_amf['SW_IN'] = np.nan
650
+
651
+ if verbose:
652
+ print('\tchecked for ancillary variables')
653
+
654
+ if filtered:
655
+ for var in ['LE', 'H', 'NETRAD']:
656
+ df_amf = remove_spikes(df_amf, varnames=[var])
657
+ df_amf = rolling_quantile_filter(df_amf, f'{var}_filt')
658
+
659
+ if g_exists:
660
+ df_amf = rolling_quantile_filter(df_amf, 'G')
661
+ df_amf = remove_spikes(df_amf, varnames=['G'])
662
+
663
+ if gapfill_interp:
664
+ for var in ['LE_filt', 'H_filt', 'G_filt', 'NETRAD_filt']:
665
+ if var in df_amf.columns:
666
+ df_amf[var].interpolate('linear', limit=8, inplace=True)
667
+
668
+ df_amf = filter_based_on_threshs(df_amf, filtered=True)
669
+ df_corr_flux = force_close_fluxnet(df_amf, filtered=True)
670
+ df_corr_br_daily = force_close_br_daily(df_amf, filtered=True)
671
+ df_out = pd.concat([df_amf, df_corr_flux, df_corr_br_daily], axis=1)
672
+
673
+ df_out['LE_std'] = df_out.LE.rolling(4, min_periods=3).std()
674
+ df_out['LE_2hr_med'] = df_out.LE.rolling(4, min_periods=3).median()
675
+ df_out['LE_2hr_avg'] = df_out.LE.rolling(4, min_periods=3).mean()
676
+
677
+ print('\tmeta data read to access utc offset')
678
+ offset = get_utc_hr_offset(site_meta_fname)
679
+ out_times = change_to_utc(df_out.index, offset)
680
+ df_out['time_utc'] = out_times
681
+
682
+ site_long = get_lon(site_meta_fname)
683
+ df_out['solar_time'] = utc_to_solar(df_out.time_utc, site_long)
684
+ df_out['solar_hour'] = df_out['solar_time'].dt.hour
685
+ df_out.set_index(['time_utc'], inplace=True)
686
+ df_out['local_time'] = pd.to_datetime(df_out['TIMESTAMP_END'], format='%Y%m%d%H%M')
687
+
688
+ return df_out
689
+
690
+
691
+ # --- OTHER UTILITIES ---
692
+ def LE_2_ETmm(LE_Wm2, freq='day'):
693
+ """
694
+ Converts Latent Energy (LE) flux to Evapotranspiration (ET).
695
+
696
+ Args:
697
+ LE_Wm2 (np.ndarray): Latent energy flux in W/m^2.
698
+ freq (str): The time frequency ('30 min' or 'day').
699
+
700
+ Returns:
701
+ np.ndarray: Evapotranspiration in mm.
702
+ """
703
+ lambda_e = 2.460 * 10**6
704
+ roe_w = 1000
705
+ m_2_mm = 1000
706
+
707
+ if freq == '30 min':
708
+ sec_conv = 60 * 30
709
+ elif freq == 'day':
710
+ sec_conv = 60 * 30 * 48
711
+ else:
712
+ raise ValueError("Invalid frequency. Choose '30 min' or 'day'.")
713
+
714
+ mask = ~np.isnan(LE_Wm2)
715
+ ET_mm = np.empty(LE_Wm2.shape)
716
+ ET_mm[:] = np.nan
717
+ ET_mm[mask] = LE_Wm2[mask] * (m_2_mm * sec_conv) / (lambda_e * roe_w)
718
+ return ET_mm
719
+
720
+
721
+ def assign_time(in_df, time_col='time_UTC'):
722
+ """
723
+ Converts a specified time column to a datetime index.
724
+
725
+ Args:
726
+ in_df (pd.DataFrame): DataFrame with a time column.
727
+ time_col (str): The name of the time column to use.
728
+
729
+ Returns:
730
+ pd.DataFrame: A new DataFrame with the time column set as the index.
731
+ """
732
+ df_test_var = in_df.copy()
733
+ df_test_var['time'] = pd.to_datetime(df_test_var[time_col])
734
+ df_test_var.set_index('time', inplace=True)
735
+ df_test_var.drop(time_col, axis=1, inplace=True)
736
+ return df_test_var.copy()