loone-data-prep 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loone_data_prep/GEOGLOWS_LOONE_DATA_PREP.py +2 -1
- loone_data_prep/LOONE_DATA_PREP.py +115 -2
- loone_data_prep/flow_data/S65E_total.py +71 -6
- loone_data_prep/flow_data/forecast_bias_correction.py +193 -8
- loone_data_prep/flow_data/get_inflows.py +130 -41
- loone_data_prep/flow_data/get_outflows.py +110 -26
- loone_data_prep/flow_data/hydro.py +121 -27
- loone_data_prep/utils.py +339 -62
- loone_data_prep/water_level_data/get_all.py +208 -11
- loone_data_prep/water_level_data/hydro.py +71 -3
- loone_data_prep/water_quality_data/get_inflows.py +88 -3
- loone_data_prep/water_quality_data/get_lake_wq.py +85 -3
- loone_data_prep/water_quality_data/wq.py +44 -0
- loone_data_prep/weather_data/get_all.py +126 -3
- loone_data_prep/weather_data/weather.py +185 -27
- {loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.8.dist-info}/METADATA +2 -1
- loone_data_prep-0.1.8.dist-info/RECORD +27 -0
- {loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.8.dist-info}/WHEEL +1 -1
- loone_data_prep-0.1.6.dist-info/RECORD +0 -27
- {loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.8.dist-info}/LICENSE +0 -0
- {loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.8.dist-info}/top_level.txt +0 -0
|
@@ -45,10 +45,11 @@ def main(input_dir: str, output_dir: str, ensemble_number: str) -> None: # , hi
|
|
|
45
45
|
LO_Stage = pd.read_csv(f"{input_dir}/LO_Stage.csv")
|
|
46
46
|
# Create Column (EOD Stg(ft, NGVD)) in File (SFWMM_Daily_Outputs)
|
|
47
47
|
LO_Stage = DF_Date_Range(LO_Stage, M3_Yr, M3_M, M3_D, En_Yr, En_M, En_D)
|
|
48
|
+
LO_Stage.index = LO_Stage["date"]
|
|
48
49
|
# Calculate average
|
|
49
50
|
if "Average_Stage" not in LO_Stage.columns:
|
|
50
51
|
LO_Stage = LO_Stage.loc[:, ~LO_Stage.columns.str.contains("^Unnamed")]
|
|
51
|
-
LO_Stage["Average_Stage"] = LO_Stage.mean(axis=1)
|
|
52
|
+
LO_Stage["Average_Stage"] = LO_Stage.drop(columns=['date']).mean(axis=1)
|
|
52
53
|
LO_Stage.to_csv(f"{input_dir}/LO_Stage.csv", index=False)
|
|
53
54
|
LO_Storage = stg2sto(f"{input_dir}/StgSto_data.csv", LO_Stage["Average_Stage"], 0)
|
|
54
55
|
LO_SA = stg2ar(f"{input_dir}/Stgar_data.csv", LO_Stage["Average_Stage"], 0)
|
|
@@ -383,7 +383,6 @@ def main(input_dir: str, output_dir: str) -> None:
|
|
|
383
383
|
LOWS['LZ40WS'] = LZ40WS['LZ40_WNDS_MPH']
|
|
384
384
|
LOWS['LO_Avg_WS_MPH'] = LOWS.mean(axis=1, numeric_only=True)
|
|
385
385
|
LOWS.to_csv(f'{output_dir}/LOWS.csv', index=False)
|
|
386
|
-
LOWS.to_csv(f'{input_dir}/LOWS.csv', index=False) # Also needed in temporary directory by utils.py's wind_induced_waves()
|
|
387
386
|
|
|
388
387
|
# RFVol acft
|
|
389
388
|
# Create File (RF_Volume)
|
|
@@ -581,6 +580,18 @@ def main(input_dir: str, output_dir: str) -> None:
|
|
|
581
580
|
LO_OP_data_Inter['Mean_OP'] = LO_OP_data_Inter.mean(axis=1, numeric_only=True)
|
|
582
581
|
LO_OP_data_Inter = DF_Date_Range(LO_OP_data_Inter, St_Yr, St_M, St_D, En_Yr, En_M, En_D)
|
|
583
582
|
LO_OP_data_Inter.to_csv(f'{output_dir}/LO_OP.csv', index=False)
|
|
583
|
+
|
|
584
|
+
# Create File (N_OP) (L001, L005, L008)
|
|
585
|
+
n_op = LO_OP_data_Inter[['date', 'Data_L001_OP_Inter', 'Data_L005_OP_Inter', 'Data_L008_OP_Inter']]
|
|
586
|
+
n_op['OP'] = n_op.mean(axis=1, numeric_only=True) * 1000 # mg/L to mg/m3
|
|
587
|
+
n_op.drop(['Data_L001_OP_Inter', 'Data_L005_OP_Inter', 'Data_L008_OP_Inter'], axis=1, inplace=True)
|
|
588
|
+
n_op.to_csv(f'{output_dir}/N_OP.csv', index=False)
|
|
589
|
+
|
|
590
|
+
# Create File (S_OP) (L004, L006, L007, L008, and LZ40)
|
|
591
|
+
s_op = LO_OP_data_Inter[['date', 'Data_L004_OP_Inter', 'Data_L006_OP_Inter', 'Data_L007_OP_Inter', 'Data_L008_OP_Inter', 'Data_LZ40_OP_Inter']]
|
|
592
|
+
s_op['OP'] = s_op.mean(axis=1, numeric_only=True) * 1000 # mg/L to mg/m3
|
|
593
|
+
s_op.drop(['Data_L004_OP_Inter', 'Data_L006_OP_Inter', 'Data_L007_OP_Inter', 'Data_L008_OP_Inter', 'Data_LZ40_OP_Inter'], axis=1, inplace=True)
|
|
594
|
+
s_op.to_csv(f'{output_dir}/S_OP.csv', index=False)
|
|
584
595
|
|
|
585
596
|
# Interpolated NH4 Observations in Lake
|
|
586
597
|
# Create File (LO_Avg_NH4)
|
|
@@ -663,6 +674,22 @@ def main(input_dir: str, output_dir: str) -> None:
|
|
|
663
674
|
LO_DIN['NO'] = LO_NO_Clean_Inter['Mean_NO'].values
|
|
664
675
|
LO_DIN['DIN_mg/m3'] = LO_DIN[['NH4', 'NO']].sum(axis=1)*1000
|
|
665
676
|
LO_DIN.to_csv(f'{output_dir}/LO_DIN.csv', index=False)
|
|
677
|
+
|
|
678
|
+
# Create File (N_DIN) (L001, L005, L008)
|
|
679
|
+
n_din = pd.DataFrame(date_DIN, columns=['date'])
|
|
680
|
+
n_din.set_index('date', inplace=True)
|
|
681
|
+
n_din['NH4'] = LO_NH4_Clean_Inter[['date', 'Data_L001_NH4_Inter', 'Data_L005_NH4_Inter', 'Data_L008_NH4_Inter']].mean(axis=1, numeric_only=True)
|
|
682
|
+
n_din['NO'] = LO_NO_Clean_Inter[['date', 'Data_L001_NO_Inter', 'Data_L005_NO_Inter', 'Data_L008_NO_Inter']].mean(axis=1, numeric_only=True)*1000 # mg/L to mg/m3
|
|
683
|
+
n_din['DIN'] = n_din[['NH4', 'NO']].sum(axis=1)*1000 # mg/L to mg/m3
|
|
684
|
+
n_din.to_csv(f'{output_dir}/N_DIN.csv')
|
|
685
|
+
|
|
686
|
+
# Create File (S_DIN) (L004, L006, L007, L008, LZ40)
|
|
687
|
+
s_din = pd.DataFrame(date_DIN, columns=['date'])
|
|
688
|
+
s_din.set_index('date', inplace=True)
|
|
689
|
+
s_din['NH4'] = LO_NH4_Clean_Inter[['date', 'Data_L004_NH4_Inter', 'Data_L006_NH4_Inter', 'Data_L007_NH4_Inter', 'Data_L008_NH4_Inter', 'Data_LZ40_NH4_Inter']].mean(axis=1, numeric_only=True)
|
|
690
|
+
s_din['NO'] = LO_NO_Clean_Inter[['date', 'Data_L004_NO_Inter', 'Data_L006_NO_Inter', 'Data_L007_NO_Inter', 'Data_L008_NO_Inter', 'Data_LZ40_NO_Inter']].mean(axis=1, numeric_only=True)*1000 # mg/L to mg/m3
|
|
691
|
+
s_din['DIN'] = s_din[['NH4', 'NO']].sum(axis=1)*1000 # mg/L to mg/m3
|
|
692
|
+
s_din.to_csv(f'{output_dir}/S_DIN.csv')
|
|
666
693
|
|
|
667
694
|
# Interpolated DO Observations in Lake
|
|
668
695
|
# Create File (LO_Avg_DO)
|
|
@@ -822,6 +849,93 @@ def main(input_dir: str, output_dir: str) -> None:
|
|
|
822
849
|
LO_Chla_Merge_Monthly_Inter = LO_Chla_Merge.resample('M').mean()
|
|
823
850
|
LO_Chla_Merge_Monthly_Inter.to_csv(f'{output_dir}/LO_Chla_Merge_Monthly_Inter.csv')
|
|
824
851
|
|
|
852
|
+
# Create files (LO_Chla_Obs.csv, N_Merged_Chla.csv, and S_Merged_Chla.csv)
|
|
853
|
+
L001_Chla = pd.read_csv(f'{input_dir}/water_quality_L001_CHLOROPHYLL-A, CORRECTED.csv')
|
|
854
|
+
L001_Chla.drop(columns=['days'], inplace=True)
|
|
855
|
+
L004_Chla = pd.read_csv(f'{input_dir}/water_quality_L004_CHLOROPHYLL-A, CORRECTED.csv')
|
|
856
|
+
L004_Chla.drop(columns=['days'], inplace=True)
|
|
857
|
+
L005_Chla = pd.read_csv(f'{input_dir}/water_quality_L005_CHLOROPHYLL-A, CORRECTED.csv')
|
|
858
|
+
L005_Chla.drop(columns=['days'], inplace=True)
|
|
859
|
+
L006_Chla = pd.read_csv(f'{input_dir}/water_quality_L006_CHLOROPHYLL-A, CORRECTED.csv')
|
|
860
|
+
L006_Chla.drop(columns=['days'], inplace=True)
|
|
861
|
+
L007_Chla = pd.read_csv(f'{input_dir}/water_quality_L007_CHLOROPHYLL-A, CORRECTED.csv')
|
|
862
|
+
L007_Chla.drop(columns=['days'], inplace=True)
|
|
863
|
+
L008_Chla = pd.read_csv(f'{input_dir}/water_quality_L008_CHLOROPHYLL-A, CORRECTED.csv')
|
|
864
|
+
L008_Chla.drop(columns=['days'], inplace=True)
|
|
865
|
+
LZ40_Chla = pd.read_csv(f'{input_dir}/water_quality_LZ40_CHLOROPHYLL-A, CORRECTED.csv')
|
|
866
|
+
LZ40_Chla.drop(columns=['days'], inplace=True)
|
|
867
|
+
L001_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L001_CHLOROPHYLL-A(LC).csv')
|
|
868
|
+
L001_Chla_LC.drop(columns=['days'], inplace=True)
|
|
869
|
+
L004_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L004_CHLOROPHYLL-A(LC).csv')
|
|
870
|
+
L004_Chla_LC.drop(columns=['days'], inplace=True)
|
|
871
|
+
L005_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L005_CHLOROPHYLL-A(LC).csv')
|
|
872
|
+
L005_Chla_LC.drop(columns=['days'], inplace=True)
|
|
873
|
+
L006_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L006_CHLOROPHYLL-A(LC).csv')
|
|
874
|
+
L006_Chla_LC.drop(columns=['days'], inplace=True)
|
|
875
|
+
L007_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L007_CHLOROPHYLL-A(LC).csv')
|
|
876
|
+
L007_Chla_LC.drop(columns=['days'], inplace=True)
|
|
877
|
+
L008_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L008_CHLOROPHYLL-A(LC).csv')
|
|
878
|
+
L008_Chla_LC.drop(columns=['days'], inplace=True)
|
|
879
|
+
LZ40_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_LZ40_CHLOROPHYLL-A(LC).csv')
|
|
880
|
+
LZ40_Chla_LC.drop(columns=['days'], inplace=True)
|
|
881
|
+
|
|
882
|
+
LO_Chla = pd.merge(L001_Chla, L004_Chla, how='left', on='date')
|
|
883
|
+
LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
|
|
884
|
+
LO_Chla = pd.merge(LO_Chla, L005_Chla, how='left', on='date')
|
|
885
|
+
LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
|
|
886
|
+
LO_Chla = pd.merge(LO_Chla, L006_Chla, how='left', on='date')
|
|
887
|
+
LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
|
|
888
|
+
LO_Chla = pd.merge(LO_Chla, L007_Chla, how='left', on='date')
|
|
889
|
+
LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
|
|
890
|
+
LO_Chla = pd.merge(LO_Chla, L008_Chla, how='left', on='date')
|
|
891
|
+
LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
|
|
892
|
+
LO_Chla = pd.merge(LO_Chla, LZ40_Chla, how='left', on='date')
|
|
893
|
+
LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
|
|
894
|
+
LO_Chla = LO_Chla.set_index('date')
|
|
895
|
+
LO_Chla['Mean_Chla'] = LO_Chla.mean(axis=1)
|
|
896
|
+
LO_Chla = LO_Chla.reset_index()
|
|
897
|
+
LO_Chla_N_cols = ['L001_CHLOROPHYLL-A, CORRECTED_ug/L', 'L005_CHLOROPHYLL-A, CORRECTED_ug/L', 'L008_CHLOROPHYLL-A, CORRECTED_ug/L']
|
|
898
|
+
LO_Chla['Chla_North'] = LO_Chla[LO_Chla_N_cols].mean(axis=1)
|
|
899
|
+
LO_Chla_S_cols = ['L004_CHLOROPHYLL-A, CORRECTED_ug/L', 'L006_CHLOROPHYLL-A, CORRECTED_ug/L', 'L007_CHLOROPHYLL-A, CORRECTED_ug/L','L008_CHLOROPHYLL-A, CORRECTED_ug/L','LZ40_CHLOROPHYLL-A, CORRECTED_ug/L']
|
|
900
|
+
LO_Chla['Chla_South'] = LO_Chla[LO_Chla_S_cols].mean(axis=1)
|
|
901
|
+
|
|
902
|
+
LO_Chla_LC = pd.merge(L001_Chla_LC, L004_Chla_LC, how='left', on='date')
|
|
903
|
+
LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
|
|
904
|
+
LO_Chla_LC = pd.merge(LO_Chla_LC, L005_Chla_LC, how='left', on='date')
|
|
905
|
+
LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
|
|
906
|
+
LO_Chla_LC = pd.merge(LO_Chla_LC, L006_Chla_LC, how='left', on='date')
|
|
907
|
+
LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
|
|
908
|
+
LO_Chla_LC = pd.merge(LO_Chla_LC, L007_Chla_LC, how='left', on='date')
|
|
909
|
+
LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
|
|
910
|
+
LO_Chla_LC = pd.merge(LO_Chla_LC, L008_Chla_LC, how='left', on='date')
|
|
911
|
+
LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
|
|
912
|
+
LO_Chla_LC = pd.merge(LO_Chla_LC, LZ40_Chla_LC, how='left', on='date')
|
|
913
|
+
LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
|
|
914
|
+
LO_Chla_LC = LO_Chla_LC.set_index('date')
|
|
915
|
+
LO_Chla_LC['Mean_Chla'] = LO_Chla_LC.mean(axis=1)
|
|
916
|
+
LO_Chla_LC = LO_Chla_LC.reset_index()
|
|
917
|
+
LO_Chla_LC_N_cols = ['L001_CHLOROPHYLL-A(LC)_ug/L', 'L005_CHLOROPHYLL-A(LC)_ug/L', 'L008_CHLOROPHYLL-A(LC)_ug/L']
|
|
918
|
+
LO_Chla_LC['Chla_North'] = LO_Chla_LC[LO_Chla_LC_N_cols].mean(axis=1)
|
|
919
|
+
LO_Chla_LC_S_cols = ['L004_CHLOROPHYLL-A(LC)_ug/L', 'L006_CHLOROPHYLL-A(LC)_ug/L', 'L007_CHLOROPHYLL-A(LC)_ug/L','L008_CHLOROPHYLL-A(LC)_ug/L','LZ40_CHLOROPHYLL-A(LC)_ug/L']
|
|
920
|
+
LO_Chla_LC['Chla_South'] = LO_Chla_LC[LO_Chla_LC_S_cols].mean(axis=1)
|
|
921
|
+
|
|
922
|
+
LO_Chla = DF_Date_Range(LO_Chla, 2008, 1, 1, 2010, 10, 19)
|
|
923
|
+
LO_Chla_df = pd.DataFrame(LO_Chla['date'], columns=['date'])
|
|
924
|
+
LO_Chla_df['Chla'] = LO_Chla['Mean_Chla']
|
|
925
|
+
LO_Chla_df['Chla_N'] = LO_Chla['Chla_North']
|
|
926
|
+
LO_Chla_df['Chla_S'] = LO_Chla['Chla_South']
|
|
927
|
+
|
|
928
|
+
LO_Chla_LC = DF_Date_Range(LO_Chla_LC, 2010, 10, 20, 2023, 6, 30)
|
|
929
|
+
LO_Chla_LC_df = pd.DataFrame(LO_Chla_LC['date'], columns=['date'])
|
|
930
|
+
LO_Chla_LC_df['Chla'] = LO_Chla_LC['Mean_Chla']
|
|
931
|
+
LO_Chla_LC_df['Chla_N'] = LO_Chla_LC['Chla_North']
|
|
932
|
+
LO_Chla_LC_df['Chla_S'] = LO_Chla_LC['Chla_South']
|
|
933
|
+
|
|
934
|
+
LO_Chla_Merge = pd.concat([LO_Chla_df, LO_Chla_LC_df]).reset_index(drop=True)
|
|
935
|
+
LO_Chla_Merge.to_csv(f'{output_dir}/LO_Chla_Obs.csv')
|
|
936
|
+
LO_Chla_Merge[['date', 'Chla_N']].rename(columns={'Chla_N': 'Chla'}).to_csv(f'{output_dir}/N_Merged_Chla.csv', index=False)
|
|
937
|
+
LO_Chla_Merge[['date', 'Chla_S']].rename(columns={'Chla_S': 'Chla'}).to_csv(f'{output_dir}/S_Merged_Chla.csv', index=False)
|
|
938
|
+
|
|
825
939
|
# Create Files S65E_Avg_Chla
|
|
826
940
|
S65E_Chla_Inter = pd.read_csv(f'{input_dir}/water_quality_S65E_CHLOROPHYLL-A, CORRECTED_Interpolated.csv')
|
|
827
941
|
S65E_Chla_LC_Inter = pd.read_csv(f'{input_dir}/water_quality_S65E_CHLOROPHYLL-A(LC)_Interpolated.csv')
|
|
@@ -897,7 +1011,6 @@ def main(input_dir: str, output_dir: str) -> None:
|
|
|
897
1011
|
# Write Data into csv files
|
|
898
1012
|
# write Avg Stage (ft, m) Storage (acft, m3) SA (acres) to csv
|
|
899
1013
|
LO_Stg_Sto_SA_df.to_csv(f'{output_dir}/Average_LO_Storage_3MLag.csv', index=False)
|
|
900
|
-
LO_Stg_Sto_SA_df.to_csv(f'{input_dir}/Average_LO_Storage_3MLag.csv', index=False) # Also needed in temporary directory by utils.py's wind_induced_waves()
|
|
901
1014
|
# Write S65 TP concentrations (mg/L)
|
|
902
1015
|
S65_total_TP.to_csv(f'{output_dir}/S65_TP_3MLag.csv', index=False)
|
|
903
1016
|
# TP External Loads 3 Months Lag (mg)
|
|
@@ -2,22 +2,87 @@ import sys
|
|
|
2
2
|
from retry import retry
|
|
3
3
|
from rpy2.robjects import r
|
|
4
4
|
from rpy2.rinterface_lib.embedded import RRuntimeError
|
|
5
|
+
import pandas as pd
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
@retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
|
|
8
|
-
def get(
|
|
9
|
+
def get(
|
|
10
|
+
workspace,
|
|
11
|
+
date_min: str = "1972-01-01",
|
|
12
|
+
date_max: str = "2023-06-30"
|
|
13
|
+
) -> None:
|
|
9
14
|
r(
|
|
10
15
|
f"""
|
|
11
16
|
# Load the required libraries
|
|
12
17
|
library(dbhydroR)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
+
library(dplyr)
|
|
19
|
+
|
|
20
|
+
# Helper Functions
|
|
21
|
+
retrieve_data <- function(dbkey, date_min, date_max)
|
|
22
|
+
{{
|
|
23
|
+
# Get the data from dbhydro
|
|
24
|
+
df = get_hydro(dbkey = dbkey, date_min = date_min, date_max = date_max, raw = TRUE)
|
|
25
|
+
|
|
26
|
+
# Give data.frame correct column names so it can be cleaned using the clean_hydro function
|
|
27
|
+
colnames(df) <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
|
|
28
|
+
|
|
29
|
+
# Add a type and units column to data so it can be cleaned using the clean_hydro function
|
|
30
|
+
df$type <- "FLOW"
|
|
31
|
+
df$units <- "cfs"
|
|
32
|
+
|
|
33
|
+
# Clean the data.frame
|
|
34
|
+
df <- clean_hydro(df)
|
|
35
|
+
|
|
36
|
+
# Drop the " _FLOW_cfs" column
|
|
37
|
+
df <- df %>% select(-` _FLOW_cfs`)
|
|
38
|
+
|
|
39
|
+
# Convert Flow rate from cfs to m³/day
|
|
40
|
+
df[, -1] <- df[, -1] * (0.0283168466 * 86400)
|
|
41
|
+
|
|
42
|
+
# Return resulting data.frame
|
|
43
|
+
return(df)
|
|
44
|
+
}}
|
|
45
|
+
|
|
46
|
+
# S65E_S
|
|
47
|
+
S65E_S <- retrieve_data(dbkey = "91656", date_min = "{date_min}", date_max = "{date_max}")
|
|
48
|
+
|
|
49
|
+
# Wait five seconds before next request to avoid "too many requests" error
|
|
50
|
+
Sys.sleep(5)
|
|
51
|
+
|
|
52
|
+
# S65EX1_S
|
|
53
|
+
S65EX1_S <- retrieve_data(dbkey = "AL760", date_min = "{date_min}", date_max = "{date_max}")
|
|
54
|
+
|
|
55
|
+
# Merge the data from each dbkey
|
|
56
|
+
result <- merge(S65E_S, S65EX1_S, by = "date", all = TRUE)
|
|
57
|
+
|
|
58
|
+
# Write the data to a file
|
|
59
|
+
write.csv(result, file = '{workspace}/S65E_total.csv')
|
|
18
60
|
"""
|
|
19
61
|
)
|
|
62
|
+
|
|
63
|
+
_reformat_s65e_total_file(workspace)
|
|
20
64
|
|
|
65
|
+
def _reformat_s65e_total_file(workspace: str):
|
|
66
|
+
# Read in the data
|
|
67
|
+
df = pd.read_csv(f"{workspace}/S65E_total.csv")
|
|
68
|
+
|
|
69
|
+
# Drop unused columns
|
|
70
|
+
df.drop('Unnamed: 0', axis=1, inplace=True)
|
|
71
|
+
|
|
72
|
+
# Convert date column to datetime
|
|
73
|
+
df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
|
|
74
|
+
|
|
75
|
+
# Sort the data by date
|
|
76
|
+
df.sort_values('date', inplace=True)
|
|
77
|
+
|
|
78
|
+
# Renumber the index
|
|
79
|
+
df.reset_index(drop=True, inplace=True)
|
|
80
|
+
|
|
81
|
+
# Drop rows that are missing all their values
|
|
82
|
+
df.dropna(how='all', inplace=True)
|
|
83
|
+
|
|
84
|
+
# Write the updated data back to the file
|
|
85
|
+
df.to_csv(f"{workspace}/S65E_total.csv")
|
|
21
86
|
|
|
22
87
|
if __name__ == "__main__":
|
|
23
88
|
workspace = sys.argv[1].rstrip("/")
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
import os
|
|
3
|
+
import math
|
|
4
|
+
import numpy as np
|
|
3
5
|
import pandas as pd
|
|
4
6
|
import geoglows
|
|
7
|
+
from scipy import interpolate
|
|
5
8
|
|
|
6
9
|
|
|
7
10
|
SECONDS_IN_DAY = 86400
|
|
@@ -37,29 +40,42 @@ def get_bias_corrected_data(
|
|
|
37
40
|
|
|
38
41
|
# Get the historical simulation data for the given reach ID
|
|
39
42
|
historical_data = None
|
|
40
|
-
|
|
43
|
+
|
|
41
44
|
if cache_path is None:
|
|
42
45
|
historical_data = geoglows.streamflow.historic_simulation(reach_id)
|
|
43
46
|
else:
|
|
44
47
|
# Create the geoglows cache directory if it doesn't exist
|
|
45
|
-
geoglows_cache_path = os.path.join(cache_path,
|
|
48
|
+
geoglows_cache_path = os.path.join(cache_path, "geoglows_cache")
|
|
46
49
|
if not os.path.exists(geoglows_cache_path):
|
|
47
50
|
os.makedirs(geoglows_cache_path)
|
|
48
|
-
|
|
51
|
+
|
|
49
52
|
# Check if the historical simulation data is already cached
|
|
50
|
-
if os.path.exists(
|
|
51
|
-
|
|
53
|
+
if os.path.exists(
|
|
54
|
+
os.path.join(
|
|
55
|
+
geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
|
|
56
|
+
)
|
|
57
|
+
):
|
|
58
|
+
historical_data = pd.read_csv(
|
|
59
|
+
os.path.join(
|
|
60
|
+
geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
|
|
61
|
+
),
|
|
62
|
+
index_col=0,
|
|
63
|
+
)
|
|
52
64
|
historical_data.index = pd.to_datetime(historical_data.index)
|
|
53
65
|
else:
|
|
54
66
|
historical_data = geoglows.streamflow.historic_simulation(reach_id)
|
|
55
|
-
historical_data.to_csv(
|
|
67
|
+
historical_data.to_csv(
|
|
68
|
+
os.path.join(
|
|
69
|
+
geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
|
|
70
|
+
)
|
|
71
|
+
)
|
|
56
72
|
|
|
57
73
|
# Correct the forecast bias in the station ensembles
|
|
58
|
-
station_ensembles =
|
|
74
|
+
station_ensembles = bias_correct_forecast(
|
|
59
75
|
station_ensembles, historical_data, prepared_od
|
|
60
76
|
)
|
|
61
77
|
# Correct the forecast bias in the station stats
|
|
62
|
-
station_stats =
|
|
78
|
+
station_stats = bias_correct_forecast(
|
|
63
79
|
station_stats, historical_data, prepared_od
|
|
64
80
|
)
|
|
65
81
|
|
|
@@ -92,6 +108,175 @@ def prep_observed_data(observed_data: pd.DataFrame) -> pd.DataFrame:
|
|
|
92
108
|
return observed_data
|
|
93
109
|
|
|
94
110
|
|
|
111
|
+
def bias_correct_historical(
|
|
112
|
+
simulated_data: pd.DataFrame, observed_data: pd.DataFrame
|
|
113
|
+
) -> pd.DataFrame:
|
|
114
|
+
"""
|
|
115
|
+
Accepts a historically simulated flow timeseries and observed flow timeseries and attempts to correct biases in the
|
|
116
|
+
simulation on a monthly basis.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
simulated_data: A dataframe with a datetime index and a single column of streamflow values
|
|
120
|
+
observed_data: A dataframe with a datetime index and a single column of streamflow values
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
pandas DataFrame with a datetime index and a single column of streamflow values
|
|
124
|
+
"""
|
|
125
|
+
# list of the unique months in the historical simulation. should always be 1->12 but just in case...
|
|
126
|
+
unique_simulation_months = sorted(set(simulated_data.index.strftime("%m")))
|
|
127
|
+
dates = []
|
|
128
|
+
values = []
|
|
129
|
+
|
|
130
|
+
for month in unique_simulation_months:
|
|
131
|
+
# filter historic data to only be current month
|
|
132
|
+
monthly_simulated = simulated_data[
|
|
133
|
+
simulated_data.index.month == int(month)
|
|
134
|
+
].dropna()
|
|
135
|
+
to_prob = _flow_and_probability_mapper(
|
|
136
|
+
monthly_simulated, to_probability=True
|
|
137
|
+
)
|
|
138
|
+
# filter the observations to current month
|
|
139
|
+
monthly_observed = observed_data[
|
|
140
|
+
observed_data.index.month == int(month)
|
|
141
|
+
].dropna()
|
|
142
|
+
to_flow = _flow_and_probability_mapper(monthly_observed, to_flow=True)
|
|
143
|
+
|
|
144
|
+
dates += monthly_simulated.index.to_list()
|
|
145
|
+
value = to_flow(to_prob(monthly_simulated.values))
|
|
146
|
+
values += value.tolist()
|
|
147
|
+
|
|
148
|
+
corrected = pd.DataFrame(
|
|
149
|
+
data=values, index=dates, columns=["Corrected Simulated Streamflow"]
|
|
150
|
+
)
|
|
151
|
+
corrected.sort_index(inplace=True)
|
|
152
|
+
return corrected
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def bias_correct_forecast(
|
|
156
|
+
forecast_data: pd.DataFrame,
|
|
157
|
+
simulated_data: pd.DataFrame,
|
|
158
|
+
observed_data: pd.DataFrame,
|
|
159
|
+
use_month: int = 0,
|
|
160
|
+
) -> pd.DataFrame:
|
|
161
|
+
"""
|
|
162
|
+
Accepts a short term forecast of streamflow, simulated historical flow, and observed flow timeseries and attempts
|
|
163
|
+
to correct biases in the forecasted data
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
forecast_data: A dataframe with a datetime index and any number of columns of forecasted flow. Compatible with
|
|
167
|
+
forecast_stats, forecast_ensembles, forecast_records
|
|
168
|
+
simulated_data: A dataframe with a datetime index and a single column of streamflow values
|
|
169
|
+
observed_data: A dataframe with a datetime index and a single column of streamflow values
|
|
170
|
+
use_month: Optional: either 0 for correct the forecast based on the first month of the forecast data or -1 if
|
|
171
|
+
you want to correct based on the ending month of the forecast data
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
pandas DataFrame with a copy of forecasted data with values updated in each column
|
|
175
|
+
"""
|
|
176
|
+
# make a copy of the forecasts which we update and return so the original data is not changed
|
|
177
|
+
forecast_copy = forecast_data.copy()
|
|
178
|
+
|
|
179
|
+
# make the flow and probability interpolation functions
|
|
180
|
+
monthly_simulated = simulated_data[
|
|
181
|
+
simulated_data.index.month == forecast_copy.index[use_month].month
|
|
182
|
+
].dropna()
|
|
183
|
+
monthly_observed = observed_data[
|
|
184
|
+
observed_data.index.month == forecast_copy.index[use_month].month
|
|
185
|
+
].dropna()
|
|
186
|
+
to_prob = _flow_and_probability_mapper(
|
|
187
|
+
monthly_simulated, to_probability=True, extrapolate=True
|
|
188
|
+
)
|
|
189
|
+
to_flow = _flow_and_probability_mapper(
|
|
190
|
+
monthly_observed, to_flow=True, extrapolate=True
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# for each column of forecast data, make the interpolation function and update the dataframe
|
|
194
|
+
for column in forecast_copy.columns:
|
|
195
|
+
tmp = forecast_copy[column].dropna()
|
|
196
|
+
forecast_copy.update(
|
|
197
|
+
pd.DataFrame(
|
|
198
|
+
to_flow(to_prob(tmp.values)), index=tmp.index, columns=[column]
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
return forecast_copy
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _flow_and_probability_mapper(
|
|
206
|
+
monthly_data: pd.DataFrame,
|
|
207
|
+
to_probability: bool = False,
|
|
208
|
+
to_flow: bool = False,
|
|
209
|
+
extrapolate: bool = False,
|
|
210
|
+
) -> interpolate.interp1d:
|
|
211
|
+
if not to_flow and not to_probability:
|
|
212
|
+
raise ValueError(
|
|
213
|
+
"You need to specify either to_probability or to_flow as True"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# get maximum value to bound histogram
|
|
217
|
+
max_val = math.ceil(np.max(monthly_data.max()))
|
|
218
|
+
min_val = math.floor(np.min(monthly_data.min()))
|
|
219
|
+
|
|
220
|
+
if max_val == min_val:
|
|
221
|
+
max_val += 0.1
|
|
222
|
+
|
|
223
|
+
# determine number of histograms bins needed
|
|
224
|
+
number_of_points = len(monthly_data.values)
|
|
225
|
+
number_of_classes = math.ceil(1 + (3.322 * math.log10(number_of_points)))
|
|
226
|
+
|
|
227
|
+
# specify the bin width for histogram (in m3/s)
|
|
228
|
+
step_width = (max_val - min_val) / number_of_classes
|
|
229
|
+
|
|
230
|
+
# specify histogram bins
|
|
231
|
+
bins = np.arange(
|
|
232
|
+
-np.min(step_width),
|
|
233
|
+
max_val + 2 * np.min(step_width),
|
|
234
|
+
np.min(step_width),
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
if bins[0] == 0:
|
|
238
|
+
bins = np.concatenate((-bins[1], bins))
|
|
239
|
+
elif bins[0] > 0:
|
|
240
|
+
bins = np.concatenate((-bins[0], bins))
|
|
241
|
+
|
|
242
|
+
# make the histogram
|
|
243
|
+
counts, bin_edges = np.histogram(monthly_data, bins=bins)
|
|
244
|
+
|
|
245
|
+
# adjust the bins to be the center
|
|
246
|
+
bin_edges = bin_edges[1:]
|
|
247
|
+
|
|
248
|
+
# normalize the histograms
|
|
249
|
+
counts = counts.astype(float) / monthly_data.size
|
|
250
|
+
|
|
251
|
+
# calculate the cdfs
|
|
252
|
+
cdf = np.cumsum(counts)
|
|
253
|
+
|
|
254
|
+
# Identify indices where consecutive values are the same
|
|
255
|
+
duplicate_indices = np.where(np.diff(cdf) == 0)[0]
|
|
256
|
+
|
|
257
|
+
# Adjust duplicate value to be an extrapolation of the previous value
|
|
258
|
+
for idx in duplicate_indices:
|
|
259
|
+
if idx > 0:
|
|
260
|
+
cdf[idx] = cdf[idx - 1] + (cdf[idx + 1] - cdf[idx - 1]) / 2
|
|
261
|
+
|
|
262
|
+
# interpolated function to convert simulated streamflow to prob
|
|
263
|
+
if to_probability:
|
|
264
|
+
if extrapolate:
|
|
265
|
+
func = interpolate.interp1d(
|
|
266
|
+
bin_edges, cdf, fill_value="extrapolate"
|
|
267
|
+
)
|
|
268
|
+
else:
|
|
269
|
+
func = interpolate.interp1d(bin_edges, cdf)
|
|
270
|
+
return lambda x: np.clip(func(x), 0, 1)
|
|
271
|
+
# interpolated function to convert simulated prob to observed streamflow
|
|
272
|
+
elif to_flow:
|
|
273
|
+
if extrapolate:
|
|
274
|
+
return interpolate.interp1d(
|
|
275
|
+
cdf, bin_edges, fill_value="extrapolate"
|
|
276
|
+
)
|
|
277
|
+
return interpolate.interp1d(cdf, bin_edges)
|
|
278
|
+
|
|
279
|
+
|
|
95
280
|
if __name__ == "__main__":
|
|
96
281
|
station_id = sys.argv[1]
|
|
97
282
|
reach_id = sys.argv[2]
|