loone-data-prep 0.1.7__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/PKG-INFO +2 -1
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/GEOGLOWS_LOONE_DATA_PREP.py +2 -1
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/LOONE_DATA_PREP.py +115 -0
- loone_data_prep-0.1.8/loone_data_prep/flow_data/forecast_bias_correction.py +293 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/utils.py +286 -78
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep.egg-info/PKG-INFO +2 -1
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep.egg-info/requires.txt +1 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/pyproject.toml +10 -5
- loone_data_prep-0.1.7/loone_data_prep/flow_data/forecast_bias_correction.py +0 -108
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/LICENSE +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/README.md +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/__init__.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/data_analyses_fns.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/flow_data/S65E_total.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/flow_data/__init__.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/flow_data/get_forecast_flows.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/flow_data/get_inflows.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/flow_data/get_outflows.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/flow_data/hydro.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/water_level_data/__init__.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/water_level_data/get_all.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/water_level_data/hydro.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/water_quality_data/__init__.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/water_quality_data/get_inflows.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/water_quality_data/get_lake_wq.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/water_quality_data/wq.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/weather_data/__init__.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/weather_data/get_all.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/weather_data/weather.py +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep.egg-info/SOURCES.txt +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep.egg-info/dependency_links.txt +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep.egg-info/top_level.txt +0 -0
- {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: loone_data_prep
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.8
|
|
4
4
|
Summary: Prepare data to run the LOONE model.
|
|
5
5
|
Author-email: Osama Tarabih <osamatarabih@usf.edu>
|
|
6
6
|
Maintainer-email: Michael Souffront <msouffront@aquaveo.com>, James Dolinar <jdolinar@aquaveo.com>
|
|
@@ -20,6 +20,7 @@ Description-Content-Type: text/markdown
|
|
|
20
20
|
License-File: LICENSE
|
|
21
21
|
Requires-Dist: rpy2
|
|
22
22
|
Requires-Dist: retry
|
|
23
|
+
Requires-Dist: numpy<2
|
|
23
24
|
Requires-Dist: pandas
|
|
24
25
|
Requires-Dist: scipy
|
|
25
26
|
Requires-Dist: geoglows==0.27.1
|
|
@@ -45,10 +45,11 @@ def main(input_dir: str, output_dir: str, ensemble_number: str) -> None: # , hi
|
|
|
45
45
|
LO_Stage = pd.read_csv(f"{input_dir}/LO_Stage.csv")
|
|
46
46
|
# Create Column (EOD Stg(ft, NGVD)) in File (SFWMM_Daily_Outputs)
|
|
47
47
|
LO_Stage = DF_Date_Range(LO_Stage, M3_Yr, M3_M, M3_D, En_Yr, En_M, En_D)
|
|
48
|
+
LO_Stage.index = LO_Stage["date"]
|
|
48
49
|
# Calculate average
|
|
49
50
|
if "Average_Stage" not in LO_Stage.columns:
|
|
50
51
|
LO_Stage = LO_Stage.loc[:, ~LO_Stage.columns.str.contains("^Unnamed")]
|
|
51
|
-
LO_Stage["Average_Stage"] = LO_Stage.mean(axis=1)
|
|
52
|
+
LO_Stage["Average_Stage"] = LO_Stage.drop(columns=['date']).mean(axis=1)
|
|
52
53
|
LO_Stage.to_csv(f"{input_dir}/LO_Stage.csv", index=False)
|
|
53
54
|
LO_Storage = stg2sto(f"{input_dir}/StgSto_data.csv", LO_Stage["Average_Stage"], 0)
|
|
54
55
|
LO_SA = stg2ar(f"{input_dir}/Stgar_data.csv", LO_Stage["Average_Stage"], 0)
|
|
@@ -580,6 +580,18 @@ def main(input_dir: str, output_dir: str) -> None:
|
|
|
580
580
|
LO_OP_data_Inter['Mean_OP'] = LO_OP_data_Inter.mean(axis=1, numeric_only=True)
|
|
581
581
|
LO_OP_data_Inter = DF_Date_Range(LO_OP_data_Inter, St_Yr, St_M, St_D, En_Yr, En_M, En_D)
|
|
582
582
|
LO_OP_data_Inter.to_csv(f'{output_dir}/LO_OP.csv', index=False)
|
|
583
|
+
|
|
584
|
+
# Create File (N_OP) (L001, L005, L008)
|
|
585
|
+
n_op = LO_OP_data_Inter[['date', 'Data_L001_OP_Inter', 'Data_L005_OP_Inter', 'Data_L008_OP_Inter']]
|
|
586
|
+
n_op['OP'] = n_op.mean(axis=1, numeric_only=True) * 1000 # mg/L to mg/m3
|
|
587
|
+
n_op.drop(['Data_L001_OP_Inter', 'Data_L005_OP_Inter', 'Data_L008_OP_Inter'], axis=1, inplace=True)
|
|
588
|
+
n_op.to_csv(f'{output_dir}/N_OP.csv', index=False)
|
|
589
|
+
|
|
590
|
+
# Create File (S_OP) (L004, L006, L007, L008, and LZ40)
|
|
591
|
+
s_op = LO_OP_data_Inter[['date', 'Data_L004_OP_Inter', 'Data_L006_OP_Inter', 'Data_L007_OP_Inter', 'Data_L008_OP_Inter', 'Data_LZ40_OP_Inter']]
|
|
592
|
+
s_op['OP'] = s_op.mean(axis=1, numeric_only=True) * 1000 # mg/L to mg/m3
|
|
593
|
+
s_op.drop(['Data_L004_OP_Inter', 'Data_L006_OP_Inter', 'Data_L007_OP_Inter', 'Data_L008_OP_Inter', 'Data_LZ40_OP_Inter'], axis=1, inplace=True)
|
|
594
|
+
s_op.to_csv(f'{output_dir}/S_OP.csv', index=False)
|
|
583
595
|
|
|
584
596
|
# Interpolated NH4 Observations in Lake
|
|
585
597
|
# Create File (LO_Avg_NH4)
|
|
@@ -662,6 +674,22 @@ def main(input_dir: str, output_dir: str) -> None:
|
|
|
662
674
|
LO_DIN['NO'] = LO_NO_Clean_Inter['Mean_NO'].values
|
|
663
675
|
LO_DIN['DIN_mg/m3'] = LO_DIN[['NH4', 'NO']].sum(axis=1)*1000
|
|
664
676
|
LO_DIN.to_csv(f'{output_dir}/LO_DIN.csv', index=False)
|
|
677
|
+
|
|
678
|
+
# Create File (N_DIN) (L001, L005, L008)
|
|
679
|
+
n_din = pd.DataFrame(date_DIN, columns=['date'])
|
|
680
|
+
n_din.set_index('date', inplace=True)
|
|
681
|
+
n_din['NH4'] = LO_NH4_Clean_Inter[['date', 'Data_L001_NH4_Inter', 'Data_L005_NH4_Inter', 'Data_L008_NH4_Inter']].mean(axis=1, numeric_only=True)
|
|
682
|
+
n_din['NO'] = LO_NO_Clean_Inter[['date', 'Data_L001_NO_Inter', 'Data_L005_NO_Inter', 'Data_L008_NO_Inter']].mean(axis=1, numeric_only=True)*1000 # mg/L to mg/m3
|
|
683
|
+
n_din['DIN'] = n_din[['NH4', 'NO']].sum(axis=1)*1000 # mg/L to mg/m3
|
|
684
|
+
n_din.to_csv(f'{output_dir}/N_DIN.csv')
|
|
685
|
+
|
|
686
|
+
# Create File (S_DIN) (L004, L006, L007, L008, LZ40)
|
|
687
|
+
s_din = pd.DataFrame(date_DIN, columns=['date'])
|
|
688
|
+
s_din.set_index('date', inplace=True)
|
|
689
|
+
s_din['NH4'] = LO_NH4_Clean_Inter[['date', 'Data_L004_NH4_Inter', 'Data_L006_NH4_Inter', 'Data_L007_NH4_Inter', 'Data_L008_NH4_Inter', 'Data_LZ40_NH4_Inter']].mean(axis=1, numeric_only=True)
|
|
690
|
+
s_din['NO'] = LO_NO_Clean_Inter[['date', 'Data_L004_NO_Inter', 'Data_L006_NO_Inter', 'Data_L007_NO_Inter', 'Data_L008_NO_Inter', 'Data_LZ40_NO_Inter']].mean(axis=1, numeric_only=True)*1000 # mg/L to mg/m3
|
|
691
|
+
s_din['DIN'] = s_din[['NH4', 'NO']].sum(axis=1)*1000 # mg/L to mg/m3
|
|
692
|
+
s_din.to_csv(f'{output_dir}/S_DIN.csv')
|
|
665
693
|
|
|
666
694
|
# Interpolated DO Observations in Lake
|
|
667
695
|
# Create File (LO_Avg_DO)
|
|
@@ -821,6 +849,93 @@ def main(input_dir: str, output_dir: str) -> None:
|
|
|
821
849
|
LO_Chla_Merge_Monthly_Inter = LO_Chla_Merge.resample('M').mean()
|
|
822
850
|
LO_Chla_Merge_Monthly_Inter.to_csv(f'{output_dir}/LO_Chla_Merge_Monthly_Inter.csv')
|
|
823
851
|
|
|
852
|
+
# Create files (LO_Chla_Obs.csv, N_Merged_Chla.csv, and S_Merged_Chla.csv)
|
|
853
|
+
L001_Chla = pd.read_csv(f'{input_dir}/water_quality_L001_CHLOROPHYLL-A, CORRECTED.csv')
|
|
854
|
+
L001_Chla.drop(columns=['days'], inplace=True)
|
|
855
|
+
L004_Chla = pd.read_csv(f'{input_dir}/water_quality_L004_CHLOROPHYLL-A, CORRECTED.csv')
|
|
856
|
+
L004_Chla.drop(columns=['days'], inplace=True)
|
|
857
|
+
L005_Chla = pd.read_csv(f'{input_dir}/water_quality_L005_CHLOROPHYLL-A, CORRECTED.csv')
|
|
858
|
+
L005_Chla.drop(columns=['days'], inplace=True)
|
|
859
|
+
L006_Chla = pd.read_csv(f'{input_dir}/water_quality_L006_CHLOROPHYLL-A, CORRECTED.csv')
|
|
860
|
+
L006_Chla.drop(columns=['days'], inplace=True)
|
|
861
|
+
L007_Chla = pd.read_csv(f'{input_dir}/water_quality_L007_CHLOROPHYLL-A, CORRECTED.csv')
|
|
862
|
+
L007_Chla.drop(columns=['days'], inplace=True)
|
|
863
|
+
L008_Chla = pd.read_csv(f'{input_dir}/water_quality_L008_CHLOROPHYLL-A, CORRECTED.csv')
|
|
864
|
+
L008_Chla.drop(columns=['days'], inplace=True)
|
|
865
|
+
LZ40_Chla = pd.read_csv(f'{input_dir}/water_quality_LZ40_CHLOROPHYLL-A, CORRECTED.csv')
|
|
866
|
+
LZ40_Chla.drop(columns=['days'], inplace=True)
|
|
867
|
+
L001_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L001_CHLOROPHYLL-A(LC).csv')
|
|
868
|
+
L001_Chla_LC.drop(columns=['days'], inplace=True)
|
|
869
|
+
L004_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L004_CHLOROPHYLL-A(LC).csv')
|
|
870
|
+
L004_Chla_LC.drop(columns=['days'], inplace=True)
|
|
871
|
+
L005_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L005_CHLOROPHYLL-A(LC).csv')
|
|
872
|
+
L005_Chla_LC.drop(columns=['days'], inplace=True)
|
|
873
|
+
L006_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L006_CHLOROPHYLL-A(LC).csv')
|
|
874
|
+
L006_Chla_LC.drop(columns=['days'], inplace=True)
|
|
875
|
+
L007_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L007_CHLOROPHYLL-A(LC).csv')
|
|
876
|
+
L007_Chla_LC.drop(columns=['days'], inplace=True)
|
|
877
|
+
L008_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L008_CHLOROPHYLL-A(LC).csv')
|
|
878
|
+
L008_Chla_LC.drop(columns=['days'], inplace=True)
|
|
879
|
+
LZ40_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_LZ40_CHLOROPHYLL-A(LC).csv')
|
|
880
|
+
LZ40_Chla_LC.drop(columns=['days'], inplace=True)
|
|
881
|
+
|
|
882
|
+
LO_Chla = pd.merge(L001_Chla, L004_Chla, how='left', on='date')
|
|
883
|
+
LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
|
|
884
|
+
LO_Chla = pd.merge(LO_Chla, L005_Chla, how='left', on='date')
|
|
885
|
+
LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
|
|
886
|
+
LO_Chla = pd.merge(LO_Chla, L006_Chla, how='left', on='date')
|
|
887
|
+
LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
|
|
888
|
+
LO_Chla = pd.merge(LO_Chla, L007_Chla, how='left', on='date')
|
|
889
|
+
LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
|
|
890
|
+
LO_Chla = pd.merge(LO_Chla, L008_Chla, how='left', on='date')
|
|
891
|
+
LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
|
|
892
|
+
LO_Chla = pd.merge(LO_Chla, LZ40_Chla, how='left', on='date')
|
|
893
|
+
LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
|
|
894
|
+
LO_Chla = LO_Chla.set_index('date')
|
|
895
|
+
LO_Chla['Mean_Chla'] = LO_Chla.mean(axis=1)
|
|
896
|
+
LO_Chla = LO_Chla.reset_index()
|
|
897
|
+
LO_Chla_N_cols = ['L001_CHLOROPHYLL-A, CORRECTED_ug/L', 'L005_CHLOROPHYLL-A, CORRECTED_ug/L', 'L008_CHLOROPHYLL-A, CORRECTED_ug/L']
|
|
898
|
+
LO_Chla['Chla_North'] = LO_Chla[LO_Chla_N_cols].mean(axis=1)
|
|
899
|
+
LO_Chla_S_cols = ['L004_CHLOROPHYLL-A, CORRECTED_ug/L', 'L006_CHLOROPHYLL-A, CORRECTED_ug/L', 'L007_CHLOROPHYLL-A, CORRECTED_ug/L','L008_CHLOROPHYLL-A, CORRECTED_ug/L','LZ40_CHLOROPHYLL-A, CORRECTED_ug/L']
|
|
900
|
+
LO_Chla['Chla_South'] = LO_Chla[LO_Chla_S_cols].mean(axis=1)
|
|
901
|
+
|
|
902
|
+
LO_Chla_LC = pd.merge(L001_Chla_LC, L004_Chla_LC, how='left', on='date')
|
|
903
|
+
LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
|
|
904
|
+
LO_Chla_LC = pd.merge(LO_Chla_LC, L005_Chla_LC, how='left', on='date')
|
|
905
|
+
LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
|
|
906
|
+
LO_Chla_LC = pd.merge(LO_Chla_LC, L006_Chla_LC, how='left', on='date')
|
|
907
|
+
LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
|
|
908
|
+
LO_Chla_LC = pd.merge(LO_Chla_LC, L007_Chla_LC, how='left', on='date')
|
|
909
|
+
LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
|
|
910
|
+
LO_Chla_LC = pd.merge(LO_Chla_LC, L008_Chla_LC, how='left', on='date')
|
|
911
|
+
LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
|
|
912
|
+
LO_Chla_LC = pd.merge(LO_Chla_LC, LZ40_Chla_LC, how='left', on='date')
|
|
913
|
+
LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
|
|
914
|
+
LO_Chla_LC = LO_Chla_LC.set_index('date')
|
|
915
|
+
LO_Chla_LC['Mean_Chla'] = LO_Chla_LC.mean(axis=1)
|
|
916
|
+
LO_Chla_LC = LO_Chla_LC.reset_index()
|
|
917
|
+
LO_Chla_LC_N_cols = ['L001_CHLOROPHYLL-A(LC)_ug/L', 'L005_CHLOROPHYLL-A(LC)_ug/L', 'L008_CHLOROPHYLL-A(LC)_ug/L']
|
|
918
|
+
LO_Chla_LC['Chla_North'] = LO_Chla_LC[LO_Chla_LC_N_cols].mean(axis=1)
|
|
919
|
+
LO_Chla_LC_S_cols = ['L004_CHLOROPHYLL-A(LC)_ug/L', 'L006_CHLOROPHYLL-A(LC)_ug/L', 'L007_CHLOROPHYLL-A(LC)_ug/L','L008_CHLOROPHYLL-A(LC)_ug/L','LZ40_CHLOROPHYLL-A(LC)_ug/L']
|
|
920
|
+
LO_Chla_LC['Chla_South'] = LO_Chla_LC[LO_Chla_LC_S_cols].mean(axis=1)
|
|
921
|
+
|
|
922
|
+
LO_Chla = DF_Date_Range(LO_Chla, 2008, 1, 1, 2010, 10, 19)
|
|
923
|
+
LO_Chla_df = pd.DataFrame(LO_Chla['date'], columns=['date'])
|
|
924
|
+
LO_Chla_df['Chla'] = LO_Chla['Mean_Chla']
|
|
925
|
+
LO_Chla_df['Chla_N'] = LO_Chla['Chla_North']
|
|
926
|
+
LO_Chla_df['Chla_S'] = LO_Chla['Chla_South']
|
|
927
|
+
|
|
928
|
+
LO_Chla_LC = DF_Date_Range(LO_Chla_LC, 2010, 10, 20, 2023, 6, 30)
|
|
929
|
+
LO_Chla_LC_df = pd.DataFrame(LO_Chla_LC['date'], columns=['date'])
|
|
930
|
+
LO_Chla_LC_df['Chla'] = LO_Chla_LC['Mean_Chla']
|
|
931
|
+
LO_Chla_LC_df['Chla_N'] = LO_Chla_LC['Chla_North']
|
|
932
|
+
LO_Chla_LC_df['Chla_S'] = LO_Chla_LC['Chla_South']
|
|
933
|
+
|
|
934
|
+
LO_Chla_Merge = pd.concat([LO_Chla_df, LO_Chla_LC_df]).reset_index(drop=True)
|
|
935
|
+
LO_Chla_Merge.to_csv(f'{output_dir}/LO_Chla_Obs.csv')
|
|
936
|
+
LO_Chla_Merge[['date', 'Chla_N']].rename(columns={'Chla_N': 'Chla'}).to_csv(f'{output_dir}/N_Merged_Chla.csv', index=False)
|
|
937
|
+
LO_Chla_Merge[['date', 'Chla_S']].rename(columns={'Chla_S': 'Chla'}).to_csv(f'{output_dir}/S_Merged_Chla.csv', index=False)
|
|
938
|
+
|
|
824
939
|
# Create Files S65E_Avg_Chla
|
|
825
940
|
S65E_Chla_Inter = pd.read_csv(f'{input_dir}/water_quality_S65E_CHLOROPHYLL-A, CORRECTED_Interpolated.csv')
|
|
826
941
|
S65E_Chla_LC_Inter = pd.read_csv(f'{input_dir}/water_quality_S65E_CHLOROPHYLL-A(LC)_Interpolated.csv')
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import math
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import geoglows
|
|
7
|
+
from scipy import interpolate
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
SECONDS_IN_DAY = 86400
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_bias_corrected_data(
|
|
14
|
+
station_id: str,
|
|
15
|
+
reach_id: str,
|
|
16
|
+
observed_data_path: str,
|
|
17
|
+
station_ensembles: pd.DataFrame,
|
|
18
|
+
station_stats: pd.DataFrame,
|
|
19
|
+
cache_path: str = None,
|
|
20
|
+
) -> dict:
|
|
21
|
+
# Load the observed data from a CSV file
|
|
22
|
+
observed_data = pd.read_csv(
|
|
23
|
+
observed_data_path,
|
|
24
|
+
index_col=0,
|
|
25
|
+
usecols=["date", f"{station_id}_FLOW_cmd"],
|
|
26
|
+
)
|
|
27
|
+
# Convert the index to datetime and localize it to UTC
|
|
28
|
+
observed_data.index = pd.to_datetime(observed_data.index).tz_localize(
|
|
29
|
+
"UTC"
|
|
30
|
+
)
|
|
31
|
+
# Transform the data by dividing it by the number of seconds in a day
|
|
32
|
+
observed_data = observed_data.transform(lambda x: x / SECONDS_IN_DAY)
|
|
33
|
+
# Rename the value column to "Streamflow (m3/s)"
|
|
34
|
+
observed_data.rename(
|
|
35
|
+
columns={f"{station_id}_FLOW_cmd": "Streamflow (m3/s)"}, inplace=True
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Prepare the observed data by filling NaN values with the 10yr average
|
|
39
|
+
prepared_od = prep_observed_data(observed_data)
|
|
40
|
+
|
|
41
|
+
# Get the historical simulation data for the given reach ID
|
|
42
|
+
historical_data = None
|
|
43
|
+
|
|
44
|
+
if cache_path is None:
|
|
45
|
+
historical_data = geoglows.streamflow.historic_simulation(reach_id)
|
|
46
|
+
else:
|
|
47
|
+
# Create the geoglows cache directory if it doesn't exist
|
|
48
|
+
geoglows_cache_path = os.path.join(cache_path, "geoglows_cache")
|
|
49
|
+
if not os.path.exists(geoglows_cache_path):
|
|
50
|
+
os.makedirs(geoglows_cache_path)
|
|
51
|
+
|
|
52
|
+
# Check if the historical simulation data is already cached
|
|
53
|
+
if os.path.exists(
|
|
54
|
+
os.path.join(
|
|
55
|
+
geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
|
|
56
|
+
)
|
|
57
|
+
):
|
|
58
|
+
historical_data = pd.read_csv(
|
|
59
|
+
os.path.join(
|
|
60
|
+
geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
|
|
61
|
+
),
|
|
62
|
+
index_col=0,
|
|
63
|
+
)
|
|
64
|
+
historical_data.index = pd.to_datetime(historical_data.index)
|
|
65
|
+
else:
|
|
66
|
+
historical_data = geoglows.streamflow.historic_simulation(reach_id)
|
|
67
|
+
historical_data.to_csv(
|
|
68
|
+
os.path.join(
|
|
69
|
+
geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Correct the forecast bias in the station ensembles
|
|
74
|
+
station_ensembles = bias_correct_forecast(
|
|
75
|
+
station_ensembles, historical_data, prepared_od
|
|
76
|
+
)
|
|
77
|
+
# Correct the forecast bias in the station stats
|
|
78
|
+
station_stats = bias_correct_forecast(
|
|
79
|
+
station_stats, historical_data, prepared_od
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Return the bias-corrected station ensembles and station stats
|
|
83
|
+
return station_ensembles, station_stats
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def prep_observed_data(observed_data: pd.DataFrame) -> pd.DataFrame:
|
|
87
|
+
# Group the data by month and day
|
|
88
|
+
grouped_data = observed_data.groupby(
|
|
89
|
+
[observed_data.index.month, observed_data.index.day]
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Calculate the rolling average of 'Streamflow (m3/s)' for each group
|
|
93
|
+
daily_10yr_avg = (
|
|
94
|
+
grouped_data["Streamflow (m3/s)"]
|
|
95
|
+
.rolling(window=10, min_periods=1, center=True)
|
|
96
|
+
.mean()
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Reset the multi-index of daily_10yr_avg and sort it by index
|
|
100
|
+
fill_val = daily_10yr_avg.reset_index(level=[0, 1], drop=True).sort_index()
|
|
101
|
+
|
|
102
|
+
# Fill NaN in 'Streamflow (m3/s)' with corresponding values from fill_val
|
|
103
|
+
observed_data["Streamflow (m3/s)"] = observed_data[
|
|
104
|
+
"Streamflow (m3/s)"
|
|
105
|
+
].fillna(fill_val)
|
|
106
|
+
|
|
107
|
+
# Return the modified observed_data DataFrame
|
|
108
|
+
return observed_data
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def bias_correct_historical(
|
|
112
|
+
simulated_data: pd.DataFrame, observed_data: pd.DataFrame
|
|
113
|
+
) -> pd.DataFrame:
|
|
114
|
+
"""
|
|
115
|
+
Accepts a historically simulated flow timeseries and observed flow timeseries and attempts to correct biases in the
|
|
116
|
+
simulation on a monthly basis.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
simulated_data: A dataframe with a datetime index and a single column of streamflow values
|
|
120
|
+
observed_data: A dataframe with a datetime index and a single column of streamflow values
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
pandas DataFrame with a datetime index and a single column of streamflow values
|
|
124
|
+
"""
|
|
125
|
+
# list of the unique months in the historical simulation. should always be 1->12 but just in case...
|
|
126
|
+
unique_simulation_months = sorted(set(simulated_data.index.strftime("%m")))
|
|
127
|
+
dates = []
|
|
128
|
+
values = []
|
|
129
|
+
|
|
130
|
+
for month in unique_simulation_months:
|
|
131
|
+
# filter historic data to only be current month
|
|
132
|
+
monthly_simulated = simulated_data[
|
|
133
|
+
simulated_data.index.month == int(month)
|
|
134
|
+
].dropna()
|
|
135
|
+
to_prob = _flow_and_probability_mapper(
|
|
136
|
+
monthly_simulated, to_probability=True
|
|
137
|
+
)
|
|
138
|
+
# filter the observations to current month
|
|
139
|
+
monthly_observed = observed_data[
|
|
140
|
+
observed_data.index.month == int(month)
|
|
141
|
+
].dropna()
|
|
142
|
+
to_flow = _flow_and_probability_mapper(monthly_observed, to_flow=True)
|
|
143
|
+
|
|
144
|
+
dates += monthly_simulated.index.to_list()
|
|
145
|
+
value = to_flow(to_prob(monthly_simulated.values))
|
|
146
|
+
values += value.tolist()
|
|
147
|
+
|
|
148
|
+
corrected = pd.DataFrame(
|
|
149
|
+
data=values, index=dates, columns=["Corrected Simulated Streamflow"]
|
|
150
|
+
)
|
|
151
|
+
corrected.sort_index(inplace=True)
|
|
152
|
+
return corrected
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def bias_correct_forecast(
|
|
156
|
+
forecast_data: pd.DataFrame,
|
|
157
|
+
simulated_data: pd.DataFrame,
|
|
158
|
+
observed_data: pd.DataFrame,
|
|
159
|
+
use_month: int = 0,
|
|
160
|
+
) -> pd.DataFrame:
|
|
161
|
+
"""
|
|
162
|
+
Accepts a short term forecast of streamflow, simulated historical flow, and observed flow timeseries and attempts
|
|
163
|
+
to correct biases in the forecasted data
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
forecast_data: A dataframe with a datetime index and any number of columns of forecasted flow. Compatible with
|
|
167
|
+
forecast_stats, forecast_ensembles, forecast_records
|
|
168
|
+
simulated_data: A dataframe with a datetime index and a single column of streamflow values
|
|
169
|
+
observed_data: A dataframe with a datetime index and a single column of streamflow values
|
|
170
|
+
use_month: Optional: either 0 for correct the forecast based on the first month of the forecast data or -1 if
|
|
171
|
+
you want to correct based on the ending month of the forecast data
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
pandas DataFrame with a copy of forecasted data with values updated in each column
|
|
175
|
+
"""
|
|
176
|
+
# make a copy of the forecasts which we update and return so the original data is not changed
|
|
177
|
+
forecast_copy = forecast_data.copy()
|
|
178
|
+
|
|
179
|
+
# make the flow and probability interpolation functions
|
|
180
|
+
monthly_simulated = simulated_data[
|
|
181
|
+
simulated_data.index.month == forecast_copy.index[use_month].month
|
|
182
|
+
].dropna()
|
|
183
|
+
monthly_observed = observed_data[
|
|
184
|
+
observed_data.index.month == forecast_copy.index[use_month].month
|
|
185
|
+
].dropna()
|
|
186
|
+
to_prob = _flow_and_probability_mapper(
|
|
187
|
+
monthly_simulated, to_probability=True, extrapolate=True
|
|
188
|
+
)
|
|
189
|
+
to_flow = _flow_and_probability_mapper(
|
|
190
|
+
monthly_observed, to_flow=True, extrapolate=True
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# for each column of forecast data, make the interpolation function and update the dataframe
|
|
194
|
+
for column in forecast_copy.columns:
|
|
195
|
+
tmp = forecast_copy[column].dropna()
|
|
196
|
+
forecast_copy.update(
|
|
197
|
+
pd.DataFrame(
|
|
198
|
+
to_flow(to_prob(tmp.values)), index=tmp.index, columns=[column]
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
return forecast_copy
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _flow_and_probability_mapper(
|
|
206
|
+
monthly_data: pd.DataFrame,
|
|
207
|
+
to_probability: bool = False,
|
|
208
|
+
to_flow: bool = False,
|
|
209
|
+
extrapolate: bool = False,
|
|
210
|
+
) -> interpolate.interp1d:
|
|
211
|
+
if not to_flow and not to_probability:
|
|
212
|
+
raise ValueError(
|
|
213
|
+
"You need to specify either to_probability or to_flow as True"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# get maximum value to bound histogram
|
|
217
|
+
max_val = math.ceil(np.max(monthly_data.max()))
|
|
218
|
+
min_val = math.floor(np.min(monthly_data.min()))
|
|
219
|
+
|
|
220
|
+
if max_val == min_val:
|
|
221
|
+
max_val += 0.1
|
|
222
|
+
|
|
223
|
+
# determine number of histograms bins needed
|
|
224
|
+
number_of_points = len(monthly_data.values)
|
|
225
|
+
number_of_classes = math.ceil(1 + (3.322 * math.log10(number_of_points)))
|
|
226
|
+
|
|
227
|
+
# specify the bin width for histogram (in m3/s)
|
|
228
|
+
step_width = (max_val - min_val) / number_of_classes
|
|
229
|
+
|
|
230
|
+
# specify histogram bins
|
|
231
|
+
bins = np.arange(
|
|
232
|
+
-np.min(step_width),
|
|
233
|
+
max_val + 2 * np.min(step_width),
|
|
234
|
+
np.min(step_width),
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
if bins[0] == 0:
|
|
238
|
+
bins = np.concatenate((-bins[1], bins))
|
|
239
|
+
elif bins[0] > 0:
|
|
240
|
+
bins = np.concatenate((-bins[0], bins))
|
|
241
|
+
|
|
242
|
+
# make the histogram
|
|
243
|
+
counts, bin_edges = np.histogram(monthly_data, bins=bins)
|
|
244
|
+
|
|
245
|
+
# adjust the bins to be the center
|
|
246
|
+
bin_edges = bin_edges[1:]
|
|
247
|
+
|
|
248
|
+
# normalize the histograms
|
|
249
|
+
counts = counts.astype(float) / monthly_data.size
|
|
250
|
+
|
|
251
|
+
# calculate the cdfs
|
|
252
|
+
cdf = np.cumsum(counts)
|
|
253
|
+
|
|
254
|
+
# Identify indices where consecutive values are the same
|
|
255
|
+
duplicate_indices = np.where(np.diff(cdf) == 0)[0]
|
|
256
|
+
|
|
257
|
+
# Adjust duplicate value to be an extrapolation of the previous value
|
|
258
|
+
for idx in duplicate_indices:
|
|
259
|
+
if idx > 0:
|
|
260
|
+
cdf[idx] = cdf[idx - 1] + (cdf[idx + 1] - cdf[idx - 1]) / 2
|
|
261
|
+
|
|
262
|
+
# interpolated function to convert simulated streamflow to prob
|
|
263
|
+
if to_probability:
|
|
264
|
+
if extrapolate:
|
|
265
|
+
func = interpolate.interp1d(
|
|
266
|
+
bin_edges, cdf, fill_value="extrapolate"
|
|
267
|
+
)
|
|
268
|
+
else:
|
|
269
|
+
func = interpolate.interp1d(bin_edges, cdf)
|
|
270
|
+
return lambda x: np.clip(func(x), 0, 1)
|
|
271
|
+
# interpolated function to convert simulated prob to observed streamflow
|
|
272
|
+
elif to_flow:
|
|
273
|
+
if extrapolate:
|
|
274
|
+
return interpolate.interp1d(
|
|
275
|
+
cdf, bin_edges, fill_value="extrapolate"
|
|
276
|
+
)
|
|
277
|
+
return interpolate.interp1d(cdf, bin_edges)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
if __name__ == "__main__":
|
|
281
|
+
station_id = sys.argv[1]
|
|
282
|
+
reach_id = sys.argv[2]
|
|
283
|
+
observed_data_path = sys.argv[3].rstrip("/")
|
|
284
|
+
station_ensembles = sys.argv[4]
|
|
285
|
+
station_stats = sys.argv[5]
|
|
286
|
+
|
|
287
|
+
get_bias_corrected_data(
|
|
288
|
+
station_id,
|
|
289
|
+
reach_id,
|
|
290
|
+
observed_data_path,
|
|
291
|
+
station_ensembles,
|
|
292
|
+
station_stats,
|
|
293
|
+
)
|