loone-data-prep 0.1.6__tar.gz → 0.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/PKG-INFO +2 -1
  2. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep/GEOGLOWS_LOONE_DATA_PREP.py +2 -1
  3. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep/LOONE_DATA_PREP.py +115 -2
  4. loone_data_prep-0.1.8/loone_data_prep/flow_data/S65E_total.py +89 -0
  5. loone_data_prep-0.1.8/loone_data_prep/flow_data/forecast_bias_correction.py +293 -0
  6. loone_data_prep-0.1.8/loone_data_prep/flow_data/get_inflows.py +159 -0
  7. loone_data_prep-0.1.8/loone_data_prep/flow_data/get_outflows.py +164 -0
  8. loone_data_prep-0.1.8/loone_data_prep/flow_data/hydro.py +155 -0
  9. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep/utils.py +339 -62
  10. loone_data_prep-0.1.8/loone_data_prep/water_level_data/get_all.py +232 -0
  11. loone_data_prep-0.1.8/loone_data_prep/water_level_data/hydro.py +114 -0
  12. loone_data_prep-0.1.8/loone_data_prep/water_quality_data/get_inflows.py +127 -0
  13. loone_data_prep-0.1.8/loone_data_prep/water_quality_data/get_lake_wq.py +129 -0
  14. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep/water_quality_data/wq.py +44 -0
  15. loone_data_prep-0.1.8/loone_data_prep/weather_data/get_all.py +157 -0
  16. loone_data_prep-0.1.8/loone_data_prep/weather_data/weather.py +280 -0
  17. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep.egg-info/PKG-INFO +2 -1
  18. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep.egg-info/requires.txt +1 -0
  19. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/pyproject.toml +10 -5
  20. loone_data_prep-0.1.6/loone_data_prep/flow_data/S65E_total.py +0 -24
  21. loone_data_prep-0.1.6/loone_data_prep/flow_data/forecast_bias_correction.py +0 -108
  22. loone_data_prep-0.1.6/loone_data_prep/flow_data/get_inflows.py +0 -70
  23. loone_data_prep-0.1.6/loone_data_prep/flow_data/get_outflows.py +0 -80
  24. loone_data_prep-0.1.6/loone_data_prep/flow_data/hydro.py +0 -61
  25. loone_data_prep-0.1.6/loone_data_prep/water_level_data/get_all.py +0 -35
  26. loone_data_prep-0.1.6/loone_data_prep/water_level_data/hydro.py +0 -46
  27. loone_data_prep-0.1.6/loone_data_prep/water_quality_data/get_inflows.py +0 -42
  28. loone_data_prep-0.1.6/loone_data_prep/water_quality_data/get_lake_wq.py +0 -47
  29. loone_data_prep-0.1.6/loone_data_prep/weather_data/get_all.py +0 -34
  30. loone_data_prep-0.1.6/loone_data_prep/weather_data/weather.py +0 -122
  31. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/LICENSE +0 -0
  32. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/README.md +0 -0
  33. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep/__init__.py +0 -0
  34. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep/data_analyses_fns.py +0 -0
  35. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep/flow_data/__init__.py +0 -0
  36. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep/flow_data/get_forecast_flows.py +0 -0
  37. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep/water_level_data/__init__.py +0 -0
  38. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep/water_quality_data/__init__.py +0 -0
  39. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep/weather_data/__init__.py +0 -0
  40. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep.egg-info/SOURCES.txt +0 -0
  41. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep.egg-info/dependency_links.txt +0 -0
  42. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep.egg-info/top_level.txt +0 -0
  43. {loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: loone_data_prep
3
- Version: 0.1.6
3
+ Version: 0.1.8
4
4
  Summary: Prepare data to run the LOONE model.
5
5
  Author-email: Osama Tarabih <osamatarabih@usf.edu>
6
6
  Maintainer-email: Michael Souffront <msouffront@aquaveo.com>, James Dolinar <jdolinar@aquaveo.com>
@@ -20,6 +20,7 @@ Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
21
  Requires-Dist: rpy2
22
22
  Requires-Dist: retry
23
+ Requires-Dist: numpy<2
23
24
  Requires-Dist: pandas
24
25
  Requires-Dist: scipy
25
26
  Requires-Dist: geoglows==0.27.1
@@ -45,10 +45,11 @@ def main(input_dir: str, output_dir: str, ensemble_number: str) -> None: # , hi
45
45
  LO_Stage = pd.read_csv(f"{input_dir}/LO_Stage.csv")
46
46
  # Create Column (EOD Stg(ft, NGVD)) in File (SFWMM_Daily_Outputs)
47
47
  LO_Stage = DF_Date_Range(LO_Stage, M3_Yr, M3_M, M3_D, En_Yr, En_M, En_D)
48
+ LO_Stage.index = LO_Stage["date"]
48
49
  # Calculate average
49
50
  if "Average_Stage" not in LO_Stage.columns:
50
51
  LO_Stage = LO_Stage.loc[:, ~LO_Stage.columns.str.contains("^Unnamed")]
51
- LO_Stage["Average_Stage"] = LO_Stage.mean(axis=1)
52
+ LO_Stage["Average_Stage"] = LO_Stage.drop(columns=['date']).mean(axis=1)
52
53
  LO_Stage.to_csv(f"{input_dir}/LO_Stage.csv", index=False)
53
54
  LO_Storage = stg2sto(f"{input_dir}/StgSto_data.csv", LO_Stage["Average_Stage"], 0)
54
55
  LO_SA = stg2ar(f"{input_dir}/Stgar_data.csv", LO_Stage["Average_Stage"], 0)
@@ -383,7 +383,6 @@ def main(input_dir: str, output_dir: str) -> None:
383
383
  LOWS['LZ40WS'] = LZ40WS['LZ40_WNDS_MPH']
384
384
  LOWS['LO_Avg_WS_MPH'] = LOWS.mean(axis=1, numeric_only=True)
385
385
  LOWS.to_csv(f'{output_dir}/LOWS.csv', index=False)
386
- LOWS.to_csv(f'{input_dir}/LOWS.csv', index=False) # Also needed in temporary directory by utils.py's wind_induced_waves()
387
386
 
388
387
  # RFVol acft
389
388
  # Create File (RF_Volume)
@@ -581,6 +580,18 @@ def main(input_dir: str, output_dir: str) -> None:
581
580
  LO_OP_data_Inter['Mean_OP'] = LO_OP_data_Inter.mean(axis=1, numeric_only=True)
582
581
  LO_OP_data_Inter = DF_Date_Range(LO_OP_data_Inter, St_Yr, St_M, St_D, En_Yr, En_M, En_D)
583
582
  LO_OP_data_Inter.to_csv(f'{output_dir}/LO_OP.csv', index=False)
583
+
584
+ # Create File (N_OP) (L001, L005, L008)
585
+ n_op = LO_OP_data_Inter[['date', 'Data_L001_OP_Inter', 'Data_L005_OP_Inter', 'Data_L008_OP_Inter']]
586
+ n_op['OP'] = n_op.mean(axis=1, numeric_only=True) * 1000 # mg/L to mg/m3
587
+ n_op.drop(['Data_L001_OP_Inter', 'Data_L005_OP_Inter', 'Data_L008_OP_Inter'], axis=1, inplace=True)
588
+ n_op.to_csv(f'{output_dir}/N_OP.csv', index=False)
589
+
590
+ # Create File (S_OP) (L004, L006, L007, L008, and LZ40)
591
+ s_op = LO_OP_data_Inter[['date', 'Data_L004_OP_Inter', 'Data_L006_OP_Inter', 'Data_L007_OP_Inter', 'Data_L008_OP_Inter', 'Data_LZ40_OP_Inter']]
592
+ s_op['OP'] = s_op.mean(axis=1, numeric_only=True) * 1000 # mg/L to mg/m3
593
+ s_op.drop(['Data_L004_OP_Inter', 'Data_L006_OP_Inter', 'Data_L007_OP_Inter', 'Data_L008_OP_Inter', 'Data_LZ40_OP_Inter'], axis=1, inplace=True)
594
+ s_op.to_csv(f'{output_dir}/S_OP.csv', index=False)
584
595
 
585
596
  # Interpolated NH4 Observations in Lake
586
597
  # Create File (LO_Avg_NH4)
@@ -663,6 +674,22 @@ def main(input_dir: str, output_dir: str) -> None:
663
674
  LO_DIN['NO'] = LO_NO_Clean_Inter['Mean_NO'].values
664
675
  LO_DIN['DIN_mg/m3'] = LO_DIN[['NH4', 'NO']].sum(axis=1)*1000
665
676
  LO_DIN.to_csv(f'{output_dir}/LO_DIN.csv', index=False)
677
+
678
+ # Create File (N_DIN) (L001, L005, L008)
679
+ n_din = pd.DataFrame(date_DIN, columns=['date'])
680
+ n_din.set_index('date', inplace=True)
681
+ n_din['NH4'] = LO_NH4_Clean_Inter[['date', 'Data_L001_NH4_Inter', 'Data_L005_NH4_Inter', 'Data_L008_NH4_Inter']].mean(axis=1, numeric_only=True)
682
+ n_din['NO'] = LO_NO_Clean_Inter[['date', 'Data_L001_NO_Inter', 'Data_L005_NO_Inter', 'Data_L008_NO_Inter']].mean(axis=1, numeric_only=True)*1000 # mg/L to mg/m3
683
+ n_din['DIN'] = n_din[['NH4', 'NO']].sum(axis=1)*1000 # mg/L to mg/m3
684
+ n_din.to_csv(f'{output_dir}/N_DIN.csv')
685
+
686
+ # Create File (S_DIN) (L004, L006, L007, L008, LZ40)
687
+ s_din = pd.DataFrame(date_DIN, columns=['date'])
688
+ s_din.set_index('date', inplace=True)
689
+ s_din['NH4'] = LO_NH4_Clean_Inter[['date', 'Data_L004_NH4_Inter', 'Data_L006_NH4_Inter', 'Data_L007_NH4_Inter', 'Data_L008_NH4_Inter', 'Data_LZ40_NH4_Inter']].mean(axis=1, numeric_only=True)
690
+ s_din['NO'] = LO_NO_Clean_Inter[['date', 'Data_L004_NO_Inter', 'Data_L006_NO_Inter', 'Data_L007_NO_Inter', 'Data_L008_NO_Inter', 'Data_LZ40_NO_Inter']].mean(axis=1, numeric_only=True)*1000 # mg/L to mg/m3
691
+ s_din['DIN'] = s_din[['NH4', 'NO']].sum(axis=1)*1000 # mg/L to mg/m3
692
+ s_din.to_csv(f'{output_dir}/S_DIN.csv')
666
693
 
667
694
  # Interpolated DO Observations in Lake
668
695
  # Create File (LO_Avg_DO)
@@ -822,6 +849,93 @@ def main(input_dir: str, output_dir: str) -> None:
822
849
  LO_Chla_Merge_Monthly_Inter = LO_Chla_Merge.resample('M').mean()
823
850
  LO_Chla_Merge_Monthly_Inter.to_csv(f'{output_dir}/LO_Chla_Merge_Monthly_Inter.csv')
824
851
 
852
+ # Create files (LO_Chla_Obs.csv, N_Merged_Chla.csv, and S_Merged_Chla.csv)
853
+ L001_Chla = pd.read_csv(f'{input_dir}/water_quality_L001_CHLOROPHYLL-A, CORRECTED.csv')
854
+ L001_Chla.drop(columns=['days'], inplace=True)
855
+ L004_Chla = pd.read_csv(f'{input_dir}/water_quality_L004_CHLOROPHYLL-A, CORRECTED.csv')
856
+ L004_Chla.drop(columns=['days'], inplace=True)
857
+ L005_Chla = pd.read_csv(f'{input_dir}/water_quality_L005_CHLOROPHYLL-A, CORRECTED.csv')
858
+ L005_Chla.drop(columns=['days'], inplace=True)
859
+ L006_Chla = pd.read_csv(f'{input_dir}/water_quality_L006_CHLOROPHYLL-A, CORRECTED.csv')
860
+ L006_Chla.drop(columns=['days'], inplace=True)
861
+ L007_Chla = pd.read_csv(f'{input_dir}/water_quality_L007_CHLOROPHYLL-A, CORRECTED.csv')
862
+ L007_Chla.drop(columns=['days'], inplace=True)
863
+ L008_Chla = pd.read_csv(f'{input_dir}/water_quality_L008_CHLOROPHYLL-A, CORRECTED.csv')
864
+ L008_Chla.drop(columns=['days'], inplace=True)
865
+ LZ40_Chla = pd.read_csv(f'{input_dir}/water_quality_LZ40_CHLOROPHYLL-A, CORRECTED.csv')
866
+ LZ40_Chla.drop(columns=['days'], inplace=True)
867
+ L001_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L001_CHLOROPHYLL-A(LC).csv')
868
+ L001_Chla_LC.drop(columns=['days'], inplace=True)
869
+ L004_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L004_CHLOROPHYLL-A(LC).csv')
870
+ L004_Chla_LC.drop(columns=['days'], inplace=True)
871
+ L005_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L005_CHLOROPHYLL-A(LC).csv')
872
+ L005_Chla_LC.drop(columns=['days'], inplace=True)
873
+ L006_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L006_CHLOROPHYLL-A(LC).csv')
874
+ L006_Chla_LC.drop(columns=['days'], inplace=True)
875
+ L007_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L007_CHLOROPHYLL-A(LC).csv')
876
+ L007_Chla_LC.drop(columns=['days'], inplace=True)
877
+ L008_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L008_CHLOROPHYLL-A(LC).csv')
878
+ L008_Chla_LC.drop(columns=['days'], inplace=True)
879
+ LZ40_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_LZ40_CHLOROPHYLL-A(LC).csv')
880
+ LZ40_Chla_LC.drop(columns=['days'], inplace=True)
881
+
882
+ LO_Chla = pd.merge(L001_Chla, L004_Chla, how='left', on='date')
883
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
884
+ LO_Chla = pd.merge(LO_Chla, L005_Chla, how='left', on='date')
885
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
886
+ LO_Chla = pd.merge(LO_Chla, L006_Chla, how='left', on='date')
887
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
888
+ LO_Chla = pd.merge(LO_Chla, L007_Chla, how='left', on='date')
889
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
890
+ LO_Chla = pd.merge(LO_Chla, L008_Chla, how='left', on='date')
891
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
892
+ LO_Chla = pd.merge(LO_Chla, LZ40_Chla, how='left', on='date')
893
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
894
+ LO_Chla = LO_Chla.set_index('date')
895
+ LO_Chla['Mean_Chla'] = LO_Chla.mean(axis=1)
896
+ LO_Chla = LO_Chla.reset_index()
897
+ LO_Chla_N_cols = ['L001_CHLOROPHYLL-A, CORRECTED_ug/L', 'L005_CHLOROPHYLL-A, CORRECTED_ug/L', 'L008_CHLOROPHYLL-A, CORRECTED_ug/L']
898
+ LO_Chla['Chla_North'] = LO_Chla[LO_Chla_N_cols].mean(axis=1)
899
+ LO_Chla_S_cols = ['L004_CHLOROPHYLL-A, CORRECTED_ug/L', 'L006_CHLOROPHYLL-A, CORRECTED_ug/L', 'L007_CHLOROPHYLL-A, CORRECTED_ug/L','L008_CHLOROPHYLL-A, CORRECTED_ug/L','LZ40_CHLOROPHYLL-A, CORRECTED_ug/L']
900
+ LO_Chla['Chla_South'] = LO_Chla[LO_Chla_S_cols].mean(axis=1)
901
+
902
+ LO_Chla_LC = pd.merge(L001_Chla_LC, L004_Chla_LC, how='left', on='date')
903
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
904
+ LO_Chla_LC = pd.merge(LO_Chla_LC, L005_Chla_LC, how='left', on='date')
905
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
906
+ LO_Chla_LC = pd.merge(LO_Chla_LC, L006_Chla_LC, how='left', on='date')
907
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
908
+ LO_Chla_LC = pd.merge(LO_Chla_LC, L007_Chla_LC, how='left', on='date')
909
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
910
+ LO_Chla_LC = pd.merge(LO_Chla_LC, L008_Chla_LC, how='left', on='date')
911
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
912
+ LO_Chla_LC = pd.merge(LO_Chla_LC, LZ40_Chla_LC, how='left', on='date')
913
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
914
+ LO_Chla_LC = LO_Chla_LC.set_index('date')
915
+ LO_Chla_LC['Mean_Chla'] = LO_Chla_LC.mean(axis=1)
916
+ LO_Chla_LC = LO_Chla_LC.reset_index()
917
+ LO_Chla_LC_N_cols = ['L001_CHLOROPHYLL-A(LC)_ug/L', 'L005_CHLOROPHYLL-A(LC)_ug/L', 'L008_CHLOROPHYLL-A(LC)_ug/L']
918
+ LO_Chla_LC['Chla_North'] = LO_Chla_LC[LO_Chla_LC_N_cols].mean(axis=1)
919
+ LO_Chla_LC_S_cols = ['L004_CHLOROPHYLL-A(LC)_ug/L', 'L006_CHLOROPHYLL-A(LC)_ug/L', 'L007_CHLOROPHYLL-A(LC)_ug/L','L008_CHLOROPHYLL-A(LC)_ug/L','LZ40_CHLOROPHYLL-A(LC)_ug/L']
920
+ LO_Chla_LC['Chla_South'] = LO_Chla_LC[LO_Chla_LC_S_cols].mean(axis=1)
921
+
922
+ LO_Chla = DF_Date_Range(LO_Chla, 2008, 1, 1, 2010, 10, 19)
923
+ LO_Chla_df = pd.DataFrame(LO_Chla['date'], columns=['date'])
924
+ LO_Chla_df['Chla'] = LO_Chla['Mean_Chla']
925
+ LO_Chla_df['Chla_N'] = LO_Chla['Chla_North']
926
+ LO_Chla_df['Chla_S'] = LO_Chla['Chla_South']
927
+
928
+ LO_Chla_LC = DF_Date_Range(LO_Chla_LC, 2010, 10, 20, 2023, 6, 30)
929
+ LO_Chla_LC_df = pd.DataFrame(LO_Chla_LC['date'], columns=['date'])
930
+ LO_Chla_LC_df['Chla'] = LO_Chla_LC['Mean_Chla']
931
+ LO_Chla_LC_df['Chla_N'] = LO_Chla_LC['Chla_North']
932
+ LO_Chla_LC_df['Chla_S'] = LO_Chla_LC['Chla_South']
933
+
934
+ LO_Chla_Merge = pd.concat([LO_Chla_df, LO_Chla_LC_df]).reset_index(drop=True)
935
+ LO_Chla_Merge.to_csv(f'{output_dir}/LO_Chla_Obs.csv')
936
+ LO_Chla_Merge[['date', 'Chla_N']].rename(columns={'Chla_N': 'Chla'}).to_csv(f'{output_dir}/N_Merged_Chla.csv', index=False)
937
+ LO_Chla_Merge[['date', 'Chla_S']].rename(columns={'Chla_S': 'Chla'}).to_csv(f'{output_dir}/S_Merged_Chla.csv', index=False)
938
+
825
939
  # Create Files S65E_Avg_Chla
826
940
  S65E_Chla_Inter = pd.read_csv(f'{input_dir}/water_quality_S65E_CHLOROPHYLL-A, CORRECTED_Interpolated.csv')
827
941
  S65E_Chla_LC_Inter = pd.read_csv(f'{input_dir}/water_quality_S65E_CHLOROPHYLL-A(LC)_Interpolated.csv')
@@ -897,7 +1011,6 @@ def main(input_dir: str, output_dir: str) -> None:
897
1011
  # Write Data into csv files
898
1012
  # write Avg Stage (ft, m) Storage (acft, m3) SA (acres) to csv
899
1013
  LO_Stg_Sto_SA_df.to_csv(f'{output_dir}/Average_LO_Storage_3MLag.csv', index=False)
900
- LO_Stg_Sto_SA_df.to_csv(f'{input_dir}/Average_LO_Storage_3MLag.csv', index=False) # Also needed in temporary directory by utils.py's wind_induced_waves()
901
1014
  # Write S65 TP concentrations (mg/L)
902
1015
  S65_total_TP.to_csv(f'{output_dir}/S65_TP_3MLag.csv', index=False)
903
1016
  # TP External Loads 3 Months Lag (mg)
@@ -0,0 +1,89 @@
1
+ import sys
2
+ from retry import retry
3
+ from rpy2.robjects import r
4
+ from rpy2.rinterface_lib.embedded import RRuntimeError
5
+ import pandas as pd
6
+
7
+
8
+ @retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
9
+ def get(
10
+ workspace,
11
+ date_min: str = "1972-01-01",
12
+ date_max: str = "2023-06-30"
13
+ ) -> None:
14
+ r(
15
+ f"""
16
+ # Load the required libraries
17
+ library(dbhydroR)
18
+ library(dplyr)
19
+
20
+ # Helper Functions
21
+ retrieve_data <- function(dbkey, date_min, date_max)
22
+ {{
23
+ # Get the data from dbhydro
24
+ df = get_hydro(dbkey = dbkey, date_min = date_min, date_max = date_max, raw = TRUE)
25
+
26
+ # Give data.frame correct column names so it can be cleaned using the clean_hydro function
27
+ colnames(df) <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
28
+
29
+ # Add a type and units column to data so it can be cleaned using the clean_hydro function
30
+ df$type <- "FLOW"
31
+ df$units <- "cfs"
32
+
33
+ # Clean the data.frame
34
+ df <- clean_hydro(df)
35
+
36
+ # Drop the " _FLOW_cfs" column
37
+ df <- df %>% select(-` _FLOW_cfs`)
38
+
39
+ # Convert Flow rate from cfs to m³/day
40
+ df[, -1] <- df[, -1] * (0.0283168466 * 86400)
41
+
42
+ # Return resulting data.frame
43
+ return(df)
44
+ }}
45
+
46
+ # S65E_S
47
+ S65E_S <- retrieve_data(dbkey = "91656", date_min = "{date_min}", date_max = "{date_max}")
48
+
49
+ # Wait five seconds before next request to avoid "too many requests" error
50
+ Sys.sleep(5)
51
+
52
+ # S65EX1_S
53
+ S65EX1_S <- retrieve_data(dbkey = "AL760", date_min = "{date_min}", date_max = "{date_max}")
54
+
55
+ # Merge the data from each dbkey
56
+ result <- merge(S65E_S, S65EX1_S, by = "date", all = TRUE)
57
+
58
+ # Write the data to a file
59
+ write.csv(result, file = '{workspace}/S65E_total.csv')
60
+ """
61
+ )
62
+
63
+ _reformat_s65e_total_file(workspace)
64
+
65
+ def _reformat_s65e_total_file(workspace: str):
66
+ # Read in the data
67
+ df = pd.read_csv(f"{workspace}/S65E_total.csv")
68
+
69
+ # Drop unused columns
70
+ df.drop('Unnamed: 0', axis=1, inplace=True)
71
+
72
+ # Convert date column to datetime
73
+ df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
74
+
75
+ # Sort the data by date
76
+ df.sort_values('date', inplace=True)
77
+
78
+ # Renumber the index
79
+ df.reset_index(drop=True, inplace=True)
80
+
81
+ # Drop rows that are missing all their values
82
+ df.dropna(how='all', inplace=True)
83
+
84
+ # Write the updated data back to the file
85
+ df.to_csv(f"{workspace}/S65E_total.csv")
86
+
87
+ if __name__ == "__main__":
88
+ workspace = sys.argv[1].rstrip("/")
89
+ get(workspace)
@@ -0,0 +1,293 @@
1
+ import sys
2
+ import os
3
+ import math
4
+ import numpy as np
5
+ import pandas as pd
6
+ import geoglows
7
+ from scipy import interpolate
8
+
9
+
10
+ SECONDS_IN_DAY = 86400
11
+
12
+
13
+ def get_bias_corrected_data(
14
+ station_id: str,
15
+ reach_id: str,
16
+ observed_data_path: str,
17
+ station_ensembles: pd.DataFrame,
18
+ station_stats: pd.DataFrame,
19
+ cache_path: str = None,
20
+ ) -> dict:
21
+ # Load the observed data from a CSV file
22
+ observed_data = pd.read_csv(
23
+ observed_data_path,
24
+ index_col=0,
25
+ usecols=["date", f"{station_id}_FLOW_cmd"],
26
+ )
27
+ # Convert the index to datetime and localize it to UTC
28
+ observed_data.index = pd.to_datetime(observed_data.index).tz_localize(
29
+ "UTC"
30
+ )
31
+ # Transform the data by dividing it by the number of seconds in a day
32
+ observed_data = observed_data.transform(lambda x: x / SECONDS_IN_DAY)
33
+ # Rename the value column to "Streamflow (m3/s)"
34
+ observed_data.rename(
35
+ columns={f"{station_id}_FLOW_cmd": "Streamflow (m3/s)"}, inplace=True
36
+ )
37
+
38
+ # Prepare the observed data by filling NaN values with the 10yr average
39
+ prepared_od = prep_observed_data(observed_data)
40
+
41
+ # Get the historical simulation data for the given reach ID
42
+ historical_data = None
43
+
44
+ if cache_path is None:
45
+ historical_data = geoglows.streamflow.historic_simulation(reach_id)
46
+ else:
47
+ # Create the geoglows cache directory if it doesn't exist
48
+ geoglows_cache_path = os.path.join(cache_path, "geoglows_cache")
49
+ if not os.path.exists(geoglows_cache_path):
50
+ os.makedirs(geoglows_cache_path)
51
+
52
+ # Check if the historical simulation data is already cached
53
+ if os.path.exists(
54
+ os.path.join(
55
+ geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
56
+ )
57
+ ):
58
+ historical_data = pd.read_csv(
59
+ os.path.join(
60
+ geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
61
+ ),
62
+ index_col=0,
63
+ )
64
+ historical_data.index = pd.to_datetime(historical_data.index)
65
+ else:
66
+ historical_data = geoglows.streamflow.historic_simulation(reach_id)
67
+ historical_data.to_csv(
68
+ os.path.join(
69
+ geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
70
+ )
71
+ )
72
+
73
+ # Correct the forecast bias in the station ensembles
74
+ station_ensembles = bias_correct_forecast(
75
+ station_ensembles, historical_data, prepared_od
76
+ )
77
+ # Correct the forecast bias in the station stats
78
+ station_stats = bias_correct_forecast(
79
+ station_stats, historical_data, prepared_od
80
+ )
81
+
82
+ # Return the bias-corrected station ensembles and station stats
83
+ return station_ensembles, station_stats
84
+
85
+
86
+ def prep_observed_data(observed_data: pd.DataFrame) -> pd.DataFrame:
87
+ # Group the data by month and day
88
+ grouped_data = observed_data.groupby(
89
+ [observed_data.index.month, observed_data.index.day]
90
+ )
91
+
92
+ # Calculate the rolling average of 'Streamflow (m3/s)' for each group
93
+ daily_10yr_avg = (
94
+ grouped_data["Streamflow (m3/s)"]
95
+ .rolling(window=10, min_periods=1, center=True)
96
+ .mean()
97
+ )
98
+
99
+ # Reset the multi-index of daily_10yr_avg and sort it by index
100
+ fill_val = daily_10yr_avg.reset_index(level=[0, 1], drop=True).sort_index()
101
+
102
+ # Fill NaN in 'Streamflow (m3/s)' with corresponding values from fill_val
103
+ observed_data["Streamflow (m3/s)"] = observed_data[
104
+ "Streamflow (m3/s)"
105
+ ].fillna(fill_val)
106
+
107
+ # Return the modified observed_data DataFrame
108
+ return observed_data
109
+
110
+
111
+ def bias_correct_historical(
112
+ simulated_data: pd.DataFrame, observed_data: pd.DataFrame
113
+ ) -> pd.DataFrame:
114
+ """
115
+ Accepts a historically simulated flow timeseries and observed flow timeseries and attempts to correct biases in the
116
+ simulation on a monthly basis.
117
+
118
+ Args:
119
+ simulated_data: A dataframe with a datetime index and a single column of streamflow values
120
+ observed_data: A dataframe with a datetime index and a single column of streamflow values
121
+
122
+ Returns:
123
+ pandas DataFrame with a datetime index and a single column of streamflow values
124
+ """
125
+ # list of the unique months in the historical simulation. should always be 1->12 but just in case...
126
+ unique_simulation_months = sorted(set(simulated_data.index.strftime("%m")))
127
+ dates = []
128
+ values = []
129
+
130
+ for month in unique_simulation_months:
131
+ # filter historic data to only be current month
132
+ monthly_simulated = simulated_data[
133
+ simulated_data.index.month == int(month)
134
+ ].dropna()
135
+ to_prob = _flow_and_probability_mapper(
136
+ monthly_simulated, to_probability=True
137
+ )
138
+ # filter the observations to current month
139
+ monthly_observed = observed_data[
140
+ observed_data.index.month == int(month)
141
+ ].dropna()
142
+ to_flow = _flow_and_probability_mapper(monthly_observed, to_flow=True)
143
+
144
+ dates += monthly_simulated.index.to_list()
145
+ value = to_flow(to_prob(monthly_simulated.values))
146
+ values += value.tolist()
147
+
148
+ corrected = pd.DataFrame(
149
+ data=values, index=dates, columns=["Corrected Simulated Streamflow"]
150
+ )
151
+ corrected.sort_index(inplace=True)
152
+ return corrected
153
+
154
+
155
+ def bias_correct_forecast(
156
+ forecast_data: pd.DataFrame,
157
+ simulated_data: pd.DataFrame,
158
+ observed_data: pd.DataFrame,
159
+ use_month: int = 0,
160
+ ) -> pd.DataFrame:
161
+ """
162
+ Accepts a short term forecast of streamflow, simulated historical flow, and observed flow timeseries and attempts
163
+ to correct biases in the forecasted data
164
+
165
+ Args:
166
+ forecast_data: A dataframe with a datetime index and any number of columns of forecasted flow. Compatible with
167
+ forecast_stats, forecast_ensembles, forecast_records
168
+ simulated_data: A dataframe with a datetime index and a single column of streamflow values
169
+ observed_data: A dataframe with a datetime index and a single column of streamflow values
170
+ use_month: Optional: either 0 for correct the forecast based on the first month of the forecast data or -1 if
171
+ you want to correct based on the ending month of the forecast data
172
+
173
+ Returns:
174
+ pandas DataFrame with a copy of forecasted data with values updated in each column
175
+ """
176
+ # make a copy of the forecasts which we update and return so the original data is not changed
177
+ forecast_copy = forecast_data.copy()
178
+
179
+ # make the flow and probability interpolation functions
180
+ monthly_simulated = simulated_data[
181
+ simulated_data.index.month == forecast_copy.index[use_month].month
182
+ ].dropna()
183
+ monthly_observed = observed_data[
184
+ observed_data.index.month == forecast_copy.index[use_month].month
185
+ ].dropna()
186
+ to_prob = _flow_and_probability_mapper(
187
+ monthly_simulated, to_probability=True, extrapolate=True
188
+ )
189
+ to_flow = _flow_and_probability_mapper(
190
+ monthly_observed, to_flow=True, extrapolate=True
191
+ )
192
+
193
+ # for each column of forecast data, make the interpolation function and update the dataframe
194
+ for column in forecast_copy.columns:
195
+ tmp = forecast_copy[column].dropna()
196
+ forecast_copy.update(
197
+ pd.DataFrame(
198
+ to_flow(to_prob(tmp.values)), index=tmp.index, columns=[column]
199
+ )
200
+ )
201
+
202
+ return forecast_copy
203
+
204
+
205
+ def _flow_and_probability_mapper(
206
+ monthly_data: pd.DataFrame,
207
+ to_probability: bool = False,
208
+ to_flow: bool = False,
209
+ extrapolate: bool = False,
210
+ ) -> interpolate.interp1d:
211
+ if not to_flow and not to_probability:
212
+ raise ValueError(
213
+ "You need to specify either to_probability or to_flow as True"
214
+ )
215
+
216
+ # get maximum value to bound histogram
217
+ max_val = math.ceil(np.max(monthly_data.max()))
218
+ min_val = math.floor(np.min(monthly_data.min()))
219
+
220
+ if max_val == min_val:
221
+ max_val += 0.1
222
+
223
+ # determine number of histograms bins needed
224
+ number_of_points = len(monthly_data.values)
225
+ number_of_classes = math.ceil(1 + (3.322 * math.log10(number_of_points)))
226
+
227
+ # specify the bin width for histogram (in m3/s)
228
+ step_width = (max_val - min_val) / number_of_classes
229
+
230
+ # specify histogram bins
231
+ bins = np.arange(
232
+ -np.min(step_width),
233
+ max_val + 2 * np.min(step_width),
234
+ np.min(step_width),
235
+ )
236
+
237
+ if bins[0] == 0:
238
+ bins = np.concatenate((-bins[1], bins))
239
+ elif bins[0] > 0:
240
+ bins = np.concatenate((-bins[0], bins))
241
+
242
+ # make the histogram
243
+ counts, bin_edges = np.histogram(monthly_data, bins=bins)
244
+
245
+ # adjust the bins to be the center
246
+ bin_edges = bin_edges[1:]
247
+
248
+ # normalize the histograms
249
+ counts = counts.astype(float) / monthly_data.size
250
+
251
+ # calculate the cdfs
252
+ cdf = np.cumsum(counts)
253
+
254
+ # Identify indices where consecutive values are the same
255
+ duplicate_indices = np.where(np.diff(cdf) == 0)[0]
256
+
257
+ # Adjust duplicate value to be an extrapolation of the previous value
258
+ for idx in duplicate_indices:
259
+ if idx > 0:
260
+ cdf[idx] = cdf[idx - 1] + (cdf[idx + 1] - cdf[idx - 1]) / 2
261
+
262
+ # interpolated function to convert simulated streamflow to prob
263
+ if to_probability:
264
+ if extrapolate:
265
+ func = interpolate.interp1d(
266
+ bin_edges, cdf, fill_value="extrapolate"
267
+ )
268
+ else:
269
+ func = interpolate.interp1d(bin_edges, cdf)
270
+ return lambda x: np.clip(func(x), 0, 1)
271
+ # interpolated function to convert simulated prob to observed streamflow
272
+ elif to_flow:
273
+ if extrapolate:
274
+ return interpolate.interp1d(
275
+ cdf, bin_edges, fill_value="extrapolate"
276
+ )
277
+ return interpolate.interp1d(cdf, bin_edges)
278
+
279
+
280
+ if __name__ == "__main__":
281
+ station_id = sys.argv[1]
282
+ reach_id = sys.argv[2]
283
+ observed_data_path = sys.argv[3].rstrip("/")
284
+ station_ensembles = sys.argv[4]
285
+ station_stats = sys.argv[5]
286
+
287
+ get_bias_corrected_data(
288
+ station_id,
289
+ reach_id,
290
+ observed_data_path,
291
+ station_ensembles,
292
+ station_stats,
293
+ )