loone-data-prep 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -45,10 +45,11 @@ def main(input_dir: str, output_dir: str, ensemble_number: str) -> None: # , hi
45
45
  LO_Stage = pd.read_csv(f"{input_dir}/LO_Stage.csv")
46
46
  # Create Column (EOD Stg(ft, NGVD)) in File (SFWMM_Daily_Outputs)
47
47
  LO_Stage = DF_Date_Range(LO_Stage, M3_Yr, M3_M, M3_D, En_Yr, En_M, En_D)
48
+ LO_Stage.index = LO_Stage["date"]
48
49
  # Calculate average
49
50
  if "Average_Stage" not in LO_Stage.columns:
50
51
  LO_Stage = LO_Stage.loc[:, ~LO_Stage.columns.str.contains("^Unnamed")]
51
- LO_Stage["Average_Stage"] = LO_Stage.mean(axis=1)
52
+ LO_Stage["Average_Stage"] = LO_Stage.drop(columns=['date']).mean(axis=1)
52
53
  LO_Stage.to_csv(f"{input_dir}/LO_Stage.csv", index=False)
53
54
  LO_Storage = stg2sto(f"{input_dir}/StgSto_data.csv", LO_Stage["Average_Stage"], 0)
54
55
  LO_SA = stg2ar(f"{input_dir}/Stgar_data.csv", LO_Stage["Average_Stage"], 0)
@@ -383,7 +383,6 @@ def main(input_dir: str, output_dir: str) -> None:
383
383
  LOWS['LZ40WS'] = LZ40WS['LZ40_WNDS_MPH']
384
384
  LOWS['LO_Avg_WS_MPH'] = LOWS.mean(axis=1, numeric_only=True)
385
385
  LOWS.to_csv(f'{output_dir}/LOWS.csv', index=False)
386
- LOWS.to_csv(f'{input_dir}/LOWS.csv', index=False) # Also needed in temporary directory by utils.py's wind_induced_waves()
387
386
 
388
387
  # RFVol acft
389
388
  # Create File (RF_Volume)
@@ -581,6 +580,18 @@ def main(input_dir: str, output_dir: str) -> None:
581
580
  LO_OP_data_Inter['Mean_OP'] = LO_OP_data_Inter.mean(axis=1, numeric_only=True)
582
581
  LO_OP_data_Inter = DF_Date_Range(LO_OP_data_Inter, St_Yr, St_M, St_D, En_Yr, En_M, En_D)
583
582
  LO_OP_data_Inter.to_csv(f'{output_dir}/LO_OP.csv', index=False)
583
+
584
+ # Create File (N_OP) (L001, L005, L008)
585
+ n_op = LO_OP_data_Inter[['date', 'Data_L001_OP_Inter', 'Data_L005_OP_Inter', 'Data_L008_OP_Inter']]
586
+ n_op['OP'] = n_op.mean(axis=1, numeric_only=True) * 1000 # mg/L to mg/m3
587
+ n_op.drop(['Data_L001_OP_Inter', 'Data_L005_OP_Inter', 'Data_L008_OP_Inter'], axis=1, inplace=True)
588
+ n_op.to_csv(f'{output_dir}/N_OP.csv', index=False)
589
+
590
+ # Create File (S_OP) (L004, L006, L007, L008, and LZ40)
591
+ s_op = LO_OP_data_Inter[['date', 'Data_L004_OP_Inter', 'Data_L006_OP_Inter', 'Data_L007_OP_Inter', 'Data_L008_OP_Inter', 'Data_LZ40_OP_Inter']]
592
+ s_op['OP'] = s_op.mean(axis=1, numeric_only=True) * 1000 # mg/L to mg/m3
593
+ s_op.drop(['Data_L004_OP_Inter', 'Data_L006_OP_Inter', 'Data_L007_OP_Inter', 'Data_L008_OP_Inter', 'Data_LZ40_OP_Inter'], axis=1, inplace=True)
594
+ s_op.to_csv(f'{output_dir}/S_OP.csv', index=False)
584
595
 
585
596
  # Interpolated NH4 Observations in Lake
586
597
  # Create File (LO_Avg_NH4)
@@ -663,6 +674,22 @@ def main(input_dir: str, output_dir: str) -> None:
663
674
  LO_DIN['NO'] = LO_NO_Clean_Inter['Mean_NO'].values
664
675
  LO_DIN['DIN_mg/m3'] = LO_DIN[['NH4', 'NO']].sum(axis=1)*1000
665
676
  LO_DIN.to_csv(f'{output_dir}/LO_DIN.csv', index=False)
677
+
678
+ # Create File (N_DIN) (L001, L005, L008)
679
+ n_din = pd.DataFrame(date_DIN, columns=['date'])
680
+ n_din.set_index('date', inplace=True)
681
+ n_din['NH4'] = LO_NH4_Clean_Inter[['date', 'Data_L001_NH4_Inter', 'Data_L005_NH4_Inter', 'Data_L008_NH4_Inter']].mean(axis=1, numeric_only=True)
682
+ n_din['NO'] = LO_NO_Clean_Inter[['date', 'Data_L001_NO_Inter', 'Data_L005_NO_Inter', 'Data_L008_NO_Inter']].mean(axis=1, numeric_only=True)*1000 # mg/L to mg/m3
683
+ n_din['DIN'] = n_din[['NH4', 'NO']].sum(axis=1)*1000 # mg/L to mg/m3
684
+ n_din.to_csv(f'{output_dir}/N_DIN.csv')
685
+
686
+ # Create File (S_DIN) (L004, L006, L007, L008, LZ40)
687
+ s_din = pd.DataFrame(date_DIN, columns=['date'])
688
+ s_din.set_index('date', inplace=True)
689
+ s_din['NH4'] = LO_NH4_Clean_Inter[['date', 'Data_L004_NH4_Inter', 'Data_L006_NH4_Inter', 'Data_L007_NH4_Inter', 'Data_L008_NH4_Inter', 'Data_LZ40_NH4_Inter']].mean(axis=1, numeric_only=True)
690
+ s_din['NO'] = LO_NO_Clean_Inter[['date', 'Data_L004_NO_Inter', 'Data_L006_NO_Inter', 'Data_L007_NO_Inter', 'Data_L008_NO_Inter', 'Data_LZ40_NO_Inter']].mean(axis=1, numeric_only=True)*1000 # mg/L to mg/m3
691
+ s_din['DIN'] = s_din[['NH4', 'NO']].sum(axis=1)*1000 # mg/L to mg/m3
692
+ s_din.to_csv(f'{output_dir}/S_DIN.csv')
666
693
 
667
694
  # Interpolated DO Observations in Lake
668
695
  # Create File (LO_Avg_DO)
@@ -822,6 +849,93 @@ def main(input_dir: str, output_dir: str) -> None:
822
849
  LO_Chla_Merge_Monthly_Inter = LO_Chla_Merge.resample('M').mean()
823
850
  LO_Chla_Merge_Monthly_Inter.to_csv(f'{output_dir}/LO_Chla_Merge_Monthly_Inter.csv')
824
851
 
852
+ # Create files (LO_Chla_Obs.csv, N_Merged_Chla.csv, and S_Merged_Chla.csv)
853
+ L001_Chla = pd.read_csv(f'{input_dir}/water_quality_L001_CHLOROPHYLL-A, CORRECTED.csv')
854
+ L001_Chla.drop(columns=['days'], inplace=True)
855
+ L004_Chla = pd.read_csv(f'{input_dir}/water_quality_L004_CHLOROPHYLL-A, CORRECTED.csv')
856
+ L004_Chla.drop(columns=['days'], inplace=True)
857
+ L005_Chla = pd.read_csv(f'{input_dir}/water_quality_L005_CHLOROPHYLL-A, CORRECTED.csv')
858
+ L005_Chla.drop(columns=['days'], inplace=True)
859
+ L006_Chla = pd.read_csv(f'{input_dir}/water_quality_L006_CHLOROPHYLL-A, CORRECTED.csv')
860
+ L006_Chla.drop(columns=['days'], inplace=True)
861
+ L007_Chla = pd.read_csv(f'{input_dir}/water_quality_L007_CHLOROPHYLL-A, CORRECTED.csv')
862
+ L007_Chla.drop(columns=['days'], inplace=True)
863
+ L008_Chla = pd.read_csv(f'{input_dir}/water_quality_L008_CHLOROPHYLL-A, CORRECTED.csv')
864
+ L008_Chla.drop(columns=['days'], inplace=True)
865
+ LZ40_Chla = pd.read_csv(f'{input_dir}/water_quality_LZ40_CHLOROPHYLL-A, CORRECTED.csv')
866
+ LZ40_Chla.drop(columns=['days'], inplace=True)
867
+ L001_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L001_CHLOROPHYLL-A(LC).csv')
868
+ L001_Chla_LC.drop(columns=['days'], inplace=True)
869
+ L004_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L004_CHLOROPHYLL-A(LC).csv')
870
+ L004_Chla_LC.drop(columns=['days'], inplace=True)
871
+ L005_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L005_CHLOROPHYLL-A(LC).csv')
872
+ L005_Chla_LC.drop(columns=['days'], inplace=True)
873
+ L006_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L006_CHLOROPHYLL-A(LC).csv')
874
+ L006_Chla_LC.drop(columns=['days'], inplace=True)
875
+ L007_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L007_CHLOROPHYLL-A(LC).csv')
876
+ L007_Chla_LC.drop(columns=['days'], inplace=True)
877
+ L008_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L008_CHLOROPHYLL-A(LC).csv')
878
+ L008_Chla_LC.drop(columns=['days'], inplace=True)
879
+ LZ40_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_LZ40_CHLOROPHYLL-A(LC).csv')
880
+ LZ40_Chla_LC.drop(columns=['days'], inplace=True)
881
+
882
+ LO_Chla = pd.merge(L001_Chla, L004_Chla, how='left', on='date')
883
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
884
+ LO_Chla = pd.merge(LO_Chla, L005_Chla, how='left', on='date')
885
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
886
+ LO_Chla = pd.merge(LO_Chla, L006_Chla, how='left', on='date')
887
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
888
+ LO_Chla = pd.merge(LO_Chla, L007_Chla, how='left', on='date')
889
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
890
+ LO_Chla = pd.merge(LO_Chla, L008_Chla, how='left', on='date')
891
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
892
+ LO_Chla = pd.merge(LO_Chla, LZ40_Chla, how='left', on='date')
893
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
894
+ LO_Chla = LO_Chla.set_index('date')
895
+ LO_Chla['Mean_Chla'] = LO_Chla.mean(axis=1)
896
+ LO_Chla = LO_Chla.reset_index()
897
+ LO_Chla_N_cols = ['L001_CHLOROPHYLL-A, CORRECTED_ug/L', 'L005_CHLOROPHYLL-A, CORRECTED_ug/L', 'L008_CHLOROPHYLL-A, CORRECTED_ug/L']
898
+ LO_Chla['Chla_North'] = LO_Chla[LO_Chla_N_cols].mean(axis=1)
899
+ LO_Chla_S_cols = ['L004_CHLOROPHYLL-A, CORRECTED_ug/L', 'L006_CHLOROPHYLL-A, CORRECTED_ug/L', 'L007_CHLOROPHYLL-A, CORRECTED_ug/L','L008_CHLOROPHYLL-A, CORRECTED_ug/L','LZ40_CHLOROPHYLL-A, CORRECTED_ug/L']
900
+ LO_Chla['Chla_South'] = LO_Chla[LO_Chla_S_cols].mean(axis=1)
901
+
902
+ LO_Chla_LC = pd.merge(L001_Chla_LC, L004_Chla_LC, how='left', on='date')
903
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
904
+ LO_Chla_LC = pd.merge(LO_Chla_LC, L005_Chla_LC, how='left', on='date')
905
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
906
+ LO_Chla_LC = pd.merge(LO_Chla_LC, L006_Chla_LC, how='left', on='date')
907
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
908
+ LO_Chla_LC = pd.merge(LO_Chla_LC, L007_Chla_LC, how='left', on='date')
909
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
910
+ LO_Chla_LC = pd.merge(LO_Chla_LC, L008_Chla_LC, how='left', on='date')
911
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
912
+ LO_Chla_LC = pd.merge(LO_Chla_LC, LZ40_Chla_LC, how='left', on='date')
913
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
914
+ LO_Chla_LC = LO_Chla_LC.set_index('date')
915
+ LO_Chla_LC['Mean_Chla'] = LO_Chla_LC.mean(axis=1)
916
+ LO_Chla_LC = LO_Chla_LC.reset_index()
917
+ LO_Chla_LC_N_cols = ['L001_CHLOROPHYLL-A(LC)_ug/L', 'L005_CHLOROPHYLL-A(LC)_ug/L', 'L008_CHLOROPHYLL-A(LC)_ug/L']
918
+ LO_Chla_LC['Chla_North'] = LO_Chla_LC[LO_Chla_LC_N_cols].mean(axis=1)
919
+ LO_Chla_LC_S_cols = ['L004_CHLOROPHYLL-A(LC)_ug/L', 'L006_CHLOROPHYLL-A(LC)_ug/L', 'L007_CHLOROPHYLL-A(LC)_ug/L','L008_CHLOROPHYLL-A(LC)_ug/L','LZ40_CHLOROPHYLL-A(LC)_ug/L']
920
+ LO_Chla_LC['Chla_South'] = LO_Chla_LC[LO_Chla_LC_S_cols].mean(axis=1)
921
+
922
+ LO_Chla = DF_Date_Range(LO_Chla, 2008, 1, 1, 2010, 10, 19)
923
+ LO_Chla_df = pd.DataFrame(LO_Chla['date'], columns=['date'])
924
+ LO_Chla_df['Chla'] = LO_Chla['Mean_Chla']
925
+ LO_Chla_df['Chla_N'] = LO_Chla['Chla_North']
926
+ LO_Chla_df['Chla_S'] = LO_Chla['Chla_South']
927
+
928
+ LO_Chla_LC = DF_Date_Range(LO_Chla_LC, 2010, 10, 20, 2023, 6, 30)
929
+ LO_Chla_LC_df = pd.DataFrame(LO_Chla_LC['date'], columns=['date'])
930
+ LO_Chla_LC_df['Chla'] = LO_Chla_LC['Mean_Chla']
931
+ LO_Chla_LC_df['Chla_N'] = LO_Chla_LC['Chla_North']
932
+ LO_Chla_LC_df['Chla_S'] = LO_Chla_LC['Chla_South']
933
+
934
+ LO_Chla_Merge = pd.concat([LO_Chla_df, LO_Chla_LC_df]).reset_index(drop=True)
935
+ LO_Chla_Merge.to_csv(f'{output_dir}/LO_Chla_Obs.csv')
936
+ LO_Chla_Merge[['date', 'Chla_N']].rename(columns={'Chla_N': 'Chla'}).to_csv(f'{output_dir}/N_Merged_Chla.csv', index=False)
937
+ LO_Chla_Merge[['date', 'Chla_S']].rename(columns={'Chla_S': 'Chla'}).to_csv(f'{output_dir}/S_Merged_Chla.csv', index=False)
938
+
825
939
  # Create Files S65E_Avg_Chla
826
940
  S65E_Chla_Inter = pd.read_csv(f'{input_dir}/water_quality_S65E_CHLOROPHYLL-A, CORRECTED_Interpolated.csv')
827
941
  S65E_Chla_LC_Inter = pd.read_csv(f'{input_dir}/water_quality_S65E_CHLOROPHYLL-A(LC)_Interpolated.csv')
@@ -897,7 +1011,6 @@ def main(input_dir: str, output_dir: str) -> None:
897
1011
  # Write Data into csv files
898
1012
  # write Avg Stage (ft, m) Storage (acft, m3) SA (acres) to csv
899
1013
  LO_Stg_Sto_SA_df.to_csv(f'{output_dir}/Average_LO_Storage_3MLag.csv', index=False)
900
- LO_Stg_Sto_SA_df.to_csv(f'{input_dir}/Average_LO_Storage_3MLag.csv', index=False) # Also needed in temporary directory by utils.py's wind_induced_waves()
901
1014
  # Write S65 TP concentrations (mg/L)
902
1015
  S65_total_TP.to_csv(f'{output_dir}/S65_TP_3MLag.csv', index=False)
903
1016
  # TP External Loads 3 Months Lag (mg)
@@ -2,22 +2,87 @@ import sys
2
2
  from retry import retry
3
3
  from rpy2.robjects import r
4
4
  from rpy2.rinterface_lib.embedded import RRuntimeError
5
+ import pandas as pd
5
6
 
6
7
 
7
8
  @retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
8
- def get(workspace):
9
+ def get(
10
+ workspace,
11
+ date_min: str = "1972-01-01",
12
+ date_max: str = "2023-06-30"
13
+ ) -> None:
9
14
  r(
10
15
  f"""
11
16
  # Load the required libraries
12
17
  library(dbhydroR)
13
-
14
- #S65E_Total
15
- S65E_total = get_hydro(dbkey = c("91656", "AL760"), date_min = "1972-01-01", date_max = "2023-06-30")
16
- S65E_total[, -1] <- S65E_total[, -1] * (0.0283168466 * 86400)
17
- write.csv(S65E_total,file ='{workspace}/S65E_total.csv')
18
+ library(dplyr)
19
+
20
+ # Helper Functions
21
+ retrieve_data <- function(dbkey, date_min, date_max)
22
+ {{
23
+ # Get the data from dbhydro
24
+ df = get_hydro(dbkey = dbkey, date_min = date_min, date_max = date_max, raw = TRUE)
25
+
26
+ # Give data.frame correct column names so it can be cleaned using the clean_hydro function
27
+ colnames(df) <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
28
+
29
+ # Add a type and units column to data so it can be cleaned using the clean_hydro function
30
+ df$type <- "FLOW"
31
+ df$units <- "cfs"
32
+
33
+ # Clean the data.frame
34
+ df <- clean_hydro(df)
35
+
36
+ # Drop the " _FLOW_cfs" column
37
+ df <- df %>% select(-` _FLOW_cfs`)
38
+
39
+ # Convert Flow rate from cfs to m³/day
40
+ df[, -1] <- df[, -1] * (0.0283168466 * 86400)
41
+
42
+ # Return resulting data.frame
43
+ return(df)
44
+ }}
45
+
46
+ # S65E_S
47
+ S65E_S <- retrieve_data(dbkey = "91656", date_min = "{date_min}", date_max = "{date_max}")
48
+
49
+ # Wait five seconds before next request to avoid "too many requests" error
50
+ Sys.sleep(5)
51
+
52
+ # S65EX1_S
53
+ S65EX1_S <- retrieve_data(dbkey = "AL760", date_min = "{date_min}", date_max = "{date_max}")
54
+
55
+ # Merge the data from each dbkey
56
+ result <- merge(S65E_S, S65EX1_S, by = "date", all = TRUE)
57
+
58
+ # Write the data to a file
59
+ write.csv(result, file = '{workspace}/S65E_total.csv')
18
60
  """
19
61
  )
62
+
63
+ _reformat_s65e_total_file(workspace)
20
64
 
65
+ def _reformat_s65e_total_file(workspace: str):
66
+ # Read in the data
67
+ df = pd.read_csv(f"{workspace}/S65E_total.csv")
68
+
69
+ # Drop unused columns
70
+ df.drop('Unnamed: 0', axis=1, inplace=True)
71
+
72
+ # Convert date column to datetime
73
+ df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
74
+
75
+ # Sort the data by date
76
+ df.sort_values('date', inplace=True)
77
+
78
+ # Renumber the index
79
+ df.reset_index(drop=True, inplace=True)
80
+
81
+ # Drop rows that are missing all their values
82
+ df.dropna(how='all', inplace=True)
83
+
84
+ # Write the updated data back to the file
85
+ df.to_csv(f"{workspace}/S65E_total.csv")
21
86
 
22
87
  if __name__ == "__main__":
23
88
  workspace = sys.argv[1].rstrip("/")
@@ -1,7 +1,10 @@
1
1
  import sys
2
2
  import os
3
+ import math
4
+ import numpy as np
3
5
  import pandas as pd
4
6
  import geoglows
7
+ from scipy import interpolate
5
8
 
6
9
 
7
10
  SECONDS_IN_DAY = 86400
@@ -37,29 +40,42 @@ def get_bias_corrected_data(
37
40
 
38
41
  # Get the historical simulation data for the given reach ID
39
42
  historical_data = None
40
-
43
+
41
44
  if cache_path is None:
42
45
  historical_data = geoglows.streamflow.historic_simulation(reach_id)
43
46
  else:
44
47
  # Create the geoglows cache directory if it doesn't exist
45
- geoglows_cache_path = os.path.join(cache_path, 'geoglows_cache')
48
+ geoglows_cache_path = os.path.join(cache_path, "geoglows_cache")
46
49
  if not os.path.exists(geoglows_cache_path):
47
50
  os.makedirs(geoglows_cache_path)
48
-
51
+
49
52
  # Check if the historical simulation data is already cached
50
- if os.path.exists(os.path.join(geoglows_cache_path, f'{reach_id}_historic_simulation.csv')):
51
- historical_data = pd.read_csv(os.path.join(geoglows_cache_path, f'{reach_id}_historic_simulation.csv'), index_col=0)
53
+ if os.path.exists(
54
+ os.path.join(
55
+ geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
56
+ )
57
+ ):
58
+ historical_data = pd.read_csv(
59
+ os.path.join(
60
+ geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
61
+ ),
62
+ index_col=0,
63
+ )
52
64
  historical_data.index = pd.to_datetime(historical_data.index)
53
65
  else:
54
66
  historical_data = geoglows.streamflow.historic_simulation(reach_id)
55
- historical_data.to_csv(os.path.join(geoglows_cache_path, f'{reach_id}_historic_simulation.csv'))
67
+ historical_data.to_csv(
68
+ os.path.join(
69
+ geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
70
+ )
71
+ )
56
72
 
57
73
  # Correct the forecast bias in the station ensembles
58
- station_ensembles = geoglows.bias.correct_forecast(
74
+ station_ensembles = bias_correct_forecast(
59
75
  station_ensembles, historical_data, prepared_od
60
76
  )
61
77
  # Correct the forecast bias in the station stats
62
- station_stats = geoglows.bias.correct_forecast(
78
+ station_stats = bias_correct_forecast(
63
79
  station_stats, historical_data, prepared_od
64
80
  )
65
81
 
@@ -92,6 +108,175 @@ def prep_observed_data(observed_data: pd.DataFrame) -> pd.DataFrame:
92
108
  return observed_data
93
109
 
94
110
 
111
+ def bias_correct_historical(
112
+ simulated_data: pd.DataFrame, observed_data: pd.DataFrame
113
+ ) -> pd.DataFrame:
114
+ """
115
+ Accepts a historically simulated flow timeseries and observed flow timeseries and attempts to correct biases in the
116
+ simulation on a monthly basis.
117
+
118
+ Args:
119
+ simulated_data: A dataframe with a datetime index and a single column of streamflow values
120
+ observed_data: A dataframe with a datetime index and a single column of streamflow values
121
+
122
+ Returns:
123
+ pandas DataFrame with a datetime index and a single column of streamflow values
124
+ """
125
+ # list of the unique months in the historical simulation. should always be 1->12 but just in case...
126
+ unique_simulation_months = sorted(set(simulated_data.index.strftime("%m")))
127
+ dates = []
128
+ values = []
129
+
130
+ for month in unique_simulation_months:
131
+ # filter historic data to only be current month
132
+ monthly_simulated = simulated_data[
133
+ simulated_data.index.month == int(month)
134
+ ].dropna()
135
+ to_prob = _flow_and_probability_mapper(
136
+ monthly_simulated, to_probability=True
137
+ )
138
+ # filter the observations to current month
139
+ monthly_observed = observed_data[
140
+ observed_data.index.month == int(month)
141
+ ].dropna()
142
+ to_flow = _flow_and_probability_mapper(monthly_observed, to_flow=True)
143
+
144
+ dates += monthly_simulated.index.to_list()
145
+ value = to_flow(to_prob(monthly_simulated.values))
146
+ values += value.tolist()
147
+
148
+ corrected = pd.DataFrame(
149
+ data=values, index=dates, columns=["Corrected Simulated Streamflow"]
150
+ )
151
+ corrected.sort_index(inplace=True)
152
+ return corrected
153
+
154
+
155
+ def bias_correct_forecast(
156
+ forecast_data: pd.DataFrame,
157
+ simulated_data: pd.DataFrame,
158
+ observed_data: pd.DataFrame,
159
+ use_month: int = 0,
160
+ ) -> pd.DataFrame:
161
+ """
162
+ Accepts a short term forecast of streamflow, simulated historical flow, and observed flow timeseries and attempts
163
+ to correct biases in the forecasted data
164
+
165
+ Args:
166
+ forecast_data: A dataframe with a datetime index and any number of columns of forecasted flow. Compatible with
167
+ forecast_stats, forecast_ensembles, forecast_records
168
+ simulated_data: A dataframe with a datetime index and a single column of streamflow values
169
+ observed_data: A dataframe with a datetime index and a single column of streamflow values
170
+ use_month: Optional: either 0 for correct the forecast based on the first month of the forecast data or -1 if
171
+ you want to correct based on the ending month of the forecast data
172
+
173
+ Returns:
174
+ pandas DataFrame with a copy of forecasted data with values updated in each column
175
+ """
176
+ # make a copy of the forecasts which we update and return so the original data is not changed
177
+ forecast_copy = forecast_data.copy()
178
+
179
+ # make the flow and probability interpolation functions
180
+ monthly_simulated = simulated_data[
181
+ simulated_data.index.month == forecast_copy.index[use_month].month
182
+ ].dropna()
183
+ monthly_observed = observed_data[
184
+ observed_data.index.month == forecast_copy.index[use_month].month
185
+ ].dropna()
186
+ to_prob = _flow_and_probability_mapper(
187
+ monthly_simulated, to_probability=True, extrapolate=True
188
+ )
189
+ to_flow = _flow_and_probability_mapper(
190
+ monthly_observed, to_flow=True, extrapolate=True
191
+ )
192
+
193
+ # for each column of forecast data, make the interpolation function and update the dataframe
194
+ for column in forecast_copy.columns:
195
+ tmp = forecast_copy[column].dropna()
196
+ forecast_copy.update(
197
+ pd.DataFrame(
198
+ to_flow(to_prob(tmp.values)), index=tmp.index, columns=[column]
199
+ )
200
+ )
201
+
202
+ return forecast_copy
203
+
204
+
205
+ def _flow_and_probability_mapper(
206
+ monthly_data: pd.DataFrame,
207
+ to_probability: bool = False,
208
+ to_flow: bool = False,
209
+ extrapolate: bool = False,
210
+ ) -> interpolate.interp1d:
211
+ if not to_flow and not to_probability:
212
+ raise ValueError(
213
+ "You need to specify either to_probability or to_flow as True"
214
+ )
215
+
216
+ # get maximum value to bound histogram
217
+ max_val = math.ceil(np.max(monthly_data.max()))
218
+ min_val = math.floor(np.min(monthly_data.min()))
219
+
220
+ if max_val == min_val:
221
+ max_val += 0.1
222
+
223
+ # determine number of histograms bins needed
224
+ number_of_points = len(monthly_data.values)
225
+ number_of_classes = math.ceil(1 + (3.322 * math.log10(number_of_points)))
226
+
227
+ # specify the bin width for histogram (in m3/s)
228
+ step_width = (max_val - min_val) / number_of_classes
229
+
230
+ # specify histogram bins
231
+ bins = np.arange(
232
+ -np.min(step_width),
233
+ max_val + 2 * np.min(step_width),
234
+ np.min(step_width),
235
+ )
236
+
237
+ if bins[0] == 0:
238
+ bins = np.concatenate((-bins[1], bins))
239
+ elif bins[0] > 0:
240
+ bins = np.concatenate((-bins[0], bins))
241
+
242
+ # make the histogram
243
+ counts, bin_edges = np.histogram(monthly_data, bins=bins)
244
+
245
+ # adjust the bins to be the center
246
+ bin_edges = bin_edges[1:]
247
+
248
+ # normalize the histograms
249
+ counts = counts.astype(float) / monthly_data.size
250
+
251
+ # calculate the cdfs
252
+ cdf = np.cumsum(counts)
253
+
254
+ # Identify indices where consecutive values are the same
255
+ duplicate_indices = np.where(np.diff(cdf) == 0)[0]
256
+
257
+ # Adjust duplicate value to be an extrapolation of the previous value
258
+ for idx in duplicate_indices:
259
+ if idx > 0:
260
+ cdf[idx] = cdf[idx - 1] + (cdf[idx + 1] - cdf[idx - 1]) / 2
261
+
262
+ # interpolated function to convert simulated streamflow to prob
263
+ if to_probability:
264
+ if extrapolate:
265
+ func = interpolate.interp1d(
266
+ bin_edges, cdf, fill_value="extrapolate"
267
+ )
268
+ else:
269
+ func = interpolate.interp1d(bin_edges, cdf)
270
+ return lambda x: np.clip(func(x), 0, 1)
271
+ # interpolated function to convert simulated prob to observed streamflow
272
+ elif to_flow:
273
+ if extrapolate:
274
+ return interpolate.interp1d(
275
+ cdf, bin_edges, fill_value="extrapolate"
276
+ )
277
+ return interpolate.interp1d(cdf, bin_edges)
278
+
279
+
95
280
  if __name__ == "__main__":
96
281
  station_id = sys.argv[1]
97
282
  reach_id = sys.argv[2]