loone-data-prep 0.1.7__tar.gz → 0.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/PKG-INFO +2 -1
  2. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/GEOGLOWS_LOONE_DATA_PREP.py +2 -1
  3. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/LOONE_DATA_PREP.py +115 -0
  4. loone_data_prep-0.1.8/loone_data_prep/flow_data/forecast_bias_correction.py +293 -0
  5. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/utils.py +286 -78
  6. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep.egg-info/PKG-INFO +2 -1
  7. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep.egg-info/requires.txt +1 -0
  8. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/pyproject.toml +10 -5
  9. loone_data_prep-0.1.7/loone_data_prep/flow_data/forecast_bias_correction.py +0 -108
  10. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/LICENSE +0 -0
  11. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/README.md +0 -0
  12. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/__init__.py +0 -0
  13. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/data_analyses_fns.py +0 -0
  14. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/flow_data/S65E_total.py +0 -0
  15. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/flow_data/__init__.py +0 -0
  16. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/flow_data/get_forecast_flows.py +0 -0
  17. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/flow_data/get_inflows.py +0 -0
  18. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/flow_data/get_outflows.py +0 -0
  19. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/flow_data/hydro.py +0 -0
  20. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/water_level_data/__init__.py +0 -0
  21. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/water_level_data/get_all.py +0 -0
  22. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/water_level_data/hydro.py +0 -0
  23. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/water_quality_data/__init__.py +0 -0
  24. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/water_quality_data/get_inflows.py +0 -0
  25. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/water_quality_data/get_lake_wq.py +0 -0
  26. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/water_quality_data/wq.py +0 -0
  27. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/weather_data/__init__.py +0 -0
  28. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/weather_data/get_all.py +0 -0
  29. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep/weather_data/weather.py +0 -0
  30. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep.egg-info/SOURCES.txt +0 -0
  31. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep.egg-info/dependency_links.txt +0 -0
  32. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/loone_data_prep.egg-info/top_level.txt +0 -0
  33. {loone_data_prep-0.1.7 → loone_data_prep-0.1.8}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: loone_data_prep
3
- Version: 0.1.7
3
+ Version: 0.1.8
4
4
  Summary: Prepare data to run the LOONE model.
5
5
  Author-email: Osama Tarabih <osamatarabih@usf.edu>
6
6
  Maintainer-email: Michael Souffront <msouffront@aquaveo.com>, James Dolinar <jdolinar@aquaveo.com>
@@ -20,6 +20,7 @@ Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
21
  Requires-Dist: rpy2
22
22
  Requires-Dist: retry
23
+ Requires-Dist: numpy<2
23
24
  Requires-Dist: pandas
24
25
  Requires-Dist: scipy
25
26
  Requires-Dist: geoglows==0.27.1
@@ -45,10 +45,11 @@ def main(input_dir: str, output_dir: str, ensemble_number: str) -> None: # , hi
45
45
  LO_Stage = pd.read_csv(f"{input_dir}/LO_Stage.csv")
46
46
  # Create Column (EOD Stg(ft, NGVD)) in File (SFWMM_Daily_Outputs)
47
47
  LO_Stage = DF_Date_Range(LO_Stage, M3_Yr, M3_M, M3_D, En_Yr, En_M, En_D)
48
+ LO_Stage.index = LO_Stage["date"]
48
49
  # Calculate average
49
50
  if "Average_Stage" not in LO_Stage.columns:
50
51
  LO_Stage = LO_Stage.loc[:, ~LO_Stage.columns.str.contains("^Unnamed")]
51
- LO_Stage["Average_Stage"] = LO_Stage.mean(axis=1)
52
+ LO_Stage["Average_Stage"] = LO_Stage.drop(columns=['date']).mean(axis=1)
52
53
  LO_Stage.to_csv(f"{input_dir}/LO_Stage.csv", index=False)
53
54
  LO_Storage = stg2sto(f"{input_dir}/StgSto_data.csv", LO_Stage["Average_Stage"], 0)
54
55
  LO_SA = stg2ar(f"{input_dir}/Stgar_data.csv", LO_Stage["Average_Stage"], 0)
@@ -580,6 +580,18 @@ def main(input_dir: str, output_dir: str) -> None:
580
580
  LO_OP_data_Inter['Mean_OP'] = LO_OP_data_Inter.mean(axis=1, numeric_only=True)
581
581
  LO_OP_data_Inter = DF_Date_Range(LO_OP_data_Inter, St_Yr, St_M, St_D, En_Yr, En_M, En_D)
582
582
  LO_OP_data_Inter.to_csv(f'{output_dir}/LO_OP.csv', index=False)
583
+
584
+ # Create File (N_OP) (L001, L005, L008)
585
+ n_op = LO_OP_data_Inter[['date', 'Data_L001_OP_Inter', 'Data_L005_OP_Inter', 'Data_L008_OP_Inter']]
586
+ n_op['OP'] = n_op.mean(axis=1, numeric_only=True) * 1000 # mg/L to mg/m3
587
+ n_op.drop(['Data_L001_OP_Inter', 'Data_L005_OP_Inter', 'Data_L008_OP_Inter'], axis=1, inplace=True)
588
+ n_op.to_csv(f'{output_dir}/N_OP.csv', index=False)
589
+
590
+ # Create File (S_OP) (L004, L006, L007, L008, and LZ40)
591
+ s_op = LO_OP_data_Inter[['date', 'Data_L004_OP_Inter', 'Data_L006_OP_Inter', 'Data_L007_OP_Inter', 'Data_L008_OP_Inter', 'Data_LZ40_OP_Inter']]
592
+ s_op['OP'] = s_op.mean(axis=1, numeric_only=True) * 1000 # mg/L to mg/m3
593
+ s_op.drop(['Data_L004_OP_Inter', 'Data_L006_OP_Inter', 'Data_L007_OP_Inter', 'Data_L008_OP_Inter', 'Data_LZ40_OP_Inter'], axis=1, inplace=True)
594
+ s_op.to_csv(f'{output_dir}/S_OP.csv', index=False)
583
595
 
584
596
  # Interpolated NH4 Observations in Lake
585
597
  # Create File (LO_Avg_NH4)
@@ -662,6 +674,22 @@ def main(input_dir: str, output_dir: str) -> None:
662
674
  LO_DIN['NO'] = LO_NO_Clean_Inter['Mean_NO'].values
663
675
  LO_DIN['DIN_mg/m3'] = LO_DIN[['NH4', 'NO']].sum(axis=1)*1000
664
676
  LO_DIN.to_csv(f'{output_dir}/LO_DIN.csv', index=False)
677
+
678
+ # Create File (N_DIN) (L001, L005, L008)
679
+ n_din = pd.DataFrame(date_DIN, columns=['date'])
680
+ n_din.set_index('date', inplace=True)
681
+ n_din['NH4'] = LO_NH4_Clean_Inter[['date', 'Data_L001_NH4_Inter', 'Data_L005_NH4_Inter', 'Data_L008_NH4_Inter']].mean(axis=1, numeric_only=True)
682
+ n_din['NO'] = LO_NO_Clean_Inter[['date', 'Data_L001_NO_Inter', 'Data_L005_NO_Inter', 'Data_L008_NO_Inter']].mean(axis=1, numeric_only=True)*1000 # mg/L to mg/m3
683
+ n_din['DIN'] = n_din[['NH4', 'NO']].sum(axis=1)*1000 # mg/L to mg/m3
684
+ n_din.to_csv(f'{output_dir}/N_DIN.csv')
685
+
686
+ # Create File (S_DIN) (L004, L006, L007, L008, LZ40)
687
+ s_din = pd.DataFrame(date_DIN, columns=['date'])
688
+ s_din.set_index('date', inplace=True)
689
+ s_din['NH4'] = LO_NH4_Clean_Inter[['date', 'Data_L004_NH4_Inter', 'Data_L006_NH4_Inter', 'Data_L007_NH4_Inter', 'Data_L008_NH4_Inter', 'Data_LZ40_NH4_Inter']].mean(axis=1, numeric_only=True)
690
+ s_din['NO'] = LO_NO_Clean_Inter[['date', 'Data_L004_NO_Inter', 'Data_L006_NO_Inter', 'Data_L007_NO_Inter', 'Data_L008_NO_Inter', 'Data_LZ40_NO_Inter']].mean(axis=1, numeric_only=True)*1000 # mg/L to mg/m3
691
+ s_din['DIN'] = s_din[['NH4', 'NO']].sum(axis=1)*1000 # mg/L to mg/m3
692
+ s_din.to_csv(f'{output_dir}/S_DIN.csv')
665
693
 
666
694
  # Interpolated DO Observations in Lake
667
695
  # Create File (LO_Avg_DO)
@@ -821,6 +849,93 @@ def main(input_dir: str, output_dir: str) -> None:
821
849
  LO_Chla_Merge_Monthly_Inter = LO_Chla_Merge.resample('M').mean()
822
850
  LO_Chla_Merge_Monthly_Inter.to_csv(f'{output_dir}/LO_Chla_Merge_Monthly_Inter.csv')
823
851
 
852
+ # Create files (LO_Chla_Obs.csv, N_Merged_Chla.csv, and S_Merged_Chla.csv)
853
+ L001_Chla = pd.read_csv(f'{input_dir}/water_quality_L001_CHLOROPHYLL-A, CORRECTED.csv')
854
+ L001_Chla.drop(columns=['days'], inplace=True)
855
+ L004_Chla = pd.read_csv(f'{input_dir}/water_quality_L004_CHLOROPHYLL-A, CORRECTED.csv')
856
+ L004_Chla.drop(columns=['days'], inplace=True)
857
+ L005_Chla = pd.read_csv(f'{input_dir}/water_quality_L005_CHLOROPHYLL-A, CORRECTED.csv')
858
+ L005_Chla.drop(columns=['days'], inplace=True)
859
+ L006_Chla = pd.read_csv(f'{input_dir}/water_quality_L006_CHLOROPHYLL-A, CORRECTED.csv')
860
+ L006_Chla.drop(columns=['days'], inplace=True)
861
+ L007_Chla = pd.read_csv(f'{input_dir}/water_quality_L007_CHLOROPHYLL-A, CORRECTED.csv')
862
+ L007_Chla.drop(columns=['days'], inplace=True)
863
+ L008_Chla = pd.read_csv(f'{input_dir}/water_quality_L008_CHLOROPHYLL-A, CORRECTED.csv')
864
+ L008_Chla.drop(columns=['days'], inplace=True)
865
+ LZ40_Chla = pd.read_csv(f'{input_dir}/water_quality_LZ40_CHLOROPHYLL-A, CORRECTED.csv')
866
+ LZ40_Chla.drop(columns=['days'], inplace=True)
867
+ L001_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L001_CHLOROPHYLL-A(LC).csv')
868
+ L001_Chla_LC.drop(columns=['days'], inplace=True)
869
+ L004_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L004_CHLOROPHYLL-A(LC).csv')
870
+ L004_Chla_LC.drop(columns=['days'], inplace=True)
871
+ L005_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L005_CHLOROPHYLL-A(LC).csv')
872
+ L005_Chla_LC.drop(columns=['days'], inplace=True)
873
+ L006_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L006_CHLOROPHYLL-A(LC).csv')
874
+ L006_Chla_LC.drop(columns=['days'], inplace=True)
875
+ L007_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L007_CHLOROPHYLL-A(LC).csv')
876
+ L007_Chla_LC.drop(columns=['days'], inplace=True)
877
+ L008_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L008_CHLOROPHYLL-A(LC).csv')
878
+ L008_Chla_LC.drop(columns=['days'], inplace=True)
879
+ LZ40_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_LZ40_CHLOROPHYLL-A(LC).csv')
880
+ LZ40_Chla_LC.drop(columns=['days'], inplace=True)
881
+
882
+ LO_Chla = pd.merge(L001_Chla, L004_Chla, how='left', on='date')
883
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
884
+ LO_Chla = pd.merge(LO_Chla, L005_Chla, how='left', on='date')
885
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
886
+ LO_Chla = pd.merge(LO_Chla, L006_Chla, how='left', on='date')
887
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
888
+ LO_Chla = pd.merge(LO_Chla, L007_Chla, how='left', on='date')
889
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
890
+ LO_Chla = pd.merge(LO_Chla, L008_Chla, how='left', on='date')
891
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
892
+ LO_Chla = pd.merge(LO_Chla, LZ40_Chla, how='left', on='date')
893
+ LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
894
+ LO_Chla = LO_Chla.set_index('date')
895
+ LO_Chla['Mean_Chla'] = LO_Chla.mean(axis=1)
896
+ LO_Chla = LO_Chla.reset_index()
897
+ LO_Chla_N_cols = ['L001_CHLOROPHYLL-A, CORRECTED_ug/L', 'L005_CHLOROPHYLL-A, CORRECTED_ug/L', 'L008_CHLOROPHYLL-A, CORRECTED_ug/L']
898
+ LO_Chla['Chla_North'] = LO_Chla[LO_Chla_N_cols].mean(axis=1)
899
+ LO_Chla_S_cols = ['L004_CHLOROPHYLL-A, CORRECTED_ug/L', 'L006_CHLOROPHYLL-A, CORRECTED_ug/L', 'L007_CHLOROPHYLL-A, CORRECTED_ug/L','L008_CHLOROPHYLL-A, CORRECTED_ug/L','LZ40_CHLOROPHYLL-A, CORRECTED_ug/L']
900
+ LO_Chla['Chla_South'] = LO_Chla[LO_Chla_S_cols].mean(axis=1)
901
+
902
+ LO_Chla_LC = pd.merge(L001_Chla_LC, L004_Chla_LC, how='left', on='date')
903
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
904
+ LO_Chla_LC = pd.merge(LO_Chla_LC, L005_Chla_LC, how='left', on='date')
905
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
906
+ LO_Chla_LC = pd.merge(LO_Chla_LC, L006_Chla_LC, how='left', on='date')
907
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
908
+ LO_Chla_LC = pd.merge(LO_Chla_LC, L007_Chla_LC, how='left', on='date')
909
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
910
+ LO_Chla_LC = pd.merge(LO_Chla_LC, L008_Chla_LC, how='left', on='date')
911
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
912
+ LO_Chla_LC = pd.merge(LO_Chla_LC, LZ40_Chla_LC, how='left', on='date')
913
+ LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
914
+ LO_Chla_LC = LO_Chla_LC.set_index('date')
915
+ LO_Chla_LC['Mean_Chla'] = LO_Chla_LC.mean(axis=1)
916
+ LO_Chla_LC = LO_Chla_LC.reset_index()
917
+ LO_Chla_LC_N_cols = ['L001_CHLOROPHYLL-A(LC)_ug/L', 'L005_CHLOROPHYLL-A(LC)_ug/L', 'L008_CHLOROPHYLL-A(LC)_ug/L']
918
+ LO_Chla_LC['Chla_North'] = LO_Chla_LC[LO_Chla_LC_N_cols].mean(axis=1)
919
+ LO_Chla_LC_S_cols = ['L004_CHLOROPHYLL-A(LC)_ug/L', 'L006_CHLOROPHYLL-A(LC)_ug/L', 'L007_CHLOROPHYLL-A(LC)_ug/L','L008_CHLOROPHYLL-A(LC)_ug/L','LZ40_CHLOROPHYLL-A(LC)_ug/L']
920
+ LO_Chla_LC['Chla_South'] = LO_Chla_LC[LO_Chla_LC_S_cols].mean(axis=1)
921
+
922
+ LO_Chla = DF_Date_Range(LO_Chla, 2008, 1, 1, 2010, 10, 19)
923
+ LO_Chla_df = pd.DataFrame(LO_Chla['date'], columns=['date'])
924
+ LO_Chla_df['Chla'] = LO_Chla['Mean_Chla']
925
+ LO_Chla_df['Chla_N'] = LO_Chla['Chla_North']
926
+ LO_Chla_df['Chla_S'] = LO_Chla['Chla_South']
927
+
928
+ LO_Chla_LC = DF_Date_Range(LO_Chla_LC, 2010, 10, 20, 2023, 6, 30)
929
+ LO_Chla_LC_df = pd.DataFrame(LO_Chla_LC['date'], columns=['date'])
930
+ LO_Chla_LC_df['Chla'] = LO_Chla_LC['Mean_Chla']
931
+ LO_Chla_LC_df['Chla_N'] = LO_Chla_LC['Chla_North']
932
+ LO_Chla_LC_df['Chla_S'] = LO_Chla_LC['Chla_South']
933
+
934
+ LO_Chla_Merge = pd.concat([LO_Chla_df, LO_Chla_LC_df]).reset_index(drop=True)
935
+ LO_Chla_Merge.to_csv(f'{output_dir}/LO_Chla_Obs.csv')
936
+ LO_Chla_Merge[['date', 'Chla_N']].rename(columns={'Chla_N': 'Chla'}).to_csv(f'{output_dir}/N_Merged_Chla.csv', index=False)
937
+ LO_Chla_Merge[['date', 'Chla_S']].rename(columns={'Chla_S': 'Chla'}).to_csv(f'{output_dir}/S_Merged_Chla.csv', index=False)
938
+
824
939
  # Create Files S65E_Avg_Chla
825
940
  S65E_Chla_Inter = pd.read_csv(f'{input_dir}/water_quality_S65E_CHLOROPHYLL-A, CORRECTED_Interpolated.csv')
826
941
  S65E_Chla_LC_Inter = pd.read_csv(f'{input_dir}/water_quality_S65E_CHLOROPHYLL-A(LC)_Interpolated.csv')
@@ -0,0 +1,293 @@
1
+ import sys
2
+ import os
3
+ import math
4
+ import numpy as np
5
+ import pandas as pd
6
+ import geoglows
7
+ from scipy import interpolate
8
+
9
+
10
+ SECONDS_IN_DAY = 86400
11
+
12
+
13
+ def get_bias_corrected_data(
14
+ station_id: str,
15
+ reach_id: str,
16
+ observed_data_path: str,
17
+ station_ensembles: pd.DataFrame,
18
+ station_stats: pd.DataFrame,
19
+ cache_path: str = None,
20
+ ) -> dict:
21
+ # Load the observed data from a CSV file
22
+ observed_data = pd.read_csv(
23
+ observed_data_path,
24
+ index_col=0,
25
+ usecols=["date", f"{station_id}_FLOW_cmd"],
26
+ )
27
+ # Convert the index to datetime and localize it to UTC
28
+ observed_data.index = pd.to_datetime(observed_data.index).tz_localize(
29
+ "UTC"
30
+ )
31
+ # Transform the data by dividing it by the number of seconds in a day
32
+ observed_data = observed_data.transform(lambda x: x / SECONDS_IN_DAY)
33
+ # Rename the value column to "Streamflow (m3/s)"
34
+ observed_data.rename(
35
+ columns={f"{station_id}_FLOW_cmd": "Streamflow (m3/s)"}, inplace=True
36
+ )
37
+
38
+ # Prepare the observed data by filling NaN values with the 10yr average
39
+ prepared_od = prep_observed_data(observed_data)
40
+
41
+ # Get the historical simulation data for the given reach ID
42
+ historical_data = None
43
+
44
+ if cache_path is None:
45
+ historical_data = geoglows.streamflow.historic_simulation(reach_id)
46
+ else:
47
+ # Create the geoglows cache directory if it doesn't exist
48
+ geoglows_cache_path = os.path.join(cache_path, "geoglows_cache")
49
+ if not os.path.exists(geoglows_cache_path):
50
+ os.makedirs(geoglows_cache_path)
51
+
52
+ # Check if the historical simulation data is already cached
53
+ if os.path.exists(
54
+ os.path.join(
55
+ geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
56
+ )
57
+ ):
58
+ historical_data = pd.read_csv(
59
+ os.path.join(
60
+ geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
61
+ ),
62
+ index_col=0,
63
+ )
64
+ historical_data.index = pd.to_datetime(historical_data.index)
65
+ else:
66
+ historical_data = geoglows.streamflow.historic_simulation(reach_id)
67
+ historical_data.to_csv(
68
+ os.path.join(
69
+ geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
70
+ )
71
+ )
72
+
73
+ # Correct the forecast bias in the station ensembles
74
+ station_ensembles = bias_correct_forecast(
75
+ station_ensembles, historical_data, prepared_od
76
+ )
77
+ # Correct the forecast bias in the station stats
78
+ station_stats = bias_correct_forecast(
79
+ station_stats, historical_data, prepared_od
80
+ )
81
+
82
+ # Return the bias-corrected station ensembles and station stats
83
+ return station_ensembles, station_stats
84
+
85
+
86
+ def prep_observed_data(observed_data: pd.DataFrame) -> pd.DataFrame:
87
+ # Group the data by month and day
88
+ grouped_data = observed_data.groupby(
89
+ [observed_data.index.month, observed_data.index.day]
90
+ )
91
+
92
+ # Calculate the rolling average of 'Streamflow (m3/s)' for each group
93
+ daily_10yr_avg = (
94
+ grouped_data["Streamflow (m3/s)"]
95
+ .rolling(window=10, min_periods=1, center=True)
96
+ .mean()
97
+ )
98
+
99
+ # Reset the multi-index of daily_10yr_avg and sort it by index
100
+ fill_val = daily_10yr_avg.reset_index(level=[0, 1], drop=True).sort_index()
101
+
102
+ # Fill NaN in 'Streamflow (m3/s)' with corresponding values from fill_val
103
+ observed_data["Streamflow (m3/s)"] = observed_data[
104
+ "Streamflow (m3/s)"
105
+ ].fillna(fill_val)
106
+
107
+ # Return the modified observed_data DataFrame
108
+ return observed_data
109
+
110
+
111
+ def bias_correct_historical(
112
+ simulated_data: pd.DataFrame, observed_data: pd.DataFrame
113
+ ) -> pd.DataFrame:
114
+ """
115
+ Accepts a historically simulated flow timeseries and observed flow timeseries and attempts to correct biases in the
116
+ simulation on a monthly basis.
117
+
118
+ Args:
119
+ simulated_data: A dataframe with a datetime index and a single column of streamflow values
120
+ observed_data: A dataframe with a datetime index and a single column of streamflow values
121
+
122
+ Returns:
123
+ pandas DataFrame with a datetime index and a single column of streamflow values
124
+ """
125
+ # list of the unique months in the historical simulation. should always be 1->12 but just in case...
126
+ unique_simulation_months = sorted(set(simulated_data.index.strftime("%m")))
127
+ dates = []
128
+ values = []
129
+
130
+ for month in unique_simulation_months:
131
+ # filter historic data to only be current month
132
+ monthly_simulated = simulated_data[
133
+ simulated_data.index.month == int(month)
134
+ ].dropna()
135
+ to_prob = _flow_and_probability_mapper(
136
+ monthly_simulated, to_probability=True
137
+ )
138
+ # filter the observations to current month
139
+ monthly_observed = observed_data[
140
+ observed_data.index.month == int(month)
141
+ ].dropna()
142
+ to_flow = _flow_and_probability_mapper(monthly_observed, to_flow=True)
143
+
144
+ dates += monthly_simulated.index.to_list()
145
+ value = to_flow(to_prob(monthly_simulated.values))
146
+ values += value.tolist()
147
+
148
+ corrected = pd.DataFrame(
149
+ data=values, index=dates, columns=["Corrected Simulated Streamflow"]
150
+ )
151
+ corrected.sort_index(inplace=True)
152
+ return corrected
153
+
154
+
155
+ def bias_correct_forecast(
156
+ forecast_data: pd.DataFrame,
157
+ simulated_data: pd.DataFrame,
158
+ observed_data: pd.DataFrame,
159
+ use_month: int = 0,
160
+ ) -> pd.DataFrame:
161
+ """
162
+ Accepts a short term forecast of streamflow, simulated historical flow, and observed flow timeseries and attempts
163
+ to correct biases in the forecasted data
164
+
165
+ Args:
166
+ forecast_data: A dataframe with a datetime index and any number of columns of forecasted flow. Compatible with
167
+ forecast_stats, forecast_ensembles, forecast_records
168
+ simulated_data: A dataframe with a datetime index and a single column of streamflow values
169
+ observed_data: A dataframe with a datetime index and a single column of streamflow values
170
+ use_month: Optional: either 0 for correct the forecast based on the first month of the forecast data or -1 if
171
+ you want to correct based on the ending month of the forecast data
172
+
173
+ Returns:
174
+ pandas DataFrame with a copy of forecasted data with values updated in each column
175
+ """
176
+ # make a copy of the forecasts which we update and return so the original data is not changed
177
+ forecast_copy = forecast_data.copy()
178
+
179
+ # make the flow and probability interpolation functions
180
+ monthly_simulated = simulated_data[
181
+ simulated_data.index.month == forecast_copy.index[use_month].month
182
+ ].dropna()
183
+ monthly_observed = observed_data[
184
+ observed_data.index.month == forecast_copy.index[use_month].month
185
+ ].dropna()
186
+ to_prob = _flow_and_probability_mapper(
187
+ monthly_simulated, to_probability=True, extrapolate=True
188
+ )
189
+ to_flow = _flow_and_probability_mapper(
190
+ monthly_observed, to_flow=True, extrapolate=True
191
+ )
192
+
193
+ # for each column of forecast data, make the interpolation function and update the dataframe
194
+ for column in forecast_copy.columns:
195
+ tmp = forecast_copy[column].dropna()
196
+ forecast_copy.update(
197
+ pd.DataFrame(
198
+ to_flow(to_prob(tmp.values)), index=tmp.index, columns=[column]
199
+ )
200
+ )
201
+
202
+ return forecast_copy
203
+
204
+
205
+ def _flow_and_probability_mapper(
206
+ monthly_data: pd.DataFrame,
207
+ to_probability: bool = False,
208
+ to_flow: bool = False,
209
+ extrapolate: bool = False,
210
+ ) -> interpolate.interp1d:
211
+ if not to_flow and not to_probability:
212
+ raise ValueError(
213
+ "You need to specify either to_probability or to_flow as True"
214
+ )
215
+
216
+ # get maximum value to bound histogram
217
+ max_val = math.ceil(np.max(monthly_data.max()))
218
+ min_val = math.floor(np.min(monthly_data.min()))
219
+
220
+ if max_val == min_val:
221
+ max_val += 0.1
222
+
223
+ # determine number of histograms bins needed
224
+ number_of_points = len(monthly_data.values)
225
+ number_of_classes = math.ceil(1 + (3.322 * math.log10(number_of_points)))
226
+
227
+ # specify the bin width for histogram (in m3/s)
228
+ step_width = (max_val - min_val) / number_of_classes
229
+
230
+ # specify histogram bins
231
+ bins = np.arange(
232
+ -np.min(step_width),
233
+ max_val + 2 * np.min(step_width),
234
+ np.min(step_width),
235
+ )
236
+
237
+ if bins[0] == 0:
238
+ bins = np.concatenate((-bins[1], bins))
239
+ elif bins[0] > 0:
240
+ bins = np.concatenate((-bins[0], bins))
241
+
242
+ # make the histogram
243
+ counts, bin_edges = np.histogram(monthly_data, bins=bins)
244
+
245
+ # adjust the bins to be the center
246
+ bin_edges = bin_edges[1:]
247
+
248
+ # normalize the histograms
249
+ counts = counts.astype(float) / monthly_data.size
250
+
251
+ # calculate the cdfs
252
+ cdf = np.cumsum(counts)
253
+
254
+ # Identify indices where consecutive values are the same
255
+ duplicate_indices = np.where(np.diff(cdf) == 0)[0]
256
+
257
+ # Adjust duplicate value to be an extrapolation of the previous value
258
+ for idx in duplicate_indices:
259
+ if idx > 0:
260
+ cdf[idx] = cdf[idx - 1] + (cdf[idx + 1] - cdf[idx - 1]) / 2
261
+
262
+ # interpolated function to convert simulated streamflow to prob
263
+ if to_probability:
264
+ if extrapolate:
265
+ func = interpolate.interp1d(
266
+ bin_edges, cdf, fill_value="extrapolate"
267
+ )
268
+ else:
269
+ func = interpolate.interp1d(bin_edges, cdf)
270
+ return lambda x: np.clip(func(x), 0, 1)
271
+ # interpolated function to convert simulated prob to observed streamflow
272
+ elif to_flow:
273
+ if extrapolate:
274
+ return interpolate.interp1d(
275
+ cdf, bin_edges, fill_value="extrapolate"
276
+ )
277
+ return interpolate.interp1d(cdf, bin_edges)
278
+
279
+
280
+ if __name__ == "__main__":
281
+ station_id = sys.argv[1]
282
+ reach_id = sys.argv[2]
283
+ observed_data_path = sys.argv[3].rstrip("/")
284
+ station_ensembles = sys.argv[4]
285
+ station_stats = sys.argv[5]
286
+
287
+ get_bias_corrected_data(
288
+ station_id,
289
+ reach_id,
290
+ observed_data_path,
291
+ station_ensembles,
292
+ station_stats,
293
+ )