PyPI - loone-data-prep - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

loone-data-prep 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

loone_data_prep/GEOGLOWS_LOONE_DATA_PREP.py +2 -1
loone_data_prep/LOONE_DATA_PREP.py +115 -2
loone_data_prep/flow_data/S65E_total.py +71 -6
loone_data_prep/flow_data/forecast_bias_correction.py +193 -8
loone_data_prep/flow_data/get_inflows.py +130 -41
loone_data_prep/flow_data/get_outflows.py +110 -26
loone_data_prep/flow_data/hydro.py +121 -27
loone_data_prep/utils.py +339 -62
loone_data_prep/water_level_data/get_all.py +208 -11
loone_data_prep/water_level_data/hydro.py +71 -3
loone_data_prep/water_quality_data/get_inflows.py +88 -3
loone_data_prep/water_quality_data/get_lake_wq.py +85 -3
loone_data_prep/water_quality_data/wq.py +44 -0
loone_data_prep/weather_data/get_all.py +126 -3
loone_data_prep/weather_data/weather.py +185 -27
{loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.8.dist-info}/METADATA +2 -1
loone_data_prep-0.1.8.dist-info/RECORD +27 -0
{loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.8.dist-info}/WHEEL +1 -1
loone_data_prep-0.1.6.dist-info/RECORD +0 -27
{loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.8.dist-info}/LICENSE +0 -0
{loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.8.dist-info}/top_level.txt +0 -0

loone_data_prep/GEOGLOWS_LOONE_DATA_PREP.py CHANGED Viewed

@@ -45,10 +45,11 @@ def main(input_dir: str, output_dir: str, ensemble_number: str) -> None:  # , hi
     LO_Stage = pd.read_csv(f"{input_dir}/LO_Stage.csv")
     # Create Column (EOD Stg(ft, NGVD)) in File (SFWMM_Daily_Outputs)
     LO_Stage = DF_Date_Range(LO_Stage, M3_Yr, M3_M, M3_D, En_Yr, En_M, En_D)
+    LO_Stage.index = LO_Stage["date"]
     # Calculate average
     if "Average_Stage" not in LO_Stage.columns:
         LO_Stage = LO_Stage.loc[:, ~LO_Stage.columns.str.contains("^Unnamed")]
-        LO_Stage["Average_Stage"] = LO_Stage.mean(axis=1)
+        LO_Stage["Average_Stage"] = LO_Stage.drop(columns=['date']).mean(axis=1)
         LO_Stage.to_csv(f"{input_dir}/LO_Stage.csv", index=False)
     LO_Storage = stg2sto(f"{input_dir}/StgSto_data.csv", LO_Stage["Average_Stage"], 0)
     LO_SA = stg2ar(f"{input_dir}/Stgar_data.csv", LO_Stage["Average_Stage"], 0)

loone_data_prep/LOONE_DATA_PREP.py CHANGED Viewed

@@ -383,7 +383,6 @@ def main(input_dir: str, output_dir: str) -> None:
     LOWS['LZ40WS'] = LZ40WS['LZ40_WNDS_MPH']
     LOWS['LO_Avg_WS_MPH'] = LOWS.mean(axis=1, numeric_only=True)
     LOWS.to_csv(f'{output_dir}/LOWS.csv', index=False)
-    LOWS.to_csv(f'{input_dir}/LOWS.csv', index=False)  # Also needed in temporary directory by utils.py's wind_induced_waves()
     # RFVol acft
     # Create File (RF_Volume)
@@ -581,6 +580,18 @@ def main(input_dir: str, output_dir: str) -> None:
     LO_OP_data_Inter['Mean_OP'] = LO_OP_data_Inter.mean(axis=1, numeric_only=True)
     LO_OP_data_Inter = DF_Date_Range(LO_OP_data_Inter, St_Yr, St_M, St_D, En_Yr, En_M, En_D)
     LO_OP_data_Inter.to_csv(f'{output_dir}/LO_OP.csv', index=False)
+    # Create File (N_OP) (L001, L005, L008)
+    n_op = LO_OP_data_Inter[['date', 'Data_L001_OP_Inter', 'Data_L005_OP_Inter', 'Data_L008_OP_Inter']]
+    n_op['OP'] = n_op.mean(axis=1, numeric_only=True) * 1000 # mg/L to mg/m3
+    n_op.drop(['Data_L001_OP_Inter', 'Data_L005_OP_Inter', 'Data_L008_OP_Inter'], axis=1, inplace=True)
+    n_op.to_csv(f'{output_dir}/N_OP.csv', index=False)
+    # Create File (S_OP) (L004, L006, L007, L008, and LZ40)
+    s_op = LO_OP_data_Inter[['date', 'Data_L004_OP_Inter', 'Data_L006_OP_Inter', 'Data_L007_OP_Inter', 'Data_L008_OP_Inter', 'Data_LZ40_OP_Inter']]
+    s_op['OP'] = s_op.mean(axis=1, numeric_only=True) * 1000 # mg/L to mg/m3
+    s_op.drop(['Data_L004_OP_Inter', 'Data_L006_OP_Inter', 'Data_L007_OP_Inter', 'Data_L008_OP_Inter', 'Data_LZ40_OP_Inter'], axis=1, inplace=True)
+    s_op.to_csv(f'{output_dir}/S_OP.csv', index=False)
     # Interpolated NH4 Observations in Lake
     # Create File (LO_Avg_NH4)
@@ -663,6 +674,22 @@ def main(input_dir: str, output_dir: str) -> None:
     LO_DIN['NO'] = LO_NO_Clean_Inter['Mean_NO'].values
     LO_DIN['DIN_mg/m3'] = LO_DIN[['NH4', 'NO']].sum(axis=1)*1000
     LO_DIN.to_csv(f'{output_dir}/LO_DIN.csv', index=False)
+    # Create File (N_DIN) (L001, L005, L008)
+    n_din = pd.DataFrame(date_DIN, columns=['date'])
+    n_din.set_index('date', inplace=True)
+    n_din['NH4'] = LO_NH4_Clean_Inter[['date', 'Data_L001_NH4_Inter', 'Data_L005_NH4_Inter', 'Data_L008_NH4_Inter']].mean(axis=1, numeric_only=True)
+    n_din['NO'] = LO_NO_Clean_Inter[['date', 'Data_L001_NO_Inter', 'Data_L005_NO_Inter', 'Data_L008_NO_Inter']].mean(axis=1, numeric_only=True)*1000    # mg/L to mg/m3
+    n_din['DIN'] = n_din[['NH4', 'NO']].sum(axis=1)*1000    # mg/L to mg/m3
+    n_din.to_csv(f'{output_dir}/N_DIN.csv')
+    # Create File (S_DIN) (L004, L006, L007, L008, LZ40)
+    s_din = pd.DataFrame(date_DIN, columns=['date'])
+    s_din.set_index('date', inplace=True)
+    s_din['NH4'] = LO_NH4_Clean_Inter[['date', 'Data_L004_NH4_Inter', 'Data_L006_NH4_Inter', 'Data_L007_NH4_Inter', 'Data_L008_NH4_Inter', 'Data_LZ40_NH4_Inter']].mean(axis=1, numeric_only=True)
+    s_din['NO'] = LO_NO_Clean_Inter[['date', 'Data_L004_NO_Inter', 'Data_L006_NO_Inter', 'Data_L007_NO_Inter', 'Data_L008_NO_Inter', 'Data_LZ40_NO_Inter']].mean(axis=1, numeric_only=True)*1000    # mg/L to mg/m3
+    s_din['DIN'] = s_din[['NH4', 'NO']].sum(axis=1)*1000    # mg/L to mg/m3
+    s_din.to_csv(f'{output_dir}/S_DIN.csv')
     # Interpolated DO Observations in Lake
     # Create File (LO_Avg_DO)
@@ -822,6 +849,93 @@ def main(input_dir: str, output_dir: str) -> None:
     LO_Chla_Merge_Monthly_Inter = LO_Chla_Merge.resample('M').mean()
     LO_Chla_Merge_Monthly_Inter.to_csv(f'{output_dir}/LO_Chla_Merge_Monthly_Inter.csv')
+    # Create files (LO_Chla_Obs.csv, N_Merged_Chla.csv, and S_Merged_Chla.csv)
+    L001_Chla = pd.read_csv(f'{input_dir}/water_quality_L001_CHLOROPHYLL-A, CORRECTED.csv')
+    L001_Chla.drop(columns=['days'], inplace=True)
+    L004_Chla = pd.read_csv(f'{input_dir}/water_quality_L004_CHLOROPHYLL-A, CORRECTED.csv')
+    L004_Chla.drop(columns=['days'], inplace=True)
+    L005_Chla = pd.read_csv(f'{input_dir}/water_quality_L005_CHLOROPHYLL-A, CORRECTED.csv')
+    L005_Chla.drop(columns=['days'], inplace=True)
+    L006_Chla = pd.read_csv(f'{input_dir}/water_quality_L006_CHLOROPHYLL-A, CORRECTED.csv')
+    L006_Chla.drop(columns=['days'], inplace=True)
+    L007_Chla = pd.read_csv(f'{input_dir}/water_quality_L007_CHLOROPHYLL-A, CORRECTED.csv')
+    L007_Chla.drop(columns=['days'], inplace=True)
+    L008_Chla = pd.read_csv(f'{input_dir}/water_quality_L008_CHLOROPHYLL-A, CORRECTED.csv')
+    L008_Chla.drop(columns=['days'], inplace=True)
+    LZ40_Chla = pd.read_csv(f'{input_dir}/water_quality_LZ40_CHLOROPHYLL-A, CORRECTED.csv')
+    LZ40_Chla.drop(columns=['days'], inplace=True)
+    L001_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L001_CHLOROPHYLL-A(LC).csv')
+    L001_Chla_LC.drop(columns=['days'], inplace=True)
+    L004_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L004_CHLOROPHYLL-A(LC).csv')
+    L004_Chla_LC.drop(columns=['days'], inplace=True)
+    L005_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L005_CHLOROPHYLL-A(LC).csv')
+    L005_Chla_LC.drop(columns=['days'], inplace=True)
+    L006_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L006_CHLOROPHYLL-A(LC).csv')
+    L006_Chla_LC.drop(columns=['days'], inplace=True)
+    L007_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L007_CHLOROPHYLL-A(LC).csv')
+    L007_Chla_LC.drop(columns=['days'], inplace=True)
+    L008_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L008_CHLOROPHYLL-A(LC).csv')
+    L008_Chla_LC.drop(columns=['days'], inplace=True)
+    LZ40_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_LZ40_CHLOROPHYLL-A(LC).csv')
+    LZ40_Chla_LC.drop(columns=['days'], inplace=True)
+    LO_Chla = pd.merge(L001_Chla, L004_Chla, how='left', on='date')
+    LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
+    LO_Chla = pd.merge(LO_Chla, L005_Chla, how='left', on='date')
+    LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
+    LO_Chla = pd.merge(LO_Chla, L006_Chla, how='left', on='date')
+    LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
+    LO_Chla = pd.merge(LO_Chla, L007_Chla, how='left', on='date')
+    LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
+    LO_Chla = pd.merge(LO_Chla, L008_Chla, how='left', on='date')
+    LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
+    LO_Chla = pd.merge(LO_Chla, LZ40_Chla, how='left', on='date')
+    LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
+    LO_Chla = LO_Chla.set_index('date')
+    LO_Chla['Mean_Chla'] = LO_Chla.mean(axis=1)
+    LO_Chla = LO_Chla.reset_index()
+    LO_Chla_N_cols = ['L001_CHLOROPHYLL-A, CORRECTED_ug/L', 'L005_CHLOROPHYLL-A, CORRECTED_ug/L', 'L008_CHLOROPHYLL-A, CORRECTED_ug/L']
+    LO_Chla['Chla_North'] = LO_Chla[LO_Chla_N_cols].mean(axis=1)
+    LO_Chla_S_cols = ['L004_CHLOROPHYLL-A, CORRECTED_ug/L', 'L006_CHLOROPHYLL-A, CORRECTED_ug/L', 'L007_CHLOROPHYLL-A, CORRECTED_ug/L','L008_CHLOROPHYLL-A, CORRECTED_ug/L','LZ40_CHLOROPHYLL-A, CORRECTED_ug/L']
+    LO_Chla['Chla_South'] = LO_Chla[LO_Chla_S_cols].mean(axis=1)
+    LO_Chla_LC = pd.merge(L001_Chla_LC, L004_Chla_LC, how='left', on='date')
+    LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
+    LO_Chla_LC = pd.merge(LO_Chla_LC, L005_Chla_LC, how='left', on='date')
+    LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
+    LO_Chla_LC = pd.merge(LO_Chla_LC, L006_Chla_LC, how='left', on='date')
+    LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
+    LO_Chla_LC = pd.merge(LO_Chla_LC, L007_Chla_LC, how='left', on='date')
+    LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
+    LO_Chla_LC = pd.merge(LO_Chla_LC, L008_Chla_LC, how='left', on='date')
+    LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
+    LO_Chla_LC = pd.merge(LO_Chla_LC, LZ40_Chla_LC, how='left', on='date')
+    LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
+    LO_Chla_LC = LO_Chla_LC.set_index('date')
+    LO_Chla_LC['Mean_Chla'] = LO_Chla_LC.mean(axis=1)
+    LO_Chla_LC = LO_Chla_LC.reset_index()
+    LO_Chla_LC_N_cols = ['L001_CHLOROPHYLL-A(LC)_ug/L', 'L005_CHLOROPHYLL-A(LC)_ug/L', 'L008_CHLOROPHYLL-A(LC)_ug/L']
+    LO_Chla_LC['Chla_North'] = LO_Chla_LC[LO_Chla_LC_N_cols].mean(axis=1)
+    LO_Chla_LC_S_cols = ['L004_CHLOROPHYLL-A(LC)_ug/L', 'L006_CHLOROPHYLL-A(LC)_ug/L', 'L007_CHLOROPHYLL-A(LC)_ug/L','L008_CHLOROPHYLL-A(LC)_ug/L','LZ40_CHLOROPHYLL-A(LC)_ug/L']
+    LO_Chla_LC['Chla_South'] = LO_Chla_LC[LO_Chla_LC_S_cols].mean(axis=1)
+    LO_Chla = DF_Date_Range(LO_Chla, 2008, 1, 1, 2010, 10, 19)
+    LO_Chla_df = pd.DataFrame(LO_Chla['date'], columns=['date'])
+    LO_Chla_df['Chla'] = LO_Chla['Mean_Chla']
+    LO_Chla_df['Chla_N'] = LO_Chla['Chla_North']
+    LO_Chla_df['Chla_S'] = LO_Chla['Chla_South']
+    LO_Chla_LC = DF_Date_Range(LO_Chla_LC, 2010, 10, 20, 2023, 6, 30)
+    LO_Chla_LC_df = pd.DataFrame(LO_Chla_LC['date'], columns=['date'])
+    LO_Chla_LC_df['Chla'] = LO_Chla_LC['Mean_Chla']
+    LO_Chla_LC_df['Chla_N'] = LO_Chla_LC['Chla_North']
+    LO_Chla_LC_df['Chla_S'] = LO_Chla_LC['Chla_South']
+    LO_Chla_Merge = pd.concat([LO_Chla_df, LO_Chla_LC_df]).reset_index(drop=True)
+    LO_Chla_Merge.to_csv(f'{output_dir}/LO_Chla_Obs.csv')
+    LO_Chla_Merge[['date', 'Chla_N']].rename(columns={'Chla_N': 'Chla'}).to_csv(f'{output_dir}/N_Merged_Chla.csv', index=False)
+    LO_Chla_Merge[['date', 'Chla_S']].rename(columns={'Chla_S': 'Chla'}).to_csv(f'{output_dir}/S_Merged_Chla.csv', index=False)
     # Create Files S65E_Avg_Chla
     S65E_Chla_Inter = pd.read_csv(f'{input_dir}/water_quality_S65E_CHLOROPHYLL-A, CORRECTED_Interpolated.csv')
     S65E_Chla_LC_Inter = pd.read_csv(f'{input_dir}/water_quality_S65E_CHLOROPHYLL-A(LC)_Interpolated.csv')
@@ -897,7 +1011,6 @@ def main(input_dir: str, output_dir: str) -> None:
     # Write Data into csv files
     # write Avg Stage (ft, m) Storage (acft, m3) SA (acres) to csv
     LO_Stg_Sto_SA_df.to_csv(f'{output_dir}/Average_LO_Storage_3MLag.csv', index=False)
-    LO_Stg_Sto_SA_df.to_csv(f'{input_dir}/Average_LO_Storage_3MLag.csv', index=False)   # Also needed in temporary directory by utils.py's wind_induced_waves()
     # Write S65 TP concentrations (mg/L)
     S65_total_TP.to_csv(f'{output_dir}/S65_TP_3MLag.csv', index=False)
     # TP External Loads 3 Months Lag (mg)

loone_data_prep/flow_data/S65E_total.py CHANGED Viewed

@@ -2,22 +2,87 @@ import sys
 from retry import retry
 from rpy2.robjects import r
 from rpy2.rinterface_lib.embedded import RRuntimeError
+import pandas as pd
 @retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
-def get(workspace):
+def get(
+    workspace,
+    date_min: str = "1972-01-01",
+    date_max: str = "2023-06-30"
+) -> None:
     r(
         f"""
         # Load the required libraries
         library(dbhydroR)
-        #S65E_Total
-        S65E_total = get_hydro(dbkey = c("91656", "AL760"), date_min = "1972-01-01", date_max = "2023-06-30")
-        S65E_total[, -1] <- S65E_total[, -1] * (0.0283168466 * 86400)
-        write.csv(S65E_total,file ='{workspace}/S65E_total.csv')
+        library(dplyr)
+        # Helper Functions
+        retrieve_data <- function(dbkey, date_min, date_max)
+        {{
+            # Get the data from dbhydro
+            df = get_hydro(dbkey = dbkey, date_min = date_min, date_max = date_max, raw = TRUE)
+            # Give data.frame correct column names so it can be cleaned using the clean_hydro function
+            colnames(df) <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
+            # Add a type and units column to data so it can be cleaned using the clean_hydro function
+            df$type <- "FLOW"
+            df$units <- "cfs"
+            # Clean the data.frame
+            df <- clean_hydro(df)
+            # Drop the " _FLOW_cfs" column
+            df <- df %>% select(-` _FLOW_cfs`)
+            # Convert Flow rate from cfs to m³/day
+            df[, -1] <- df[, -1] * (0.0283168466 * 86400)
+            # Return resulting data.frame
+            return(df)
+        }}
+        # S65E_S
+        S65E_S <- retrieve_data(dbkey = "91656", date_min = "{date_min}", date_max = "{date_max}")
+        # Wait five seconds before next request to avoid "too many requests" error
+        Sys.sleep(5)
+        # S65EX1_S
+        S65EX1_S <- retrieve_data(dbkey = "AL760", date_min = "{date_min}", date_max = "{date_max}")
+        # Merge the data from each dbkey
+        result <- merge(S65E_S, S65EX1_S, by = "date", all = TRUE)
+        # Write the data to a file
+        write.csv(result, file = '{workspace}/S65E_total.csv')
         """
     )
+    _reformat_s65e_total_file(workspace)
+def _reformat_s65e_total_file(workspace: str):
+    # Read in the data
+    df = pd.read_csv(f"{workspace}/S65E_total.csv")
+    # Drop unused columns
+    df.drop('Unnamed: 0', axis=1, inplace=True)
+    # Convert date column to datetime
+    df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
+    # Sort the data by date
+    df.sort_values('date', inplace=True)
+    # Renumber the index
+    df.reset_index(drop=True, inplace=True)
+    # Drop rows that are missing all their values
+    df.dropna(how='all', inplace=True)
+    # Write the updated data back to the file
+    df.to_csv(f"{workspace}/S65E_total.csv")
 if __name__ == "__main__":
     workspace = sys.argv[1].rstrip("/")

loone_data_prep/flow_data/forecast_bias_correction.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import sys
 import os
+import math
+import numpy as np
 import pandas as pd
 import geoglows
+from scipy import interpolate
 SECONDS_IN_DAY = 86400
@@ -37,29 +40,42 @@ def get_bias_corrected_data(
     # Get the historical simulation data for the given reach ID
     historical_data = None
     if cache_path is None:
         historical_data = geoglows.streamflow.historic_simulation(reach_id)
     else:
         # Create the geoglows cache directory if it doesn't exist
-        geoglows_cache_path = os.path.join(cache_path, 'geoglows_cache')
+        geoglows_cache_path = os.path.join(cache_path, "geoglows_cache")
         if not os.path.exists(geoglows_cache_path):
             os.makedirs(geoglows_cache_path)
         # Check if the historical simulation data is already cached
-        if os.path.exists(os.path.join(geoglows_cache_path, f'{reach_id}_historic_simulation.csv')):
-            historical_data = pd.read_csv(os.path.join(geoglows_cache_path, f'{reach_id}_historic_simulation.csv'), index_col=0)
+        if os.path.exists(
+            os.path.join(
+                geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
+            )
+        ):
+            historical_data = pd.read_csv(
+                os.path.join(
+                    geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
+                ),
+                index_col=0,
+            )
             historical_data.index = pd.to_datetime(historical_data.index)
         else:
             historical_data = geoglows.streamflow.historic_simulation(reach_id)
-            historical_data.to_csv(os.path.join(geoglows_cache_path, f'{reach_id}_historic_simulation.csv'))
+            historical_data.to_csv(
+                os.path.join(
+                    geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
+                )
+            )
     # Correct the forecast bias in the station ensembles
-    station_ensembles = geoglows.bias.correct_forecast(
+    station_ensembles = bias_correct_forecast(
         station_ensembles, historical_data, prepared_od
     )
     # Correct the forecast bias in the station stats
-    station_stats = geoglows.bias.correct_forecast(
+    station_stats = bias_correct_forecast(
         station_stats, historical_data, prepared_od
     )
@@ -92,6 +108,175 @@ def prep_observed_data(observed_data: pd.DataFrame) -> pd.DataFrame:
     return observed_data
+def bias_correct_historical(
+    simulated_data: pd.DataFrame, observed_data: pd.DataFrame
+) -> pd.DataFrame:
+    """
+    Accepts a historically simulated flow timeseries and observed flow timeseries and attempts to correct biases in the
+    simulation on a monthly basis.
+    Args:
+        simulated_data: A dataframe with a datetime index and a single column of streamflow values
+        observed_data: A dataframe with a datetime index and a single column of streamflow values
+    Returns:
+        pandas DataFrame with a datetime index and a single column of streamflow values
+    """
+    # list of the unique months in the historical simulation. should always be 1->12 but just in case...
+    unique_simulation_months = sorted(set(simulated_data.index.strftime("%m")))
+    dates = []
+    values = []
+    for month in unique_simulation_months:
+        # filter historic data to only be current month
+        monthly_simulated = simulated_data[
+            simulated_data.index.month == int(month)
+        ].dropna()
+        to_prob = _flow_and_probability_mapper(
+            monthly_simulated, to_probability=True
+        )
+        # filter the observations to current month
+        monthly_observed = observed_data[
+            observed_data.index.month == int(month)
+        ].dropna()
+        to_flow = _flow_and_probability_mapper(monthly_observed, to_flow=True)
+        dates += monthly_simulated.index.to_list()
+        value = to_flow(to_prob(monthly_simulated.values))
+        values += value.tolist()
+    corrected = pd.DataFrame(
+        data=values, index=dates, columns=["Corrected Simulated Streamflow"]
+    )
+    corrected.sort_index(inplace=True)
+    return corrected
+def bias_correct_forecast(
+    forecast_data: pd.DataFrame,
+    simulated_data: pd.DataFrame,
+    observed_data: pd.DataFrame,
+    use_month: int = 0,
+) -> pd.DataFrame:
+    """
+    Accepts a short term forecast of streamflow, simulated historical flow, and observed flow timeseries and attempts
+    to correct biases in the forecasted data
+    Args:
+        forecast_data: A dataframe with a datetime index and any number of columns of forecasted flow. Compatible with
+            forecast_stats, forecast_ensembles, forecast_records
+        simulated_data: A dataframe with a datetime index and a single column of streamflow values
+        observed_data: A dataframe with a datetime index and a single column of streamflow values
+        use_month: Optional: either 0 for correct the forecast based on the first month of the forecast data or -1 if
+            you want to correct based on the ending month of the forecast data
+    Returns:
+        pandas DataFrame with a copy of forecasted data with values updated in each column
+    """
+    # make a copy of the forecasts which we update and return so the original data is not changed
+    forecast_copy = forecast_data.copy()
+    # make the flow and probability interpolation functions
+    monthly_simulated = simulated_data[
+        simulated_data.index.month == forecast_copy.index[use_month].month
+    ].dropna()
+    monthly_observed = observed_data[
+        observed_data.index.month == forecast_copy.index[use_month].month
+    ].dropna()
+    to_prob = _flow_and_probability_mapper(
+        monthly_simulated, to_probability=True, extrapolate=True
+    )
+    to_flow = _flow_and_probability_mapper(
+        monthly_observed, to_flow=True, extrapolate=True
+    )
+    # for each column of forecast data, make the interpolation function and update the dataframe
+    for column in forecast_copy.columns:
+        tmp = forecast_copy[column].dropna()
+        forecast_copy.update(
+            pd.DataFrame(
+                to_flow(to_prob(tmp.values)), index=tmp.index, columns=[column]
+            )
+        )
+    return forecast_copy
+def _flow_and_probability_mapper(
+    monthly_data: pd.DataFrame,
+    to_probability: bool = False,
+    to_flow: bool = False,
+    extrapolate: bool = False,
+) -> interpolate.interp1d:
+    if not to_flow and not to_probability:
+        raise ValueError(
+            "You need to specify either to_probability or to_flow as True"
+        )
+    # get maximum value to bound histogram
+    max_val = math.ceil(np.max(monthly_data.max()))
+    min_val = math.floor(np.min(monthly_data.min()))
+    if max_val == min_val:
+        max_val += 0.1
+    # determine number of histograms bins needed
+    number_of_points = len(monthly_data.values)
+    number_of_classes = math.ceil(1 + (3.322 * math.log10(number_of_points)))
+    # specify the bin width for histogram (in m3/s)
+    step_width = (max_val - min_val) / number_of_classes
+    # specify histogram bins
+    bins = np.arange(
+        -np.min(step_width),
+        max_val + 2 * np.min(step_width),
+        np.min(step_width),
+    )
+    if bins[0] == 0:
+        bins = np.concatenate((-bins[1], bins))
+    elif bins[0] > 0:
+        bins = np.concatenate((-bins[0], bins))
+    # make the histogram
+    counts, bin_edges = np.histogram(monthly_data, bins=bins)
+    # adjust the bins to be the center
+    bin_edges = bin_edges[1:]
+    # normalize the histograms
+    counts = counts.astype(float) / monthly_data.size
+    # calculate the cdfs
+    cdf = np.cumsum(counts)
+    # Identify indices where consecutive values are the same
+    duplicate_indices = np.where(np.diff(cdf) == 0)[0]
+    # Adjust duplicate value to be an extrapolation of the previous value
+    for idx in duplicate_indices:
+        if idx > 0:
+            cdf[idx] = cdf[idx - 1] + (cdf[idx + 1] - cdf[idx - 1]) / 2
+    # interpolated function to convert simulated streamflow to prob
+    if to_probability:
+        if extrapolate:
+            func = interpolate.interp1d(
+                bin_edges, cdf, fill_value="extrapolate"
+            )
+        else:
+            func = interpolate.interp1d(bin_edges, cdf)
+        return lambda x: np.clip(func(x), 0, 1)
+    # interpolated function to convert simulated prob to observed streamflow
+    elif to_flow:
+        if extrapolate:
+            return interpolate.interp1d(
+                cdf, bin_edges, fill_value="extrapolate"
+            )
+        return interpolate.interp1d(cdf, bin_edges)
 if __name__ == "__main__":
     station_id = sys.argv[1]
     reach_id = sys.argv[2]

loone-data-prep 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

loone-data-prep 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl