PyPI - loone-data-prep - Versions diffs - 0.1.6__tar.gz → 0.1.8__tar.gz - Mend

loone-data-prep 0.1.6tar.gz → 0.1.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

{loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: loone_data_prep
-Version: 0.1.6
+Version: 0.1.8
 Summary: Prepare data to run the LOONE model.
 Author-email: Osama Tarabih <osamatarabih@usf.edu>
 Maintainer-email: Michael Souffront <msouffront@aquaveo.com>, James Dolinar <jdolinar@aquaveo.com>
@@ -20,6 +20,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: rpy2
 Requires-Dist: retry
+Requires-Dist: numpy<2
 Requires-Dist: pandas
 Requires-Dist: scipy
 Requires-Dist: geoglows==0.27.1

{loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep/GEOGLOWS_LOONE_DATA_PREP.py RENAMED Viewed

@@ -45,10 +45,11 @@ def main(input_dir: str, output_dir: str, ensemble_number: str) -> None:  # , hi
     LO_Stage = pd.read_csv(f"{input_dir}/LO_Stage.csv")
     # Create Column (EOD Stg(ft, NGVD)) in File (SFWMM_Daily_Outputs)
     LO_Stage = DF_Date_Range(LO_Stage, M3_Yr, M3_M, M3_D, En_Yr, En_M, En_D)
+    LO_Stage.index = LO_Stage["date"]
     # Calculate average
     if "Average_Stage" not in LO_Stage.columns:
         LO_Stage = LO_Stage.loc[:, ~LO_Stage.columns.str.contains("^Unnamed")]
-        LO_Stage["Average_Stage"] = LO_Stage.mean(axis=1)
+        LO_Stage["Average_Stage"] = LO_Stage.drop(columns=['date']).mean(axis=1)
         LO_Stage.to_csv(f"{input_dir}/LO_Stage.csv", index=False)
     LO_Storage = stg2sto(f"{input_dir}/StgSto_data.csv", LO_Stage["Average_Stage"], 0)
     LO_SA = stg2ar(f"{input_dir}/Stgar_data.csv", LO_Stage["Average_Stage"], 0)

{loone_data_prep-0.1.6 → loone_data_prep-0.1.8}/loone_data_prep/LOONE_DATA_PREP.py RENAMED Viewed

@@ -383,7 +383,6 @@ def main(input_dir: str, output_dir: str) -> None:
     LOWS['LZ40WS'] = LZ40WS['LZ40_WNDS_MPH']
     LOWS['LO_Avg_WS_MPH'] = LOWS.mean(axis=1, numeric_only=True)
     LOWS.to_csv(f'{output_dir}/LOWS.csv', index=False)
-    LOWS.to_csv(f'{input_dir}/LOWS.csv', index=False)  # Also needed in temporary directory by utils.py's wind_induced_waves()
     # RFVol acft
     # Create File (RF_Volume)
@@ -581,6 +580,18 @@ def main(input_dir: str, output_dir: str) -> None:
     LO_OP_data_Inter['Mean_OP'] = LO_OP_data_Inter.mean(axis=1, numeric_only=True)
     LO_OP_data_Inter = DF_Date_Range(LO_OP_data_Inter, St_Yr, St_M, St_D, En_Yr, En_M, En_D)
     LO_OP_data_Inter.to_csv(f'{output_dir}/LO_OP.csv', index=False)
+    # Create File (N_OP) (L001, L005, L008)
+    n_op = LO_OP_data_Inter[['date', 'Data_L001_OP_Inter', 'Data_L005_OP_Inter', 'Data_L008_OP_Inter']]
+    n_op['OP'] = n_op.mean(axis=1, numeric_only=True) * 1000 # mg/L to mg/m3
+    n_op.drop(['Data_L001_OP_Inter', 'Data_L005_OP_Inter', 'Data_L008_OP_Inter'], axis=1, inplace=True)
+    n_op.to_csv(f'{output_dir}/N_OP.csv', index=False)
+    # Create File (S_OP) (L004, L006, L007, L008, and LZ40)
+    s_op = LO_OP_data_Inter[['date', 'Data_L004_OP_Inter', 'Data_L006_OP_Inter', 'Data_L007_OP_Inter', 'Data_L008_OP_Inter', 'Data_LZ40_OP_Inter']]
+    s_op['OP'] = s_op.mean(axis=1, numeric_only=True) * 1000 # mg/L to mg/m3
+    s_op.drop(['Data_L004_OP_Inter', 'Data_L006_OP_Inter', 'Data_L007_OP_Inter', 'Data_L008_OP_Inter', 'Data_LZ40_OP_Inter'], axis=1, inplace=True)
+    s_op.to_csv(f'{output_dir}/S_OP.csv', index=False)
     # Interpolated NH4 Observations in Lake
     # Create File (LO_Avg_NH4)
@@ -663,6 +674,22 @@ def main(input_dir: str, output_dir: str) -> None:
     LO_DIN['NO'] = LO_NO_Clean_Inter['Mean_NO'].values
     LO_DIN['DIN_mg/m3'] = LO_DIN[['NH4', 'NO']].sum(axis=1)*1000
     LO_DIN.to_csv(f'{output_dir}/LO_DIN.csv', index=False)
+    # Create File (N_DIN) (L001, L005, L008)
+    n_din = pd.DataFrame(date_DIN, columns=['date'])
+    n_din.set_index('date', inplace=True)
+    n_din['NH4'] = LO_NH4_Clean_Inter[['date', 'Data_L001_NH4_Inter', 'Data_L005_NH4_Inter', 'Data_L008_NH4_Inter']].mean(axis=1, numeric_only=True)
+    n_din['NO'] = LO_NO_Clean_Inter[['date', 'Data_L001_NO_Inter', 'Data_L005_NO_Inter', 'Data_L008_NO_Inter']].mean(axis=1, numeric_only=True)*1000    # mg/L to mg/m3
+    n_din['DIN'] = n_din[['NH4', 'NO']].sum(axis=1)*1000    # mg/L to mg/m3
+    n_din.to_csv(f'{output_dir}/N_DIN.csv')
+    # Create File (S_DIN) (L004, L006, L007, L008, LZ40)
+    s_din = pd.DataFrame(date_DIN, columns=['date'])
+    s_din.set_index('date', inplace=True)
+    s_din['NH4'] = LO_NH4_Clean_Inter[['date', 'Data_L004_NH4_Inter', 'Data_L006_NH4_Inter', 'Data_L007_NH4_Inter', 'Data_L008_NH4_Inter', 'Data_LZ40_NH4_Inter']].mean(axis=1, numeric_only=True)
+    s_din['NO'] = LO_NO_Clean_Inter[['date', 'Data_L004_NO_Inter', 'Data_L006_NO_Inter', 'Data_L007_NO_Inter', 'Data_L008_NO_Inter', 'Data_LZ40_NO_Inter']].mean(axis=1, numeric_only=True)*1000    # mg/L to mg/m3
+    s_din['DIN'] = s_din[['NH4', 'NO']].sum(axis=1)*1000    # mg/L to mg/m3
+    s_din.to_csv(f'{output_dir}/S_DIN.csv')
     # Interpolated DO Observations in Lake
     # Create File (LO_Avg_DO)
@@ -822,6 +849,93 @@ def main(input_dir: str, output_dir: str) -> None:
     LO_Chla_Merge_Monthly_Inter = LO_Chla_Merge.resample('M').mean()
     LO_Chla_Merge_Monthly_Inter.to_csv(f'{output_dir}/LO_Chla_Merge_Monthly_Inter.csv')
+    # Create files (LO_Chla_Obs.csv, N_Merged_Chla.csv, and S_Merged_Chla.csv)
+    L001_Chla = pd.read_csv(f'{input_dir}/water_quality_L001_CHLOROPHYLL-A, CORRECTED.csv')
+    L001_Chla.drop(columns=['days'], inplace=True)
+    L004_Chla = pd.read_csv(f'{input_dir}/water_quality_L004_CHLOROPHYLL-A, CORRECTED.csv')
+    L004_Chla.drop(columns=['days'], inplace=True)
+    L005_Chla = pd.read_csv(f'{input_dir}/water_quality_L005_CHLOROPHYLL-A, CORRECTED.csv')
+    L005_Chla.drop(columns=['days'], inplace=True)
+    L006_Chla = pd.read_csv(f'{input_dir}/water_quality_L006_CHLOROPHYLL-A, CORRECTED.csv')
+    L006_Chla.drop(columns=['days'], inplace=True)
+    L007_Chla = pd.read_csv(f'{input_dir}/water_quality_L007_CHLOROPHYLL-A, CORRECTED.csv')
+    L007_Chla.drop(columns=['days'], inplace=True)
+    L008_Chla = pd.read_csv(f'{input_dir}/water_quality_L008_CHLOROPHYLL-A, CORRECTED.csv')
+    L008_Chla.drop(columns=['days'], inplace=True)
+    LZ40_Chla = pd.read_csv(f'{input_dir}/water_quality_LZ40_CHLOROPHYLL-A, CORRECTED.csv')
+    LZ40_Chla.drop(columns=['days'], inplace=True)
+    L001_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L001_CHLOROPHYLL-A(LC).csv')
+    L001_Chla_LC.drop(columns=['days'], inplace=True)
+    L004_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L004_CHLOROPHYLL-A(LC).csv')
+    L004_Chla_LC.drop(columns=['days'], inplace=True)
+    L005_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L005_CHLOROPHYLL-A(LC).csv')
+    L005_Chla_LC.drop(columns=['days'], inplace=True)
+    L006_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L006_CHLOROPHYLL-A(LC).csv')
+    L006_Chla_LC.drop(columns=['days'], inplace=True)
+    L007_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L007_CHLOROPHYLL-A(LC).csv')
+    L007_Chla_LC.drop(columns=['days'], inplace=True)
+    L008_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_L008_CHLOROPHYLL-A(LC).csv')
+    L008_Chla_LC.drop(columns=['days'], inplace=True)
+    LZ40_Chla_LC = pd.read_csv(f'{input_dir}/water_quality_LZ40_CHLOROPHYLL-A(LC).csv')
+    LZ40_Chla_LC.drop(columns=['days'], inplace=True)
+    LO_Chla = pd.merge(L001_Chla, L004_Chla, how='left', on='date')
+    LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
+    LO_Chla = pd.merge(LO_Chla, L005_Chla, how='left', on='date')
+    LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
+    LO_Chla = pd.merge(LO_Chla, L006_Chla, how='left', on='date')
+    LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
+    LO_Chla = pd.merge(LO_Chla, L007_Chla, how='left', on='date')
+    LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
+    LO_Chla = pd.merge(LO_Chla, L008_Chla, how='left', on='date')
+    LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
+    LO_Chla = pd.merge(LO_Chla, LZ40_Chla, how='left', on='date')
+    LO_Chla = LO_Chla.loc[:, ~LO_Chla.columns.str.startswith('Unnamed')]
+    LO_Chla = LO_Chla.set_index('date')
+    LO_Chla['Mean_Chla'] = LO_Chla.mean(axis=1)
+    LO_Chla = LO_Chla.reset_index()
+    LO_Chla_N_cols = ['L001_CHLOROPHYLL-A, CORRECTED_ug/L', 'L005_CHLOROPHYLL-A, CORRECTED_ug/L', 'L008_CHLOROPHYLL-A, CORRECTED_ug/L']
+    LO_Chla['Chla_North'] = LO_Chla[LO_Chla_N_cols].mean(axis=1)
+    LO_Chla_S_cols = ['L004_CHLOROPHYLL-A, CORRECTED_ug/L', 'L006_CHLOROPHYLL-A, CORRECTED_ug/L', 'L007_CHLOROPHYLL-A, CORRECTED_ug/L','L008_CHLOROPHYLL-A, CORRECTED_ug/L','LZ40_CHLOROPHYLL-A, CORRECTED_ug/L']
+    LO_Chla['Chla_South'] = LO_Chla[LO_Chla_S_cols].mean(axis=1)
+    LO_Chla_LC = pd.merge(L001_Chla_LC, L004_Chla_LC, how='left', on='date')
+    LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
+    LO_Chla_LC = pd.merge(LO_Chla_LC, L005_Chla_LC, how='left', on='date')
+    LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
+    LO_Chla_LC = pd.merge(LO_Chla_LC, L006_Chla_LC, how='left', on='date')
+    LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
+    LO_Chla_LC = pd.merge(LO_Chla_LC, L007_Chla_LC, how='left', on='date')
+    LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
+    LO_Chla_LC = pd.merge(LO_Chla_LC, L008_Chla_LC, how='left', on='date')
+    LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
+    LO_Chla_LC = pd.merge(LO_Chla_LC, LZ40_Chla_LC, how='left', on='date')
+    LO_Chla_LC = LO_Chla_LC.loc[:, ~LO_Chla_LC.columns.str.startswith('Unnamed')]
+    LO_Chla_LC = LO_Chla_LC.set_index('date')
+    LO_Chla_LC['Mean_Chla'] = LO_Chla_LC.mean(axis=1)
+    LO_Chla_LC = LO_Chla_LC.reset_index()
+    LO_Chla_LC_N_cols = ['L001_CHLOROPHYLL-A(LC)_ug/L', 'L005_CHLOROPHYLL-A(LC)_ug/L', 'L008_CHLOROPHYLL-A(LC)_ug/L']
+    LO_Chla_LC['Chla_North'] = LO_Chla_LC[LO_Chla_LC_N_cols].mean(axis=1)
+    LO_Chla_LC_S_cols = ['L004_CHLOROPHYLL-A(LC)_ug/L', 'L006_CHLOROPHYLL-A(LC)_ug/L', 'L007_CHLOROPHYLL-A(LC)_ug/L','L008_CHLOROPHYLL-A(LC)_ug/L','LZ40_CHLOROPHYLL-A(LC)_ug/L']
+    LO_Chla_LC['Chla_South'] = LO_Chla_LC[LO_Chla_LC_S_cols].mean(axis=1)
+    LO_Chla = DF_Date_Range(LO_Chla, 2008, 1, 1, 2010, 10, 19)
+    LO_Chla_df = pd.DataFrame(LO_Chla['date'], columns=['date'])
+    LO_Chla_df['Chla'] = LO_Chla['Mean_Chla']
+    LO_Chla_df['Chla_N'] = LO_Chla['Chla_North']
+    LO_Chla_df['Chla_S'] = LO_Chla['Chla_South']
+    LO_Chla_LC = DF_Date_Range(LO_Chla_LC, 2010, 10, 20, 2023, 6, 30)
+    LO_Chla_LC_df = pd.DataFrame(LO_Chla_LC['date'], columns=['date'])
+    LO_Chla_LC_df['Chla'] = LO_Chla_LC['Mean_Chla']
+    LO_Chla_LC_df['Chla_N'] = LO_Chla_LC['Chla_North']
+    LO_Chla_LC_df['Chla_S'] = LO_Chla_LC['Chla_South']
+    LO_Chla_Merge = pd.concat([LO_Chla_df, LO_Chla_LC_df]).reset_index(drop=True)
+    LO_Chla_Merge.to_csv(f'{output_dir}/LO_Chla_Obs.csv')
+    LO_Chla_Merge[['date', 'Chla_N']].rename(columns={'Chla_N': 'Chla'}).to_csv(f'{output_dir}/N_Merged_Chla.csv', index=False)
+    LO_Chla_Merge[['date', 'Chla_S']].rename(columns={'Chla_S': 'Chla'}).to_csv(f'{output_dir}/S_Merged_Chla.csv', index=False)
     # Create Files S65E_Avg_Chla
     S65E_Chla_Inter = pd.read_csv(f'{input_dir}/water_quality_S65E_CHLOROPHYLL-A, CORRECTED_Interpolated.csv')
     S65E_Chla_LC_Inter = pd.read_csv(f'{input_dir}/water_quality_S65E_CHLOROPHYLL-A(LC)_Interpolated.csv')
@@ -897,7 +1011,6 @@ def main(input_dir: str, output_dir: str) -> None:
     # Write Data into csv files
     # write Avg Stage (ft, m) Storage (acft, m3) SA (acres) to csv
     LO_Stg_Sto_SA_df.to_csv(f'{output_dir}/Average_LO_Storage_3MLag.csv', index=False)
-    LO_Stg_Sto_SA_df.to_csv(f'{input_dir}/Average_LO_Storage_3MLag.csv', index=False)   # Also needed in temporary directory by utils.py's wind_induced_waves()
     # Write S65 TP concentrations (mg/L)
     S65_total_TP.to_csv(f'{output_dir}/S65_TP_3MLag.csv', index=False)
     # TP External Loads 3 Months Lag (mg)

loone_data_prep-0.1.8/loone_data_prep/flow_data/S65E_total.py ADDED Viewed

@@ -0,0 +1,89 @@
+import sys
+from retry import retry
+from rpy2.robjects import r
+from rpy2.rinterface_lib.embedded import RRuntimeError
+import pandas as pd
+@retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
+def get(
+    workspace,
+    date_min: str = "1972-01-01",
+    date_max: str = "2023-06-30"
+) -> None:
+    r(
+        f"""
+        # Load the required libraries
+        library(dbhydroR)
+        library(dplyr)
+        # Helper Functions
+        retrieve_data <- function(dbkey, date_min, date_max)
+        {{
+            # Get the data from dbhydro
+            df = get_hydro(dbkey = dbkey, date_min = date_min, date_max = date_max, raw = TRUE)
+            # Give data.frame correct column names so it can be cleaned using the clean_hydro function
+            colnames(df) <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
+            # Add a type and units column to data so it can be cleaned using the clean_hydro function
+            df$type <- "FLOW"
+            df$units <- "cfs"
+            # Clean the data.frame
+            df <- clean_hydro(df)
+            # Drop the " _FLOW_cfs" column
+            df <- df %>% select(-` _FLOW_cfs`)
+            # Convert Flow rate from cfs to m³/day
+            df[, -1] <- df[, -1] * (0.0283168466 * 86400)
+            # Return resulting data.frame
+            return(df)
+        }}
+        # S65E_S
+        S65E_S <- retrieve_data(dbkey = "91656", date_min = "{date_min}", date_max = "{date_max}")
+        # Wait five seconds before next request to avoid "too many requests" error
+        Sys.sleep(5)
+        # S65EX1_S
+        S65EX1_S <- retrieve_data(dbkey = "AL760", date_min = "{date_min}", date_max = "{date_max}")
+        # Merge the data from each dbkey
+        result <- merge(S65E_S, S65EX1_S, by = "date", all = TRUE)
+        # Write the data to a file
+        write.csv(result, file = '{workspace}/S65E_total.csv')
+        """
+    )
+    _reformat_s65e_total_file(workspace)
+def _reformat_s65e_total_file(workspace: str):
+    # Read in the data
+    df = pd.read_csv(f"{workspace}/S65E_total.csv")
+    # Drop unused columns
+    df.drop('Unnamed: 0', axis=1, inplace=True)
+    # Convert date column to datetime
+    df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
+    # Sort the data by date
+    df.sort_values('date', inplace=True)
+    # Renumber the index
+    df.reset_index(drop=True, inplace=True)
+    # Drop rows that are missing all their values
+    df.dropna(how='all', inplace=True)
+    # Write the updated data back to the file
+    df.to_csv(f"{workspace}/S65E_total.csv")
+if __name__ == "__main__":
+    workspace = sys.argv[1].rstrip("/")
+    get(workspace)

loone_data_prep-0.1.8/loone_data_prep/flow_data/forecast_bias_correction.py ADDED Viewed

@@ -0,0 +1,293 @@
+import sys
+import os
+import math
+import numpy as np
+import pandas as pd
+import geoglows
+from scipy import interpolate
+SECONDS_IN_DAY = 86400
+def get_bias_corrected_data(
+    station_id: str,
+    reach_id: str,
+    observed_data_path: str,
+    station_ensembles: pd.DataFrame,
+    station_stats: pd.DataFrame,
+    cache_path: str = None,
+) -> dict:
+    # Load the observed data from a CSV file
+    observed_data = pd.read_csv(
+        observed_data_path,
+        index_col=0,
+        usecols=["date", f"{station_id}_FLOW_cmd"],
+    )
+    # Convert the index to datetime and localize it to UTC
+    observed_data.index = pd.to_datetime(observed_data.index).tz_localize(
+        "UTC"
+    )
+    # Transform the data by dividing it by the number of seconds in a day
+    observed_data = observed_data.transform(lambda x: x / SECONDS_IN_DAY)
+    # Rename the value column to "Streamflow (m3/s)"
+    observed_data.rename(
+        columns={f"{station_id}_FLOW_cmd": "Streamflow (m3/s)"}, inplace=True
+    )
+    # Prepare the observed data by filling NaN values with the 10yr average
+    prepared_od = prep_observed_data(observed_data)
+    # Get the historical simulation data for the given reach ID
+    historical_data = None
+    if cache_path is None:
+        historical_data = geoglows.streamflow.historic_simulation(reach_id)
+    else:
+        # Create the geoglows cache directory if it doesn't exist
+        geoglows_cache_path = os.path.join(cache_path, "geoglows_cache")
+        if not os.path.exists(geoglows_cache_path):
+            os.makedirs(geoglows_cache_path)
+        # Check if the historical simulation data is already cached
+        if os.path.exists(
+            os.path.join(
+                geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
+            )
+        ):
+            historical_data = pd.read_csv(
+                os.path.join(
+                    geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
+                ),
+                index_col=0,
+            )
+            historical_data.index = pd.to_datetime(historical_data.index)
+        else:
+            historical_data = geoglows.streamflow.historic_simulation(reach_id)
+            historical_data.to_csv(
+                os.path.join(
+                    geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
+                )
+            )
+    # Correct the forecast bias in the station ensembles
+    station_ensembles = bias_correct_forecast(
+        station_ensembles, historical_data, prepared_od
+    )
+    # Correct the forecast bias in the station stats
+    station_stats = bias_correct_forecast(
+        station_stats, historical_data, prepared_od
+    )
+    # Return the bias-corrected station ensembles and station stats
+    return station_ensembles, station_stats
+def prep_observed_data(observed_data: pd.DataFrame) -> pd.DataFrame:
+    # Group the data by month and day
+    grouped_data = observed_data.groupby(
+        [observed_data.index.month, observed_data.index.day]
+    )
+    # Calculate the rolling average of 'Streamflow (m3/s)' for each group
+    daily_10yr_avg = (
+        grouped_data["Streamflow (m3/s)"]
+        .rolling(window=10, min_periods=1, center=True)
+        .mean()
+    )
+    # Reset the multi-index of daily_10yr_avg and sort it by index
+    fill_val = daily_10yr_avg.reset_index(level=[0, 1], drop=True).sort_index()
+    # Fill NaN in 'Streamflow (m3/s)' with corresponding values from fill_val
+    observed_data["Streamflow (m3/s)"] = observed_data[
+        "Streamflow (m3/s)"
+    ].fillna(fill_val)
+    # Return the modified observed_data DataFrame
+    return observed_data
+def bias_correct_historical(
+    simulated_data: pd.DataFrame, observed_data: pd.DataFrame
+) -> pd.DataFrame:
+    """
+    Accepts a historically simulated flow timeseries and observed flow timeseries and attempts to correct biases in the
+    simulation on a monthly basis.
+    Args:
+        simulated_data: A dataframe with a datetime index and a single column of streamflow values
+        observed_data: A dataframe with a datetime index and a single column of streamflow values
+    Returns:
+        pandas DataFrame with a datetime index and a single column of streamflow values
+    """
+    # list of the unique months in the historical simulation. should always be 1->12 but just in case...
+    unique_simulation_months = sorted(set(simulated_data.index.strftime("%m")))
+    dates = []
+    values = []
+    for month in unique_simulation_months:
+        # filter historic data to only be current month
+        monthly_simulated = simulated_data[
+            simulated_data.index.month == int(month)
+        ].dropna()
+        to_prob = _flow_and_probability_mapper(
+            monthly_simulated, to_probability=True
+        )
+        # filter the observations to current month
+        monthly_observed = observed_data[
+            observed_data.index.month == int(month)
+        ].dropna()
+        to_flow = _flow_and_probability_mapper(monthly_observed, to_flow=True)
+        dates += monthly_simulated.index.to_list()
+        value = to_flow(to_prob(monthly_simulated.values))
+        values += value.tolist()
+    corrected = pd.DataFrame(
+        data=values, index=dates, columns=["Corrected Simulated Streamflow"]
+    )
+    corrected.sort_index(inplace=True)
+    return corrected
+def bias_correct_forecast(
+    forecast_data: pd.DataFrame,
+    simulated_data: pd.DataFrame,
+    observed_data: pd.DataFrame,
+    use_month: int = 0,
+) -> pd.DataFrame:
+    """
+    Accepts a short term forecast of streamflow, simulated historical flow, and observed flow timeseries and attempts
+    to correct biases in the forecasted data
+    Args:
+        forecast_data: A dataframe with a datetime index and any number of columns of forecasted flow. Compatible with
+            forecast_stats, forecast_ensembles, forecast_records
+        simulated_data: A dataframe with a datetime index and a single column of streamflow values
+        observed_data: A dataframe with a datetime index and a single column of streamflow values
+        use_month: Optional: either 0 for correct the forecast based on the first month of the forecast data or -1 if
+            you want to correct based on the ending month of the forecast data
+    Returns:
+        pandas DataFrame with a copy of forecasted data with values updated in each column
+    """
+    # make a copy of the forecasts which we update and return so the original data is not changed
+    forecast_copy = forecast_data.copy()
+    # make the flow and probability interpolation functions
+    monthly_simulated = simulated_data[
+        simulated_data.index.month == forecast_copy.index[use_month].month
+    ].dropna()
+    monthly_observed = observed_data[
+        observed_data.index.month == forecast_copy.index[use_month].month
+    ].dropna()
+    to_prob = _flow_and_probability_mapper(
+        monthly_simulated, to_probability=True, extrapolate=True
+    )
+    to_flow = _flow_and_probability_mapper(
+        monthly_observed, to_flow=True, extrapolate=True
+    )
+    # for each column of forecast data, make the interpolation function and update the dataframe
+    for column in forecast_copy.columns:
+        tmp = forecast_copy[column].dropna()
+        forecast_copy.update(
+            pd.DataFrame(
+                to_flow(to_prob(tmp.values)), index=tmp.index, columns=[column]
+            )
+        )
+    return forecast_copy
+def _flow_and_probability_mapper(
+    monthly_data: pd.DataFrame,
+    to_probability: bool = False,
+    to_flow: bool = False,
+    extrapolate: bool = False,
+) -> interpolate.interp1d:
+    if not to_flow and not to_probability:
+        raise ValueError(
+            "You need to specify either to_probability or to_flow as True"
+        )
+    # get maximum value to bound histogram
+    max_val = math.ceil(np.max(monthly_data.max()))
+    min_val = math.floor(np.min(monthly_data.min()))
+    if max_val == min_val:
+        max_val += 0.1
+    # determine number of histograms bins needed
+    number_of_points = len(monthly_data.values)
+    number_of_classes = math.ceil(1 + (3.322 * math.log10(number_of_points)))
+    # specify the bin width for histogram (in m3/s)
+    step_width = (max_val - min_val) / number_of_classes
+    # specify histogram bins
+    bins = np.arange(
+        -np.min(step_width),
+        max_val + 2 * np.min(step_width),
+        np.min(step_width),
+    )
+    if bins[0] == 0:
+        bins = np.concatenate((-bins[1], bins))
+    elif bins[0] > 0:
+        bins = np.concatenate((-bins[0], bins))
+    # make the histogram
+    counts, bin_edges = np.histogram(monthly_data, bins=bins)
+    # adjust the bins to be the center
+    bin_edges = bin_edges[1:]
+    # normalize the histograms
+    counts = counts.astype(float) / monthly_data.size
+    # calculate the cdfs
+    cdf = np.cumsum(counts)
+    # Identify indices where consecutive values are the same
+    duplicate_indices = np.where(np.diff(cdf) == 0)[0]
+    # Adjust duplicate value to be an extrapolation of the previous value
+    for idx in duplicate_indices:
+        if idx > 0:
+            cdf[idx] = cdf[idx - 1] + (cdf[idx + 1] - cdf[idx - 1]) / 2
+    # interpolated function to convert simulated streamflow to prob
+    if to_probability:
+        if extrapolate:
+            func = interpolate.interp1d(
+                bin_edges, cdf, fill_value="extrapolate"
+            )
+        else:
+            func = interpolate.interp1d(bin_edges, cdf)
+        return lambda x: np.clip(func(x), 0, 1)
+    # interpolated function to convert simulated prob to observed streamflow
+    elif to_flow:
+        if extrapolate:
+            return interpolate.interp1d(
+                cdf, bin_edges, fill_value="extrapolate"
+            )
+        return interpolate.interp1d(cdf, bin_edges)
+if __name__ == "__main__":
+    station_id = sys.argv[1]
+    reach_id = sys.argv[2]
+    observed_data_path = sys.argv[3].rstrip("/")
+    station_ensembles = sys.argv[4]
+    station_stats = sys.argv[5]
+    get_bias_corrected_data(
+        station_id,
+        reach_id,
+        observed_data_path,
+        station_ensembles,
+        station_stats,
+    )

loone-data-prep 0.1.6__tar.gz → 0.1.8__tar.gz

loone-data-prep 0.1.6tar.gz → 0.1.8tar.gz