PyPI - loone-data-prep - Versions diffs - 0.1.9__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

loone-data-prep 0.1.9py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

loone_data_prep/LOONE_DATA_PREP.py CHANGED Viewed

@@ -545,7 +545,7 @@ def main(input_dir: str, output_dir: str) -> None:
     LO_TP_data_Inter['Mean_TP'] = LO_TP_data_Inter.mean(axis=1, numeric_only=True)
     LO_TP_data_Inter = LO_TP_data_Inter.set_index(['date'])
     LO_TP_data_Inter.index = pd.to_datetime(LO_TP_data_Inter.index, unit='ns')
-    LO_TP_Monthly_Inter = LO_TP_data_Inter.resample('M').mean()
+    LO_TP_Monthly_Inter = LO_TP_data_Inter.resample('ME').mean()
     Max = LO_TP_Monthly_Inter.max(axis=1)
     Min = LO_TP_Monthly_Inter.min(axis=1)
     LO_TP_Monthly_Inter['Max'] = Max.values
@@ -624,7 +624,7 @@ def main(input_dir: str, output_dir: str) -> None:
     LO_NH4_Clean_Inter.to_csv(f'{output_dir}/LO_NH4_Clean_daily.csv', index=False)
     LO_NH4_Clean_Inter = LO_NH4_Clean_Inter.set_index(['date'])
     LO_NH4_Clean_Inter.index = pd.to_datetime(LO_NH4_Clean_Inter.index, unit='ns')
-    LO_NH4_Monthly_Inter = LO_NH4_Clean_Inter.resample('M').mean()
+    LO_NH4_Monthly_Inter = LO_NH4_Clean_Inter.resample('ME').mean()
     LO_NH4_Monthly_Inter.to_csv(f'{output_dir}/LO_NH4_Monthly_Inter.csv')
     # Interpolated NO Observations in Lake
@@ -967,6 +967,7 @@ def main(input_dir: str, output_dir: str) -> None:
     NO_list = {'S65_NO': S65_NO, 'S71_NO': S71_NO, 'S72_NO': S72_NO, 'S84_NO': S84_NO, 'S127_NO': S127_NO,
                'S133_NO': S133_NO, 'S154_NO': S154_NO, 'S191_NO': S191_NO, 'S308_NO': S308_NO,
                'FISHP_NO': FISHP_NO, 'L8_NO': L8_NO, 'S4_NO': S4_NO}
+    #TODO: Why is this date hard coded into this part?
     date_NO = pd.date_range(start='1/1/2008', end='3/31/2023', freq='D')
     NO_df = pd.DataFrame(date_NO, columns=['date'])
@@ -982,32 +983,48 @@ def main(input_dir: str, output_dir: str) -> None:
     Flow_df = DF_Date_Range(Flow_df, St_Yr, St_M, St_D, En_Yr, En_M, En_D)
     # Determine NO Loads
-    NO_Loads_In = pd.DataFrame(date_NO, columns=['date'])
-    NO_Loads_In['S65_NO_Ld'] = Flow_df['S65_Q'].values * NO_df['S65_NO'].values * 1000
-    NO_Loads_In['S71_NO_Ld'] = Flow_df['S71_Q'].values * NO_df['S71_NO'].values * 1000
-    NO_Loads_In['S72_NO_Ld'] = Flow_df['S72_Q'].values * NO_df['S72_NO'].values * 1000
-    NO_Loads_In['S84_NO_Ld'] = Flow_df['S84_Q'].values * NO_df['S84_NO'].values * 1000
-    NO_Loads_In['S127_NO_Ld'] = Flow_df['S127_In'].values * NO_df['S127_NO'].values * 1000
-    NO_Loads_In['S133_NO_Ld'] = Flow_df['S133_P_Q'].values * NO_df['S133_NO'].values * 1000
+    # Ensure 'date' is datetime
+    NO_df['date'] = pd.to_datetime(NO_df['date'])
+    Flow_df['date'] = pd.to_datetime(Flow_df['date'])
+    # Merge the two dataframes on date - this will ensure that the dates match
+    merged = pd.merge(NO_df, Flow_df, on='date', how='inner')
+    # Compute NO Loads
+    NO_Loads_In = merged[['date']].copy()
+    NO_Loads_In['S65_NO_Ld'] = merged['S65_Q'] * merged['S65_NO'] * 1000
+    NO_Loads_In['S71_NO_Ld'] = merged['S71_Q'] * merged['S71_NO'] * 1000
+    NO_Loads_In['S71_NO_Ld'] = merged['S71_Q'] * merged['S71_NO'] * 1000
+    NO_Loads_In['S72_NO_Ld'] = merged['S72_Q'] * merged['S72_NO'] * 1000
+    NO_Loads_In['S84_NO_Ld'] = merged['S84_Q'] * merged['S84_NO'] * 1000
+    NO_Loads_In['S127_NO_Ld'] = merged['S127_In'] * merged['S127_NO'] * 1000
+    NO_Loads_In['S133_NO_Ld'] = merged['S133_P_Q'] * merged['S133_NO'] * 1000
     # NO_Loads_In['S135_NO_Ld'] = Flow_df['S135_In'].values * NO_df['S135_NO'].values * 1000
-    NO_Loads_In['S154_NO_Ld'] = Flow_df['S154_Q'].values * NO_df['S154_NO'].values * 1000
-    NO_Loads_In['S191_NO_Ld'] = Flow_df['S191_Q'].values * NO_df['S191_NO'].values * 1000
-    NO_Loads_In['S308_NO_Ld'] = Flow_df['S308_In'].values * NO_df['S308_NO'].values * 1000
-    NO_Loads_In['FISHP_NO_Ld'] = Flow_df['FISHP_Q'].values * NO_df['FISHP_NO'].values * 1000
-    NO_Loads_In['L8_NO_Ld'] = Flow_df['L8_In'].values * NO_df['L8_NO'].values * 1000
-    NO_Loads_In['S4_NO_Ld'] = Flow_df['S4_P_Q'].values * NO_df['S4_NO'].values * 1000
+    NO_Loads_In['S154_NO_Ld'] = merged['S154_Q'] * merged['S154_NO'] * 1000
+    NO_Loads_In['S191_NO_Ld'] = merged['S191_Q'] * merged['S191_NO'] * 1000
+    NO_Loads_In['S308_NO_Ld'] = merged['S308_In'] * merged['S308_NO'] * 1000
+    NO_Loads_In['FISHP_NO_Ld'] = merged['FISHP_Q'] * merged['FISHP_NO'] * 1000
+    NO_Loads_In['L8_NO_Ld'] = merged['L8_In'] * merged['L8_NO'] * 1000
+    NO_Loads_In['S4_NO_Ld'] = merged['S4_P_Q'] * merged['S4_NO'] * 1000
     # Calculate the total External Loads to Lake Okeechobee
     NO_Loads_In['External_NO_Ld_mg'] = NO_Loads_In.sum(axis=1, numeric_only=True)
     NO_Loads_In.to_csv(f'{output_dir}/LO_External_Loadings_NO.csv', index=False)
     # Determine Chla Loads
     # Create File (Chla_Loads_In)
+    # Read and date-filter Chla data
     S65E_Chla = pd.read_csv(f'{output_dir}/S65E_Chla_Merged.csv')
+    S65E_Chla['date'] = pd.to_datetime(S65E_Chla['date'])  # Ensure date column is datetime
     S65E_Chla = DF_Date_Range(S65E_Chla, St_Yr, St_M, St_D, En_Yr, En_M, En_D)
-    Chla_Loads_In = pd.DataFrame(date_NO, columns=['date'])
-    Chla_Loads_In['Chla_Loads'] = Flow_df['Inflows'].values * S65E_Chla['Data'].values
+    # Merge on date
+    merged = pd.merge(Flow_df[['date', 'Inflows']], S65E_Chla[['date', 'Data']], on='date', how='inner')
+    # Calculate Chlorophyll-a loads
+    merged['Chla_Loads'] = merged['Inflows'] * merged['Data']
+    # Save results
+    Chla_Loads_In = merged[['date', 'Chla_Loads']]
     Chla_Loads_In.to_csv(f'{output_dir}/Chla_Loads_In.csv', index=False)
     # Write Data into csv files
     # write Avg Stage (ft, m) Storage (acft, m3) SA (acres) to csv
     LO_Stg_Sto_SA_df.to_csv(f'{output_dir}/Average_LO_Storage_3MLag.csv', index=False)

loone_data_prep/flow_data/forecast_bias_correction.py CHANGED Viewed

@@ -37,47 +37,65 @@ def get_bias_corrected_data(
     # Prepare the observed data by filling NaN values with the 10yr average
     prepared_od = prep_observed_data(observed_data)
-    # Get the historical simulation data for the given reach ID
-    historical_data = None
-    if cache_path is None:
-        historical_data = geoglows.streamflow.historic_simulation(reach_id)
-    else:
-        # Create the geoglows cache directory if it doesn't exist
-        geoglows_cache_path = os.path.join(cache_path, "geoglows_cache")
-        if not os.path.exists(geoglows_cache_path):
-            os.makedirs(geoglows_cache_path)
-        # Check if the historical simulation data is already cached
-        if os.path.exists(
-            os.path.join(
-                geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
-            )
-        ):
-            historical_data = pd.read_csv(
-                os.path.join(
-                    geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
-                ),
-                index_col=0,
-            )
-            historical_data.index = pd.to_datetime(historical_data.index)
-        else:
-            historical_data = geoglows.streamflow.historic_simulation(reach_id)
-            historical_data.to_csv(
-                os.path.join(
-                    geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
-                )
-            )
+    historical_data = geoglows.data.retro_daily(reach_id)
+    # Get the historical simulation data for the given reach ID - TODO: Do we for sure want to cache the historical data?
+    # I am reading the observed data that we queried earlier instead of caching it
+    # historical_data = None
+    # if cache_path is None:
+    #     historical_data = geoglows.streamflow.historic_simulation(reach_id)
+    # else:
+    #     # Create the geoglows cache directory if it doesn't exist
+    #     geoglows_cache_path = os.path.join(cache_path, "geoglows_cache")
+    #     if not os.path.exists(geoglows_cache_path):
+    #         os.makedirs(geoglows_cache_path)
+    #     # Check if the historical simulation data is already cached
+    #     if os.path.exists(
+    #         os.path.join(
+    #             geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
+    #         )
+    #     ):
+    #         historical_data = pd.read_csv(
+    #             os.path.join(
+    #                 geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
+    #             ),
+    #             index_col=0,
+    #         )
+    #         historical_data.index = pd.to_datetime(historical_data.index)
+    #     else:
+    #         historical_data = geoglows.streamflow.historic_simulation(reach_id)
+    #         historical_data.to_csv(
+    #             os.path.join(
+    #                 geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
+    #             )
+    #         )
+    # Drop 'ensemble_52' column if it exists - not necessary but we don't need it
+    station_ensembles.drop(columns=['ensemble_52'], inplace=True, errors='ignore')
+    # Drop all rows with any NaN values - again not necessary but we can drop them because we don't need it
+    station_ensembles.dropna(inplace=True)
     # Correct the forecast bias in the station ensembles
-    station_ensembles = bias_correct_forecast(
+    station_ensembles = geoglows.bias.correct_forecast(
         station_ensembles, historical_data, prepared_od
     )
     # Correct the forecast bias in the station stats
-    station_stats = bias_correct_forecast(
+    station_stats = geoglows.bias.correct_forecast(
         station_stats, historical_data, prepared_od
     )
+    #This is to clean out any infinite values that may have occurred during bias correction
+    station_ensembles = station_ensembles.replace([np.inf, -np.inf], np.nan)
+    station_ensembles = station_ensembles.interpolate(axis=0, limit_direction='both')
+    # Fill any remaining NaNs (e.g., at column ends)
+    station_ensembles = station_ensembles.ffill(axis=0).bfill(axis=0)
+    station_stats = station_stats.replace([np.inf, -np.inf], np.nan)
+    station_stats = station_stats.interpolate(axis=0, limit_direction='both')
+    # Fill any remaining NaNs (e.g., at column ends)
+    station_stats = station_stats.ffill(axis=0).bfill(axis=0)
     # Return the bias-corrected station ensembles and station stats
     return station_ensembles, station_stats

loone_data_prep/flow_data/get_forecast_flows.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import sys
-import glob
 import pandas as pd
 import rpy2.robjects as ro
 from rpy2.robjects import pandas2ri
@@ -44,37 +43,16 @@ STATION_IDS = [
     "S49_S",
 ]  # Added these stations. They seemed to be missing.
-REACH_IDS = {
-    "S191_S": 13082707,
-    "S65E_S": 13082699,
-    "S65EX1_S": 13082699,
-    "S84_S": 13082700,
-    "S154_C": 13082716,
-    "S71_S": 13082743,
-    "S72_S": 13082727,
-    "FISHP": 13082756,
-    "S308.DS": 13082736,
-    "L8.441": 13082747,
-    "S133_P": 13082709,
-    "S127_C": 13082716,
-    "S127_P": 13082716,
-    "S129_C": 13082727,
-    "S135_C": 13082725,
-    "S2_P": 13082783,
-    "S3_P": 13082809,
-    "S4_P": 13082806,
-    "S351_S": 13082804,
-    "S352_S": 13082762,
-    "S354_S": 13082809,
-    "S129 PMP_P": 13082727,
-    "S135 PMP_P": 13082725,
-    "S77_S": 13082767,
-    "INDUST": 13082806,
-    "S79_S": 13082791,
-    "S80_S": 13082718,
-    "S40_S": 13082797,
-    "S49_S": 13082696,
-}
+INFLOW_IDS = [
+    750059718, 750043742, 750035446, 750034865, 750055574, 750053211,
+    750050248, 750065049, 750064453, 750049661, 750069195, 750051436,
+    750068005, 750063868, 750069782, 750072741
+]
+OUTFLOW_IDS = [750053809, 750057949]
+MATCHED_IDS = [750052624, 750049656,  750057357,
+               750038427, 750051428, 750068601, 750058536, 750038416,
+               750050259, 750045514, 750053213, 750028935]
 SECONDS_IN_HOUR = 3600
 SECONDS_IN_DAY = 86400
@@ -140,7 +118,7 @@ def get_reach_id(latitude: float, longitude: float):
     Returns:
         (int): The reach id of the given latitude/longitude
     """
-    reach_data = geoglows.streamflow.latlon_to_reach(latitude, longitude)
+    reach_data = geoglows.streams.latlon_to_reach(latitude, longitude)
     if "error" in reach_data:
         raise Exception(reach_data["error"])
@@ -159,8 +137,8 @@ def get_flow_forecast_ensembles(reach_id: str, forecast_date: str):
     Returns:
         (pandas.core.frame.DataFrame): The 52 ensemble flow forecasts.
     """
-    return geoglows.streamflow.forecast_ensembles(
-        reach_id=reach_id, forecast_date=forecast_date, endpoint=GEOGLOWS_ENDPOINT
+    return geoglows.data.forecast_ensembles(
+        river_id=reach_id, date=forecast_date
     )
@@ -176,14 +154,15 @@ def get_flow_forecast_stats(reach_id: str, forecast_date: str):
     Returns:
         (pandas.core.frame.DataFrame): The forecast stats
     """
-    return geoglows.streamflow.forecast_stats(
-        reach_id=reach_id, forecast_date=forecast_date, endpoint=GEOGLOWS_ENDPOINT
+    return geoglows.data.forecast_stats(
+        river_id=reach_id, date=forecast_date
     )
 def ensembles_to_csv(
     workspace: str,
-    station_id: str,
+    flow_type: str,
+    reach_id: str,
     ensembles: pd.core.frame.DataFrame,
     stats: pd.core.frame.DataFrame,
 ):
@@ -202,7 +181,7 @@ def ensembles_to_csv(
             data.
     """
     # Get the path to the file that will be written
-    file_name = f"{station_id}_FLOW_cmd_geoglows.csv"
+    file_name = f"{reach_id}_{flow_type}_cmd_geoglows.csv"
     file_path = os.path.join(workspace, file_name)
     # Format DataFrames for LOONE
@@ -234,8 +213,8 @@ def _format_ensembles_DataFrame(dataframe: pd.core.frame.DataFrame):
             DataFrame.
     """
     # Remove high resolution columns (ensemble 52)
-    if "ensemble_52_m^3/s" in dataframe.columns:
-        dataframe.drop(columns="ensemble_52_m^3/s", inplace=True)
+    if "ensemble_52" in dataframe.columns:
+        dataframe.drop(columns="ensemble_52", inplace=True)
     # Remove rows with null values
     dataframe.dropna(axis="index", inplace=True)
@@ -284,8 +263,8 @@ def _format_stats_DataFrame(dataframe: pd.core.frame.DataFrame):
             DataFrame.
     """
     # Remove high resolution columns (ensemble 52, high_res_m^3/s)
-    if "high_res_m^3/s" in dataframe.columns:
-        dataframe.drop(columns="high_res_m^3/s", inplace=True)
+    if "high_res" in dataframe.columns:
+        dataframe.drop(columns="high_res", inplace=True)
     # Remove rows with null values
     dataframe.dropna(axis="index", inplace=True)
@@ -300,28 +279,28 @@ def _format_stats_DataFrame(dataframe: pd.core.frame.DataFrame):
     dataframe.clip(0, inplace=True)
     # Max Column (Max)
-    column_max = dataframe[["flow_max_m^3/s"]].copy()
+    column_max = dataframe[["flow_max"]].copy()
     column_max = column_max.groupby([column_max.index]).max()
     # 75th Percentile Column (Average)
-    column_75percentile = dataframe[["flow_75%_m^3/s"]].copy()
+    column_75percentile = dataframe[["flow_75p"]].copy()
     column_75percentile = column_75percentile.groupby(
         [column_75percentile.index]
     ).mean()
     # Average Column (Weighted Average)
-    column_average = dataframe[["flow_avg_m^3/s"]].copy()
+    column_average = dataframe[["flow_avg"]].copy()
     column_average.transform(lambda x: x / 8)
     column_average = column_average.groupby([column_average.index]).sum()
     # 25th Percentile Column (Average)
-    column_25percentile = dataframe[["flow_25%_m^3/s"]].copy()
+    column_25percentile = dataframe[["flow_25p"]].copy()
     column_25percentile = column_25percentile.groupby(
         [column_25percentile.index]
     ).mean()
     # Min Column (Min)
-    column_min = dataframe[["flow_min_m^3/s"]].copy()
+    column_min = dataframe[["flow_min"]].copy()
     column_min = column_min.groupby([column_min.index]).min()
     # Convert values in each column from m^3/h to m^3/d
@@ -338,17 +317,17 @@ def _format_stats_DataFrame(dataframe: pd.core.frame.DataFrame):
     # Append modified columns into one pandas DataFrame
     dataframe_result = pd.DataFrame()
     dataframe_result.index = dataframe.groupby([dataframe.index]).mean().index
-    dataframe_result["flow_max_m^3/d"] = column_max["flow_max_m^3/s"].tolist()
+    dataframe_result["flow_max_m^3/d"] = column_max["flow_max"].tolist()
     dataframe_result["flow_75%_m^3/d"] = column_75percentile[
-        "flow_75%_m^3/s"
+        "flow_75p"
     ].tolist()
     dataframe_result["flow_avg_m^3/d"] = column_average[
-        "flow_avg_m^3/s"
+        "flow_avg"
     ].tolist()
     dataframe_result["flow_25%_m^3/d"] = column_25percentile[
-        "flow_25%_m^3/s"
+        "flow_25p"
     ].tolist()
-    dataframe_result["flow_min_m^3/d"] = column_min["flow_min_m^3/s"].tolist()
+    dataframe_result["flow_min_m^3/d"] = column_min["flow_min"].tolist()
     # Format datetimes to just dates
     dataframe_result.index = dataframe_result.index.strftime("%Y-%m-%d")
@@ -383,48 +362,111 @@ def main(
         cache_path (str): The path to the cache directory for geoglows data.
             Should hold a directory named geoglows_cache that holds the cached files. Use None to not use a cache.
     """
-    # Local Variables
-    reach_ids = {}
-    # Get the latitude/longitude for each station
-    station_locations = get_stations_latitude_longitude(station_ids)
-    # Check for any download failures
-    for station_id in station_ids:
-        if station_id in REACH_IDS.keys():
-            reach_ids[station_id] = REACH_IDS[station_id]
-        elif station_id not in station_locations.keys():
-            raise Exception(
-                "Error: The longitude and latitude could not be downloaded "
-                f"for station {station_id}"
-            )
-    # Get station reach ids
-    if station_id not in REACH_IDS.keys():
-        for station_id in station_locations.keys():
-            location = station_locations[station_id]
-            try:
-                reach_ids[station_id] = get_reach_id(location[0], location[1])
-            except Exception as e:
-                print(
-                    "Error: Failed to get reach id for station "
-                    f"{station_id} ({str(e)})"
-                )
+    # # Local Variables
+    # reach_ids = {}
+    # # Get the latitude/longitude for each station
+    # station_locations = get_stations_latitude_longitude(station_ids)
+    # # Check for any download failures
+    # for station_id in station_ids:
+    #     if station_id in REACH_IDS.keys():
+    #         reach_ids[station_id] = REACH_IDS[station_id]
+    #     elif station_id not in station_locations.keys():
+    #         raise Exception(
+    #             "Error: The longitude and latitude could not be downloaded "
+    #             f"for station {station_id}"
+    #         )
+    # # Get station reach ids
+    # if station_id not in REACH_IDS.keys():
+    #     for station_id in station_locations.keys():
+    #         location = station_locations[station_id]
+    #         try:
+    #             reach_ids[station_id] = get_reach_id(location[0], location[1])
+    #         except Exception as e:
+    #             print(
+    #                 "Error: Failed to get reach id for station "
+    #                 f"{station_id} ({str(e)})"
+    #             )
     # Get the flow data for each station
-    for station_id in reach_ids.keys():
-        reach_id = reach_ids[station_id]
+    stations_inflow_by_comid = {
+        750072741: "S65E_S",   # TODO: Should this be S65E_total or S65E_S? - this is a station we definitely want
+        750069782: "S84_S",        #
+        # 750053211: "S129_C",       # TODO: Should this be S129_C or S129_PMP_P? - Also right now it is all 0s
+        # 750035446: "S133_P",       # TODO: Should this be S133_P or S133_C? - Also right now it is all 0s
+        750064453: "S154_C",       # This is primarily 0s
+    }
+    for reach_id in INFLOW_IDS:
+        station_ensembles = get_flow_forecast_ensembles(
+            reach_id, forecast_date
+        )
+        station_stats = get_flow_forecast_stats(reach_id, forecast_date)
+        if bias_corrected:
+            if reach_id in stations_inflow_by_comid:
+                station_id = stations_inflow_by_comid[reach_id]
+                observed_data_path = os.path.join(observed_data_dir, f"{station_id}_FLOW_cmd.csv")
+                # if observed_data_list:
+                #     observed_data_path = observed_data_list[0]
+                station_ensembles, station_stats = get_bias_corrected_data(
+                    station_id,
+                    reach_id,
+                    observed_data_path,
+                    station_ensembles,
+                    station_stats,
+                    cache_path,
+                )
+        ensembles_to_csv(
+            workspace,
+            "INFLOW",
+            reach_id,
+            station_ensembles,
+            station_stats,
+        )
+    for reach_id in OUTFLOW_IDS:
         station_ensembles = get_flow_forecast_ensembles(
             reach_id, forecast_date
         )
         station_stats = get_flow_forecast_stats(reach_id, forecast_date)
+        ensembles_to_csv(
+            workspace,
+            "OUTFLOW",
+            reach_id,
+            station_ensembles,
+            station_stats,
+        )
+    for reach_id in MATCHED_IDS:
+        stations_matched_by_comid = {
+            750068601: "S71_S",
+            750052624: "S135_C",       # TODO: Should this be S135_C or S135_P?
+           #  750052624: "S308",       # NOTE: Same COMID as S135 — only one key allowed!
+            750053213: "FISHP",
+            750038416: "S77_S",
+            750050259: "S79_TOT",
+            750045514: "S80_S",
+            750058536: "S72_S",
+            750051428: "S49_S",
+            # 750038427: "S40",
+            750057357: "S191_S",
+            750028935: "S127_C", #TODO: Should this be S127_C or S127_P?
+        }
+        station_ensembles = get_flow_forecast_ensembles(
+            reach_id, forecast_date
+        )
+        station_stats = get_flow_forecast_stats(reach_id, forecast_date)
         if bias_corrected:
-            observed_data_list = glob.glob(
-                os.path.join(observed_data_dir, f"{station_id}*FLOW_cmd.csv")
-            )
-            if observed_data_list:
-                observed_data_path = observed_data_list[0]
+            if reach_id in stations_matched_by_comid:
+                station_id = stations_matched_by_comid[reach_id]
+                observed_data_path = os.path.join(observed_data_dir, f"{station_id}_FLOW_cmd.csv")
+                # if observed_data_list:
+                #     observed_data_path = observed_data_list[0]
                 station_ensembles, station_stats = get_bias_corrected_data(
                     station_id,
                     reach_id,
@@ -436,7 +478,8 @@ def main(
         ensembles_to_csv(
             workspace,
-            station_id,
+            "MATCHED",
+            reach_id,
             station_ensembles,
             station_stats,
         )

loone_data_prep/forecast_scripts/create_forecast_LOWs.py ADDED Viewed

@@ -0,0 +1,127 @@
+import os
+from herbie import FastHerbie
+from datetime import datetime
+import pandas as pd
+from retry_requests import retry
+import warnings
+def generate_wind_forecasts(output_dir):
+    # Ensure output directory exists
+    warnings.filterwarnings("ignore", message="Will not remove GRIB file because it previously existed.")
+    os.makedirs(output_dir, exist_ok=True)
+    # Define points of interest
+    points = pd.DataFrame({
+        "longitude": [-80.7934, -80.9724, -80.7828, -80.7890],
+        "latitude": [27.1389, 26.9567, 26.8226, 26.9018]
+    })
+    # Station-specific file and column names
+    file_map = {
+        "Point_1": ("L001_WNDS_MPH_predicted.csv", "L001_WNDS_MPH"),
+        "Point_2": ("L005_WNDS_MPH_predicted.csv", "L005_WNDS_MPH"),
+        "Point_3": ("L006_WNDS_MPH_predicted.csv", "L006_WNDS_MPH"),
+        "Point_4": ("LZ40_WNDS_MPH_predicted.csv", "LZ40_WNDS_MPH")
+    }
+    today_str = datetime.today().strftime('%Y-%m-%d 00:00')
+    FH = FastHerbie([today_str], model="ifs", fxx=range(0, 360, 3))
+    dfs = []
+    variables = {
+        "10u": "10u",
+        "10v": "10v",
+        "2t": "2t",
+    }
+    # Loop through points and extract data
+    for index, point in points.iterrows():
+        print(f"\nProcessing Point {index + 1}: ({point.latitude}, {point.longitude})")
+        point_df = pd.DataFrame({
+            "longitude": [point.longitude],
+            "latitude": [point.latitude]
+        })
+        for var_key, var_name in variables.items():
+            print(f"  Variable: {var_key}")
+            # Download and load dataset
+            FH.download(f":{var_key}")
+            ds = FH.xarray(f":{var_key}", backend_kwargs={"decode_timedelta": True})
+            # Extract point data
+            dsi = ds.herbie.pick_points(point_df, method="nearest")
+            # Get actual variable name
+            if var_name == "10u":
+                var_name_actual = "u10"  # Map 10u to u10
+            elif var_name == "10v":
+                var_name_actual = "v10"  # Map 10v to v10
+            elif var_name == "2t":
+                var_name_actual = "t2m" #TODO: check that this is correct
+            # Convert to DataFrame
+            time_series = dsi[var_name_actual].squeeze()
+            df = time_series.to_dataframe().reset_index()
+            # Handle datetime columns
+            if "valid_time" in df.columns:
+                df = df.rename(columns={"valid_time": "datetime"})
+            elif "step" in df.columns and "time" in dsi.coords:
+                df["datetime"] = dsi.time.values[0] + df["step"]
+            # Retain necessary columns
+            df = df[["datetime", var_name_actual]].drop_duplicates()
+            dfs.append((index, var_name_actual, df))
+    # Merge and process data per point
+    results = {}
+    for point_index in range(len(points)):
+        u_df = [df for idx, name, df in dfs if idx == point_index and name == "u10"][0]
+        v_df = [df for idx, name, df in dfs if idx == point_index and name == "v10"][0]
+        merged = u_df.merge(v_df, on="datetime", how="outer")
+        # Compute wind speed and correction
+        merged["wind_speed"] = (merged["u10"] ** 2 + merged["v10"] ** 2) ** 0.5
+        merged["wind_speed_corrected"] = 0.4167 * merged["wind_speed"] + 4.1868
+        merged["wind_speed_corrected"] = merged["wind_speed_corrected"] * 2.23694  # m/s to mph
+        results[f"Point_{point_index + 1}"] = merged
+    # Save outputs with station-specific column names
+    for key, (filename, new_col_name) in file_map.items():
+        df = results[key].copy()
+        df = df[["datetime", "wind_speed_corrected"]].rename(columns={
+            "wind_speed_corrected": new_col_name,
+            "datetime": "date"
+        })
+        filepath = os.path.join(output_dir, filename)
+        df.to_csv(filepath, index=False)
+    # Save 2-meter air temperature data
+    airt_file_map = {
+        "Point_1": "L001_AIRT_Degrees Celsius_forecast.csv",
+        "Point_2": "L005_AIRT_Degrees Celsius_forecast.csv",
+        "Point_3": "L006_AIRT_Degrees Celsius_forecast.csv",
+        "Point_4": "LZ40_AIRT_Degrees Celsius_forecast.csv"
+    }
+    airt_column_map = {
+        "Point_1": "L001_AIRT_Degrees Celsius",
+        "Point_2": "L005_AIRT_Degrees Celsius",
+        "Point_3": "L006_AIRT_Degrees Celsius",
+        "Point_4": "LZ40_AIRT_Degrees Celsius"
+    }
+    for key in airt_file_map:
+        point_index = int(key.split("_")[1]) - 1
+        df_airt = [df for idx, name, df in dfs if idx == point_index and name == "t2m"][0].copy()
+        df_airt["t2m"] = df_airt["t2m"] - 273.15  # Convert from Kelvin to Celsius
+        df_airt = df_airt.rename(columns={
+            "datetime": "date",
+            "t2m": airt_column_map[key]
+        })
+        filepath = os.path.join(output_dir, airt_file_map[key])
+        df_airt.to_csv(filepath, index=False)

loone-data-prep 0.1.9__py3-none-any.whl → 1.1.1__py3-none-any.whl

loone-data-prep 0.1.9py3-none-any.whl → 1.1.1py3-none-any.whl