PyPI - loone-data-prep - Versions diffs - 1.2.4__py3-none-any.whl → 1.3.1__py3-none-any.whl - Mend

loone-data-prep 1.2.4py3-none-any.whl → 1.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

loone_data_prep/GEOGLOWS_LOONE_DATA_PREP.py +47 -16
loone_data_prep/LOONE_DATA_PREP.py +0 -1
loone_data_prep/dbhydro_insights.py +195 -0
loone_data_prep/flow_data/S65E_total.py +57 -57
loone_data_prep/flow_data/forecast_bias_correction.py +1 -1
loone_data_prep/flow_data/get_forecast_flows.py +19 -105
loone_data_prep/flow_data/get_inflows.py +18 -8
loone_data_prep/flow_data/get_outflows.py +16 -7
loone_data_prep/flow_data/hydro.py +62 -91
loone_data_prep/forecast_scripts/get_Chla_predicted.py +1 -1
loone_data_prep/forecast_scripts/get_NO_Loads_predicted.py +1 -1
loone_data_prep/forecast_scripts/new_combined_weather_forecast.py +220 -0
loone_data_prep/utils.py +262 -32
loone_data_prep/water_level_data/get_all.py +52 -44
loone_data_prep/water_level_data/hydro.py +49 -68
loone_data_prep/water_quality_data/get_inflows.py +69 -27
loone_data_prep/water_quality_data/get_lake_wq.py +130 -33
loone_data_prep/water_quality_data/wq.py +114 -88
loone_data_prep/weather_data/get_all.py +5 -3
loone_data_prep/weather_data/weather.py +117 -180
{loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/METADATA +2 -8
loone_data_prep-1.3.1.dist-info/RECORD +38 -0
{loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/WHEEL +1 -1
loone_data_prep/forecast_scripts/create_forecast_LOWs.py +0 -170
loone_data_prep/forecast_scripts/weather_forecast.py +0 -199
loone_data_prep-1.2.4.dist-info/RECORD +0 -38
{loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/licenses/LICENSE +0 -0
{loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/top_level.txt +0 -0

loone_data_prep/water_quality_data/wq.py CHANGED Viewed

@@ -1,117 +1,143 @@
+import csv
+import os
 import sys
 from datetime import datetime
 from retry import retry
-from rpy2.robjects import r
-from rpy2.rinterface_lib.embedded import RRuntimeError
+import pandas as pd
+from loone_data_prep.utils import get_dbhydro_api
 DEFAULT_STATION_IDS = ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]
 DATE_NOW = datetime.now().strftime("%Y-%m-%d")
-@retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
+@retry(Exception, tries=5, delay=15, max_delay=60, backoff=2)
 def get(
     workspace: str,
     name: str,
+    test_number: int,
     station_ids: list = DEFAULT_STATION_IDS,
     date_min: str = "1950-01-01",
     date_max: str = DATE_NOW,
     **kwargs: str | list
 ) -> None:
-    station_ids_str = "\"" + "\", \"".join(station_ids) + "\""
-    r(
-        f"""
-        # Load the required libraries
-        library(rio)
-        library(dbhydroR)
-        # Specify the station IDs, date range, and test names
-        station_ids <- c({station_ids_str})
-        date_min <- "{date_min}"
-        date_max <- "{date_max}"
-        test_names <- c("{name}")
-        # Loop over the station IDs
-        for (station_id in station_ids) {{
-            # Retrieve water quality data for the current station ID
-            water_quality_data <- tryCatch(
-                get_wq(
-                    station_id = station_id,
-                    date_min = date_min,
-                    date_max = date_max,
-                    test_name = test_names
-                ),
-                error = function(e) NULL
-            )
-            # Check if data is available for the current station ID and test name
-            if (!is.null(water_quality_data) && nrow(water_quality_data) > 0) {{
-                # Convert the vector to a data frame
-                water_quality_data <- as.data.frame(water_quality_data)
-                # Calculate the number of days from the minimum date plus 8
-                water_quality_data$days <- as.integer(difftime(water_quality_data$date, min(water_quality_data$date), units = "days")) + as.integer(format(min(water_quality_data$date), "%d"))
-                # Generate the filename based on the station ID
-                filename <- paste0("{workspace}/water_quality_", station_id, "_", test_names, ".csv")
-                # Save data to a CSV file
-                write.csv(water_quality_data, file = filename)
-                # Print a message indicating the file has been saved
-                cat("CSV file", filename, "has been saved.\n")
-            }} else {{
-                # Print a message indicating no data was found for the current station ID and test name
-                cat("No data found for station ID", station_id, "and test name", test_names, "\n")
-            }}
-            Sys.sleep(1) # Wait for 1 seconds before the next iteration
-        }}
-        """  # noqa: E501
-    )
+    """Fetch water quality data from DBHydro API and save it as CSV files in the specified workspace.
+    Args:
+        workspace (str): The directory where the CSV files will be saved.
+        name (str): The name of the water quality parameter. Example: 'PHOSPHATE, TOTAL AS P'
+        test_number (int): The DBHydro test number for the water quality parameter.
+        station_ids (list, optional): List of station IDs to fetch data for. Defaults to DEFAULT_STATION_IDS.
+        date_min (str, optional): The start date for fetching data in YYYY-MM-DD format. Defaults to "1950-01-01".
+        date_max (str, optional): The end date for fetching data in YYYY-MM-DD format. Defaults to the current date.
+        **kwargs: Additional keyword arguments.
+    Returns:
+        None
+    """
+    # Initialize the DBHydro API
+    api = get_dbhydro_api()
+    # Fetch water quality data
+    response = api.get_water_quality(stations=station_ids, test_numbers=[test_number], date_start=date_min, date_end=date_max, exclude_flagged_results=False)
+    df = response.to_dataframe(include_metadata=True)
+    # Process and save data for each station
+    for station in station_ids:
+        # Get a copy of the data frame for this station
+        df_station = df[df['station'] == station].copy()
+        # Check if the data frame is empty
+        if df_station.empty:
+            print(f'No data found for station ID {station} and test number {test_number}.')
+            continue
+        # Get the units of the data
+        units = df_station['units'].iloc[0] if 'units' in df_station.columns else ''
+        # Drop unwanted columns
+        df_station = df_station[['date_collected_str', 'sig_fig_value']].copy()
+        # Convert string sig_fig_value to numeric
+        df_station['sig_fig_value'] = pd.to_numeric(df_station['sig_fig_value'], errors='coerce')
+        # Calculate daily average values
+        df_station['date_collected_str'] = pd.to_datetime(df_station['date_collected_str'])
+        df_station["date_only"] = df_station["date_collected_str"].dt.date
+        df_station = df_station.groupby("date_only")["sig_fig_value"].mean().reset_index()
+        df_station.rename(columns={"date_only": "date_collected_str"}, inplace=True)
+        # Format dataframe to expected layout
+        df_station['date_collected_str'] = pd.to_datetime(df_station['date_collected_str'])                                     # Convert date_collected_str column to datetime
+        df_station.sort_values('date_collected_str', inplace=True)                                                              # Sort df by date_collected_str
+        df_station.rename(columns={'date_collected_str': 'date', 'sig_fig_value': f'{station}_{name}_{units}'}, inplace=True)   # Rename columns
+        # Calculate the days column
+        df_station['days'] = (df_station['date'] - df_station['date'].min()).dt.days + df_station['date'].min().day
+        # Make sure the integer index is written out (for backwards compatibility)
+        df_station.reset_index(inplace=True, drop=True)
+        # Start index at 1 instead of 0 (for backwards compatibility)
+        df_station.index = df_station.index + 1
+        # Make sure the integer index values are quoted in the csv file (for backwards compatibility)
+        df_station.index = df_station.index.astype(str)
+        # Make sure the date column includes time information at midnight (for backwards compatibility)
+        df_station['date'] = df_station['date'].dt.strftime('%Y-%m-%d 00:00:00')
+        # Write out the data frame to a CSV file
+        df_station.to_csv(os.path.join(workspace, f'water_quality_{station}_{name}.csv'), index=True, quoting=csv.QUOTE_NONNUMERIC)
+        # Rewrite the file so dates don't have double quotes around them (for backwards compatibility)
+        rewrite_water_quality_file_without_date_quotes(workspace, f'water_quality_{station}_{name}.csv')
-def _calculate_days_column(workspace: str, file_name: str, date_min: str):
+def _calculate_days_column(workspace: str, df: pd.DataFrame, date_min: str):
     """
     Calculates the values that should be in the "days" column of the water quality data CSV file
     based on the given date_min and writes the updated data frame back to the CSV file.
     Args:
         workspace (str): The path to the workspace directory.
-        file_name (str): The name of the water quality data CSV file.
+        df (pd.DataFrame): The water quality data dataframe.
         date_min (str): The minimum date that the "days" column values should be calculated from. Should be in format "YYYY-MM-DD".
     """
-    r(
-        f"""
-        # Import necessary libraries
-        library(lubridate)
-        # Read the CSV file
-        df <- read.csv("{workspace}/{file_name}", check.names = FALSE)
-        # Drop the "X" column that R adds when reading CSV files
-        df <- df[,-1]
-        # Get date_min as an object with the correct timezone
-        date_min_object <- as.POSIXct("{date_min}", tz = "UTC")
-        date_min_tz <- format(with_tz(date_min_object, tzone = "America/New_York"), "%Z")
-        date_min_object <- as.POSIXct("{date_min}", tz = date_min_tz)
-        # Calculate each value in the days column based on the date_min
-        for(i in 1:nrow(df))
-        {{
-            # Get the current row's date as an object with the correct timezone
-            date <- as.POSIXct(df$date[i], tz = "UTC")
-            date_tz <- format(with_tz(date, tzone = "America/New_York"), "%Z")
-            date <- as.POSIXct(df$date[i], tz = date_tz)
-            # Calculate the number of days from the minimum date to the row's date plus the number of days in date_min
-            df$days[i] <- as.integer(difftime(date, date_min_object, units = "days")) + as.integer(format(date_min_object, "%d"))
-        }}
-        # Write the updated data frame back to the CSV file
-        write.csv(df, file = "{workspace}/{file_name}", row.names = FALSE)
-        """ # noqa: E501
-      )
+    # Ensure df['date'] is a pandas datetime Series
+    df['date'] = pd.to_datetime(df['date'])
+    date_min_object = pd.to_datetime(date_min)
+    # Calculate days column for all rows
+    df['days'] = (df['date'] - date_min_object).dt.days + date_min_object.day
+    return df
+def rewrite_water_quality_file_without_date_quotes(workspace: str, file_name: str) -> None:
+    """
+    Rewrites the given water quality CSV file so that the dates don't have double quotes around them (for backwards compatibility).
+    Args:
+        workspace (str): The path to the workspace directory.
+        file_name (str): The name of the water quality CSV file.
+    """
+    # Rewrite the file so dates don't have double quotes around them (for backwards compatibility)
+    file_path = os.path.join(workspace, file_name)
+    lines = []
+    with open(file_path, 'r') as file:
+        lines = file.readlines()
+    with open(file_path, 'w', newline='') as file:
+        line_number = 0
+        for line in lines:
+            if line_number != 0:
+                line_split = line.split(',')
+                line_split[1] = line_split[1].replace('"', '')  # Remove quotes around dates (2nd column)
+                line = ','.join(line_split)
+            file.write(line)
+            line_number += 1
 if __name__ == "__main__":

loone_data_prep/weather_data/get_all.py CHANGED Viewed

@@ -88,7 +88,7 @@ def main(workspace: str, d: dict = D, dbkey_stations: dict = DBKEY_STATIONS) ->
                 continue
             # Check whether the latest data is already up to date.
-            if dbhydro_data_is_latest(date_latest):
+            if dbhydro_data_is_latest(date_latest, dbkey):
                 # Notify that the data is already up to date
                 print(f'Downloading of new {name} data skipped for dbkey {dbkey}. Data is already up to date.')
                 continue
@@ -99,8 +99,10 @@ def main(workspace: str, d: dict = D, dbkey_stations: dict = DBKEY_STATIONS) ->
             try:
                 # Download only the new data
-                print(f'Downloading new {name} data for dbkey {dbkey} starting from date {date_latest}')
-                weather.get(workspace, name, dbkeys=[dbkey], date_min=date_latest)
+                date_start = pd.to_datetime(date_latest) + pd.Timedelta(days=1)
+                date_start = date_start.strftime('%Y-%m-%d')
+                print(f'Downloading new {name} data for dbkey {dbkey} starting from date {date_start}')
+                weather.get(workspace, name, dbkeys=[dbkey], date_min=date_start)
                 # Data failed to download - It's possible the data's end date has been reached
                 if not os.path.exists(os.path.join(workspace, original_file_name)):

loone_data_prep/weather_data/weather.py CHANGED Viewed

@@ -1,16 +1,17 @@
+import os
 import sys
 from datetime import datetime
 from retry import retry
-from rpy2.robjects import r
-from rpy2.rinterface_lib.embedded import RRuntimeError
 import pandas as pd
+from loone_data_prep.utils import df_replace_missing_with_nan, get_dbhydro_api
+import csv
 DEFAULT_DBKEYS = ["16021", "12515", "12524", "13081"]
 DATE_NOW = datetime.now().strftime("%Y-%m-%d")
-@retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
+@retry(Exception, tries=5, delay=15, max_delay=60, backoff=2)
 def get(
     workspace: str,
     param: str,
@@ -19,8 +20,15 @@ def get(
     date_max: str = DATE_NOW,
     **kwargs: str | list
 ) -> None:
-    dbkeys_str = "\"" + "\", \"".join(dbkeys) + "\""
+    """Fetches daily weather data from DBHYDRO for specified dbkeys and date range, and saves the data to CSV files in the specified workspace.
+    Args:
+        workspace (str): The directory where the CSV files will be saved.
+        param (str): The type of weather data to fetch (e.g., "RAIN", "ETPI").
+        dbkeys (list, optional): List of DBHYDRO dbkeys to fetch data for. Defaults to DEFAULT_DBKEYS.
+        date_min (str, optional): The start date for data retrieval in "YYYY-MM-DD" format. Defaults to "2000-01-01".
+        date_max (str, optional): The end date for data retrieval in "YYYY-MM-DD" format. Defaults to the current date.
+    """
     data_type = param
     data_units_file = None
     data_units_header = None
@@ -28,92 +36,49 @@ def get(
     # Get the units for the file name and column header based on the type of data
     data_units_file, data_units_header = _get_file_header_data_units(data_type)
-    r_str = f"""
-        download_weather_data <- function()#workspace, dbkeys, date_min, date_max, data_type, data_units_file, data_units_header)
-        {{
-            library(dbhydroR)
-            library(dplyr)
-            dbkeys <- c({dbkeys_str})
-            successful_stations <- list()
-            for (i in dbkeys)
-            {{
-                # Retrieve data for the dbkey
-                data <- get_hydro(dbkey = i, date_min = "{date_min}", date_max = "{date_max}", raw = TRUE)
-                # Give data.frame correct column names so it can be cleaned using the clean_hydro function
-                column_names <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
-                colnames(data) <- column_names
-                # Check if the data.frame has any rows
-                if (nrow(data) > 0)
-                {{
-                    # Get the station
-                    station <- data$station[1]
-                    # Add a type and units column to data so it can be cleaned using the clean_hydro function
-                    data$type <- "{data_type}"
-                    data$units <- "{data_units_header}"
-                    # Clean the data.frame
-                    data <- clean_hydro(data)
-                    # Get the filename of the output file
-                    filename <- ""
-                    if ("{param}" %in% c("RADP", "RADT"))
-                    {{
-                        filename <- paste(station, "{data_type}", sep = "_")
-                    }}
-                    else
-                    {{
-                        filename <- paste(station, "{data_type}", "{data_units_file}", sep = "_")
-                    }}
-                    filename <- paste0(filename, ".csv")
-                    filename <- paste0("{workspace}/", filename)
-                    # Save data to a CSV file
-                    write.csv(data, file = filename)
-                    # Print a message indicating the file has been saved
-                    cat("CSV file", filename, "has been saved.\n")
-                    # Append the station to the list of successful stations
-                    successful_stations <- c(successful_stations, station)
-                }}
-                else
-                {{
-                    # No data given back, It's possible that the dbkey has reached its end date.
-                    print(paste("Empty data.frame returned for dbkey", i, "It's possible that the dbkey has reached its end date. Skipping to the next dbkey."))
-                }}
-                # Add a delay between requests
-                Sys.sleep(2) # Wait for 2 seconds before the next iteration
-            }}
-            # Return the station and dbkey to the python code
-            return(successful_stations)
-        }}
-        """  # noqa: E501
+    # Retrieve the data
+    api = get_dbhydro_api()
+    response = api.get_daily_data(dbkeys, 'id', date_min, date_max, 'NGVD29', False)
+    # Get the data as a dataframe
+    df = response.to_dataframe(True)
+    # Replace 0 values with NaN when their qualifier is either 'M' or 'N'
+    df = df_replace_missing_with_nan(df)
-    # Download the weather data
-    r(r_str)
-    result = r.download_weather_data()
+    # Map each station to its own dataframe
+    station_dfs = {}
-    # Get the stations of the dbkeys who's data were successfully downloaded
-    stations = []
-    for value in result:
-        stations.append(value[0])
+    for site_code in response.get_site_codes():
+        station_dfs[site_code] = df[df['site_code'] == site_code].copy()
-    # Format files to expected layout
-    for station in stations:
-        if station in ["L001", "L005", "L006", "LZ40"]:
-            _reformat_weather_file(workspace, station, data_type, data_units_file, data_units_header)
-            # Print a message indicating the file has been saved
-            print(f"CSV file {workspace}/{station}_{data_type}_{data_units_file}.csv has been reformatted.")
+    # Write out each station's data to its own file
+    for station, station_df in station_dfs.items():
+        # Get metadata for the station
+        parameter_code = station_df['parameter_code'].iloc[0]
+        unit_code = station_df['unit_code'].iloc[0]
+        # Select only the desired columns
+        station_df = station_df[['value']].copy()
+        # Rename datetime index
+        station_df.index.rename('date', inplace=True)
+        # Rename the columns to the expected format
+        station_df.rename(columns={'value': f'{station}_{data_type}_{data_units_header}'}, inplace=True)
+        # Make the date index a column and use an integer index (for backwards compatibility)
+        station_df = station_df.reset_index()
+        # Get the name of the output file
+        file_name = ''
+        if data_type in ['RADP', 'RADT']:
+            file_name = f'{station}_{data_type}.csv'
+        else:
+            file_name = f'{station}_{data_type}_{data_units_file}.csv'
+        # Write out the station's data to a csv file
+        station_df.to_csv(os.path.join(workspace, file_name), index=True)
 def merge_data(workspace: str, data_type: str):
@@ -127,103 +92,75 @@ def merge_data(workspace: str, data_type: str):
     # Merge the data files for the different stations (LAKE_RAINFALL_DATA.csv)
     if data_type == "RAIN":
-        r(
-            f"""
-            L001_RAIN_Inches <- read.csv("{workspace}/L001_RAIN_Inches.csv", colClasses = c("NULL", "character", "numeric"))
-            L005_RAIN_Inches = read.csv("{workspace}/L005_RAIN_Inches.csv", colClasses = c("NULL", "character", "numeric"))
-            L006_RAIN_Inches = read.csv("{workspace}/L006_RAIN_Inches.csv", colClasses = c("NULL", "character", "numeric"))
-            LZ40_RAIN_Inches = read.csv("{workspace}/LZ40_RAIN_Inches.csv", colClasses = c("NULL", "character", "numeric"))
-            #Replace NA values with zero
-            L001_RAIN_Inches[is.na(L001_RAIN_Inches)] <- 0
-            L005_RAIN_Inches[is.na(L005_RAIN_Inches)] <- 0
-            L006_RAIN_Inches[is.na(L006_RAIN_Inches)] <- 0
-            LZ40_RAIN_Inches[is.na(LZ40_RAIN_Inches)] <- 0
-            # Merge the files by the "date" column
-            merged_data <- merge(L001_RAIN_Inches, L005_RAIN_Inches, by = "date",all = TRUE)
-            merged_data <- merge(merged_data, L006_RAIN_Inches, by = "date",all = TRUE)
-            merged_data <- merge(merged_data, LZ40_RAIN_Inches, by = "date",all = TRUE)
-            # Calculate the average rainfall per day
-            merged_data$average_rainfall <- rowMeans(merged_data[, -1],na.rm = TRUE)
-            # View the updated merged data
-            head(merged_data)
-            # Save merged data as a CSV file
-            write.csv(merged_data, "{workspace}/LAKE_RAINFALL_DATA.csv", row.names = TRUE)
-            """  # noqa: E501
-        )
+        # Read in rain data
+        l001_rain_inches = pd.read_csv(os.path.join(workspace, 'L001_RAIN_Inches.csv'), index_col=0)
+        l005_rain_inches = pd.read_csv(os.path.join(workspace, 'L005_RAIN_Inches.csv'), index_col=0)
+        l006_rain_inches = pd.read_csv(os.path.join(workspace, 'L006_RAIN_Inches.csv'), index_col=0)
+        lz40_rain_inches = pd.read_csv(os.path.join(workspace, 'LZ40_RAIN_Inches.csv'), index_col=0)
+        # Replace NaN values with 0
+        l001_rain_inches.fillna(0, inplace=True)
+        l005_rain_inches.fillna(0, inplace=True)
+        l006_rain_inches.fillna(0, inplace=True)
+        lz40_rain_inches.fillna(0, inplace=True)
+        # Merge the data by the "date" column
+        merged_data = pd.merge(l001_rain_inches, l005_rain_inches, on="date", how="outer")
+        merged_data = pd.merge(merged_data, l006_rain_inches, on="date", how="outer")
+        merged_data = pd.merge(merged_data, lz40_rain_inches, on="date", how="outer")
+        # Calculate the average rainfall per day
+        merged_data['average_rainfall'] = merged_data.iloc[:, 1:].mean(axis=1)
+        # Make sure the integer index values are quoted in the csv file (for backwards compatibility)
+        merged_data.index = merged_data.index.astype(str)
+        # Save merged data as a CSV file
+        merged_data.applymap(lambda x: round(x, 4) if isinstance(x, (float, int)) else x)
+        merged_data.to_csv(os.path.join(workspace, 'LAKE_RAINFALL_DATA.csv'), index=True, quoting=csv.QUOTE_NONNUMERIC)
     # Merge the data files for the different stations (LOONE_AVERAGE_ETPI_DATA.csv)
     if data_type == "ETPI":
-        r(
-            f"""
-            L001_ETPI_Inches <- read.csv("{workspace}/L001_ETPI_Inches.csv", colClasses = c("NULL", "character", "numeric"))
-            L005_ETPI_Inches = read.csv("{workspace}/L005_ETPI_Inches.csv", colClasses = c("NULL", "character", "numeric"))
-            L006_ETPI_Inches = read.csv("{workspace}/L006_ETPI_Inches.csv", colClasses = c("NULL", "character", "numeric"))
-            LZ40_ETPI_Inches = read.csv("{workspace}/LZ40_ETPI_Inches.csv", colClasses = c("NULL", "character", "numeric"))
-            # Replace NA values with zero
-            L001_ETPI_Inches[is.na(L001_ETPI_Inches)] <- 0
-            L005_ETPI_Inches[is.na(L005_ETPI_Inches)] <- 0
-            L006_ETPI_Inches[is.na(L006_ETPI_Inches)] <- 0
-            LZ40_ETPI_Inches[is.na(LZ40_ETPI_Inches)] <- 0
-            # Merge the files by the "date" column
-            merged_data <- merge(L001_ETPI_Inches, L005_ETPI_Inches, by = "date",all = TRUE)
-            merged_data <- merge(merged_data, L006_ETPI_Inches, by = "date",all = TRUE)
-            merged_data <- merge(merged_data, LZ40_ETPI_Inches, by = "date",all = TRUE)
-            # Calculate the average rainfall per day
-            merged_data$average_ETPI <- rowMeans(merged_data[, -1],na.rm = TRUE)
-            # View the updated merged data
-            head(merged_data)
-            # Save merged data as a CSV file
-            write.csv(merged_data, "{workspace}/LOONE_AVERAGE_ETPI_DATA.csv", row.names = TRUE)
-            """  # noqa: E501
-        )
+        # Read in ETPI data
+        l001_etpi_inches = pd.read_csv(os.path.join(workspace, 'L001_ETPI_Inches.csv'), index_col=0)
+        l005_etpi_inches = pd.read_csv(os.path.join(workspace, 'L005_ETPI_Inches.csv'), index_col=0)
+        l006_etpi_inches = pd.read_csv(os.path.join(workspace, 'L006_ETPI_Inches.csv'), index_col=0)
+        lz40_etpi_inches = pd.read_csv(os.path.join(workspace, 'LZ40_ETPI_Inches.csv'), index_col=0)
+        # Replace NaN values with 0
+        l001_etpi_inches.fillna(0, inplace=True)
+        l005_etpi_inches.fillna(0, inplace=True)
+        l006_etpi_inches.fillna(0, inplace=True)
+        lz40_etpi_inches.fillna(0, inplace=True)
+        # Merge the data by the "date" column
+        merged_data = pd.merge(l001_etpi_inches, l005_etpi_inches, on="date", how="outer")
+        merged_data = pd.merge(merged_data, l006_etpi_inches, on="date", how="outer")
+        merged_data = pd.merge(merged_data, lz40_etpi_inches, on="date", how="outer")
+        # Calculate the average ETPI per day
+        merged_data['average_ETPI'] = merged_data.iloc[:, 1:].mean(axis=1)
+        # Make sure the integer index values are quoted in the csv file (for backwards compatibility)
+        merged_data.index = merged_data.index.astype(str)
+        # Save merged data as a CSV file
+        merged_data.to_csv(os.path.join(workspace, 'LOONE_AVERAGE_ETPI_DATA.csv'), index=True, quoting=csv.QUOTE_NONNUMERIC, na_rep='NA')
+        # Rewrite the file so NA values aren't quoted (for backwards compatibility)
+        file_path = os.path.join(workspace, 'LOONE_AVERAGE_ETPI_DATA.csv')
+        lines = []
-def _reformat_weather_file(workspace: str, station: str, data_type: str, data_units_file: str, data_units_header: str) -> None:
-    '''
-    Reformats the dbhydro weather file to the layout expected by the rest of the LOONE scripts.
-    This function reads in and writes out a .csv file.
-    Args:
-        workspace (str): The path to the workspace directory.
-        station (str): The station name. Ex: L001, L005, L006, LZ40.
-        data_type (str): The type of data. Ex: RAIN, ETPI, H2OT, RADP, RADT, AIRT, WNDS.
-        data_units_file (str): The units for the file name. Ex: Inches, Degrees Celsius, etc.
-        data_units_header (str): The units for the column header. Ex: Inches, Degrees Celsius, etc. Can differ from data_units_file when data_type is either RADP or RADT.
+        with open(file_path, 'r') as file:
+            lines = file.readlines()
-    Returns:
-        None
-    '''
-    # Read in the data
-    df = None
-    if data_type in ['RADP', 'RADT']:
-        df = pd.read_csv(f"{workspace}/{station}_{data_type}.csv")
-    else:
-        df = pd.read_csv(f"{workspace}/{station}_{data_type}_{data_units_file}.csv")
-    # Remove unneeded column columns
-    df.drop(f' _{data_type}_{data_units_header}', axis=1, inplace=True)
-    df.drop('Unnamed: 0', axis=1, inplace=True)
-    # Convert date column to datetime
-    df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
-    # Sort the data by date
-    df.sort_values('date', inplace=True)
-    # Renumber the index
-    df.reset_index(drop=True, inplace=True)
-    # Drop rows that are missing all their values
-    df.dropna(how='all', inplace=True)
-    # Write the updated data back to the file
-    if data_type in ['RADP', 'RADT']:
-        df.to_csv(f"{workspace}/{station}_{data_type}.csv")
-    else:
-        df.to_csv(f"{workspace}/{station}_{data_type}_{data_units_file}.csv")
+        with open(file_path, 'w', newline='') as file:
+            for line in lines:
+                line = line.replace(',"NA"', ',NA')
+                line = line.replace('"NA",', 'NA,')
+                line = line.replace(',"NaN"', ',NA')
+                line = line.replace('"NaN",', 'NA,')
+                file.write(line)
 def _get_file_header_data_units(data_type: str) -> tuple[str, str]:

{loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: loone_data_prep
-Version: 1.2.4
+Version: 1.3.1
 Summary: Prepare data to run the LOONE model.
 Author-email: Osama Tarabih <osamatarabih@usf.edu>
 Maintainer-email: Michael Souffront <msouffront@aquaveo.com>, James Dolinar <jdolinar@aquaveo.com>
@@ -18,7 +18,6 @@ License: BSD-3-Clause License
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: rpy2
 Requires-Dist: retry
 Requires-Dist: numpy<2
 Requires-Dist: pandas
@@ -30,6 +29,7 @@ Requires-Dist: requests_cache
 Requires-Dist: retry-requests
 Requires-Dist: eccodes==2.41.0
 Requires-Dist: xarray==2025.4.0
+Requires-Dist: dbhydro-py
 Dynamic: license-file
 LOONE_DATA_PREP
@@ -40,11 +40,6 @@ Prepare data for the LOONE water quality model.
 Line to the LOONE model: [https://pypi.org/project/loone](https://pypi.org/project/loone)
 Link to LOONE model repository: [https://github.com/Aquaveo/LOONE](https://github.com/Aquaveo/LOONE)
-## Prerequisites:
-* R ([https://www.r-project.org/](https://www.r-project.org/))
-* R packages: dbhydroR, rio, dplyr
 ## Installation:
 ```bash
@@ -103,7 +98,6 @@ dbkeys = get_dbkeys(
     stat="MEAN",
     recorder="CR10",
     freq="DA",
-    detail_level="dbkey"
 )
 # Get water level data

loone-data-prep 1.2.4__py3-none-any.whl → 1.3.1__py3-none-any.whl

loone-data-prep 1.2.4py3-none-any.whl → 1.3.1py3-none-any.whl