PyPI - loone-data-prep - Versions diffs - 1.3.0__py3-none-any.whl → 1.3.1__py3-none-any.whl - Mend

loone-data-prep 1.3.0py3-none-any.whl → 1.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

loone_data_prep/dbhydro_insights.py +195 -0
loone_data_prep/flow_data/S65E_total.py +57 -57
loone_data_prep/flow_data/forecast_bias_correction.py +1 -1
loone_data_prep/flow_data/get_forecast_flows.py +19 -105
loone_data_prep/flow_data/get_inflows.py +18 -8
loone_data_prep/flow_data/get_outflows.py +16 -7
loone_data_prep/flow_data/hydro.py +62 -91
loone_data_prep/utils.py +243 -30
loone_data_prep/water_level_data/get_all.py +52 -44
loone_data_prep/water_level_data/hydro.py +49 -68
loone_data_prep/water_quality_data/get_inflows.py +69 -27
loone_data_prep/water_quality_data/get_lake_wq.py +130 -33
loone_data_prep/water_quality_data/wq.py +114 -88
loone_data_prep/weather_data/get_all.py +5 -3
loone_data_prep/weather_data/weather.py +117 -180
{loone_data_prep-1.3.0.dist-info → loone_data_prep-1.3.1.dist-info}/METADATA +2 -8
{loone_data_prep-1.3.0.dist-info → loone_data_prep-1.3.1.dist-info}/RECORD +20 -19
{loone_data_prep-1.3.0.dist-info → loone_data_prep-1.3.1.dist-info}/WHEEL +1 -1
{loone_data_prep-1.3.0.dist-info → loone_data_prep-1.3.1.dist-info}/licenses/LICENSE +0 -0
{loone_data_prep-1.3.0.dist-info → loone_data_prep-1.3.1.dist-info}/top_level.txt +0 -0

loone_data_prep/flow_data/hydro.py CHANGED Viewed

@@ -1,116 +1,68 @@
 import sys
 from datetime import datetime
-from glob import glob
 from retry import retry
-import os
 import pandas as pd
-from rpy2.robjects import r
-from rpy2.rinterface_lib.embedded import RRuntimeError
+from loone_data_prep.utils import df_replace_missing_with_nan, get_dbhydro_api
 DATE_NOW = datetime.now().strftime("%Y-%m-%d")
-@retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
+@retry(Exception, tries=5, delay=15, max_delay=60, backoff=2)
 def get(
     workspace: str,
     dbkey: str,
     date_min: str = "1990-01-01",
-    date_max: str = DATE_NOW
+    date_max: str = DATE_NOW,
+    station: str | None = None
 ) -> None:
-    r_str = f"""
-    download_flow_data <- function(workspace, dbkey, date_min, date_max)
-    {{
-        # Load the required libraries
-        library(dbhydroR)
-        library(dplyr)
-        # Retrieve data for the dbkey
-        data <- get_hydro(dbkey = "{dbkey}", date_min = "{date_min}", date_max = "{date_max}", raw = TRUE)
-        # Check if data is empty or contains only the "date" column
-        if (ncol(data) <= 1) {{
-            cat("No data found for dbkey", "{dbkey}", "Skipping to the next dbkey.\n")
-        }}
-        # Give data.frame correct column names so it can be cleaned using the clean_hydro function
-        colnames(data) <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
-        # Check if the data.frame has any rows
-        if (nrow(data) == 0)
-        {{
-            # No data given back, It's possible that the dbkey has reached its end date.
-            print(paste("Empty data.frame returned for dbkey", "{dbkey}", "It's possible that the dbkey has reached its end date. Skipping to the next dbkey."))
-            return(list(success = FALSE, dbkey = "{dbkey}"))
-        }}
-        # Add a type and units column to data so it can be cleaned using the clean_hydro function
-        data$type <- "FLOW"
-        data$units <- "cfs"
-        # Get the station
-        station <- data$station[1]
-        # Clean the data.frame
-        data <- clean_hydro(data)
-        # Multiply all columns except "date" column by 0.0283168466 * 86400 to convert Flow rate from cfs to m³/day
-        data[, -1] <- data[, -1] * (0.0283168466 * 86400)
-        # Drop the " _FLOW_cfs" column
-        data <- data %>% select(-` _FLOW_cfs`)
-        # Sort the data by date
-        data <- data[order(data$date), ]
-        # Get the filename for the output CSV file
-        filename <- paste0(station, "_FLOW", "_{dbkey}_cmd.csv")
-        # Save data to a CSV file
-        write.csv(data, file = paste0("{workspace}/", filename))
-        # Print a message indicating the file has been saved
-        cat("CSV file", filename, "has been saved.\n")
-        # Add a delay between requests
-        Sys.sleep(1)  # Wait for 1 second before the next iteration
-        # Return the station and dbkey to the python code
-        list(success = TRUE, station = station, dbkey = "{dbkey}")
-    }}
+    """Fetches daily flow data from DBHYDRO and saves it to a CSV file.
+    Args:
+        workspace (str): Path to the workspace directory where data will be saved.
+        dbkey (str): The DBHYDRO database key for the station.
+        date_min (str): Minimum date for data retrieval in 'YYYY-MM-DD' format.
+        date_max (str): Maximum date for data retrieval in 'YYYY-MM-DD' format.
+        station (str | None): The station name. If None, the station name will be fetched from DBHYDRO.
     """
-    r(r_str)
+    # Get a DbHydroApi instance
+    api = get_dbhydro_api()
-    # Call the R function to download the flow data
-    result = r.download_flow_data(workspace, dbkey, date_min, date_max)
+    # Get the daily data from DbHydro
+    response = api.get_daily_data([dbkey], 'id', date_min, date_max, 'NGVD29', False)
     # Check for failure
-    success = result.rx2("success")[0]
-    if not success:
+    if not response.has_data():
         return
-    # Get the station name for _reformat_flow_file()
-    station = result.rx2("station")[0]
+    # Get the station name for _reformat_flow_df()
+    if station is None:
+        station = response.get_site_codes()[0]
+    # Get the data as a dataframe
+    df = response.to_dataframe(True)
-    # Reformat the flow data file to the expected layout
-    _reformat_flow_file(workspace, station, dbkey)
+    # Replace flagged 0 values and -99999.0 with NaN
+    df = df_replace_missing_with_nan(df)
+    # Convert flow from cfs to cmd
+    df['value'] = df['value'] * (0.0283168466 * 86400)
+    # Prepare the dataframe to be reformatted into the expected layout
+    df.reset_index(inplace=True)
+    df.rename(columns={'datetime': 'date', 'value': f'{station}_FLOW_cmd'}, inplace=True)
+    # Reformat the flow df to the expected layout
+    df = _reformat_flow_df(df, station)
     # Check if the station name contains a space
-    if " " in station:
+    if ' ' in station:
         # Replace space with underscore in the station name
         station_previous = station
-        station = station.replace(" ", "_")
-        # Rename the file
-        os.rename(f"{workspace}/{station_previous}_FLOW_{dbkey}_cmd.csv", f"{workspace}/{station}_FLOW_{dbkey}_cmd.csv")
+        station = station.replace(' ', '_')
-    # column values are converted to cmd in R. This snippet makes sure column names are updated accordingly.
-    file = glob(f'{workspace}/*FLOW*{dbkey}_cmd.csv')[0]
-    df = pd.read_csv(file, index_col=False)
-    df.columns = df.columns.astype(str).str.replace("_cfs", "_cmd")
-    df.to_csv(file, index=False)
+    # Write the data to a CSV file
+    df.to_csv(f'{workspace}/{station}_FLOW_{dbkey}_cmd.csv', index=True)
 def _reformat_flow_file(workspace:str, station: str, dbkey: str):
@@ -130,8 +82,27 @@ def _reformat_flow_file(workspace:str, station: str, dbkey: str):
     # Read in the data
     df = pd.read_csv(f"{workspace}/{station}_FLOW_{dbkey}_cmd.csv")
+    # Reformat the data
+    df = _reformat_flow_df(df, station)
+    # Write the updated data back to the file
+    df.to_csv(f"{workspace}/{station}_FLOW_{dbkey}_cmd.csv")
+def _reformat_flow_df(df: pd.DataFrame, station: str) -> pd.DataFrame:
+    '''
+    Reformat the flow data file to the expected layout.
+    Converts the format of the dates in the file to 'YYYY-MM-DD' then sorts the data by date.
+    Args:
+        df (pd.DataFrame): The dataframe containing the flow data.
+        station (str): The station name.
+    Returns:
+        pd.DataFrame: The reformatted dataframe.
+    '''
     # Grab only the columns we need
-    df = df[['date', f'{station}_FLOW_cfs']]
+    df = df[['date', f'{station}_FLOW_cmd']].copy()
     # Convert date column to datetime
     df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
@@ -143,10 +114,10 @@ def _reformat_flow_file(workspace:str, station: str, dbkey: str):
     df.reset_index(drop=True, inplace=True)
     # Drop rows that are missing values for both the date and value columns
-    df = df.drop(df[(df['date'].isna()) & (df[f'{station}_FLOW_cfs'].isna())].index)
+    df = df.drop(df[(df['date'].isna()) & (df[f'{station}_FLOW_cmd'].isna())].index)
-    # Write the updated data back to the file
-    df.to_csv(f"{workspace}/{station}_FLOW_{dbkey}_cmd.csv")
+    # Return the updated dataframe
+    return df
 if __name__ == "__main__":

loone_data_prep/utils.py CHANGED Viewed

@@ -5,17 +5,14 @@ import math
 from glob import glob
 from calendar import monthrange
 import traceback
+from typing import Literal, Tuple
 import numpy as np
 import pandas as pd
 from retry import retry
 from scipy.optimize import fsolve
 from scipy import interpolate
-from rpy2.robjects import r
-from rpy2.robjects.vectors import (
-    StrVector as rpy2StrVector,
-    DataFrame as rpy2DataFrame,
-)
-from rpy2.rinterface_lib.embedded import RRuntimeError
+from dbhydro_py import DbHydroApi
+from loone_data_prep.dbhydro_insights import get_dbhydro_station_metadata, get_dbhydro_continuous_timeseries_metadata, get_dbhydro_water_quality_metadata
 DEFAULT_STATION_IDS = ["L001", "L005", "L006", "LZ40"]
@@ -224,7 +221,7 @@ DEFAULT_EXPFUNC_NITROGEN_CONSTANTS = {
     "S135_P": {"a": 3.09890183766129, "b": 0.657896838486496},
 }
-@retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
+@retry(Exception, tries=5, delay=15, max_delay=60, backoff=2)
 def get_dbkeys(
     station_ids: list,
     category: str,
@@ -232,9 +229,8 @@ def get_dbkeys(
     stat: str,
     recorder: str,
     freq: str = "DA",
-    detail_level: str = "dbkey",
     *args: str,
-) -> rpy2StrVector | rpy2DataFrame:
+) -> list[str]:
     """Get dbkeys. See DBHydroR documentation for more information:
     https://cran.r-project.org/web/packages/dbhydroR/dbhydroR.pdf
@@ -245,27 +241,68 @@ def get_dbkeys(
         stat (str): Statistic of data to retrieve.
         recorder (str): Recorder of data to retrieve.
         freq (str, optional): Frequency of data to retrieve. Defaults to "DA".
-        detail_level (str, optional): Detail level of data to retrieve. Defaults to "dbkey". Options are "dbkey",
-            "summary", or "full".
     Returns:
-        rpy2StrVector | rpy2DataFrame: dbkeys info at the specified detail level.
+        list[str]: dbkeys info for the specified parameters.
     """
+    # Retrieve the metadata for the specified parameters
+    metadata = get_dbhydro_continuous_timeseries_metadata(station_ids, [category], [param], [stat], [recorder], [freq])
+    # A set to hold the dbkeys to avoid duplicates
+    dbkeys = set()
+    # No data returned from API
+    if metadata is None:
+        return list(dbkeys)
+    # Get the dbkeys from the metadata
+    for result in metadata['results']:
+        dbkeys.add(result['timeseriesId'])
+    # Return the dbkeys as a list
+    return list(dbkeys)
-    station_ids_str = '"' + '", "'.join(station_ids) + '"'
-    dbkeys = r(
-        f"""
-        library(dbhydroR)
+def get_stations_latitude_longitude(station_ids: list[str]):
+    """Gets the latitudes and longitudes of the given stations.
-        station_ids <- c({station_ids_str})
-        dbkeys <- get_dbkey(stationid = station_ids,  category = "{category}", param = "{param}", stat = "{stat}", recorder="{recorder}", freq = "{freq}", detail.level = "{detail_level}")
-        print(dbkeys)
-        return(dbkeys)
-        """  # noqa: E501
-    )
+    Args:
+        station_ids (list[str]): The ids of the stations to get the
+            latitudes/longitudes of. Example: ['L OKEE', 'FISHP']
+    Returns:
+        (dict[str, tuple[numpy.float64, numpy.float64]]): A dictionary of
+            format dict<station_id:(latitude,longitude)>
-    return dbkeys
+    If a station's latitude/longitude fails to download then its station_id
+        won't be a key in the returned dictionary.
+    """
+    # Dictionary to hold the latitude/longitude of each station
+    station_data = {}
+    # Get the latitude and longitude for each station
+    for station_id in station_ids:
+        # Retrieve the current station's metadata
+        station_metadata = get_dbhydro_station_metadata(station_id)
+        # Check if the metadata was successfully retrieved
+        if station_metadata is None:
+            print(f'Failed to get latitude/longitude for station {station_id} - No data given back from API')
+            continue
+        # Extract the latitude and longitude from the metadata
+        try:
+            latitude = station_metadata['features'][0]['attributes']['LAT']
+            longitude = station_metadata['features'][0]['attributes']['LONG']
+        except KeyError:
+            print(f'Failed to get latitude/longitude for station {station_id} - Unexpected response structure from API')
+            continue
+        # Add the latitude and longitude to the dictionary
+        station_data[station_id] = latitude, longitude
+    # Return the dictionary of station latitudes and longitudes
+    return station_data
 def data_interpolations(
@@ -916,9 +953,17 @@ def find_last_date_in_csv(workspace: str, file_name: str) -> str:
     # Helper Functions
     def is_valid_date(date_string):
+        # Check for date without time part
         try:
             datetime.datetime.strptime(date_string, "%Y-%m-%d")
             return True
+        except ValueError:
+            pass
+        # Check for date with time part
+        try:
+            datetime.datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")
+            return True
         except ValueError:
             return False
@@ -955,23 +1000,69 @@ def find_last_date_in_csv(workspace: str, file_name: str) -> str:
         return None
-def dbhydro_data_is_latest(date_latest: str):
+def dbhydro_data_is_latest(date_latest: str, dbkey: str | None = None) -> bool:
     """
     Checks whether the given date is the most recent date possible to get data from dbhydro.
     Can be used to check whether dbhydro data is up-to-date.
     Args:
         date_latest (str): The date of the most recent data of the dbhydro data you have
+        dbkey (str | None, optional): The dbkey of the data you are checking. Defaults to None.
     Returns:
         bool: True if the date_latest is the most recent date possible to get data from dbhydro, False otherwise
     """
-    date_latest_object = datetime.datetime.strptime(
-        date_latest, "%Y-%m-%d"
-    ).date()
-    return date_latest_object == (
-        datetime.datetime.now().date() - datetime.timedelta(days=1)
-    )
+    # Convert date_latest to a date object
+    date_latest_object = pd.to_datetime(date_latest).date()
+    # No dbkey provided
+    if dbkey is None:
+        # Assume latest data available is yesterday
+        return date_latest_object == (datetime.datetime.now().date() - datetime.timedelta(days=1))
+    # Get dbhydro api
+    dbhydro_api = get_dbhydro_api()
+    # Retrieve the last date available from dbhydro for the given dbkey
+    data = dbhydro_api.get_daily_data([dbkey], 'id', '1900-01-01', '1900-01-02', 'NGVD29', False)
+    last_date = data.time_series[0].period_of_record.por_last_date
+    # Use date part only (exclude time)
+    last_date = last_date.split("T")[0]
+    # Convert last_date to a date object
+    last_date_object = datetime.datetime.strptime(last_date, "%Y-%m-%d").date()
+    # Compare given date to last date from dbhydro
+    return date_latest_object >= last_date_object
+def dbhydro_water_quality_data_is_latest(date_latest: str, station: str, station_type: Literal['SITE', 'STATION'], test_number: int) -> bool:
+    """
+    Checks whether the given date is the most recent date possible to get water quality data from dbhydro.
+    Can be used to check whether dbhydro water quality data is up-to-date.
+    Args:
+        date_latest (str): The date of the most recent data of the dbhydro water quality data you have.
+        station (str): The station ID of the water quality data you are checking.
+        test_number (int): The test number of the water quality data you are checking. Test numbers map to parameters such as 'PHOSPHATE, TOTAL AS P'.
+    Returns:
+        bool: True if the date_latest is the most recent date possible to get water quality data from dbhydro, False otherwise
+    """
+    # Get the date range from dbhydro water quality data
+    date_start, date_end = get_dbhydro_water_quality_date_range(station, station_type, test_number)
+    # No end date available
+    if date_end is None:
+        # Assume data is not up-to-date
+        return False
+    # Convert date_latest to a datetime object
+    date_latest_object = pd.to_datetime(date_latest)
+    # Compare given date to last date from dbhydro
+    return date_latest_object >= date_end
 def get_synthetic_data(date_start: str, df: pd.DataFrame):
@@ -1038,6 +1129,128 @@ def get_synthetic_data(date_start: str, df: pd.DataFrame):
     return df
+def df_replace_missing_with_nan(df: pd.DataFrame, qualifier_codes: set = {'M', 'N'}, no_data_value: float = -99999.0) -> pd.DataFrame:
+    """
+    Replace values in the 'value' column of the DataFrame with NaN where the 'qualifier' column contains specified qualifier codes.
+    This was designed to work with dataframes created from dbhydro_py response data.
+    The dataframe must have 'value' and 'qualifier' columns.
+    Qualifier/Codes can be found here: https://insightsdata.sfwmd.gov/#/reference-tables?lookup=qualityCode
+    Args:
+        df (pd.DataFrame): DataFrame that was created from a dbhydro_py response. Must have value and qualifier columns.
+        qualifier_codes (set, optional): Set of qualifier codes indicating missing data. Defaults to {'M', 'N'}.
+        no_data_value (float, optional): Value representing no data. Defaults to -99999.0. Values equal to this will also be replaced with NaN.
+    Returns:
+        pd.DataFrame: DataFrame with specified values replaced with NaN.
+    """
+    # Replace 0 values with NaN when their qualifier is in qualifier_codes
+    # 'M' = Missing, 'N' = Not Yet Available
+    # Qualifier/Codes can be found here: https://insightsdata.sfwmd.gov/#/reference-tables?lookup=qualityCode
+    df.loc[df['qualifier'].isin(qualifier_codes), 'value'] = np.nan
+    # Also replace no_data_value with NaN
+    df.loc[np.isclose(df['value'], no_data_value), 'value'] = np.nan
+    # Return modified dataframe
+    return df
+def get_dbhydro_water_quality_date_range(station: str, station_type: Literal['SITE', 'STATION'], test_number: int) -> Tuple[pd.Timestamp | None, pd.Timestamp | None]:
+    """Get the start date and end date for the given station and test number from DBHYDRO water quality data.
+    Args:
+        station (str): The station names.
+        station_type (Literal['SITE', 'STATION']): The type of the station.
+        test_number (int): The test number of the data. Test numbers map to parameters such as 'PHOSPHATE, TOTAL AS P'.
+    Returns:
+        Tuple[pd.Timestamp | None, pd.Timestamp | None]: A tuple containing the start date and end date in 'MM/DD/YYYY' format.
+    """
+    response = get_dbhydro_water_quality_metadata([(station, station_type)], [test_number])
+    # No data given back by api
+    if response is None:
+        return (None, None)
+    # Get the date range from the response
+    if 'results' in response:
+        results = response['results']
+        if len(results) > 0:
+            # Find the first non-None start and end dates
+            date_start = None
+            date_end = None
+            for result in results:
+                date_start = result.get('startDate', None)
+                date_end = result.get('endDate', None)
+                # Dates found
+                if date_start is not None and date_end is not None:
+                    break
+            # If no valid dates were found, return early
+            if date_start is None or date_end is None:
+                return (date_start, date_end)
+            # Find the earliest start date and latest end date
+            for result in results:
+                date_start_current = result.get('startDate', None)
+                date_end_current = result.get('endDate', None)
+                if date_start_current is not None and pd.to_datetime(date_start_current) < pd.to_datetime(date_start):
+                    date_start = date_start_current
+                if date_end_current is not None and pd.to_datetime(date_end_current) > pd.to_datetime(date_end):
+                    date_end = date_end_current
+            # Convert dates to datetime objects
+            if date_start is not None:
+                date_start = pd.to_datetime(date_start)
+            if date_end is not None:
+                date_end = pd.to_datetime(date_end)
+            # Return the earliest start date and latest end date
+            return (date_start, date_end)
+    # No results found
+    return (None, None)
+def get_dbhydro_api_keys_from_environment() -> dict[str, str]:
+    """Get DBHYDRO API keys from environment variables.
+    Returns:
+        Dict[str, str]: A dictionary containing the DBHYDRO API keys where dict keys are 'client_id' and 'client_secret'.
+    """
+    # Get API keys from environment variables
+    api_keys = {
+        "client_id": os.environ.get("DBHYDRO_API_CLIENT_ID", ""),
+        "client_secret": os.environ.get("DBHYDRO_API_CLIENT_SECRET", ""),
+    }
+    # Return the API keys
+    return api_keys
+def get_dbhydro_api_keys() -> dict[str, str]:
+    """Get DBHYDRO API keys.
+    Returns:
+        Dict[str, str]: A dictionary containing the DBHYDRO API keys where dict keys are 'client_id' and 'client_secret'.
+    """
+    return get_dbhydro_api_keys_from_environment()
+def get_dbhydro_api() -> DbHydroApi:
+    """Get a configured DbHydroApi instance.
+    Returns:
+        DbHydroApi: An instance of the DbHydroApi class.
+    """
+    api_keys = get_dbhydro_api_keys()
+    dbhydro_api = DbHydroApi.with_default_adapter(client_id=api_keys["client_id"], client_secret=api_keys["client_secret"])
+    return dbhydro_api
 if __name__ == "__main__":
     if sys.argv[1] == "get_dbkeys":
         get_dbkeys(

loone-data-prep 1.3.0__py3-none-any.whl → 1.3.1__py3-none-any.whl

loone-data-prep 1.3.0py3-none-any.whl → 1.3.1py3-none-any.whl