loone-data-prep 1.2.4__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. loone_data_prep/GEOGLOWS_LOONE_DATA_PREP.py +47 -16
  2. loone_data_prep/LOONE_DATA_PREP.py +0 -1
  3. loone_data_prep/dbhydro_insights.py +195 -0
  4. loone_data_prep/flow_data/S65E_total.py +57 -57
  5. loone_data_prep/flow_data/forecast_bias_correction.py +1 -1
  6. loone_data_prep/flow_data/get_forecast_flows.py +19 -105
  7. loone_data_prep/flow_data/get_inflows.py +18 -8
  8. loone_data_prep/flow_data/get_outflows.py +16 -7
  9. loone_data_prep/flow_data/hydro.py +62 -91
  10. loone_data_prep/forecast_scripts/get_Chla_predicted.py +1 -1
  11. loone_data_prep/forecast_scripts/get_NO_Loads_predicted.py +1 -1
  12. loone_data_prep/forecast_scripts/new_combined_weather_forecast.py +220 -0
  13. loone_data_prep/utils.py +262 -32
  14. loone_data_prep/water_level_data/get_all.py +52 -44
  15. loone_data_prep/water_level_data/hydro.py +49 -68
  16. loone_data_prep/water_quality_data/get_inflows.py +69 -27
  17. loone_data_prep/water_quality_data/get_lake_wq.py +130 -33
  18. loone_data_prep/water_quality_data/wq.py +114 -88
  19. loone_data_prep/weather_data/get_all.py +5 -3
  20. loone_data_prep/weather_data/weather.py +117 -180
  21. {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/METADATA +2 -8
  22. loone_data_prep-1.3.1.dist-info/RECORD +38 -0
  23. {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/WHEEL +1 -1
  24. loone_data_prep/forecast_scripts/create_forecast_LOWs.py +0 -170
  25. loone_data_prep/forecast_scripts/weather_forecast.py +0 -199
  26. loone_data_prep-1.2.4.dist-info/RECORD +0 -38
  27. {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/licenses/LICENSE +0 -0
  28. {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/top_level.txt +0 -0
@@ -1,117 +1,143 @@
1
+ import csv
2
+ import os
1
3
  import sys
2
4
  from datetime import datetime
3
5
  from retry import retry
4
- from rpy2.robjects import r
5
- from rpy2.rinterface_lib.embedded import RRuntimeError
6
-
6
+ import pandas as pd
7
+ from loone_data_prep.utils import get_dbhydro_api
7
8
 
8
9
  DEFAULT_STATION_IDS = ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]
9
10
  DATE_NOW = datetime.now().strftime("%Y-%m-%d")
10
11
 
11
12
 
12
- @retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
13
+ @retry(Exception, tries=5, delay=15, max_delay=60, backoff=2)
13
14
  def get(
14
15
  workspace: str,
15
16
  name: str,
17
+ test_number: int,
16
18
  station_ids: list = DEFAULT_STATION_IDS,
17
19
  date_min: str = "1950-01-01",
18
20
  date_max: str = DATE_NOW,
19
21
  **kwargs: str | list
20
22
  ) -> None:
21
- station_ids_str = "\"" + "\", \"".join(station_ids) + "\""
22
- r(
23
- f"""
24
- # Load the required libraries
25
- library(rio)
26
- library(dbhydroR)
27
-
28
- # Specify the station IDs, date range, and test names
29
- station_ids <- c({station_ids_str})
30
- date_min <- "{date_min}"
31
- date_max <- "{date_max}"
32
- test_names <- c("{name}")
33
-
34
- # Loop over the station IDs
35
- for (station_id in station_ids) {{
36
- # Retrieve water quality data for the current station ID
37
- water_quality_data <- tryCatch(
38
- get_wq(
39
- station_id = station_id,
40
- date_min = date_min,
41
- date_max = date_max,
42
- test_name = test_names
43
- ),
44
- error = function(e) NULL
45
- )
46
-
47
- # Check if data is available for the current station ID and test name
48
- if (!is.null(water_quality_data) && nrow(water_quality_data) > 0) {{
49
- # Convert the vector to a data frame
50
- water_quality_data <- as.data.frame(water_quality_data)
51
-
52
- # Calculate the number of days from the minimum date plus 8
53
- water_quality_data$days <- as.integer(difftime(water_quality_data$date, min(water_quality_data$date), units = "days")) + as.integer(format(min(water_quality_data$date), "%d"))
54
-
55
- # Generate the filename based on the station ID
56
- filename <- paste0("{workspace}/water_quality_", station_id, "_", test_names, ".csv")
57
-
58
- # Save data to a CSV file
59
- write.csv(water_quality_data, file = filename)
60
-
61
- # Print a message indicating the file has been saved
62
- cat("CSV file", filename, "has been saved.\n")
63
- }} else {{
64
- # Print a message indicating no data was found for the current station ID and test name
65
- cat("No data found for station ID", station_id, "and test name", test_names, "\n")
66
- }}
67
- Sys.sleep(1) # Wait for 1 seconds before the next iteration
68
- }}
69
- """ # noqa: E501
70
- )
23
+ """Fetch water quality data from DBHydro API and save it as CSV files in the specified workspace.
24
+
25
+ Args:
26
+ workspace (str): The directory where the CSV files will be saved.
27
+ name (str): The name of the water quality parameter. Example: 'PHOSPHATE, TOTAL AS P'
28
+ test_number (int): The DBHydro test number for the water quality parameter.
29
+ station_ids (list, optional): List of station IDs to fetch data for. Defaults to DEFAULT_STATION_IDS.
30
+ date_min (str, optional): The start date for fetching data in YYYY-MM-DD format. Defaults to "1950-01-01".
31
+ date_max (str, optional): The end date for fetching data in YYYY-MM-DD format. Defaults to the current date.
32
+ **kwargs: Additional keyword arguments.
33
+
34
+ Returns:
35
+ None
36
+ """
37
+
38
+ # Initialize the DBHydro API
39
+ api = get_dbhydro_api()
40
+
41
+ # Fetch water quality data
42
+ response = api.get_water_quality(stations=station_ids, test_numbers=[test_number], date_start=date_min, date_end=date_max, exclude_flagged_results=False)
43
+ df = response.to_dataframe(include_metadata=True)
44
+
45
+ # Process and save data for each station
46
+ for station in station_ids:
47
+ # Get a copy of the data frame for this station
48
+ df_station = df[df['station'] == station].copy()
49
+
50
+ # Check if the data frame is empty
51
+ if df_station.empty:
52
+ print(f'No data found for station ID {station} and test number {test_number}.')
53
+ continue
54
+
55
+ # Get the units of the data
56
+ units = df_station['units'].iloc[0] if 'units' in df_station.columns else ''
57
+
58
+ # Drop unwanted columns
59
+ df_station = df_station[['date_collected_str', 'sig_fig_value']].copy()
60
+
61
+ # Convert string sig_fig_value to numeric
62
+ df_station['sig_fig_value'] = pd.to_numeric(df_station['sig_fig_value'], errors='coerce')
63
+
64
+ # Calculate daily average values
65
+ df_station['date_collected_str'] = pd.to_datetime(df_station['date_collected_str'])
66
+ df_station["date_only"] = df_station["date_collected_str"].dt.date
67
+ df_station = df_station.groupby("date_only")["sig_fig_value"].mean().reset_index()
68
+ df_station.rename(columns={"date_only": "date_collected_str"}, inplace=True)
69
+
70
+ # Format dataframe to expected layout
71
+ df_station['date_collected_str'] = pd.to_datetime(df_station['date_collected_str']) # Convert date_collected_str column to datetime
72
+ df_station.sort_values('date_collected_str', inplace=True) # Sort df by date_collected_str
73
+ df_station.rename(columns={'date_collected_str': 'date', 'sig_fig_value': f'{station}_{name}_{units}'}, inplace=True) # Rename columns
74
+
75
+ # Calculate the days column
76
+ df_station['days'] = (df_station['date'] - df_station['date'].min()).dt.days + df_station['date'].min().day
77
+
78
+ # Make sure the integer index is written out (for backwards compatibility)
79
+ df_station.reset_index(inplace=True, drop=True)
80
+
81
+ # Start index at 1 instead of 0 (for backwards compatibility)
82
+ df_station.index = df_station.index + 1
83
+
84
+ # Make sure the integer index values are quoted in the csv file (for backwards compatibility)
85
+ df_station.index = df_station.index.astype(str)
86
+
87
+ # Make sure the date column includes time information at midnight (for backwards compatibility)
88
+ df_station['date'] = df_station['date'].dt.strftime('%Y-%m-%d 00:00:00')
89
+
90
+ # Write out the data frame to a CSV file
91
+ df_station.to_csv(os.path.join(workspace, f'water_quality_{station}_{name}.csv'), index=True, quoting=csv.QUOTE_NONNUMERIC)
92
+
93
+ # Rewrite the file so dates don't have double quotes around them (for backwards compatibility)
94
+ rewrite_water_quality_file_without_date_quotes(workspace, f'water_quality_{station}_{name}.csv')
71
95
 
72
96
 
73
- def _calculate_days_column(workspace: str, file_name: str, date_min: str):
97
+ def _calculate_days_column(workspace: str, df: pd.DataFrame, date_min: str):
74
98
  """
75
99
  Calculates the values that should be in the "days" column of the water quality data CSV file
76
100
  based on the given date_min and writes the updated data frame back to the CSV file.
77
101
 
78
102
  Args:
79
103
  workspace (str): The path to the workspace directory.
80
- file_name (str): The name of the water quality data CSV file.
104
+ df (pd.DataFrame): The water quality data dataframe.
81
105
  date_min (str): The minimum date that the "days" column values should be calculated from. Should be in format "YYYY-MM-DD".
82
106
  """
83
- r(
84
- f"""
85
- # Import necessary libraries
86
- library(lubridate)
87
-
88
- # Read the CSV file
89
- df <- read.csv("{workspace}/{file_name}", check.names = FALSE)
90
-
91
- # Drop the "X" column that R adds when reading CSV files
92
- df <- df[,-1]
93
-
94
- # Get date_min as an object with the correct timezone
95
- date_min_object <- as.POSIXct("{date_min}", tz = "UTC")
96
- date_min_tz <- format(with_tz(date_min_object, tzone = "America/New_York"), "%Z")
97
- date_min_object <- as.POSIXct("{date_min}", tz = date_min_tz)
98
-
99
- # Calculate each value in the days column based on the date_min
100
- for(i in 1:nrow(df))
101
- {{
102
- # Get the current row's date as an object with the correct timezone
103
- date <- as.POSIXct(df$date[i], tz = "UTC")
104
- date_tz <- format(with_tz(date, tzone = "America/New_York"), "%Z")
105
- date <- as.POSIXct(df$date[i], tz = date_tz)
106
-
107
- # Calculate the number of days from the minimum date to the row's date plus the number of days in date_min
108
- df$days[i] <- as.integer(difftime(date, date_min_object, units = "days")) + as.integer(format(date_min_object, "%d"))
109
- }}
110
-
111
- # Write the updated data frame back to the CSV file
112
- write.csv(df, file = "{workspace}/{file_name}", row.names = FALSE)
113
- """ # noqa: E501
114
- )
107
+ # Ensure df['date'] is a pandas datetime Series
108
+ df['date'] = pd.to_datetime(df['date'])
109
+ date_min_object = pd.to_datetime(date_min)
110
+
111
+ # Calculate days column for all rows
112
+ df['days'] = (df['date'] - date_min_object).dt.days + date_min_object.day
113
+
114
+ return df
115
+
116
+
117
+ def rewrite_water_quality_file_without_date_quotes(workspace: str, file_name: str) -> None:
118
+ """
119
+ Rewrites the given water quality CSV file so that the dates don't have double quotes around them (for backwards compatibility).
120
+
121
+ Args:
122
+ workspace (str): The path to the workspace directory.
123
+ file_name (str): The name of the water quality CSV file.
124
+ """
125
+ # Rewrite the file so dates don't have double quotes around them (for backwards compatibility)
126
+ file_path = os.path.join(workspace, file_name)
127
+ lines = []
128
+
129
+ with open(file_path, 'r') as file:
130
+ lines = file.readlines()
131
+
132
+ with open(file_path, 'w', newline='') as file:
133
+ line_number = 0
134
+ for line in lines:
135
+ if line_number != 0:
136
+ line_split = line.split(',')
137
+ line_split[1] = line_split[1].replace('"', '') # Remove quotes around dates (2nd column)
138
+ line = ','.join(line_split)
139
+ file.write(line)
140
+ line_number += 1
115
141
 
116
142
 
117
143
  if __name__ == "__main__":
@@ -88,7 +88,7 @@ def main(workspace: str, d: dict = D, dbkey_stations: dict = DBKEY_STATIONS) ->
88
88
  continue
89
89
 
90
90
  # Check whether the latest data is already up to date.
91
- if dbhydro_data_is_latest(date_latest):
91
+ if dbhydro_data_is_latest(date_latest, dbkey):
92
92
  # Notify that the data is already up to date
93
93
  print(f'Downloading of new {name} data skipped for dbkey {dbkey}. Data is already up to date.')
94
94
  continue
@@ -99,8 +99,10 @@ def main(workspace: str, d: dict = D, dbkey_stations: dict = DBKEY_STATIONS) ->
99
99
 
100
100
  try:
101
101
  # Download only the new data
102
- print(f'Downloading new {name} data for dbkey {dbkey} starting from date {date_latest}')
103
- weather.get(workspace, name, dbkeys=[dbkey], date_min=date_latest)
102
+ date_start = pd.to_datetime(date_latest) + pd.Timedelta(days=1)
103
+ date_start = date_start.strftime('%Y-%m-%d')
104
+ print(f'Downloading new {name} data for dbkey {dbkey} starting from date {date_start}')
105
+ weather.get(workspace, name, dbkeys=[dbkey], date_min=date_start)
104
106
 
105
107
  # Data failed to download - It's possible the data's end date has been reached
106
108
  if not os.path.exists(os.path.join(workspace, original_file_name)):
@@ -1,16 +1,17 @@
1
+ import os
1
2
  import sys
2
3
  from datetime import datetime
3
4
  from retry import retry
4
- from rpy2.robjects import r
5
- from rpy2.rinterface_lib.embedded import RRuntimeError
6
5
  import pandas as pd
6
+ from loone_data_prep.utils import df_replace_missing_with_nan, get_dbhydro_api
7
+ import csv
7
8
 
8
9
 
9
10
  DEFAULT_DBKEYS = ["16021", "12515", "12524", "13081"]
10
11
  DATE_NOW = datetime.now().strftime("%Y-%m-%d")
11
12
 
12
13
 
13
- @retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
14
+ @retry(Exception, tries=5, delay=15, max_delay=60, backoff=2)
14
15
  def get(
15
16
  workspace: str,
16
17
  param: str,
@@ -19,8 +20,15 @@ def get(
19
20
  date_max: str = DATE_NOW,
20
21
  **kwargs: str | list
21
22
  ) -> None:
22
- dbkeys_str = "\"" + "\", \"".join(dbkeys) + "\""
23
-
23
+ """Fetches daily weather data from DBHYDRO for specified dbkeys and date range, and saves the data to CSV files in the specified workspace.
24
+
25
+ Args:
26
+ workspace (str): The directory where the CSV files will be saved.
27
+ param (str): The type of weather data to fetch (e.g., "RAIN", "ETPI").
28
+ dbkeys (list, optional): List of DBHYDRO dbkeys to fetch data for. Defaults to DEFAULT_DBKEYS.
29
+ date_min (str, optional): The start date for data retrieval in "YYYY-MM-DD" format. Defaults to "2000-01-01".
30
+ date_max (str, optional): The end date for data retrieval in "YYYY-MM-DD" format. Defaults to the current date.
31
+ """
24
32
  data_type = param
25
33
  data_units_file = None
26
34
  data_units_header = None
@@ -28,92 +36,49 @@ def get(
28
36
  # Get the units for the file name and column header based on the type of data
29
37
  data_units_file, data_units_header = _get_file_header_data_units(data_type)
30
38
 
31
- r_str = f"""
32
- download_weather_data <- function()#workspace, dbkeys, date_min, date_max, data_type, data_units_file, data_units_header)
33
- {{
34
- library(dbhydroR)
35
- library(dplyr)
36
-
37
- dbkeys <- c({dbkeys_str})
38
- successful_stations <- list()
39
-
40
- for (i in dbkeys)
41
- {{
42
- # Retrieve data for the dbkey
43
- data <- get_hydro(dbkey = i, date_min = "{date_min}", date_max = "{date_max}", raw = TRUE)
44
-
45
- # Give data.frame correct column names so it can be cleaned using the clean_hydro function
46
- column_names <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
47
- colnames(data) <- column_names
48
-
49
- # Check if the data.frame has any rows
50
- if (nrow(data) > 0)
51
- {{
52
- # Get the station
53
- station <- data$station[1]
54
-
55
- # Add a type and units column to data so it can be cleaned using the clean_hydro function
56
- data$type <- "{data_type}"
57
- data$units <- "{data_units_header}"
58
-
59
- # Clean the data.frame
60
- data <- clean_hydro(data)
61
-
62
- # Get the filename of the output file
63
- filename <- ""
64
-
65
- if ("{param}" %in% c("RADP", "RADT"))
66
- {{
67
- filename <- paste(station, "{data_type}", sep = "_")
68
- }}
69
- else
70
- {{
71
- filename <- paste(station, "{data_type}", "{data_units_file}", sep = "_")
72
- }}
73
-
74
- filename <- paste0(filename, ".csv")
75
- filename <- paste0("{workspace}/", filename)
76
-
77
- # Save data to a CSV file
78
- write.csv(data, file = filename)
79
-
80
- # Print a message indicating the file has been saved
81
- cat("CSV file", filename, "has been saved.\n")
82
-
83
- # Append the station to the list of successful stations
84
- successful_stations <- c(successful_stations, station)
85
- }}
86
- else
87
- {{
88
- # No data given back, It's possible that the dbkey has reached its end date.
89
- print(paste("Empty data.frame returned for dbkey", i, "It's possible that the dbkey has reached its end date. Skipping to the next dbkey."))
90
- }}
91
-
92
- # Add a delay between requests
93
- Sys.sleep(2) # Wait for 2 seconds before the next iteration
94
- }}
95
-
96
- # Return the station and dbkey to the python code
97
- return(successful_stations)
98
- }}
99
- """ # noqa: E501
39
+ # Retrieve the data
40
+ api = get_dbhydro_api()
41
+ response = api.get_daily_data(dbkeys, 'id', date_min, date_max, 'NGVD29', False)
42
+
43
+ # Get the data as a dataframe
44
+ df = response.to_dataframe(True)
45
+
46
+ # Replace 0 values with NaN when their qualifier is either 'M' or 'N'
47
+ df = df_replace_missing_with_nan(df)
100
48
 
101
- # Download the weather data
102
- r(r_str)
103
- result = r.download_weather_data()
49
+ # Map each station to its own dataframe
50
+ station_dfs = {}
104
51
 
105
- # Get the stations of the dbkeys who's data were successfully downloaded
106
- stations = []
107
- for value in result:
108
- stations.append(value[0])
52
+ for site_code in response.get_site_codes():
53
+ station_dfs[site_code] = df[df['site_code'] == site_code].copy()
109
54
 
110
- # Format files to expected layout
111
- for station in stations:
112
- if station in ["L001", "L005", "L006", "LZ40"]:
113
- _reformat_weather_file(workspace, station, data_type, data_units_file, data_units_header)
114
-
115
- # Print a message indicating the file has been saved
116
- print(f"CSV file {workspace}/{station}_{data_type}_{data_units_file}.csv has been reformatted.")
55
+ # Write out each station's data to its own file
56
+ for station, station_df in station_dfs.items():
57
+ # Get metadata for the station
58
+ parameter_code = station_df['parameter_code'].iloc[0]
59
+ unit_code = station_df['unit_code'].iloc[0]
60
+
61
+ # Select only the desired columns
62
+ station_df = station_df[['value']].copy()
63
+
64
+ # Rename datetime index
65
+ station_df.index.rename('date', inplace=True)
66
+
67
+ # Rename the columns to the expected format
68
+ station_df.rename(columns={'value': f'{station}_{data_type}_{data_units_header}'}, inplace=True)
69
+
70
+ # Make the date index a column and use an integer index (for backwards compatibility)
71
+ station_df = station_df.reset_index()
72
+
73
+ # Get the name of the output file
74
+ file_name = ''
75
+ if data_type in ['RADP', 'RADT']:
76
+ file_name = f'{station}_{data_type}.csv'
77
+ else:
78
+ file_name = f'{station}_{data_type}_{data_units_file}.csv'
79
+
80
+ # Write out the station's data to a csv file
81
+ station_df.to_csv(os.path.join(workspace, file_name), index=True)
117
82
 
118
83
 
119
84
  def merge_data(workspace: str, data_type: str):
@@ -127,103 +92,75 @@ def merge_data(workspace: str, data_type: str):
127
92
 
128
93
  # Merge the data files for the different stations (LAKE_RAINFALL_DATA.csv)
129
94
  if data_type == "RAIN":
130
- r(
131
- f"""
132
- L001_RAIN_Inches <- read.csv("{workspace}/L001_RAIN_Inches.csv", colClasses = c("NULL", "character", "numeric"))
133
- L005_RAIN_Inches = read.csv("{workspace}/L005_RAIN_Inches.csv", colClasses = c("NULL", "character", "numeric"))
134
- L006_RAIN_Inches = read.csv("{workspace}/L006_RAIN_Inches.csv", colClasses = c("NULL", "character", "numeric"))
135
- LZ40_RAIN_Inches = read.csv("{workspace}/LZ40_RAIN_Inches.csv", colClasses = c("NULL", "character", "numeric"))
136
- #Replace NA values with zero
137
- L001_RAIN_Inches[is.na(L001_RAIN_Inches)] <- 0
138
- L005_RAIN_Inches[is.na(L005_RAIN_Inches)] <- 0
139
- L006_RAIN_Inches[is.na(L006_RAIN_Inches)] <- 0
140
- LZ40_RAIN_Inches[is.na(LZ40_RAIN_Inches)] <- 0
141
- # Merge the files by the "date" column
142
- merged_data <- merge(L001_RAIN_Inches, L005_RAIN_Inches, by = "date",all = TRUE)
143
- merged_data <- merge(merged_data, L006_RAIN_Inches, by = "date",all = TRUE)
144
- merged_data <- merge(merged_data, LZ40_RAIN_Inches, by = "date",all = TRUE)
145
- # Calculate the average rainfall per day
146
- merged_data$average_rainfall <- rowMeans(merged_data[, -1],na.rm = TRUE)
147
-
148
- # View the updated merged data
149
- head(merged_data)
150
- # Save merged data as a CSV file
151
- write.csv(merged_data, "{workspace}/LAKE_RAINFALL_DATA.csv", row.names = TRUE)
152
- """ # noqa: E501
153
- )
95
+ # Read in rain data
96
+ l001_rain_inches = pd.read_csv(os.path.join(workspace, 'L001_RAIN_Inches.csv'), index_col=0)
97
+ l005_rain_inches = pd.read_csv(os.path.join(workspace, 'L005_RAIN_Inches.csv'), index_col=0)
98
+ l006_rain_inches = pd.read_csv(os.path.join(workspace, 'L006_RAIN_Inches.csv'), index_col=0)
99
+ lz40_rain_inches = pd.read_csv(os.path.join(workspace, 'LZ40_RAIN_Inches.csv'), index_col=0)
100
+
101
+ # Replace NaN values with 0
102
+ l001_rain_inches.fillna(0, inplace=True)
103
+ l005_rain_inches.fillna(0, inplace=True)
104
+ l006_rain_inches.fillna(0, inplace=True)
105
+ lz40_rain_inches.fillna(0, inplace=True)
106
+
107
+ # Merge the data by the "date" column
108
+ merged_data = pd.merge(l001_rain_inches, l005_rain_inches, on="date", how="outer")
109
+ merged_data = pd.merge(merged_data, l006_rain_inches, on="date", how="outer")
110
+ merged_data = pd.merge(merged_data, lz40_rain_inches, on="date", how="outer")
111
+
112
+ # Calculate the average rainfall per day
113
+ merged_data['average_rainfall'] = merged_data.iloc[:, 1:].mean(axis=1)
114
+
115
+ # Make sure the integer index values are quoted in the csv file (for backwards compatibility)
116
+ merged_data.index = merged_data.index.astype(str)
117
+
118
+ # Save merged data as a CSV file
119
+ merged_data.applymap(lambda x: round(x, 4) if isinstance(x, (float, int)) else x)
120
+ merged_data.to_csv(os.path.join(workspace, 'LAKE_RAINFALL_DATA.csv'), index=True, quoting=csv.QUOTE_NONNUMERIC)
154
121
 
155
122
  # Merge the data files for the different stations (LOONE_AVERAGE_ETPI_DATA.csv)
156
123
  if data_type == "ETPI":
157
- r(
158
- f"""
159
- L001_ETPI_Inches <- read.csv("{workspace}/L001_ETPI_Inches.csv", colClasses = c("NULL", "character", "numeric"))
160
- L005_ETPI_Inches = read.csv("{workspace}/L005_ETPI_Inches.csv", colClasses = c("NULL", "character", "numeric"))
161
- L006_ETPI_Inches = read.csv("{workspace}/L006_ETPI_Inches.csv", colClasses = c("NULL", "character", "numeric"))
162
- LZ40_ETPI_Inches = read.csv("{workspace}/LZ40_ETPI_Inches.csv", colClasses = c("NULL", "character", "numeric"))
163
-
164
- # Replace NA values with zero
165
- L001_ETPI_Inches[is.na(L001_ETPI_Inches)] <- 0
166
- L005_ETPI_Inches[is.na(L005_ETPI_Inches)] <- 0
167
- L006_ETPI_Inches[is.na(L006_ETPI_Inches)] <- 0
168
- LZ40_ETPI_Inches[is.na(LZ40_ETPI_Inches)] <- 0
169
- # Merge the files by the "date" column
170
- merged_data <- merge(L001_ETPI_Inches, L005_ETPI_Inches, by = "date",all = TRUE)
171
- merged_data <- merge(merged_data, L006_ETPI_Inches, by = "date",all = TRUE)
172
- merged_data <- merge(merged_data, LZ40_ETPI_Inches, by = "date",all = TRUE)
173
- # Calculate the average rainfall per day
174
- merged_data$average_ETPI <- rowMeans(merged_data[, -1],na.rm = TRUE)
175
-
176
- # View the updated merged data
177
- head(merged_data)
178
- # Save merged data as a CSV file
179
- write.csv(merged_data, "{workspace}/LOONE_AVERAGE_ETPI_DATA.csv", row.names = TRUE)
180
- """ # noqa: E501
181
- )
182
-
124
+ # Read in ETPI data
125
+ l001_etpi_inches = pd.read_csv(os.path.join(workspace, 'L001_ETPI_Inches.csv'), index_col=0)
126
+ l005_etpi_inches = pd.read_csv(os.path.join(workspace, 'L005_ETPI_Inches.csv'), index_col=0)
127
+ l006_etpi_inches = pd.read_csv(os.path.join(workspace, 'L006_ETPI_Inches.csv'), index_col=0)
128
+ lz40_etpi_inches = pd.read_csv(os.path.join(workspace, 'LZ40_ETPI_Inches.csv'), index_col=0)
129
+
130
+ # Replace NaN values with 0
131
+ l001_etpi_inches.fillna(0, inplace=True)
132
+ l005_etpi_inches.fillna(0, inplace=True)
133
+ l006_etpi_inches.fillna(0, inplace=True)
134
+ lz40_etpi_inches.fillna(0, inplace=True)
135
+
136
+ # Merge the data by the "date" column
137
+ merged_data = pd.merge(l001_etpi_inches, l005_etpi_inches, on="date", how="outer")
138
+ merged_data = pd.merge(merged_data, l006_etpi_inches, on="date", how="outer")
139
+ merged_data = pd.merge(merged_data, lz40_etpi_inches, on="date", how="outer")
140
+
141
+ # Calculate the average ETPI per day
142
+ merged_data['average_ETPI'] = merged_data.iloc[:, 1:].mean(axis=1)
143
+
144
+ # Make sure the integer index values are quoted in the csv file (for backwards compatibility)
145
+ merged_data.index = merged_data.index.astype(str)
146
+
147
+ # Save merged data as a CSV file
148
+ merged_data.to_csv(os.path.join(workspace, 'LOONE_AVERAGE_ETPI_DATA.csv'), index=True, quoting=csv.QUOTE_NONNUMERIC, na_rep='NA')
149
+
150
+ # Rewrite the file so NA values aren't quoted (for backwards compatibility)
151
+ file_path = os.path.join(workspace, 'LOONE_AVERAGE_ETPI_DATA.csv')
152
+ lines = []
183
153
 
184
- def _reformat_weather_file(workspace: str, station: str, data_type: str, data_units_file: str, data_units_header: str) -> None:
185
- '''
186
- Reformats the dbhydro weather file to the layout expected by the rest of the LOONE scripts.
187
- This function reads in and writes out a .csv file.
188
-
189
- Args:
190
- workspace (str): The path to the workspace directory.
191
- station (str): The station name. Ex: L001, L005, L006, LZ40.
192
- data_type (str): The type of data. Ex: RAIN, ETPI, H2OT, RADP, RADT, AIRT, WNDS.
193
- data_units_file (str): The units for the file name. Ex: Inches, Degrees Celsius, etc.
194
- data_units_header (str): The units for the column header. Ex: Inches, Degrees Celsius, etc. Can differ from data_units_file when data_type is either RADP or RADT.
154
+ with open(file_path, 'r') as file:
155
+ lines = file.readlines()
195
156
 
196
- Returns:
197
- None
198
- '''
199
- # Read in the data
200
- df = None
201
- if data_type in ['RADP', 'RADT']:
202
- df = pd.read_csv(f"{workspace}/{station}_{data_type}.csv")
203
- else:
204
- df = pd.read_csv(f"{workspace}/{station}_{data_type}_{data_units_file}.csv")
205
-
206
- # Remove unneeded column columns
207
- df.drop(f' _{data_type}_{data_units_header}', axis=1, inplace=True)
208
- df.drop('Unnamed: 0', axis=1, inplace=True)
209
-
210
- # Convert date column to datetime
211
- df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
212
-
213
- # Sort the data by date
214
- df.sort_values('date', inplace=True)
215
-
216
- # Renumber the index
217
- df.reset_index(drop=True, inplace=True)
218
-
219
- # Drop rows that are missing all their values
220
- df.dropna(how='all', inplace=True)
221
-
222
- # Write the updated data back to the file
223
- if data_type in ['RADP', 'RADT']:
224
- df.to_csv(f"{workspace}/{station}_{data_type}.csv")
225
- else:
226
- df.to_csv(f"{workspace}/{station}_{data_type}_{data_units_file}.csv")
157
+ with open(file_path, 'w', newline='') as file:
158
+ for line in lines:
159
+ line = line.replace(',"NA"', ',NA')
160
+ line = line.replace('"NA",', 'NA,')
161
+ line = line.replace(',"NaN"', ',NA')
162
+ line = line.replace('"NaN",', 'NA,')
163
+ file.write(line)
227
164
 
228
165
 
229
166
  def _get_file_header_data_units(data_type: str) -> tuple[str, str]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: loone_data_prep
3
- Version: 1.2.4
3
+ Version: 1.3.1
4
4
  Summary: Prepare data to run the LOONE model.
5
5
  Author-email: Osama Tarabih <osamatarabih@usf.edu>
6
6
  Maintainer-email: Michael Souffront <msouffront@aquaveo.com>, James Dolinar <jdolinar@aquaveo.com>
@@ -18,7 +18,6 @@ License: BSD-3-Clause License
18
18
 
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: rpy2
22
21
  Requires-Dist: retry
23
22
  Requires-Dist: numpy<2
24
23
  Requires-Dist: pandas
@@ -30,6 +29,7 @@ Requires-Dist: requests_cache
30
29
  Requires-Dist: retry-requests
31
30
  Requires-Dist: eccodes==2.41.0
32
31
  Requires-Dist: xarray==2025.4.0
32
+ Requires-Dist: dbhydro-py
33
33
  Dynamic: license-file
34
34
 
35
35
  LOONE_DATA_PREP
@@ -40,11 +40,6 @@ Prepare data for the LOONE water quality model.
40
40
  Line to the LOONE model: [https://pypi.org/project/loone](https://pypi.org/project/loone)
41
41
  Link to LOONE model repository: [https://github.com/Aquaveo/LOONE](https://github.com/Aquaveo/LOONE)
42
42
 
43
- ## Prerequisites:
44
-
45
- * R ([https://www.r-project.org/](https://www.r-project.org/))
46
- * R packages: dbhydroR, rio, dplyr
47
-
48
43
  ## Installation:
49
44
 
50
45
  ```bash
@@ -103,7 +98,6 @@ dbkeys = get_dbkeys(
103
98
  stat="MEAN",
104
99
  recorder="CR10",
105
100
  freq="DA",
106
- detail_level="dbkey"
107
101
  )
108
102
 
109
103
  # Get water level data