loone-data-prep 1.2.4__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loone_data_prep/GEOGLOWS_LOONE_DATA_PREP.py +47 -16
- loone_data_prep/LOONE_DATA_PREP.py +0 -1
- loone_data_prep/dbhydro_insights.py +195 -0
- loone_data_prep/flow_data/S65E_total.py +57 -57
- loone_data_prep/flow_data/forecast_bias_correction.py +1 -1
- loone_data_prep/flow_data/get_forecast_flows.py +19 -105
- loone_data_prep/flow_data/get_inflows.py +18 -8
- loone_data_prep/flow_data/get_outflows.py +16 -7
- loone_data_prep/flow_data/hydro.py +62 -91
- loone_data_prep/forecast_scripts/get_Chla_predicted.py +1 -1
- loone_data_prep/forecast_scripts/get_NO_Loads_predicted.py +1 -1
- loone_data_prep/forecast_scripts/new_combined_weather_forecast.py +220 -0
- loone_data_prep/utils.py +262 -32
- loone_data_prep/water_level_data/get_all.py +52 -44
- loone_data_prep/water_level_data/hydro.py +49 -68
- loone_data_prep/water_quality_data/get_inflows.py +69 -27
- loone_data_prep/water_quality_data/get_lake_wq.py +130 -33
- loone_data_prep/water_quality_data/wq.py +114 -88
- loone_data_prep/weather_data/get_all.py +5 -3
- loone_data_prep/weather_data/weather.py +117 -180
- {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/METADATA +2 -8
- loone_data_prep-1.3.1.dist-info/RECORD +38 -0
- {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/WHEEL +1 -1
- loone_data_prep/forecast_scripts/create_forecast_LOWs.py +0 -170
- loone_data_prep/forecast_scripts/weather_forecast.py +0 -199
- loone_data_prep-1.2.4.dist-info/RECORD +0 -38
- {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/top_level.txt +0 -0
|
@@ -1,117 +1,143 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os
|
|
1
3
|
import sys
|
|
2
4
|
from datetime import datetime
|
|
3
5
|
from retry import retry
|
|
4
|
-
|
|
5
|
-
from
|
|
6
|
-
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from loone_data_prep.utils import get_dbhydro_api
|
|
7
8
|
|
|
8
9
|
DEFAULT_STATION_IDS = ["L001", "L004", "L005", "L006", "L007", "L008", "LZ40"]
|
|
9
10
|
DATE_NOW = datetime.now().strftime("%Y-%m-%d")
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
@retry(
|
|
13
|
+
@retry(Exception, tries=5, delay=15, max_delay=60, backoff=2)
|
|
13
14
|
def get(
|
|
14
15
|
workspace: str,
|
|
15
16
|
name: str,
|
|
17
|
+
test_number: int,
|
|
16
18
|
station_ids: list = DEFAULT_STATION_IDS,
|
|
17
19
|
date_min: str = "1950-01-01",
|
|
18
20
|
date_max: str = DATE_NOW,
|
|
19
21
|
**kwargs: str | list
|
|
20
22
|
) -> None:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
23
|
+
"""Fetch water quality data from DBHydro API and save it as CSV files in the specified workspace.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
workspace (str): The directory where the CSV files will be saved.
|
|
27
|
+
name (str): The name of the water quality parameter. Example: 'PHOSPHATE, TOTAL AS P'
|
|
28
|
+
test_number (int): The DBHydro test number for the water quality parameter.
|
|
29
|
+
station_ids (list, optional): List of station IDs to fetch data for. Defaults to DEFAULT_STATION_IDS.
|
|
30
|
+
date_min (str, optional): The start date for fetching data in YYYY-MM-DD format. Defaults to "1950-01-01".
|
|
31
|
+
date_max (str, optional): The end date for fetching data in YYYY-MM-DD format. Defaults to the current date.
|
|
32
|
+
**kwargs: Additional keyword arguments.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
None
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
# Initialize the DBHydro API
|
|
39
|
+
api = get_dbhydro_api()
|
|
40
|
+
|
|
41
|
+
# Fetch water quality data
|
|
42
|
+
response = api.get_water_quality(stations=station_ids, test_numbers=[test_number], date_start=date_min, date_end=date_max, exclude_flagged_results=False)
|
|
43
|
+
df = response.to_dataframe(include_metadata=True)
|
|
44
|
+
|
|
45
|
+
# Process and save data for each station
|
|
46
|
+
for station in station_ids:
|
|
47
|
+
# Get a copy of the data frame for this station
|
|
48
|
+
df_station = df[df['station'] == station].copy()
|
|
49
|
+
|
|
50
|
+
# Check if the data frame is empty
|
|
51
|
+
if df_station.empty:
|
|
52
|
+
print(f'No data found for station ID {station} and test number {test_number}.')
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
# Get the units of the data
|
|
56
|
+
units = df_station['units'].iloc[0] if 'units' in df_station.columns else ''
|
|
57
|
+
|
|
58
|
+
# Drop unwanted columns
|
|
59
|
+
df_station = df_station[['date_collected_str', 'sig_fig_value']].copy()
|
|
60
|
+
|
|
61
|
+
# Convert string sig_fig_value to numeric
|
|
62
|
+
df_station['sig_fig_value'] = pd.to_numeric(df_station['sig_fig_value'], errors='coerce')
|
|
63
|
+
|
|
64
|
+
# Calculate daily average values
|
|
65
|
+
df_station['date_collected_str'] = pd.to_datetime(df_station['date_collected_str'])
|
|
66
|
+
df_station["date_only"] = df_station["date_collected_str"].dt.date
|
|
67
|
+
df_station = df_station.groupby("date_only")["sig_fig_value"].mean().reset_index()
|
|
68
|
+
df_station.rename(columns={"date_only": "date_collected_str"}, inplace=True)
|
|
69
|
+
|
|
70
|
+
# Format dataframe to expected layout
|
|
71
|
+
df_station['date_collected_str'] = pd.to_datetime(df_station['date_collected_str']) # Convert date_collected_str column to datetime
|
|
72
|
+
df_station.sort_values('date_collected_str', inplace=True) # Sort df by date_collected_str
|
|
73
|
+
df_station.rename(columns={'date_collected_str': 'date', 'sig_fig_value': f'{station}_{name}_{units}'}, inplace=True) # Rename columns
|
|
74
|
+
|
|
75
|
+
# Calculate the days column
|
|
76
|
+
df_station['days'] = (df_station['date'] - df_station['date'].min()).dt.days + df_station['date'].min().day
|
|
77
|
+
|
|
78
|
+
# Make sure the integer index is written out (for backwards compatibility)
|
|
79
|
+
df_station.reset_index(inplace=True, drop=True)
|
|
80
|
+
|
|
81
|
+
# Start index at 1 instead of 0 (for backwards compatibility)
|
|
82
|
+
df_station.index = df_station.index + 1
|
|
83
|
+
|
|
84
|
+
# Make sure the integer index values are quoted in the csv file (for backwards compatibility)
|
|
85
|
+
df_station.index = df_station.index.astype(str)
|
|
86
|
+
|
|
87
|
+
# Make sure the date column includes time information at midnight (for backwards compatibility)
|
|
88
|
+
df_station['date'] = df_station['date'].dt.strftime('%Y-%m-%d 00:00:00')
|
|
89
|
+
|
|
90
|
+
# Write out the data frame to a CSV file
|
|
91
|
+
df_station.to_csv(os.path.join(workspace, f'water_quality_{station}_{name}.csv'), index=True, quoting=csv.QUOTE_NONNUMERIC)
|
|
92
|
+
|
|
93
|
+
# Rewrite the file so dates don't have double quotes around them (for backwards compatibility)
|
|
94
|
+
rewrite_water_quality_file_without_date_quotes(workspace, f'water_quality_{station}_{name}.csv')
|
|
71
95
|
|
|
72
96
|
|
|
73
|
-
def _calculate_days_column(workspace: str,
|
|
97
|
+
def _calculate_days_column(workspace: str, df: pd.DataFrame, date_min: str):
|
|
74
98
|
"""
|
|
75
99
|
Calculates the values that should be in the "days" column of the water quality data CSV file
|
|
76
100
|
based on the given date_min and writes the updated data frame back to the CSV file.
|
|
77
101
|
|
|
78
102
|
Args:
|
|
79
103
|
workspace (str): The path to the workspace directory.
|
|
80
|
-
|
|
104
|
+
df (pd.DataFrame): The water quality data dataframe.
|
|
81
105
|
date_min (str): The minimum date that the "days" column values should be calculated from. Should be in format "YYYY-MM-DD".
|
|
82
106
|
"""
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
107
|
+
# Ensure df['date'] is a pandas datetime Series
|
|
108
|
+
df['date'] = pd.to_datetime(df['date'])
|
|
109
|
+
date_min_object = pd.to_datetime(date_min)
|
|
110
|
+
|
|
111
|
+
# Calculate days column for all rows
|
|
112
|
+
df['days'] = (df['date'] - date_min_object).dt.days + date_min_object.day
|
|
113
|
+
|
|
114
|
+
return df
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def rewrite_water_quality_file_without_date_quotes(workspace: str, file_name: str) -> None:
|
|
118
|
+
"""
|
|
119
|
+
Rewrites the given water quality CSV file so that the dates don't have double quotes around them (for backwards compatibility).
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
workspace (str): The path to the workspace directory.
|
|
123
|
+
file_name (str): The name of the water quality CSV file.
|
|
124
|
+
"""
|
|
125
|
+
# Rewrite the file so dates don't have double quotes around them (for backwards compatibility)
|
|
126
|
+
file_path = os.path.join(workspace, file_name)
|
|
127
|
+
lines = []
|
|
128
|
+
|
|
129
|
+
with open(file_path, 'r') as file:
|
|
130
|
+
lines = file.readlines()
|
|
131
|
+
|
|
132
|
+
with open(file_path, 'w', newline='') as file:
|
|
133
|
+
line_number = 0
|
|
134
|
+
for line in lines:
|
|
135
|
+
if line_number != 0:
|
|
136
|
+
line_split = line.split(',')
|
|
137
|
+
line_split[1] = line_split[1].replace('"', '') # Remove quotes around dates (2nd column)
|
|
138
|
+
line = ','.join(line_split)
|
|
139
|
+
file.write(line)
|
|
140
|
+
line_number += 1
|
|
115
141
|
|
|
116
142
|
|
|
117
143
|
if __name__ == "__main__":
|
|
@@ -88,7 +88,7 @@ def main(workspace: str, d: dict = D, dbkey_stations: dict = DBKEY_STATIONS) ->
|
|
|
88
88
|
continue
|
|
89
89
|
|
|
90
90
|
# Check whether the latest data is already up to date.
|
|
91
|
-
if dbhydro_data_is_latest(date_latest):
|
|
91
|
+
if dbhydro_data_is_latest(date_latest, dbkey):
|
|
92
92
|
# Notify that the data is already up to date
|
|
93
93
|
print(f'Downloading of new {name} data skipped for dbkey {dbkey}. Data is already up to date.')
|
|
94
94
|
continue
|
|
@@ -99,8 +99,10 @@ def main(workspace: str, d: dict = D, dbkey_stations: dict = DBKEY_STATIONS) ->
|
|
|
99
99
|
|
|
100
100
|
try:
|
|
101
101
|
# Download only the new data
|
|
102
|
-
|
|
103
|
-
|
|
102
|
+
date_start = pd.to_datetime(date_latest) + pd.Timedelta(days=1)
|
|
103
|
+
date_start = date_start.strftime('%Y-%m-%d')
|
|
104
|
+
print(f'Downloading new {name} data for dbkey {dbkey} starting from date {date_start}')
|
|
105
|
+
weather.get(workspace, name, dbkeys=[dbkey], date_min=date_start)
|
|
104
106
|
|
|
105
107
|
# Data failed to download - It's possible the data's end date has been reached
|
|
106
108
|
if not os.path.exists(os.path.join(workspace, original_file_name)):
|
|
@@ -1,16 +1,17 @@
|
|
|
1
|
+
import os
|
|
1
2
|
import sys
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
from retry import retry
|
|
4
|
-
from rpy2.robjects import r
|
|
5
|
-
from rpy2.rinterface_lib.embedded import RRuntimeError
|
|
6
5
|
import pandas as pd
|
|
6
|
+
from loone_data_prep.utils import df_replace_missing_with_nan, get_dbhydro_api
|
|
7
|
+
import csv
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
DEFAULT_DBKEYS = ["16021", "12515", "12524", "13081"]
|
|
10
11
|
DATE_NOW = datetime.now().strftime("%Y-%m-%d")
|
|
11
12
|
|
|
12
13
|
|
|
13
|
-
@retry(
|
|
14
|
+
@retry(Exception, tries=5, delay=15, max_delay=60, backoff=2)
|
|
14
15
|
def get(
|
|
15
16
|
workspace: str,
|
|
16
17
|
param: str,
|
|
@@ -19,8 +20,15 @@ def get(
|
|
|
19
20
|
date_max: str = DATE_NOW,
|
|
20
21
|
**kwargs: str | list
|
|
21
22
|
) -> None:
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
"""Fetches daily weather data from DBHYDRO for specified dbkeys and date range, and saves the data to CSV files in the specified workspace.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
workspace (str): The directory where the CSV files will be saved.
|
|
27
|
+
param (str): The type of weather data to fetch (e.g., "RAIN", "ETPI").
|
|
28
|
+
dbkeys (list, optional): List of DBHYDRO dbkeys to fetch data for. Defaults to DEFAULT_DBKEYS.
|
|
29
|
+
date_min (str, optional): The start date for data retrieval in "YYYY-MM-DD" format. Defaults to "2000-01-01".
|
|
30
|
+
date_max (str, optional): The end date for data retrieval in "YYYY-MM-DD" format. Defaults to the current date.
|
|
31
|
+
"""
|
|
24
32
|
data_type = param
|
|
25
33
|
data_units_file = None
|
|
26
34
|
data_units_header = None
|
|
@@ -28,92 +36,49 @@ def get(
|
|
|
28
36
|
# Get the units for the file name and column header based on the type of data
|
|
29
37
|
data_units_file, data_units_header = _get_file_header_data_units(data_type)
|
|
30
38
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
for (i in dbkeys)
|
|
41
|
-
{{
|
|
42
|
-
# Retrieve data for the dbkey
|
|
43
|
-
data <- get_hydro(dbkey = i, date_min = "{date_min}", date_max = "{date_max}", raw = TRUE)
|
|
44
|
-
|
|
45
|
-
# Give data.frame correct column names so it can be cleaned using the clean_hydro function
|
|
46
|
-
column_names <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
|
|
47
|
-
colnames(data) <- column_names
|
|
48
|
-
|
|
49
|
-
# Check if the data.frame has any rows
|
|
50
|
-
if (nrow(data) > 0)
|
|
51
|
-
{{
|
|
52
|
-
# Get the station
|
|
53
|
-
station <- data$station[1]
|
|
54
|
-
|
|
55
|
-
# Add a type and units column to data so it can be cleaned using the clean_hydro function
|
|
56
|
-
data$type <- "{data_type}"
|
|
57
|
-
data$units <- "{data_units_header}"
|
|
58
|
-
|
|
59
|
-
# Clean the data.frame
|
|
60
|
-
data <- clean_hydro(data)
|
|
61
|
-
|
|
62
|
-
# Get the filename of the output file
|
|
63
|
-
filename <- ""
|
|
64
|
-
|
|
65
|
-
if ("{param}" %in% c("RADP", "RADT"))
|
|
66
|
-
{{
|
|
67
|
-
filename <- paste(station, "{data_type}", sep = "_")
|
|
68
|
-
}}
|
|
69
|
-
else
|
|
70
|
-
{{
|
|
71
|
-
filename <- paste(station, "{data_type}", "{data_units_file}", sep = "_")
|
|
72
|
-
}}
|
|
73
|
-
|
|
74
|
-
filename <- paste0(filename, ".csv")
|
|
75
|
-
filename <- paste0("{workspace}/", filename)
|
|
76
|
-
|
|
77
|
-
# Save data to a CSV file
|
|
78
|
-
write.csv(data, file = filename)
|
|
79
|
-
|
|
80
|
-
# Print a message indicating the file has been saved
|
|
81
|
-
cat("CSV file", filename, "has been saved.\n")
|
|
82
|
-
|
|
83
|
-
# Append the station to the list of successful stations
|
|
84
|
-
successful_stations <- c(successful_stations, station)
|
|
85
|
-
}}
|
|
86
|
-
else
|
|
87
|
-
{{
|
|
88
|
-
# No data given back, It's possible that the dbkey has reached its end date.
|
|
89
|
-
print(paste("Empty data.frame returned for dbkey", i, "It's possible that the dbkey has reached its end date. Skipping to the next dbkey."))
|
|
90
|
-
}}
|
|
91
|
-
|
|
92
|
-
# Add a delay between requests
|
|
93
|
-
Sys.sleep(2) # Wait for 2 seconds before the next iteration
|
|
94
|
-
}}
|
|
95
|
-
|
|
96
|
-
# Return the station and dbkey to the python code
|
|
97
|
-
return(successful_stations)
|
|
98
|
-
}}
|
|
99
|
-
""" # noqa: E501
|
|
39
|
+
# Retrieve the data
|
|
40
|
+
api = get_dbhydro_api()
|
|
41
|
+
response = api.get_daily_data(dbkeys, 'id', date_min, date_max, 'NGVD29', False)
|
|
42
|
+
|
|
43
|
+
# Get the data as a dataframe
|
|
44
|
+
df = response.to_dataframe(True)
|
|
45
|
+
|
|
46
|
+
# Replace 0 values with NaN when their qualifier is either 'M' or 'N'
|
|
47
|
+
df = df_replace_missing_with_nan(df)
|
|
100
48
|
|
|
101
|
-
#
|
|
102
|
-
|
|
103
|
-
result = r.download_weather_data()
|
|
49
|
+
# Map each station to its own dataframe
|
|
50
|
+
station_dfs = {}
|
|
104
51
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
for value in result:
|
|
108
|
-
stations.append(value[0])
|
|
52
|
+
for site_code in response.get_site_codes():
|
|
53
|
+
station_dfs[site_code] = df[df['site_code'] == site_code].copy()
|
|
109
54
|
|
|
110
|
-
#
|
|
111
|
-
for station in
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
55
|
+
# Write out each station's data to its own file
|
|
56
|
+
for station, station_df in station_dfs.items():
|
|
57
|
+
# Get metadata for the station
|
|
58
|
+
parameter_code = station_df['parameter_code'].iloc[0]
|
|
59
|
+
unit_code = station_df['unit_code'].iloc[0]
|
|
60
|
+
|
|
61
|
+
# Select only the desired columns
|
|
62
|
+
station_df = station_df[['value']].copy()
|
|
63
|
+
|
|
64
|
+
# Rename datetime index
|
|
65
|
+
station_df.index.rename('date', inplace=True)
|
|
66
|
+
|
|
67
|
+
# Rename the columns to the expected format
|
|
68
|
+
station_df.rename(columns={'value': f'{station}_{data_type}_{data_units_header}'}, inplace=True)
|
|
69
|
+
|
|
70
|
+
# Make the date index a column and use an integer index (for backwards compatibility)
|
|
71
|
+
station_df = station_df.reset_index()
|
|
72
|
+
|
|
73
|
+
# Get the name of the output file
|
|
74
|
+
file_name = ''
|
|
75
|
+
if data_type in ['RADP', 'RADT']:
|
|
76
|
+
file_name = f'{station}_{data_type}.csv'
|
|
77
|
+
else:
|
|
78
|
+
file_name = f'{station}_{data_type}_{data_units_file}.csv'
|
|
79
|
+
|
|
80
|
+
# Write out the station's data to a csv file
|
|
81
|
+
station_df.to_csv(os.path.join(workspace, file_name), index=True)
|
|
117
82
|
|
|
118
83
|
|
|
119
84
|
def merge_data(workspace: str, data_type: str):
|
|
@@ -127,103 +92,75 @@ def merge_data(workspace: str, data_type: str):
|
|
|
127
92
|
|
|
128
93
|
# Merge the data files for the different stations (LAKE_RAINFALL_DATA.csv)
|
|
129
94
|
if data_type == "RAIN":
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
95
|
+
# Read in rain data
|
|
96
|
+
l001_rain_inches = pd.read_csv(os.path.join(workspace, 'L001_RAIN_Inches.csv'), index_col=0)
|
|
97
|
+
l005_rain_inches = pd.read_csv(os.path.join(workspace, 'L005_RAIN_Inches.csv'), index_col=0)
|
|
98
|
+
l006_rain_inches = pd.read_csv(os.path.join(workspace, 'L006_RAIN_Inches.csv'), index_col=0)
|
|
99
|
+
lz40_rain_inches = pd.read_csv(os.path.join(workspace, 'LZ40_RAIN_Inches.csv'), index_col=0)
|
|
100
|
+
|
|
101
|
+
# Replace NaN values with 0
|
|
102
|
+
l001_rain_inches.fillna(0, inplace=True)
|
|
103
|
+
l005_rain_inches.fillna(0, inplace=True)
|
|
104
|
+
l006_rain_inches.fillna(0, inplace=True)
|
|
105
|
+
lz40_rain_inches.fillna(0, inplace=True)
|
|
106
|
+
|
|
107
|
+
# Merge the data by the "date" column
|
|
108
|
+
merged_data = pd.merge(l001_rain_inches, l005_rain_inches, on="date", how="outer")
|
|
109
|
+
merged_data = pd.merge(merged_data, l006_rain_inches, on="date", how="outer")
|
|
110
|
+
merged_data = pd.merge(merged_data, lz40_rain_inches, on="date", how="outer")
|
|
111
|
+
|
|
112
|
+
# Calculate the average rainfall per day
|
|
113
|
+
merged_data['average_rainfall'] = merged_data.iloc[:, 1:].mean(axis=1)
|
|
114
|
+
|
|
115
|
+
# Make sure the integer index values are quoted in the csv file (for backwards compatibility)
|
|
116
|
+
merged_data.index = merged_data.index.astype(str)
|
|
117
|
+
|
|
118
|
+
# Save merged data as a CSV file
|
|
119
|
+
merged_data.applymap(lambda x: round(x, 4) if isinstance(x, (float, int)) else x)
|
|
120
|
+
merged_data.to_csv(os.path.join(workspace, 'LAKE_RAINFALL_DATA.csv'), index=True, quoting=csv.QUOTE_NONNUMERIC)
|
|
154
121
|
|
|
155
122
|
# Merge the data files for the different stations (LOONE_AVERAGE_ETPI_DATA.csv)
|
|
156
123
|
if data_type == "ETPI":
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
)
|
|
182
|
-
|
|
124
|
+
# Read in ETPI data
|
|
125
|
+
l001_etpi_inches = pd.read_csv(os.path.join(workspace, 'L001_ETPI_Inches.csv'), index_col=0)
|
|
126
|
+
l005_etpi_inches = pd.read_csv(os.path.join(workspace, 'L005_ETPI_Inches.csv'), index_col=0)
|
|
127
|
+
l006_etpi_inches = pd.read_csv(os.path.join(workspace, 'L006_ETPI_Inches.csv'), index_col=0)
|
|
128
|
+
lz40_etpi_inches = pd.read_csv(os.path.join(workspace, 'LZ40_ETPI_Inches.csv'), index_col=0)
|
|
129
|
+
|
|
130
|
+
# Replace NaN values with 0
|
|
131
|
+
l001_etpi_inches.fillna(0, inplace=True)
|
|
132
|
+
l005_etpi_inches.fillna(0, inplace=True)
|
|
133
|
+
l006_etpi_inches.fillna(0, inplace=True)
|
|
134
|
+
lz40_etpi_inches.fillna(0, inplace=True)
|
|
135
|
+
|
|
136
|
+
# Merge the data by the "date" column
|
|
137
|
+
merged_data = pd.merge(l001_etpi_inches, l005_etpi_inches, on="date", how="outer")
|
|
138
|
+
merged_data = pd.merge(merged_data, l006_etpi_inches, on="date", how="outer")
|
|
139
|
+
merged_data = pd.merge(merged_data, lz40_etpi_inches, on="date", how="outer")
|
|
140
|
+
|
|
141
|
+
# Calculate the average ETPI per day
|
|
142
|
+
merged_data['average_ETPI'] = merged_data.iloc[:, 1:].mean(axis=1)
|
|
143
|
+
|
|
144
|
+
# Make sure the integer index values are quoted in the csv file (for backwards compatibility)
|
|
145
|
+
merged_data.index = merged_data.index.astype(str)
|
|
146
|
+
|
|
147
|
+
# Save merged data as a CSV file
|
|
148
|
+
merged_data.to_csv(os.path.join(workspace, 'LOONE_AVERAGE_ETPI_DATA.csv'), index=True, quoting=csv.QUOTE_NONNUMERIC, na_rep='NA')
|
|
149
|
+
|
|
150
|
+
# Rewrite the file so NA values aren't quoted (for backwards compatibility)
|
|
151
|
+
file_path = os.path.join(workspace, 'LOONE_AVERAGE_ETPI_DATA.csv')
|
|
152
|
+
lines = []
|
|
183
153
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
Reformats the dbhydro weather file to the layout expected by the rest of the LOONE scripts.
|
|
187
|
-
This function reads in and writes out a .csv file.
|
|
188
|
-
|
|
189
|
-
Args:
|
|
190
|
-
workspace (str): The path to the workspace directory.
|
|
191
|
-
station (str): The station name. Ex: L001, L005, L006, LZ40.
|
|
192
|
-
data_type (str): The type of data. Ex: RAIN, ETPI, H2OT, RADP, RADT, AIRT, WNDS.
|
|
193
|
-
data_units_file (str): The units for the file name. Ex: Inches, Degrees Celsius, etc.
|
|
194
|
-
data_units_header (str): The units for the column header. Ex: Inches, Degrees Celsius, etc. Can differ from data_units_file when data_type is either RADP or RADT.
|
|
154
|
+
with open(file_path, 'r') as file:
|
|
155
|
+
lines = file.readlines()
|
|
195
156
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
else:
|
|
204
|
-
df = pd.read_csv(f"{workspace}/{station}_{data_type}_{data_units_file}.csv")
|
|
205
|
-
|
|
206
|
-
# Remove unneeded column columns
|
|
207
|
-
df.drop(f' _{data_type}_{data_units_header}', axis=1, inplace=True)
|
|
208
|
-
df.drop('Unnamed: 0', axis=1, inplace=True)
|
|
209
|
-
|
|
210
|
-
# Convert date column to datetime
|
|
211
|
-
df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
|
|
212
|
-
|
|
213
|
-
# Sort the data by date
|
|
214
|
-
df.sort_values('date', inplace=True)
|
|
215
|
-
|
|
216
|
-
# Renumber the index
|
|
217
|
-
df.reset_index(drop=True, inplace=True)
|
|
218
|
-
|
|
219
|
-
# Drop rows that are missing all their values
|
|
220
|
-
df.dropna(how='all', inplace=True)
|
|
221
|
-
|
|
222
|
-
# Write the updated data back to the file
|
|
223
|
-
if data_type in ['RADP', 'RADT']:
|
|
224
|
-
df.to_csv(f"{workspace}/{station}_{data_type}.csv")
|
|
225
|
-
else:
|
|
226
|
-
df.to_csv(f"{workspace}/{station}_{data_type}_{data_units_file}.csv")
|
|
157
|
+
with open(file_path, 'w', newline='') as file:
|
|
158
|
+
for line in lines:
|
|
159
|
+
line = line.replace(',"NA"', ',NA')
|
|
160
|
+
line = line.replace('"NA",', 'NA,')
|
|
161
|
+
line = line.replace(',"NaN"', ',NA')
|
|
162
|
+
line = line.replace('"NaN",', 'NA,')
|
|
163
|
+
file.write(line)
|
|
227
164
|
|
|
228
165
|
|
|
229
166
|
def _get_file_header_data_units(data_type: str) -> tuple[str, str]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: loone_data_prep
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.1
|
|
4
4
|
Summary: Prepare data to run the LOONE model.
|
|
5
5
|
Author-email: Osama Tarabih <osamatarabih@usf.edu>
|
|
6
6
|
Maintainer-email: Michael Souffront <msouffront@aquaveo.com>, James Dolinar <jdolinar@aquaveo.com>
|
|
@@ -18,7 +18,6 @@ License: BSD-3-Clause License
|
|
|
18
18
|
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
|
-
Requires-Dist: rpy2
|
|
22
21
|
Requires-Dist: retry
|
|
23
22
|
Requires-Dist: numpy<2
|
|
24
23
|
Requires-Dist: pandas
|
|
@@ -30,6 +29,7 @@ Requires-Dist: requests_cache
|
|
|
30
29
|
Requires-Dist: retry-requests
|
|
31
30
|
Requires-Dist: eccodes==2.41.0
|
|
32
31
|
Requires-Dist: xarray==2025.4.0
|
|
32
|
+
Requires-Dist: dbhydro-py
|
|
33
33
|
Dynamic: license-file
|
|
34
34
|
|
|
35
35
|
LOONE_DATA_PREP
|
|
@@ -40,11 +40,6 @@ Prepare data for the LOONE water quality model.
|
|
|
40
40
|
Line to the LOONE model: [https://pypi.org/project/loone](https://pypi.org/project/loone)
|
|
41
41
|
Link to LOONE model repository: [https://github.com/Aquaveo/LOONE](https://github.com/Aquaveo/LOONE)
|
|
42
42
|
|
|
43
|
-
## Prerequisites:
|
|
44
|
-
|
|
45
|
-
* R ([https://www.r-project.org/](https://www.r-project.org/))
|
|
46
|
-
* R packages: dbhydroR, rio, dplyr
|
|
47
|
-
|
|
48
43
|
## Installation:
|
|
49
44
|
|
|
50
45
|
```bash
|
|
@@ -103,7 +98,6 @@ dbkeys = get_dbkeys(
|
|
|
103
98
|
stat="MEAN",
|
|
104
99
|
recorder="CR10",
|
|
105
100
|
freq="DA",
|
|
106
|
-
detail_level="dbkey"
|
|
107
101
|
)
|
|
108
102
|
|
|
109
103
|
# Get water level data
|