loone-data-prep 1.2.4__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loone_data_prep/GEOGLOWS_LOONE_DATA_PREP.py +47 -16
- loone_data_prep/LOONE_DATA_PREP.py +0 -1
- loone_data_prep/dbhydro_insights.py +195 -0
- loone_data_prep/flow_data/S65E_total.py +57 -57
- loone_data_prep/flow_data/forecast_bias_correction.py +1 -1
- loone_data_prep/flow_data/get_forecast_flows.py +19 -105
- loone_data_prep/flow_data/get_inflows.py +18 -8
- loone_data_prep/flow_data/get_outflows.py +16 -7
- loone_data_prep/flow_data/hydro.py +62 -91
- loone_data_prep/forecast_scripts/get_Chla_predicted.py +1 -1
- loone_data_prep/forecast_scripts/get_NO_Loads_predicted.py +1 -1
- loone_data_prep/forecast_scripts/new_combined_weather_forecast.py +220 -0
- loone_data_prep/utils.py +262 -32
- loone_data_prep/water_level_data/get_all.py +52 -44
- loone_data_prep/water_level_data/hydro.py +49 -68
- loone_data_prep/water_quality_data/get_inflows.py +69 -27
- loone_data_prep/water_quality_data/get_lake_wq.py +130 -33
- loone_data_prep/water_quality_data/wq.py +114 -88
- loone_data_prep/weather_data/get_all.py +5 -3
- loone_data_prep/weather_data/weather.py +117 -180
- {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/METADATA +2 -8
- loone_data_prep-1.3.1.dist-info/RECORD +38 -0
- {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/WHEEL +1 -1
- loone_data_prep/forecast_scripts/create_forecast_LOWs.py +0 -170
- loone_data_prep/forecast_scripts/weather_forecast.py +0 -199
- loone_data_prep-1.2.4.dist-info/RECORD +0 -38
- {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/top_level.txt +0 -0
loone_data_prep/utils.py
CHANGED
|
@@ -5,17 +5,14 @@ import math
|
|
|
5
5
|
from glob import glob
|
|
6
6
|
from calendar import monthrange
|
|
7
7
|
import traceback
|
|
8
|
+
from typing import Literal, Tuple
|
|
8
9
|
import numpy as np
|
|
9
10
|
import pandas as pd
|
|
10
11
|
from retry import retry
|
|
11
12
|
from scipy.optimize import fsolve
|
|
12
13
|
from scipy import interpolate
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
StrVector as rpy2StrVector,
|
|
16
|
-
DataFrame as rpy2DataFrame,
|
|
17
|
-
)
|
|
18
|
-
from rpy2.rinterface_lib.embedded import RRuntimeError
|
|
14
|
+
from dbhydro_py import DbHydroApi
|
|
15
|
+
from loone_data_prep.dbhydro_insights import get_dbhydro_station_metadata, get_dbhydro_continuous_timeseries_metadata, get_dbhydro_water_quality_metadata
|
|
19
16
|
|
|
20
17
|
|
|
21
18
|
DEFAULT_STATION_IDS = ["L001", "L005", "L006", "LZ40"]
|
|
@@ -224,7 +221,7 @@ DEFAULT_EXPFUNC_NITROGEN_CONSTANTS = {
|
|
|
224
221
|
"S135_P": {"a": 3.09890183766129, "b": 0.657896838486496},
|
|
225
222
|
}
|
|
226
223
|
|
|
227
|
-
@retry(
|
|
224
|
+
@retry(Exception, tries=5, delay=15, max_delay=60, backoff=2)
|
|
228
225
|
def get_dbkeys(
|
|
229
226
|
station_ids: list,
|
|
230
227
|
category: str,
|
|
@@ -232,9 +229,8 @@ def get_dbkeys(
|
|
|
232
229
|
stat: str,
|
|
233
230
|
recorder: str,
|
|
234
231
|
freq: str = "DA",
|
|
235
|
-
detail_level: str = "dbkey",
|
|
236
232
|
*args: str,
|
|
237
|
-
) ->
|
|
233
|
+
) -> list[str]:
|
|
238
234
|
"""Get dbkeys. See DBHydroR documentation for more information:
|
|
239
235
|
https://cran.r-project.org/web/packages/dbhydroR/dbhydroR.pdf
|
|
240
236
|
|
|
@@ -245,27 +241,68 @@ def get_dbkeys(
|
|
|
245
241
|
stat (str): Statistic of data to retrieve.
|
|
246
242
|
recorder (str): Recorder of data to retrieve.
|
|
247
243
|
freq (str, optional): Frequency of data to retrieve. Defaults to "DA".
|
|
248
|
-
detail_level (str, optional): Detail level of data to retrieve. Defaults to "dbkey". Options are "dbkey",
|
|
249
|
-
"summary", or "full".
|
|
250
244
|
|
|
251
245
|
Returns:
|
|
252
|
-
|
|
246
|
+
list[str]: dbkeys info for the specified parameters.
|
|
253
247
|
"""
|
|
248
|
+
# Retrieve the metadata for the specified parameters
|
|
249
|
+
metadata = get_dbhydro_continuous_timeseries_metadata(station_ids, [category], [param], [stat], [recorder], [freq])
|
|
250
|
+
|
|
251
|
+
# A set to hold the dbkeys to avoid duplicates
|
|
252
|
+
dbkeys = set()
|
|
253
|
+
|
|
254
|
+
# No data returned from API
|
|
255
|
+
if metadata is None:
|
|
256
|
+
return list(dbkeys)
|
|
257
|
+
|
|
258
|
+
# Get the dbkeys from the metadata
|
|
259
|
+
for result in metadata['results']:
|
|
260
|
+
dbkeys.add(result['timeseriesId'])
|
|
261
|
+
|
|
262
|
+
# Return the dbkeys as a list
|
|
263
|
+
return list(dbkeys)
|
|
254
264
|
|
|
255
|
-
station_ids_str = '"' + '", "'.join(station_ids) + '"'
|
|
256
265
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
library(dbhydroR)
|
|
266
|
+
def get_stations_latitude_longitude(station_ids: list[str]):
|
|
267
|
+
"""Gets the latitudes and longitudes of the given stations.
|
|
260
268
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
return(dbkeys)
|
|
265
|
-
""" # noqa: E501
|
|
266
|
-
)
|
|
269
|
+
Args:
|
|
270
|
+
station_ids (list[str]): The ids of the stations to get the
|
|
271
|
+
latitudes/longitudes of. Example: ['L OKEE', 'FISHP']
|
|
267
272
|
|
|
268
|
-
|
|
273
|
+
Returns:
|
|
274
|
+
(dict[str, tuple[numpy.float64, numpy.float64]]): A dictionary of
|
|
275
|
+
format dict<station_id:(latitude,longitude)>
|
|
276
|
+
|
|
277
|
+
If a station's latitude/longitude fails to download then its station_id
|
|
278
|
+
won't be a key in the returned dictionary.
|
|
279
|
+
"""
|
|
280
|
+
# Dictionary to hold the latitude/longitude of each station
|
|
281
|
+
station_data = {}
|
|
282
|
+
|
|
283
|
+
# Get the latitude and longitude for each station
|
|
284
|
+
for station_id in station_ids:
|
|
285
|
+
# Retrieve the current station's metadata
|
|
286
|
+
station_metadata = get_dbhydro_station_metadata(station_id)
|
|
287
|
+
|
|
288
|
+
# Check if the metadata was successfully retrieved
|
|
289
|
+
if station_metadata is None:
|
|
290
|
+
print(f'Failed to get latitude/longitude for station {station_id} - No data given back from API')
|
|
291
|
+
continue
|
|
292
|
+
|
|
293
|
+
# Extract the latitude and longitude from the metadata
|
|
294
|
+
try:
|
|
295
|
+
latitude = station_metadata['features'][0]['attributes']['LAT']
|
|
296
|
+
longitude = station_metadata['features'][0]['attributes']['LONG']
|
|
297
|
+
except KeyError:
|
|
298
|
+
print(f'Failed to get latitude/longitude for station {station_id} - Unexpected response structure from API')
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
# Add the latitude and longitude to the dictionary
|
|
302
|
+
station_data[station_id] = latitude, longitude
|
|
303
|
+
|
|
304
|
+
# Return the dictionary of station latitudes and longitudes
|
|
305
|
+
return station_data
|
|
269
306
|
|
|
270
307
|
|
|
271
308
|
def data_interpolations(
|
|
@@ -916,9 +953,17 @@ def find_last_date_in_csv(workspace: str, file_name: str) -> str:
|
|
|
916
953
|
|
|
917
954
|
# Helper Functions
|
|
918
955
|
def is_valid_date(date_string):
|
|
956
|
+
# Check for date without time part
|
|
919
957
|
try:
|
|
920
958
|
datetime.datetime.strptime(date_string, "%Y-%m-%d")
|
|
921
959
|
return True
|
|
960
|
+
except ValueError:
|
|
961
|
+
pass
|
|
962
|
+
|
|
963
|
+
# Check for date with time part
|
|
964
|
+
try:
|
|
965
|
+
datetime.datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")
|
|
966
|
+
return True
|
|
922
967
|
except ValueError:
|
|
923
968
|
return False
|
|
924
969
|
|
|
@@ -955,23 +1000,69 @@ def find_last_date_in_csv(workspace: str, file_name: str) -> str:
|
|
|
955
1000
|
return None
|
|
956
1001
|
|
|
957
1002
|
|
|
958
|
-
def dbhydro_data_is_latest(date_latest: str):
|
|
1003
|
+
def dbhydro_data_is_latest(date_latest: str, dbkey: str | None = None) -> bool:
|
|
959
1004
|
"""
|
|
960
1005
|
Checks whether the given date is the most recent date possible to get data from dbhydro.
|
|
961
1006
|
Can be used to check whether dbhydro data is up-to-date.
|
|
962
1007
|
|
|
963
1008
|
Args:
|
|
964
1009
|
date_latest (str): The date of the most recent data of the dbhydro data you have
|
|
1010
|
+
dbkey (str | None, optional): The dbkey of the data you are checking. Defaults to None.
|
|
965
1011
|
|
|
966
1012
|
Returns:
|
|
967
1013
|
bool: True if the date_latest is the most recent date possible to get data from dbhydro, False otherwise
|
|
968
1014
|
"""
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
1015
|
+
# Convert date_latest to a date object
|
|
1016
|
+
date_latest_object = pd.to_datetime(date_latest).date()
|
|
1017
|
+
|
|
1018
|
+
# No dbkey provided
|
|
1019
|
+
if dbkey is None:
|
|
1020
|
+
# Assume latest data available is yesterday
|
|
1021
|
+
return date_latest_object == (datetime.datetime.now().date() - datetime.timedelta(days=1))
|
|
1022
|
+
|
|
1023
|
+
# Get dbhydro api
|
|
1024
|
+
dbhydro_api = get_dbhydro_api()
|
|
1025
|
+
|
|
1026
|
+
# Retrieve the last date available from dbhydro for the given dbkey
|
|
1027
|
+
data = dbhydro_api.get_daily_data([dbkey], 'id', '1900-01-01', '1900-01-02', 'NGVD29', False)
|
|
1028
|
+
last_date = data.time_series[0].period_of_record.por_last_date
|
|
1029
|
+
|
|
1030
|
+
# Use date part only (exclude time)
|
|
1031
|
+
last_date = last_date.split("T")[0]
|
|
1032
|
+
|
|
1033
|
+
# Convert last_date to a date object
|
|
1034
|
+
last_date_object = datetime.datetime.strptime(last_date, "%Y-%m-%d").date()
|
|
1035
|
+
|
|
1036
|
+
# Compare given date to last date from dbhydro
|
|
1037
|
+
return date_latest_object >= last_date_object
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
def dbhydro_water_quality_data_is_latest(date_latest: str, station: str, station_type: Literal['SITE', 'STATION'], test_number: int) -> bool:
|
|
1041
|
+
"""
|
|
1042
|
+
Checks whether the given date is the most recent date possible to get water quality data from dbhydro.
|
|
1043
|
+
Can be used to check whether dbhydro water quality data is up-to-date.
|
|
1044
|
+
|
|
1045
|
+
Args:
|
|
1046
|
+
date_latest (str): The date of the most recent data of the dbhydro water quality data you have.
|
|
1047
|
+
station (str): The station ID of the water quality data you are checking.
|
|
1048
|
+
test_number (int): The test number of the water quality data you are checking. Test numbers map to parameters such as 'PHOSPHATE, TOTAL AS P'.
|
|
1049
|
+
|
|
1050
|
+
Returns:
|
|
1051
|
+
bool: True if the date_latest is the most recent date possible to get water quality data from dbhydro, False otherwise
|
|
1052
|
+
"""
|
|
1053
|
+
# Get the date range from dbhydro water quality data
|
|
1054
|
+
date_start, date_end = get_dbhydro_water_quality_date_range(station, station_type, test_number)
|
|
1055
|
+
|
|
1056
|
+
# No end date available
|
|
1057
|
+
if date_end is None:
|
|
1058
|
+
# Assume data is not up-to-date
|
|
1059
|
+
return False
|
|
1060
|
+
|
|
1061
|
+
# Convert date_latest to a datetime object
|
|
1062
|
+
date_latest_object = pd.to_datetime(date_latest)
|
|
1063
|
+
|
|
1064
|
+
# Compare given date to last date from dbhydro
|
|
1065
|
+
return date_latest_object >= date_end
|
|
975
1066
|
|
|
976
1067
|
|
|
977
1068
|
def get_synthetic_data(date_start: str, df: pd.DataFrame):
|
|
@@ -996,14 +1087,31 @@ def get_synthetic_data(date_start: str, df: pd.DataFrame):
|
|
|
996
1087
|
end_month_day = date_end.strftime('%m-%d')
|
|
997
1088
|
|
|
998
1089
|
# Filter the DataFrame to include only rows between date_start and date_end for all previous years
|
|
999
|
-
|
|
1090
|
+
# (handle year wrap, e.g., Dec -> Jan)
|
|
1091
|
+
wraps_year = start_month_day > end_month_day
|
|
1092
|
+
|
|
1093
|
+
if wraps_year:
|
|
1094
|
+
mask = (
|
|
1095
|
+
(df['month_day'] >= start_month_day) |
|
|
1096
|
+
(df['month_day'] <= end_month_day)
|
|
1097
|
+
)
|
|
1098
|
+
else:
|
|
1099
|
+
mask = (
|
|
1100
|
+
(df['month_day'] >= start_month_day) &
|
|
1101
|
+
(df['month_day'] <= end_month_day)
|
|
1102
|
+
)
|
|
1103
|
+
|
|
1000
1104
|
filtered_data = df.loc[mask]
|
|
1001
1105
|
|
|
1002
1106
|
# Group by the month and day, then calculate the average for each group
|
|
1003
1107
|
average_values = filtered_data.groupby('month_day')['Data'].mean()
|
|
1004
1108
|
# Interpolate in case there are missing values:
|
|
1005
1109
|
start_date = pd.to_datetime('2001-' + start_month_day)
|
|
1006
|
-
|
|
1110
|
+
|
|
1111
|
+
if wraps_year:
|
|
1112
|
+
end_date = pd.to_datetime('2002-' + end_month_day)
|
|
1113
|
+
else:
|
|
1114
|
+
end_date = pd.to_datetime('2001-' + end_month_day)
|
|
1007
1115
|
|
|
1008
1116
|
full_dates = pd.date_range(start=start_date, end=end_date)
|
|
1009
1117
|
full_index = full_dates.strftime('%m-%d')
|
|
@@ -1021,6 +1129,128 @@ def get_synthetic_data(date_start: str, df: pd.DataFrame):
|
|
|
1021
1129
|
return df
|
|
1022
1130
|
|
|
1023
1131
|
|
|
1132
|
+
def df_replace_missing_with_nan(df: pd.DataFrame, qualifier_codes: set = {'M', 'N'}, no_data_value: float = -99999.0) -> pd.DataFrame:
|
|
1133
|
+
"""
|
|
1134
|
+
Replace values in the 'value' column of the DataFrame with NaN where the 'qualifier' column contains specified qualifier codes.
|
|
1135
|
+
|
|
1136
|
+
This was designed to work with dataframes created from dbhydro_py response data.
|
|
1137
|
+
The dataframe must have 'value' and 'qualifier' columns.
|
|
1138
|
+
Qualifier/Codes can be found here: https://insightsdata.sfwmd.gov/#/reference-tables?lookup=qualityCode
|
|
1139
|
+
|
|
1140
|
+
Args:
|
|
1141
|
+
df (pd.DataFrame): DataFrame that was created from a dbhydro_py response. Must have value and qualifier columns.
|
|
1142
|
+
qualifier_codes (set, optional): Set of qualifier codes indicating missing data. Defaults to {'M', 'N'}.
|
|
1143
|
+
no_data_value (float, optional): Value representing no data. Defaults to -99999.0. Values equal to this will also be replaced with NaN.
|
|
1144
|
+
|
|
1145
|
+
Returns:
|
|
1146
|
+
pd.DataFrame: DataFrame with specified values replaced with NaN.
|
|
1147
|
+
"""
|
|
1148
|
+
# Replace 0 values with NaN when their qualifier is in qualifier_codes
|
|
1149
|
+
# 'M' = Missing, 'N' = Not Yet Available
|
|
1150
|
+
# Qualifier/Codes can be found here: https://insightsdata.sfwmd.gov/#/reference-tables?lookup=qualityCode
|
|
1151
|
+
df.loc[df['qualifier'].isin(qualifier_codes), 'value'] = np.nan
|
|
1152
|
+
|
|
1153
|
+
# Also replace no_data_value with NaN
|
|
1154
|
+
df.loc[np.isclose(df['value'], no_data_value), 'value'] = np.nan
|
|
1155
|
+
|
|
1156
|
+
# Return modified dataframe
|
|
1157
|
+
return df
|
|
1158
|
+
|
|
1159
|
+
|
|
1160
|
+
def get_dbhydro_water_quality_date_range(station: str, station_type: Literal['SITE', 'STATION'], test_number: int) -> Tuple[pd.Timestamp | None, pd.Timestamp | None]:
|
|
1161
|
+
"""Get the start date and end date for the given station and test number from DBHYDRO water quality data.
|
|
1162
|
+
|
|
1163
|
+
Args:
|
|
1164
|
+
station (str): The station names.
|
|
1165
|
+
station_type (Literal['SITE', 'STATION']): The type of the station.
|
|
1166
|
+
test_number (int): The test number of the data. Test numbers map to parameters such as 'PHOSPHATE, TOTAL AS P'.
|
|
1167
|
+
|
|
1168
|
+
Returns:
|
|
1169
|
+
Tuple[pd.Timestamp | None, pd.Timestamp | None]: A tuple containing the start date and end date in 'MM/DD/YYYY' format.
|
|
1170
|
+
"""
|
|
1171
|
+
response = get_dbhydro_water_quality_metadata([(station, station_type)], [test_number])
|
|
1172
|
+
|
|
1173
|
+
# No data given back by api
|
|
1174
|
+
if response is None:
|
|
1175
|
+
return (None, None)
|
|
1176
|
+
|
|
1177
|
+
# Get the date range from the response
|
|
1178
|
+
if 'results' in response:
|
|
1179
|
+
results = response['results']
|
|
1180
|
+
if len(results) > 0:
|
|
1181
|
+
# Find the first non-None start and end dates
|
|
1182
|
+
date_start = None
|
|
1183
|
+
date_end = None
|
|
1184
|
+
for result in results:
|
|
1185
|
+
date_start = result.get('startDate', None)
|
|
1186
|
+
date_end = result.get('endDate', None)
|
|
1187
|
+
|
|
1188
|
+
# Dates found
|
|
1189
|
+
if date_start is not None and date_end is not None:
|
|
1190
|
+
break
|
|
1191
|
+
|
|
1192
|
+
# If no valid dates were found, return early
|
|
1193
|
+
if date_start is None or date_end is None:
|
|
1194
|
+
return (date_start, date_end)
|
|
1195
|
+
|
|
1196
|
+
# Find the earliest start date and latest end date
|
|
1197
|
+
for result in results:
|
|
1198
|
+
date_start_current = result.get('startDate', None)
|
|
1199
|
+
date_end_current = result.get('endDate', None)
|
|
1200
|
+
if date_start_current is not None and pd.to_datetime(date_start_current) < pd.to_datetime(date_start):
|
|
1201
|
+
date_start = date_start_current
|
|
1202
|
+
if date_end_current is not None and pd.to_datetime(date_end_current) > pd.to_datetime(date_end):
|
|
1203
|
+
date_end = date_end_current
|
|
1204
|
+
|
|
1205
|
+
# Convert dates to datetime objects
|
|
1206
|
+
if date_start is not None:
|
|
1207
|
+
date_start = pd.to_datetime(date_start)
|
|
1208
|
+
if date_end is not None:
|
|
1209
|
+
date_end = pd.to_datetime(date_end)
|
|
1210
|
+
|
|
1211
|
+
# Return the earliest start date and latest end date
|
|
1212
|
+
return (date_start, date_end)
|
|
1213
|
+
|
|
1214
|
+
# No results found
|
|
1215
|
+
return (None, None)
|
|
1216
|
+
|
|
1217
|
+
|
|
1218
|
+
def get_dbhydro_api_keys_from_environment() -> dict[str, str]:
|
|
1219
|
+
"""Get DBHYDRO API keys from environment variables.
|
|
1220
|
+
|
|
1221
|
+
Returns:
|
|
1222
|
+
Dict[str, str]: A dictionary containing the DBHYDRO API keys where dict keys are 'client_id' and 'client_secret'.
|
|
1223
|
+
"""
|
|
1224
|
+
# Get API keys from environment variables
|
|
1225
|
+
api_keys = {
|
|
1226
|
+
"client_id": os.environ.get("DBHYDRO_API_CLIENT_ID", ""),
|
|
1227
|
+
"client_secret": os.environ.get("DBHYDRO_API_CLIENT_SECRET", ""),
|
|
1228
|
+
}
|
|
1229
|
+
|
|
1230
|
+
# Return the API keys
|
|
1231
|
+
return api_keys
|
|
1232
|
+
|
|
1233
|
+
|
|
1234
|
+
def get_dbhydro_api_keys() -> dict[str, str]:
|
|
1235
|
+
"""Get DBHYDRO API keys.
|
|
1236
|
+
|
|
1237
|
+
Returns:
|
|
1238
|
+
Dict[str, str]: A dictionary containing the DBHYDRO API keys where dict keys are 'client_id' and 'client_secret'.
|
|
1239
|
+
"""
|
|
1240
|
+
return get_dbhydro_api_keys_from_environment()
|
|
1241
|
+
|
|
1242
|
+
|
|
1243
|
+
def get_dbhydro_api() -> DbHydroApi:
|
|
1244
|
+
"""Get a configured DbHydroApi instance.
|
|
1245
|
+
|
|
1246
|
+
Returns:
|
|
1247
|
+
DbHydroApi: An instance of the DbHydroApi class.
|
|
1248
|
+
"""
|
|
1249
|
+
api_keys = get_dbhydro_api_keys()
|
|
1250
|
+
dbhydro_api = DbHydroApi.with_default_adapter(client_id=api_keys["client_id"], client_secret=api_keys["client_secret"])
|
|
1251
|
+
return dbhydro_api
|
|
1252
|
+
|
|
1253
|
+
|
|
1024
1254
|
if __name__ == "__main__":
|
|
1025
1255
|
if sys.argv[1] == "get_dbkeys":
|
|
1026
1256
|
get_dbkeys(
|
|
@@ -4,8 +4,7 @@ import requests
|
|
|
4
4
|
import uuid
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
from loone_data_prep.water_level_data import hydro
|
|
7
|
-
from loone_data_prep.
|
|
8
|
-
from loone_data_prep.utils import find_last_date_in_csv, dbhydro_data_is_latest
|
|
7
|
+
from loone_data_prep.utils import find_last_date_in_csv, dbhydro_data_is_latest, get_stations_latitude_longitude
|
|
9
8
|
import pandas as pd
|
|
10
9
|
|
|
11
10
|
DATE_NOW = datetime.now().date().strftime("%Y-%m-%d")
|
|
@@ -13,11 +12,11 @@ DATE_NOW = datetime.now().date().strftime("%Y-%m-%d")
|
|
|
13
12
|
D = {
|
|
14
13
|
"LO_Stage": {"dbkeys": ["16022", "12509", "12519", "16265", "15611"], "datum": "NGVD29"},
|
|
15
14
|
"LO_Stage_2": {"dbkeys": ["94832"], "date_min": "2024-04-30", "datum": "NAVD88"},
|
|
16
|
-
"Stg_3ANW": {"dbkeys": ["LA369"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
|
|
17
|
-
"Stg_2A17": {"dbkeys": ["16531"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
|
|
18
|
-
"Stg_3A3": {"dbkeys": ["16532"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
|
|
19
|
-
"Stg_3A4": {"dbkeys": ["16537"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
|
|
20
|
-
"Stg_3A28": {"dbkeys": ["16538"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"}
|
|
15
|
+
"Stg_3ANW": {"dbkeys": ["LA369"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29", "override_site_codes": {"G3ANW": "3A-NW"}},
|
|
16
|
+
"Stg_2A17": {"dbkeys": ["16531"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29", "override_site_codes": {"2A-17": "2-17"}},
|
|
17
|
+
"Stg_3A3": {"dbkeys": ["16532"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29", "override_site_codes": {"3A-3": "3-63"}},
|
|
18
|
+
"Stg_3A4": {"dbkeys": ["16537"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29", "override_site_codes": {"3A-4": "3-64"}},
|
|
19
|
+
"Stg_3A28": {"dbkeys": ["16538"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29", "override_site_codes": {"3A-28": "3-65"}},
|
|
21
20
|
}
|
|
22
21
|
|
|
23
22
|
|
|
@@ -25,9 +24,6 @@ def main(workspace: str, d: dict = D) -> dict:
|
|
|
25
24
|
missing_files = []
|
|
26
25
|
failed_downloads = [] # List of file names that the script failed to get the latest data for (but the files still exist)
|
|
27
26
|
|
|
28
|
-
# Get the date of the latest data in LO_Stage_2.csv
|
|
29
|
-
date_latest_lo_stage_2 = find_last_date_in_csv(workspace, "LO_Stage_2.csv")
|
|
30
|
-
|
|
31
27
|
for name, params in d.items():
|
|
32
28
|
# Get the date of the latest data in the csv file
|
|
33
29
|
date_latest = find_last_date_in_csv(workspace, f"{name}.csv")
|
|
@@ -35,10 +31,18 @@ def main(workspace: str, d: dict = D) -> dict:
|
|
|
35
31
|
# File with data for this dbkey does NOT already exist (or possibly some other error occurred)
|
|
36
32
|
if date_latest is None:
|
|
37
33
|
print(f"Getting all water level data for {name}.")
|
|
34
|
+
params['date_max'] = DATE_NOW
|
|
38
35
|
hydro.get(workspace, name, **params)
|
|
39
36
|
else:
|
|
40
37
|
# Check whether the latest data is already up to date.
|
|
41
|
-
|
|
38
|
+
requires_data_download = False
|
|
39
|
+
for dbkey in params['dbkeys']:
|
|
40
|
+
if not dbhydro_data_is_latest(date_latest, dbkey):
|
|
41
|
+
requires_data_download = True
|
|
42
|
+
break
|
|
43
|
+
|
|
44
|
+
# Data is already up to date
|
|
45
|
+
if not requires_data_download:
|
|
42
46
|
# Notify that the data is already up to date
|
|
43
47
|
print(f'Downloading of new water level data skipped for {name}. Data is already up to date.')
|
|
44
48
|
continue
|
|
@@ -50,21 +54,23 @@ def main(workspace: str, d: dict = D) -> dict:
|
|
|
50
54
|
|
|
51
55
|
try:
|
|
52
56
|
# Download only the new data
|
|
53
|
-
|
|
54
|
-
|
|
57
|
+
date_next = (datetime.strptime(date_latest, "%Y-%m-%d") + pd.DateOffset(days=1)).date().strftime("%Y-%m-%d")
|
|
58
|
+
print(f'Downloading new water level data for {name} starting from date {date_next}')
|
|
59
|
+
kwargs = {}
|
|
60
|
+
if 'override_site_codes' in params:
|
|
61
|
+
kwargs['override_site_codes'] = params['override_site_codes']
|
|
62
|
+
hydro.get(workspace, name, dbkeys=params['dbkeys'], date_min=date_next, date_max=DATE_NOW, datum=params['datum'], **kwargs)
|
|
55
63
|
|
|
56
64
|
# Read in the original data and the newly downloaded data
|
|
57
|
-
df_original = pd.read_csv(os.path.join(workspace, original_file_name_temp), index_col=
|
|
58
|
-
df_new = pd.read_csv(os.path.join(workspace, original_file_name), index_col=
|
|
59
|
-
|
|
60
|
-
# For get_hydro() calls with multiple dbkeys, remove the row corresponding to the latest date from the downloaded data.
|
|
61
|
-
# When get_hydro() is given multiple keys its returned data starts from the date given instead of the day after like it
|
|
62
|
-
# does when given a single key.
|
|
63
|
-
if len(params['dbkeys']) > 1:
|
|
64
|
-
df_new = df_new[df_new['date'] != date_latest]
|
|
65
|
+
df_original = pd.read_csv(os.path.join(workspace, original_file_name_temp), index_col='date')
|
|
66
|
+
df_new = pd.read_csv(os.path.join(workspace, original_file_name), index_col='date')
|
|
65
67
|
|
|
66
68
|
# Merge the new data with the original data
|
|
67
|
-
df_merged = pd.concat([df_original, df_new], ignore_index=
|
|
69
|
+
df_merged = pd.concat([df_original, df_new], ignore_index=False)
|
|
70
|
+
|
|
71
|
+
# Ensure an integer index (for backwards compatibility)
|
|
72
|
+
df_merged.reset_index(inplace=True)
|
|
73
|
+
df_merged.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')
|
|
68
74
|
|
|
69
75
|
# Write out the merged data
|
|
70
76
|
df_merged.to_csv(os.path.join(workspace, original_file_name))
|
|
@@ -102,6 +108,10 @@ def main(workspace: str, d: dict = D) -> dict:
|
|
|
102
108
|
lat_long_map = get_stations_latitude_longitude(["L OKEE"])
|
|
103
109
|
latitude, longitude = lat_long_map["L OKEE"]
|
|
104
110
|
|
|
111
|
+
# Load the LO_Stage.csv file
|
|
112
|
+
df_lo_stage = pd.read_csv(os.path.join(workspace, "LO_Stage.csv"), index_col="date")
|
|
113
|
+
df_lo_stage.index = pd.to_datetime(df_lo_stage.index)
|
|
114
|
+
|
|
105
115
|
# Load the LO_Stage_2.csv file
|
|
106
116
|
df_lo_stage_2 = pd.read_csv(os.path.join(workspace, "LO_Stage_2.csv"), index_col="date")
|
|
107
117
|
df_lo_stage_2.index = pd.to_datetime(df_lo_stage_2.index)
|
|
@@ -109,21 +119,24 @@ def main(workspace: str, d: dict = D) -> dict:
|
|
|
109
119
|
# Output Progress
|
|
110
120
|
print("Converting NAVD88 to NGVD29 for 'L OKEE's new dbkey...\n")
|
|
111
121
|
|
|
112
|
-
# Use only the data that is not already in the LO_Stage.csv file
|
|
113
|
-
|
|
114
|
-
date_start = datetime.strptime(date_latest_lo_stage_2, "%Y-%m-%d") + pd.DateOffset(days=1)
|
|
115
|
-
df_lo_stage_2 = df_lo_stage_2.loc[date_start:]
|
|
122
|
+
# Use only the data that is not already in the LO_Stage.csv file and exists in the LO_Stage_2.csv file
|
|
123
|
+
common_dates = df_lo_stage.index.intersection(df_lo_stage_2.index)
|
|
116
124
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
125
|
+
missing_mask = (
|
|
126
|
+
df_lo_stage.loc[common_dates, "L OKEE_STG_ft NGVD29"].isna() &
|
|
127
|
+
df_lo_stage_2.loc[common_dates, "L OKEE_STG_ft NGVD29"].notna()
|
|
128
|
+
)
|
|
121
129
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
130
|
+
missing_dates: pd.DatetimeIndex = common_dates[missing_mask]
|
|
131
|
+
missing_dates = missing_dates.to_list()
|
|
132
|
+
|
|
133
|
+
# Convert the stage values from NAVD88 to NGVD29 for the missing dates
|
|
134
|
+
converted_values = {}
|
|
135
|
+
for date in missing_dates:
|
|
125
136
|
try:
|
|
126
|
-
|
|
137
|
+
navd88_value = df_lo_stage_2.at[date, "L OKEE_STG_ft NGVD29"]
|
|
138
|
+
ngvd29_value = _convert_navd88_to_ngvd29(latitude, longitude, navd88_value, date.year)
|
|
139
|
+
converted_values[date] = ngvd29_value
|
|
127
140
|
except Exception as e:
|
|
128
141
|
convert_failure = True
|
|
129
142
|
print(str(e))
|
|
@@ -132,20 +145,15 @@ def main(workspace: str, d: dict = D) -> dict:
|
|
|
132
145
|
# Check for conversion failure
|
|
133
146
|
if not convert_failure:
|
|
134
147
|
# Update the LO_Stage.csv file with the converted values
|
|
135
|
-
|
|
136
|
-
df_lo_stage.index = pd.to_datetime(df_lo_stage.index)
|
|
137
|
-
|
|
138
|
-
for i in range(0, len(lo_stage_2_values_ngvd29)):
|
|
139
|
-
# Get the current date and value
|
|
140
|
-
date = lo_stage_2_dates[i]
|
|
141
|
-
value = lo_stage_2_values_ngvd29[i]
|
|
142
|
-
|
|
143
|
-
# Update the value in the LO_Stage dataframe
|
|
148
|
+
for date, value in converted_values.items():
|
|
144
149
|
df_lo_stage.at[date, "L OKEE_STG_ft NGVD29"] = value
|
|
145
150
|
|
|
146
151
|
# Reset the index
|
|
147
152
|
df_lo_stage.reset_index(inplace=True)
|
|
148
|
-
|
|
153
|
+
|
|
154
|
+
# Drop Unnamed: 0 column that might have been added
|
|
155
|
+
if "Unnamed: 0" in df_lo_stage.columns:
|
|
156
|
+
df_lo_stage.drop(columns=["Unnamed: 0"], inplace=True)
|
|
149
157
|
|
|
150
158
|
# Save the updated LO_Stage.csv file
|
|
151
159
|
df_lo_stage.to_csv(os.path.join(workspace, "LO_Stage.csv"))
|