loone-data-prep 1.2.4__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. loone_data_prep/GEOGLOWS_LOONE_DATA_PREP.py +47 -16
  2. loone_data_prep/LOONE_DATA_PREP.py +0 -1
  3. loone_data_prep/dbhydro_insights.py +195 -0
  4. loone_data_prep/flow_data/S65E_total.py +57 -57
  5. loone_data_prep/flow_data/forecast_bias_correction.py +1 -1
  6. loone_data_prep/flow_data/get_forecast_flows.py +19 -105
  7. loone_data_prep/flow_data/get_inflows.py +18 -8
  8. loone_data_prep/flow_data/get_outflows.py +16 -7
  9. loone_data_prep/flow_data/hydro.py +62 -91
  10. loone_data_prep/forecast_scripts/get_Chla_predicted.py +1 -1
  11. loone_data_prep/forecast_scripts/get_NO_Loads_predicted.py +1 -1
  12. loone_data_prep/forecast_scripts/new_combined_weather_forecast.py +220 -0
  13. loone_data_prep/utils.py +262 -32
  14. loone_data_prep/water_level_data/get_all.py +52 -44
  15. loone_data_prep/water_level_data/hydro.py +49 -68
  16. loone_data_prep/water_quality_data/get_inflows.py +69 -27
  17. loone_data_prep/water_quality_data/get_lake_wq.py +130 -33
  18. loone_data_prep/water_quality_data/wq.py +114 -88
  19. loone_data_prep/weather_data/get_all.py +5 -3
  20. loone_data_prep/weather_data/weather.py +117 -180
  21. {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/METADATA +2 -8
  22. loone_data_prep-1.3.1.dist-info/RECORD +38 -0
  23. {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/WHEEL +1 -1
  24. loone_data_prep/forecast_scripts/create_forecast_LOWs.py +0 -170
  25. loone_data_prep/forecast_scripts/weather_forecast.py +0 -199
  26. loone_data_prep-1.2.4.dist-info/RECORD +0 -38
  27. {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/licenses/LICENSE +0 -0
  28. {loone_data_prep-1.2.4.dist-info → loone_data_prep-1.3.1.dist-info}/top_level.txt +0 -0
loone_data_prep/utils.py CHANGED
@@ -5,17 +5,14 @@ import math
5
5
  from glob import glob
6
6
  from calendar import monthrange
7
7
  import traceback
8
+ from typing import Literal, Tuple
8
9
  import numpy as np
9
10
  import pandas as pd
10
11
  from retry import retry
11
12
  from scipy.optimize import fsolve
12
13
  from scipy import interpolate
13
- from rpy2.robjects import r
14
- from rpy2.robjects.vectors import (
15
- StrVector as rpy2StrVector,
16
- DataFrame as rpy2DataFrame,
17
- )
18
- from rpy2.rinterface_lib.embedded import RRuntimeError
14
+ from dbhydro_py import DbHydroApi
15
+ from loone_data_prep.dbhydro_insights import get_dbhydro_station_metadata, get_dbhydro_continuous_timeseries_metadata, get_dbhydro_water_quality_metadata
19
16
 
20
17
 
21
18
  DEFAULT_STATION_IDS = ["L001", "L005", "L006", "LZ40"]
@@ -224,7 +221,7 @@ DEFAULT_EXPFUNC_NITROGEN_CONSTANTS = {
224
221
  "S135_P": {"a": 3.09890183766129, "b": 0.657896838486496},
225
222
  }
226
223
 
227
- @retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
224
+ @retry(Exception, tries=5, delay=15, max_delay=60, backoff=2)
228
225
  def get_dbkeys(
229
226
  station_ids: list,
230
227
  category: str,
@@ -232,9 +229,8 @@ def get_dbkeys(
232
229
  stat: str,
233
230
  recorder: str,
234
231
  freq: str = "DA",
235
- detail_level: str = "dbkey",
236
232
  *args: str,
237
- ) -> rpy2StrVector | rpy2DataFrame:
233
+ ) -> list[str]:
238
234
  """Get dbkeys. See DBHydroR documentation for more information:
239
235
  https://cran.r-project.org/web/packages/dbhydroR/dbhydroR.pdf
240
236
 
@@ -245,27 +241,68 @@ def get_dbkeys(
245
241
  stat (str): Statistic of data to retrieve.
246
242
  recorder (str): Recorder of data to retrieve.
247
243
  freq (str, optional): Frequency of data to retrieve. Defaults to "DA".
248
- detail_level (str, optional): Detail level of data to retrieve. Defaults to "dbkey". Options are "dbkey",
249
- "summary", or "full".
250
244
 
251
245
  Returns:
252
- rpy2StrVector | rpy2DataFrame: dbkeys info at the specified detail level.
246
+ list[str]: dbkeys info for the specified parameters.
253
247
  """
248
+ # Retrieve the metadata for the specified parameters
249
+ metadata = get_dbhydro_continuous_timeseries_metadata(station_ids, [category], [param], [stat], [recorder], [freq])
250
+
251
+ # A set to hold the dbkeys to avoid duplicates
252
+ dbkeys = set()
253
+
254
+ # No data returned from API
255
+ if metadata is None:
256
+ return list(dbkeys)
257
+
258
+ # Get the dbkeys from the metadata
259
+ for result in metadata['results']:
260
+ dbkeys.add(result['timeseriesId'])
261
+
262
+ # Return the dbkeys as a list
263
+ return list(dbkeys)
254
264
 
255
- station_ids_str = '"' + '", "'.join(station_ids) + '"'
256
265
 
257
- dbkeys = r(
258
- f"""
259
- library(dbhydroR)
266
+ def get_stations_latitude_longitude(station_ids: list[str]):
267
+ """Gets the latitudes and longitudes of the given stations.
260
268
 
261
- station_ids <- c({station_ids_str})
262
- dbkeys <- get_dbkey(stationid = station_ids, category = "{category}", param = "{param}", stat = "{stat}", recorder="{recorder}", freq = "{freq}", detail.level = "{detail_level}")
263
- print(dbkeys)
264
- return(dbkeys)
265
- """ # noqa: E501
266
- )
269
+ Args:
270
+ station_ids (list[str]): The ids of the stations to get the
271
+ latitudes/longitudes of. Example: ['L OKEE', 'FISHP']
267
272
 
268
- return dbkeys
273
+ Returns:
274
+ (dict[str, tuple[numpy.float64, numpy.float64]]): A dictionary of
275
+ format dict<station_id:(latitude,longitude)>
276
+
277
+ If a station's latitude/longitude fails to download then its station_id
278
+ won't be a key in the returned dictionary.
279
+ """
280
+ # Dictionary to hold the latitude/longitude of each station
281
+ station_data = {}
282
+
283
+ # Get the latitude and longitude for each station
284
+ for station_id in station_ids:
285
+ # Retrieve the current station's metadata
286
+ station_metadata = get_dbhydro_station_metadata(station_id)
287
+
288
+ # Check if the metadata was successfully retrieved
289
+ if station_metadata is None:
290
+ print(f'Failed to get latitude/longitude for station {station_id} - No data given back from API')
291
+ continue
292
+
293
+ # Extract the latitude and longitude from the metadata
294
+ try:
295
+ latitude = station_metadata['features'][0]['attributes']['LAT']
296
+ longitude = station_metadata['features'][0]['attributes']['LONG']
297
+ except KeyError:
298
+ print(f'Failed to get latitude/longitude for station {station_id} - Unexpected response structure from API')
299
+ continue
300
+
301
+ # Add the latitude and longitude to the dictionary
302
+ station_data[station_id] = latitude, longitude
303
+
304
+ # Return the dictionary of station latitudes and longitudes
305
+ return station_data
269
306
 
270
307
 
271
308
  def data_interpolations(
@@ -916,9 +953,17 @@ def find_last_date_in_csv(workspace: str, file_name: str) -> str:
916
953
 
917
954
  # Helper Functions
918
955
  def is_valid_date(date_string):
956
+ # Check for date without time part
919
957
  try:
920
958
  datetime.datetime.strptime(date_string, "%Y-%m-%d")
921
959
  return True
960
+ except ValueError:
961
+ pass
962
+
963
+ # Check for date with time part
964
+ try:
965
+ datetime.datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")
966
+ return True
922
967
  except ValueError:
923
968
  return False
924
969
 
@@ -955,23 +1000,69 @@ def find_last_date_in_csv(workspace: str, file_name: str) -> str:
955
1000
  return None
956
1001
 
957
1002
 
958
- def dbhydro_data_is_latest(date_latest: str):
1003
+ def dbhydro_data_is_latest(date_latest: str, dbkey: str | None = None) -> bool:
959
1004
  """
960
1005
  Checks whether the given date is the most recent date possible to get data from dbhydro.
961
1006
  Can be used to check whether dbhydro data is up-to-date.
962
1007
 
963
1008
  Args:
964
1009
  date_latest (str): The date of the most recent data of the dbhydro data you have
1010
+ dbkey (str | None, optional): The dbkey of the data you are checking. Defaults to None.
965
1011
 
966
1012
  Returns:
967
1013
  bool: True if the date_latest is the most recent date possible to get data from dbhydro, False otherwise
968
1014
  """
969
- date_latest_object = datetime.datetime.strptime(
970
- date_latest, "%Y-%m-%d"
971
- ).date()
972
- return date_latest_object == (
973
- datetime.datetime.now().date() - datetime.timedelta(days=1)
974
- )
1015
+ # Convert date_latest to a date object
1016
+ date_latest_object = pd.to_datetime(date_latest).date()
1017
+
1018
+ # No dbkey provided
1019
+ if dbkey is None:
1020
+ # Assume latest data available is yesterday
1021
+ return date_latest_object == (datetime.datetime.now().date() - datetime.timedelta(days=1))
1022
+
1023
+ # Get dbhydro api
1024
+ dbhydro_api = get_dbhydro_api()
1025
+
1026
+ # Retrieve the last date available from dbhydro for the given dbkey
1027
+ data = dbhydro_api.get_daily_data([dbkey], 'id', '1900-01-01', '1900-01-02', 'NGVD29', False)
1028
+ last_date = data.time_series[0].period_of_record.por_last_date
1029
+
1030
+ # Use date part only (exclude time)
1031
+ last_date = last_date.split("T")[0]
1032
+
1033
+ # Convert last_date to a date object
1034
+ last_date_object = datetime.datetime.strptime(last_date, "%Y-%m-%d").date()
1035
+
1036
+ # Compare given date to last date from dbhydro
1037
+ return date_latest_object >= last_date_object
1038
+
1039
+
1040
+ def dbhydro_water_quality_data_is_latest(date_latest: str, station: str, station_type: Literal['SITE', 'STATION'], test_number: int) -> bool:
1041
+ """
1042
+ Checks whether the given date is the most recent date possible to get water quality data from dbhydro.
1043
+ Can be used to check whether dbhydro water quality data is up-to-date.
1044
+
1045
+ Args:
1046
+ date_latest (str): The date of the most recent data of the dbhydro water quality data you have.
1047
+ station (str): The station ID of the water quality data you are checking.
1048
+ test_number (int): The test number of the water quality data you are checking. Test numbers map to parameters such as 'PHOSPHATE, TOTAL AS P'.
1049
+
1050
+ Returns:
1051
+ bool: True if the date_latest is the most recent date possible to get water quality data from dbhydro, False otherwise
1052
+ """
1053
+ # Get the date range from dbhydro water quality data
1054
+ date_start, date_end = get_dbhydro_water_quality_date_range(station, station_type, test_number)
1055
+
1056
+ # No end date available
1057
+ if date_end is None:
1058
+ # Assume data is not up-to-date
1059
+ return False
1060
+
1061
+ # Convert date_latest to a datetime object
1062
+ date_latest_object = pd.to_datetime(date_latest)
1063
+
1064
+ # Compare given date to last date from dbhydro
1065
+ return date_latest_object >= date_end
975
1066
 
976
1067
 
977
1068
  def get_synthetic_data(date_start: str, df: pd.DataFrame):
@@ -996,14 +1087,31 @@ def get_synthetic_data(date_start: str, df: pd.DataFrame):
996
1087
  end_month_day = date_end.strftime('%m-%d')
997
1088
 
998
1089
  # Filter the DataFrame to include only rows between date_start and date_end for all previous years
999
- mask = (df['month_day'] >= start_month_day) & (df['month_day'] <= end_month_day)
1090
+ # (handle year wrap, e.g., Dec -> Jan)
1091
+ wraps_year = start_month_day > end_month_day
1092
+
1093
+ if wraps_year:
1094
+ mask = (
1095
+ (df['month_day'] >= start_month_day) |
1096
+ (df['month_day'] <= end_month_day)
1097
+ )
1098
+ else:
1099
+ mask = (
1100
+ (df['month_day'] >= start_month_day) &
1101
+ (df['month_day'] <= end_month_day)
1102
+ )
1103
+
1000
1104
  filtered_data = df.loc[mask]
1001
1105
 
1002
1106
  # Group by the month and day, then calculate the average for each group
1003
1107
  average_values = filtered_data.groupby('month_day')['Data'].mean()
1004
1108
  # Interpolate in case there are missing values:
1005
1109
  start_date = pd.to_datetime('2001-' + start_month_day)
1006
- end_date = pd.to_datetime('2001-' + end_month_day)
1110
+
1111
+ if wraps_year:
1112
+ end_date = pd.to_datetime('2002-' + end_month_day)
1113
+ else:
1114
+ end_date = pd.to_datetime('2001-' + end_month_day)
1007
1115
 
1008
1116
  full_dates = pd.date_range(start=start_date, end=end_date)
1009
1117
  full_index = full_dates.strftime('%m-%d')
@@ -1021,6 +1129,128 @@ def get_synthetic_data(date_start: str, df: pd.DataFrame):
1021
1129
  return df
1022
1130
 
1023
1131
 
1132
+ def df_replace_missing_with_nan(df: pd.DataFrame, qualifier_codes: set = {'M', 'N'}, no_data_value: float = -99999.0) -> pd.DataFrame:
1133
+ """
1134
+ Replace values in the 'value' column of the DataFrame with NaN where the 'qualifier' column contains specified qualifier codes.
1135
+
1136
+ This was designed to work with dataframes created from dbhydro_py response data.
1137
+ The dataframe must have 'value' and 'qualifier' columns.
1138
+ Qualifier/Codes can be found here: https://insightsdata.sfwmd.gov/#/reference-tables?lookup=qualityCode
1139
+
1140
+ Args:
1141
+ df (pd.DataFrame): DataFrame that was created from a dbhydro_py response. Must have value and qualifier columns.
1142
+ qualifier_codes (set, optional): Set of qualifier codes indicating missing data. Defaults to {'M', 'N'}.
1143
+ no_data_value (float, optional): Value representing no data. Defaults to -99999.0. Values equal to this will also be replaced with NaN.
1144
+
1145
+ Returns:
1146
+ pd.DataFrame: DataFrame with specified values replaced with NaN.
1147
+ """
1148
+ # Replace 0 values with NaN when their qualifier is in qualifier_codes
1149
+ # 'M' = Missing, 'N' = Not Yet Available
1150
+ # Qualifier/Codes can be found here: https://insightsdata.sfwmd.gov/#/reference-tables?lookup=qualityCode
1151
+ df.loc[df['qualifier'].isin(qualifier_codes), 'value'] = np.nan
1152
+
1153
+ # Also replace no_data_value with NaN
1154
+ df.loc[np.isclose(df['value'], no_data_value), 'value'] = np.nan
1155
+
1156
+ # Return modified dataframe
1157
+ return df
1158
+
1159
+
1160
+ def get_dbhydro_water_quality_date_range(station: str, station_type: Literal['SITE', 'STATION'], test_number: int) -> Tuple[pd.Timestamp | None, pd.Timestamp | None]:
1161
+ """Get the start date and end date for the given station and test number from DBHYDRO water quality data.
1162
+
1163
+ Args:
1164
+ station (str): The station names.
1165
+ station_type (Literal['SITE', 'STATION']): The type of the station.
1166
+ test_number (int): The test number of the data. Test numbers map to parameters such as 'PHOSPHATE, TOTAL AS P'.
1167
+
1168
+ Returns:
1169
+ Tuple[pd.Timestamp | None, pd.Timestamp | None]: A tuple containing the start date and end date in 'MM/DD/YYYY' format.
1170
+ """
1171
+ response = get_dbhydro_water_quality_metadata([(station, station_type)], [test_number])
1172
+
1173
+ # No data given back by api
1174
+ if response is None:
1175
+ return (None, None)
1176
+
1177
+ # Get the date range from the response
1178
+ if 'results' in response:
1179
+ results = response['results']
1180
+ if len(results) > 0:
1181
+ # Find the first non-None start and end dates
1182
+ date_start = None
1183
+ date_end = None
1184
+ for result in results:
1185
+ date_start = result.get('startDate', None)
1186
+ date_end = result.get('endDate', None)
1187
+
1188
+ # Dates found
1189
+ if date_start is not None and date_end is not None:
1190
+ break
1191
+
1192
+ # If no valid dates were found, return early
1193
+ if date_start is None or date_end is None:
1194
+ return (date_start, date_end)
1195
+
1196
+ # Find the earliest start date and latest end date
1197
+ for result in results:
1198
+ date_start_current = result.get('startDate', None)
1199
+ date_end_current = result.get('endDate', None)
1200
+ if date_start_current is not None and pd.to_datetime(date_start_current) < pd.to_datetime(date_start):
1201
+ date_start = date_start_current
1202
+ if date_end_current is not None and pd.to_datetime(date_end_current) > pd.to_datetime(date_end):
1203
+ date_end = date_end_current
1204
+
1205
+ # Convert dates to datetime objects
1206
+ if date_start is not None:
1207
+ date_start = pd.to_datetime(date_start)
1208
+ if date_end is not None:
1209
+ date_end = pd.to_datetime(date_end)
1210
+
1211
+ # Return the earliest start date and latest end date
1212
+ return (date_start, date_end)
1213
+
1214
+ # No results found
1215
+ return (None, None)
1216
+
1217
+
1218
+ def get_dbhydro_api_keys_from_environment() -> dict[str, str]:
1219
+ """Get DBHYDRO API keys from environment variables.
1220
+
1221
+ Returns:
1222
+ Dict[str, str]: A dictionary containing the DBHYDRO API keys where dict keys are 'client_id' and 'client_secret'.
1223
+ """
1224
+ # Get API keys from environment variables
1225
+ api_keys = {
1226
+ "client_id": os.environ.get("DBHYDRO_API_CLIENT_ID", ""),
1227
+ "client_secret": os.environ.get("DBHYDRO_API_CLIENT_SECRET", ""),
1228
+ }
1229
+
1230
+ # Return the API keys
1231
+ return api_keys
1232
+
1233
+
1234
+ def get_dbhydro_api_keys() -> dict[str, str]:
1235
+ """Get DBHYDRO API keys.
1236
+
1237
+ Returns:
1238
+ Dict[str, str]: A dictionary containing the DBHYDRO API keys where dict keys are 'client_id' and 'client_secret'.
1239
+ """
1240
+ return get_dbhydro_api_keys_from_environment()
1241
+
1242
+
1243
+ def get_dbhydro_api() -> DbHydroApi:
1244
+ """Get a configured DbHydroApi instance.
1245
+
1246
+ Returns:
1247
+ DbHydroApi: An instance of the DbHydroApi class.
1248
+ """
1249
+ api_keys = get_dbhydro_api_keys()
1250
+ dbhydro_api = DbHydroApi.with_default_adapter(client_id=api_keys["client_id"], client_secret=api_keys["client_secret"])
1251
+ return dbhydro_api
1252
+
1253
+
1024
1254
  if __name__ == "__main__":
1025
1255
  if sys.argv[1] == "get_dbkeys":
1026
1256
  get_dbkeys(
@@ -4,8 +4,7 @@ import requests
4
4
  import uuid
5
5
  from datetime import datetime
6
6
  from loone_data_prep.water_level_data import hydro
7
- from loone_data_prep.flow_data.get_forecast_flows import get_stations_latitude_longitude
8
- from loone_data_prep.utils import find_last_date_in_csv, dbhydro_data_is_latest
7
+ from loone_data_prep.utils import find_last_date_in_csv, dbhydro_data_is_latest, get_stations_latitude_longitude
9
8
  import pandas as pd
10
9
 
11
10
  DATE_NOW = datetime.now().date().strftime("%Y-%m-%d")
@@ -13,11 +12,11 @@ DATE_NOW = datetime.now().date().strftime("%Y-%m-%d")
13
12
  D = {
14
13
  "LO_Stage": {"dbkeys": ["16022", "12509", "12519", "16265", "15611"], "datum": "NGVD29"},
15
14
  "LO_Stage_2": {"dbkeys": ["94832"], "date_min": "2024-04-30", "datum": "NAVD88"},
16
- "Stg_3ANW": {"dbkeys": ["LA369"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
17
- "Stg_2A17": {"dbkeys": ["16531"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
18
- "Stg_3A3": {"dbkeys": ["16532"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
19
- "Stg_3A4": {"dbkeys": ["16537"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
20
- "Stg_3A28": {"dbkeys": ["16538"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"}
15
+ "Stg_3ANW": {"dbkeys": ["LA369"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29", "override_site_codes": {"G3ANW": "3A-NW"}},
16
+ "Stg_2A17": {"dbkeys": ["16531"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29", "override_site_codes": {"2A-17": "2-17"}},
17
+ "Stg_3A3": {"dbkeys": ["16532"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29", "override_site_codes": {"3A-3": "3-63"}},
18
+ "Stg_3A4": {"dbkeys": ["16537"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29", "override_site_codes": {"3A-4": "3-64"}},
19
+ "Stg_3A28": {"dbkeys": ["16538"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29", "override_site_codes": {"3A-28": "3-65"}},
21
20
  }
22
21
 
23
22
 
@@ -25,9 +24,6 @@ def main(workspace: str, d: dict = D) -> dict:
25
24
  missing_files = []
26
25
  failed_downloads = [] # List of file names that the script failed to get the latest data for (but the files still exist)
27
26
 
28
- # Get the date of the latest data in LO_Stage_2.csv
29
- date_latest_lo_stage_2 = find_last_date_in_csv(workspace, "LO_Stage_2.csv")
30
-
31
27
  for name, params in d.items():
32
28
  # Get the date of the latest data in the csv file
33
29
  date_latest = find_last_date_in_csv(workspace, f"{name}.csv")
@@ -35,10 +31,18 @@ def main(workspace: str, d: dict = D) -> dict:
35
31
  # File with data for this dbkey does NOT already exist (or possibly some other error occurred)
36
32
  if date_latest is None:
37
33
  print(f"Getting all water level data for {name}.")
34
+ params['date_max'] = DATE_NOW
38
35
  hydro.get(workspace, name, **params)
39
36
  else:
40
37
  # Check whether the latest data is already up to date.
41
- if dbhydro_data_is_latest(date_latest):
38
+ requires_data_download = False
39
+ for dbkey in params['dbkeys']:
40
+ if not dbhydro_data_is_latest(date_latest, dbkey):
41
+ requires_data_download = True
42
+ break
43
+
44
+ # Data is already up to date
45
+ if not requires_data_download:
42
46
  # Notify that the data is already up to date
43
47
  print(f'Downloading of new water level data skipped for {name}. Data is already up to date.')
44
48
  continue
@@ -50,21 +54,23 @@ def main(workspace: str, d: dict = D) -> dict:
50
54
 
51
55
  try:
52
56
  # Download only the new data
53
- print(f'Downloading new water level data for {name} starting from date {date_latest}')
54
- hydro.get(workspace, name, dbkeys=params['dbkeys'], date_min=date_latest, date_max=DATE_NOW, datum=params['datum'])
57
+ date_next = (datetime.strptime(date_latest, "%Y-%m-%d") + pd.DateOffset(days=1)).date().strftime("%Y-%m-%d")
58
+ print(f'Downloading new water level data for {name} starting from date {date_next}')
59
+ kwargs = {}
60
+ if 'override_site_codes' in params:
61
+ kwargs['override_site_codes'] = params['override_site_codes']
62
+ hydro.get(workspace, name, dbkeys=params['dbkeys'], date_min=date_next, date_max=DATE_NOW, datum=params['datum'], **kwargs)
55
63
 
56
64
  # Read in the original data and the newly downloaded data
57
- df_original = pd.read_csv(os.path.join(workspace, original_file_name_temp), index_col=0)
58
- df_new = pd.read_csv(os.path.join(workspace, original_file_name), index_col=0)
59
-
60
- # For get_hydro() calls with multiple dbkeys, remove the row corresponding to the latest date from the downloaded data.
61
- # When get_hydro() is given multiple keys its returned data starts from the date given instead of the day after like it
62
- # does when given a single key.
63
- if len(params['dbkeys']) > 1:
64
- df_new = df_new[df_new['date'] != date_latest]
65
+ df_original = pd.read_csv(os.path.join(workspace, original_file_name_temp), index_col='date')
66
+ df_new = pd.read_csv(os.path.join(workspace, original_file_name), index_col='date')
65
67
 
66
68
  # Merge the new data with the original data
67
- df_merged = pd.concat([df_original, df_new], ignore_index=True)
69
+ df_merged = pd.concat([df_original, df_new], ignore_index=False)
70
+
71
+ # Ensure an integer index (for backwards compatibility)
72
+ df_merged.reset_index(inplace=True)
73
+ df_merged.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')
68
74
 
69
75
  # Write out the merged data
70
76
  df_merged.to_csv(os.path.join(workspace, original_file_name))
@@ -102,6 +108,10 @@ def main(workspace: str, d: dict = D) -> dict:
102
108
  lat_long_map = get_stations_latitude_longitude(["L OKEE"])
103
109
  latitude, longitude = lat_long_map["L OKEE"]
104
110
 
111
+ # Load the LO_Stage.csv file
112
+ df_lo_stage = pd.read_csv(os.path.join(workspace, "LO_Stage.csv"), index_col="date")
113
+ df_lo_stage.index = pd.to_datetime(df_lo_stage.index)
114
+
105
115
  # Load the LO_Stage_2.csv file
106
116
  df_lo_stage_2 = pd.read_csv(os.path.join(workspace, "LO_Stage_2.csv"), index_col="date")
107
117
  df_lo_stage_2.index = pd.to_datetime(df_lo_stage_2.index)
@@ -109,21 +119,24 @@ def main(workspace: str, d: dict = D) -> dict:
109
119
  # Output Progress
110
120
  print("Converting NAVD88 to NGVD29 for 'L OKEE's new dbkey...\n")
111
121
 
112
- # Use only the data that is not already in the LO_Stage.csv file
113
- if date_latest_lo_stage_2 is not None:
114
- date_start = datetime.strptime(date_latest_lo_stage_2, "%Y-%m-%d") + pd.DateOffset(days=1)
115
- df_lo_stage_2 = df_lo_stage_2.loc[date_start:]
122
+ # Use only the data that is not already in the LO_Stage.csv file and exists in the LO_Stage_2.csv file
123
+ common_dates = df_lo_stage.index.intersection(df_lo_stage_2.index)
116
124
 
117
- # Convert the stage values from NAVD88 to NGVD29
118
- lo_stage_2_dates = df_lo_stage_2.index.tolist()
119
- lo_stage_2_values_navd88 = df_lo_stage_2["L OKEE_STG_ft NGVD29"].tolist()
120
- lo_stage_2_values_ngvd29 = []
125
+ missing_mask = (
126
+ df_lo_stage.loc[common_dates, "L OKEE_STG_ft NGVD29"].isna() &
127
+ df_lo_stage_2.loc[common_dates, "L OKEE_STG_ft NGVD29"].notna()
128
+ )
121
129
 
122
- for i in range(0, len(lo_stage_2_values_navd88)):
123
- date = lo_stage_2_dates[i]
124
- value = lo_stage_2_values_navd88[i]
130
+ missing_dates: pd.DatetimeIndex = common_dates[missing_mask]
131
+ missing_dates = missing_dates.to_list()
132
+
133
+ # Convert the stage values from NAVD88 to NGVD29 for the missing dates
134
+ converted_values = {}
135
+ for date in missing_dates:
125
136
  try:
126
- lo_stage_2_values_ngvd29.append(_convert_navd88_to_ngvd29(latitude, longitude, value, date.year))
137
+ navd88_value = df_lo_stage_2.at[date, "L OKEE_STG_ft NGVD29"]
138
+ ngvd29_value = _convert_navd88_to_ngvd29(latitude, longitude, navd88_value, date.year)
139
+ converted_values[date] = ngvd29_value
127
140
  except Exception as e:
128
141
  convert_failure = True
129
142
  print(str(e))
@@ -132,20 +145,15 @@ def main(workspace: str, d: dict = D) -> dict:
132
145
  # Check for conversion failure
133
146
  if not convert_failure:
134
147
  # Update the LO_Stage.csv file with the converted values
135
- df_lo_stage = pd.read_csv(os.path.join(workspace, "LO_Stage.csv"), index_col="date")
136
- df_lo_stage.index = pd.to_datetime(df_lo_stage.index)
137
-
138
- for i in range(0, len(lo_stage_2_values_ngvd29)):
139
- # Get the current date and value
140
- date = lo_stage_2_dates[i]
141
- value = lo_stage_2_values_ngvd29[i]
142
-
143
- # Update the value in the LO_Stage dataframe
148
+ for date, value in converted_values.items():
144
149
  df_lo_stage.at[date, "L OKEE_STG_ft NGVD29"] = value
145
150
 
146
151
  # Reset the index
147
152
  df_lo_stage.reset_index(inplace=True)
148
- df_lo_stage.drop(columns=["Unnamed: 0"], inplace=True)
153
+
154
+ # Drop Unnamed: 0 column that might have been added
155
+ if "Unnamed: 0" in df_lo_stage.columns:
156
+ df_lo_stage.drop(columns=["Unnamed: 0"], inplace=True)
149
157
 
150
158
  # Save the updated LO_Stage.csv file
151
159
  df_lo_stage.to_csv(os.path.join(workspace, "LO_Stage.csv"))