loone-data-prep 0.1.6__tar.gz → 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/PKG-INFO +1 -1
  2. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep/LOONE_DATA_PREP.py +0 -2
  3. loone_data_prep-0.1.7/loone_data_prep/flow_data/S65E_total.py +89 -0
  4. loone_data_prep-0.1.7/loone_data_prep/flow_data/get_inflows.py +159 -0
  5. loone_data_prep-0.1.7/loone_data_prep/flow_data/get_outflows.py +164 -0
  6. loone_data_prep-0.1.7/loone_data_prep/flow_data/hydro.py +155 -0
  7. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep/utils.py +69 -0
  8. loone_data_prep-0.1.7/loone_data_prep/water_level_data/get_all.py +232 -0
  9. loone_data_prep-0.1.7/loone_data_prep/water_level_data/hydro.py +114 -0
  10. loone_data_prep-0.1.7/loone_data_prep/water_quality_data/get_inflows.py +127 -0
  11. loone_data_prep-0.1.7/loone_data_prep/water_quality_data/get_lake_wq.py +129 -0
  12. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep/water_quality_data/wq.py +44 -0
  13. loone_data_prep-0.1.7/loone_data_prep/weather_data/get_all.py +157 -0
  14. loone_data_prep-0.1.7/loone_data_prep/weather_data/weather.py +280 -0
  15. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep.egg-info/PKG-INFO +1 -1
  16. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/pyproject.toml +1 -1
  17. loone_data_prep-0.1.6/loone_data_prep/flow_data/S65E_total.py +0 -24
  18. loone_data_prep-0.1.6/loone_data_prep/flow_data/get_inflows.py +0 -70
  19. loone_data_prep-0.1.6/loone_data_prep/flow_data/get_outflows.py +0 -80
  20. loone_data_prep-0.1.6/loone_data_prep/flow_data/hydro.py +0 -61
  21. loone_data_prep-0.1.6/loone_data_prep/water_level_data/get_all.py +0 -35
  22. loone_data_prep-0.1.6/loone_data_prep/water_level_data/hydro.py +0 -46
  23. loone_data_prep-0.1.6/loone_data_prep/water_quality_data/get_inflows.py +0 -42
  24. loone_data_prep-0.1.6/loone_data_prep/water_quality_data/get_lake_wq.py +0 -47
  25. loone_data_prep-0.1.6/loone_data_prep/weather_data/get_all.py +0 -34
  26. loone_data_prep-0.1.6/loone_data_prep/weather_data/weather.py +0 -122
  27. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/LICENSE +0 -0
  28. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/README.md +0 -0
  29. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep/GEOGLOWS_LOONE_DATA_PREP.py +0 -0
  30. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep/__init__.py +0 -0
  31. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep/data_analyses_fns.py +0 -0
  32. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep/flow_data/__init__.py +0 -0
  33. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep/flow_data/forecast_bias_correction.py +0 -0
  34. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep/flow_data/get_forecast_flows.py +0 -0
  35. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep/water_level_data/__init__.py +0 -0
  36. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep/water_quality_data/__init__.py +0 -0
  37. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep/weather_data/__init__.py +0 -0
  38. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep.egg-info/SOURCES.txt +0 -0
  39. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep.egg-info/dependency_links.txt +0 -0
  40. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep.egg-info/requires.txt +0 -0
  41. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/loone_data_prep.egg-info/top_level.txt +0 -0
  42. {loone_data_prep-0.1.6 → loone_data_prep-0.1.7}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: loone_data_prep
3
- Version: 0.1.6
3
+ Version: 0.1.7
4
4
  Summary: Prepare data to run the LOONE model.
5
5
  Author-email: Osama Tarabih <osamatarabih@usf.edu>
6
6
  Maintainer-email: Michael Souffront <msouffront@aquaveo.com>, James Dolinar <jdolinar@aquaveo.com>
@@ -383,7 +383,6 @@ def main(input_dir: str, output_dir: str) -> None:
383
383
  LOWS['LZ40WS'] = LZ40WS['LZ40_WNDS_MPH']
384
384
  LOWS['LO_Avg_WS_MPH'] = LOWS.mean(axis=1, numeric_only=True)
385
385
  LOWS.to_csv(f'{output_dir}/LOWS.csv', index=False)
386
- LOWS.to_csv(f'{input_dir}/LOWS.csv', index=False) # Also needed in temporary directory by utils.py's wind_induced_waves()
387
386
 
388
387
  # RFVol acft
389
388
  # Create File (RF_Volume)
@@ -897,7 +896,6 @@ def main(input_dir: str, output_dir: str) -> None:
897
896
  # Write Data into csv files
898
897
  # write Avg Stage (ft, m) Storage (acft, m3) SA (acres) to csv
899
898
  LO_Stg_Sto_SA_df.to_csv(f'{output_dir}/Average_LO_Storage_3MLag.csv', index=False)
900
- LO_Stg_Sto_SA_df.to_csv(f'{input_dir}/Average_LO_Storage_3MLag.csv', index=False) # Also needed in temporary directory by utils.py's wind_induced_waves()
901
899
  # Write S65 TP concentrations (mg/L)
902
900
  S65_total_TP.to_csv(f'{output_dir}/S65_TP_3MLag.csv', index=False)
903
901
  # TP External Loads 3 Months Lag (mg)
@@ -0,0 +1,89 @@
1
+ import sys
2
+ from retry import retry
3
+ from rpy2.robjects import r
4
+ from rpy2.rinterface_lib.embedded import RRuntimeError
5
+ import pandas as pd
6
+
7
+
8
+ @retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
9
+ def get(
10
+ workspace,
11
+ date_min: str = "1972-01-01",
12
+ date_max: str = "2023-06-30"
13
+ ) -> None:
14
+ r(
15
+ f"""
16
+ # Load the required libraries
17
+ library(dbhydroR)
18
+ library(dplyr)
19
+
20
+ # Helper Functions
21
+ retrieve_data <- function(dbkey, date_min, date_max)
22
+ {{
23
+ # Get the data from dbhydro
24
+ df = get_hydro(dbkey = dbkey, date_min = date_min, date_max = date_max, raw = TRUE)
25
+
26
+ # Give data.frame correct column names so it can be cleaned using the clean_hydro function
27
+ colnames(df) <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
28
+
29
+ # Add a type and units column to data so it can be cleaned using the clean_hydro function
30
+ df$type <- "FLOW"
31
+ df$units <- "cfs"
32
+
33
+ # Clean the data.frame
34
+ df <- clean_hydro(df)
35
+
36
+ # Drop the " _FLOW_cfs" column
37
+ df <- df %>% select(-` _FLOW_cfs`)
38
+
39
+ # Convert Flow rate from cfs to m³/day
40
+ df[, -1] <- df[, -1] * (0.0283168466 * 86400)
41
+
42
+ # Return resulting data.frame
43
+ return(df)
44
+ }}
45
+
46
+ # S65E_S
47
+ S65E_S <- retrieve_data(dbkey = "91656", date_min = "{date_min}", date_max = "{date_max}")
48
+
49
+ # Wait five seconds before next request to avoid "too many requests" error
50
+ Sys.sleep(5)
51
+
52
+ # S65EX1_S
53
+ S65EX1_S <- retrieve_data(dbkey = "AL760", date_min = "{date_min}", date_max = "{date_max}")
54
+
55
+ # Merge the data from each dbkey
56
+ result <- merge(S65E_S, S65EX1_S, by = "date", all = TRUE)
57
+
58
+ # Write the data to a file
59
+ write.csv(result, file = '{workspace}/S65E_total.csv')
60
+ """
61
+ )
62
+
63
+ _reformat_s65e_total_file(workspace)
64
+
65
+ def _reformat_s65e_total_file(workspace: str):
66
+ # Read in the data
67
+ df = pd.read_csv(f"{workspace}/S65E_total.csv")
68
+
69
+ # Drop unused columns
70
+ df.drop('Unnamed: 0', axis=1, inplace=True)
71
+
72
+ # Convert date column to datetime
73
+ df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
74
+
75
+ # Sort the data by date
76
+ df.sort_values('date', inplace=True)
77
+
78
+ # Renumber the index
79
+ df.reset_index(drop=True, inplace=True)
80
+
81
+ # Drop rows that are missing all their values
82
+ df.dropna(how='all', inplace=True)
83
+
84
+ # Write the updated data back to the file
85
+ df.to_csv(f"{workspace}/S65E_total.csv")
86
+
87
+ if __name__ == "__main__":
88
+ workspace = sys.argv[1].rstrip("/")
89
+ get(workspace)
@@ -0,0 +1,159 @@
1
+ import sys
2
+ import os
3
+ from glob import glob
4
+ from datetime import datetime
5
+ import uuid
6
+ import pandas as pd
7
+ from loone_data_prep.flow_data import hydro, S65E_total
8
+ from loone_data_prep.utils import find_last_date_in_csv, dbhydro_data_is_latest
9
+
10
+
11
+ # Database keys for needed inflow data mapped to their stations
12
+ DBKEYS = {
13
+ "91370": "S127_C",
14
+ "91371": "S127_P",
15
+ "91373": "S129_C",
16
+ "91377": "S133_P",
17
+ "91379": "S135_C",
18
+ "91401": "S154_C",
19
+ "91429": "S191_S",
20
+ "91473": "S2_P",
21
+ "91508": "S351_S",
22
+ "91510": "S352_S",
23
+ "91513": "S354_S",
24
+ "91599": "S3_P",
25
+ "91608": "S4_P",
26
+ "91656": "S65E_S",
27
+ "91668": "S71_S",
28
+ "91675": "S72_S",
29
+ "91687": "S84_S",
30
+ "15627": "FISHP",
31
+ "15640": "L8.441",
32
+ "15626": "S308.DS",
33
+ "15642": "S129 PMP_P",
34
+ "15638": "S135 PMP_P",
35
+ }
36
+
37
+ def main(workspace: str, dbkeys: dict = DBKEYS) -> dict:
38
+ """
39
+ Retrieve the inflows data used by LOONE.
40
+
41
+ Args:
42
+ workspace (str): Path to workspace where data will be downloaded.
43
+ dbkeys (dict): Dictionary of dbkeys and corresponding station names.
44
+
45
+ Returns:
46
+ dict: Success or error message
47
+ """
48
+
49
+ # Retrieve inflow data
50
+ for dbkey, station in dbkeys.copy().items():
51
+ file_name = f"{station}_FLOW_cmd.csv"
52
+ date_latest = find_last_date_in_csv(workspace, file_name)
53
+
54
+ # File with data for this dbkey does NOT already exist (or possibly some other error occurred)
55
+ if date_latest is None:
56
+ # Download all the data
57
+ print(f'Downloading all inflow data for {station}')
58
+ hydro.get(workspace, dbkey)
59
+ else:
60
+ # Check whether the latest data is already up to date.
61
+ if dbhydro_data_is_latest(date_latest):
62
+ # Notify that the data is already up to date
63
+ print(f'Downloading of new inflow data skipped for Station {station} (dbkey: {dbkey}). Data is already up to date.')
64
+
65
+ # Remove dbkey from dbkeys so we know it didn't fail
66
+ del dbkeys[dbkey]
67
+ continue
68
+
69
+ # Download only the new data
70
+ print(f'Downloading new inflow data for {station} starting from date {date_latest}')
71
+ hydro.get(workspace, dbkey, date_latest)
72
+
73
+ # Make sure both our original data and newly downloaded data exist
74
+ df_original_path = os.path.join(workspace, f"{station}_FLOW_cmd.csv")
75
+ df_new_path = os.path.join(workspace, f"{station}_FLOW_{dbkey}_cmd.csv")
76
+
77
+ if os.path.exists(df_original_path) and os.path.exists(df_new_path):
78
+ # Merge the new data with the old data
79
+ df_original = pd.read_csv(df_original_path, index_col=0)
80
+ df_new = pd.read_csv(df_new_path, index_col=0)
81
+ df_merged = pd.concat([df_original, df_new], ignore_index=True)
82
+
83
+ # Write the merged data to the new file
84
+ df_merged.to_csv(os.path.join(workspace, f"{station}_FLOW_{dbkey}_cmd.csv"))
85
+
86
+ # Remove the old file
87
+ os.remove(os.path.join(workspace, f"{station}_FLOW_cmd.csv"))
88
+
89
+ # Download S65E_total.csv Data
90
+ date_latest = find_last_date_in_csv(workspace, "S65E_total.csv")
91
+
92
+ if date_latest is None:
93
+ print('Downloading all S65E_total data')
94
+ S65E_total.get(workspace, date_max=datetime.now().strftime("%Y-%m-%d"))
95
+ else:
96
+ # Check whether the latest data is already up to date.
97
+ if dbhydro_data_is_latest(date_latest):
98
+ # Notify that the data is already up to date
99
+ print(f'Downloading of new inflow data skipped for S65E_total. Data is already up to date.')
100
+ else:
101
+ # Temporarily rename current data file so it isn't over written
102
+ original_file_name = F"S65E_total_old_{uuid.uuid4()}.csv"
103
+ os.rename(os.path.join(workspace, "S65E_total.csv"), os.path.join(workspace, original_file_name))
104
+
105
+ try:
106
+ # Download only the new data
107
+ print(f'Downloading new S65E_total data starting from date {date_latest}')
108
+ S65E_total.get(workspace, date_min=date_latest, date_max=datetime.now().strftime("%Y-%m-%d"))
109
+
110
+ # Merge the new data with the original data
111
+ df_original = pd.read_csv(os.path.join(workspace, original_file_name), index_col=0)
112
+ df_new = pd.read_csv(os.path.join(workspace, "S65E_total.csv"), index_col=0)
113
+ df_merged = pd.concat([df_original, df_new], ignore_index=True)
114
+
115
+ # Write out the merged data
116
+ df_merged.to_csv(os.path.join(workspace, original_file_name))
117
+
118
+ # Remove the newly downloaded data file
119
+ os.remove(os.path.join(workspace, "S65E_total.csv"))
120
+ except Exception as e:
121
+ print(f"Error occurred while downloading new S65E_total data: {e}")
122
+ finally:
123
+ # Rename the original updated file back to its original name
124
+ os.rename(os.path.join(workspace, original_file_name), os.path.join(workspace, "S65E_total.csv"))
125
+
126
+ # Check if all files were downloaded
127
+ files = glob(f"{workspace}/*FLOW*_cmd.csv")
128
+
129
+ for file in files:
130
+ file_dbkey = file.split("_")[-2]
131
+
132
+ if file_dbkey in dbkeys:
133
+ # Remove dbkey from file name
134
+ new_file_name = file.replace(f"_{file_dbkey}", "")
135
+ os.rename(file, new_file_name)
136
+
137
+ # Remove dbkey from dbkeys so we know it successfully downloaded
138
+ del dbkeys[file_dbkey]
139
+
140
+ # Check for failed downloads
141
+ if len(dbkeys) > 0 or not os.path.exists(f"{workspace}/S65E_total.csv"):
142
+ error_message = ""
143
+
144
+ # dbkeys
145
+ if len(dbkeys) > 0:
146
+ error_message += f"The data from the following dbkeys could not be downloaded: {list(dbkeys.keys())}\n"
147
+
148
+ # S65E_total.csv
149
+ if not os.path.exists(f"{workspace}/S65E_total.csv"):
150
+ error_message += "S65E_total.csv file could not be downloaded.\n"
151
+
152
+ return {"error": error_message}
153
+
154
+ return {"success": "Completed inflow flow data download."}
155
+
156
+
157
+ if __name__ == "__main__":
158
+ workspace = sys.argv[1].rstrip("/")
159
+ main(workspace)
@@ -0,0 +1,164 @@
1
+ import sys
2
+ import os
3
+ from glob import glob
4
+ import pandas as pd
5
+ from loone_data_prep.utils import get_dbkeys, find_last_date_in_csv, dbhydro_data_is_latest
6
+ from loone_data_prep.flow_data import hydro
7
+
8
+ STATION_IDS = [
9
+ "S308.DS",
10
+ "S77_S",
11
+ "L8.441",
12
+ "S127_C",
13
+ "S129_C",
14
+ "S135_C",
15
+ "S351_S",
16
+ "S352_S",
17
+ "S354_S",
18
+ "INDUST",
19
+ "S79",
20
+ "S80_S",
21
+ "S2_NNR",
22
+ "S3",
23
+ "S48_S",
24
+ "S49_S",
25
+ ]
26
+
27
+
28
+ DBKEYS = {
29
+ "91370": "S127_C",
30
+ "91373": "S129_C",
31
+ "91379": "S135_C",
32
+ "91508": "S351_S",
33
+ "91510": "S352_S",
34
+ "91513": "S354_S",
35
+ "91677": "S77_S",
36
+ "15628": "INDUST",
37
+ "15640": "L8.441",
38
+ "15626": "S308.DS",
39
+ "00865": "S79_TOT",
40
+ "JW224": "S80_S",
41
+ "00436": "S2 NNR",
42
+ "15018": "S3",
43
+ "91606": "S48_S",
44
+ "JW223": "S49_S",
45
+ }
46
+
47
+
48
+ def _get_outflow_data_from_station_ids(workspace: str, station_ids: list) -> dict:
49
+ """Attempt to download outflow data from station ids.
50
+
51
+ Args:
52
+ workspace (str): Path to workspace where data will be downloaded.
53
+ station_ids (list): List of station ids to download data for.
54
+
55
+ Returns:
56
+ dict: Success or error message
57
+ """
58
+ # Get dbkeys from station ids
59
+ dbkeys = list(get_dbkeys(station_ids, "SW", "FLOW", "MEAN", "PREF", detail_level="dbkey"))
60
+ dbkeys.extend(list(get_dbkeys(station_ids, "SW", "FLOW", "MEAN", "DRV", detail_level="dbkey")))
61
+
62
+ for dbkey in dbkeys:
63
+ hydro.get(workspace, dbkey, "2000-01-01")
64
+
65
+ # Check if all files were downloaded
66
+ files = glob(f"{workspace}/*FLOW*_cmd.csv")
67
+
68
+ for file in files:
69
+ file_dbkey = file.split("_")[-2]
70
+
71
+ if file_dbkey in dbkeys:
72
+ # Remove dbkey from file name
73
+ new_file_name = file.replace(f"_{file_dbkey}", "")
74
+ os.rename(file, new_file_name)
75
+
76
+ # Remove dbkey from dbkeys so we know it successfully downloaded
77
+ dbkeys.remove(file_dbkey)
78
+
79
+ if len(dbkeys) > 0:
80
+ return {"error": f"The data from the following dbkeys could not be downloaded: {dbkeys}"}
81
+
82
+ return {"success": "Completed outflow flow data download."}
83
+
84
+
85
+ def main(workspace: str, dbkeys: dict = DBKEYS, station_ids: list = STATION_IDS) -> dict:
86
+ """
87
+ Retrieve the outflow data used by LOONE.
88
+
89
+ Args:
90
+ workspace (str): Path to workspace where data will be downloaded.
91
+ dbkeys (dict): Dictionary of dbkeys and corresponding station names.
92
+ station_ids (list): List of station ids to download data for if the dbkeys argument is not provided.
93
+
94
+ Returns:
95
+ dict: Success or error message
96
+ """
97
+
98
+ # No dbkeys given, attempt to get data from station ids
99
+ if dbkeys is None:
100
+ return _get_outflow_data_from_station_ids(workspace, station_ids)
101
+
102
+ # Get outflow data from dbkeys
103
+ for dbkey, station in dbkeys.copy().items():
104
+ # Get the date of the latest data in the csv file (if any)
105
+ date_latest = find_last_date_in_csv(workspace, f"{station}_FLOW_cmd.csv")
106
+
107
+ # File with data for this dbkey does NOT already exist (or possibly some other error occurred)
108
+ if date_latest is None:
109
+ # Download all data
110
+ print(f'Downloading all outflow data for {station}')
111
+ hydro.get(workspace, dbkey, "2000-01-01")
112
+ else:
113
+ # Check whether the latest data is already up to date.
114
+ if dbhydro_data_is_latest(date_latest):
115
+ # Notify that the data is already up to date
116
+ print(f'Downloading of new outflow data skipped for Station {station} (dbkey: {dbkey}). Data is already up to date.')
117
+
118
+ # Remove dbkey from dbkeys so we know it didn't fail
119
+ del dbkeys[dbkey]
120
+ continue
121
+
122
+ # Download only the new data
123
+ print(f'Downloading new outflow data for {station} starting from date {date_latest}')
124
+ hydro.get(workspace, dbkey, date_latest)
125
+
126
+ # Make sure both our original data and newly downloaded data exist
127
+ df_old_path = os.path.join(workspace, f"{station}_FLOW_cmd.csv")
128
+ df_new_path = os.path.join(workspace, f"{station}_FLOW_{dbkey}_cmd.csv")
129
+
130
+ if os.path.exists(df_old_path) and os.path.exists(df_new_path):
131
+ # Merge the new data with the old data
132
+ df_original = pd.read_csv(df_old_path, index_col=0)
133
+ df_new = pd.read_csv(df_new_path, index_col=0)
134
+ df_merged = pd.concat([df_original, df_new], ignore_index=True)
135
+
136
+ # Write the merged data to the new file
137
+ df_merged.to_csv(os.path.join(workspace, f"{station}_FLOW_{dbkey}_cmd.csv"))
138
+
139
+ # Remove the old file
140
+ os.remove(os.path.join(workspace, f"{station}_FLOW_cmd.csv"))
141
+
142
+ # Check if all files were downloaded
143
+ files = glob(f"{workspace}/*FLOW*_cmd.csv")
144
+
145
+ for file in files:
146
+ file_dbkey = file.split("_")[-2]
147
+
148
+ if file_dbkey in dbkeys:
149
+ # Remove dbkey from file name
150
+ new_file_name = file.replace(f"_{file_dbkey}", "")
151
+ os.rename(file, new_file_name)
152
+
153
+ # Remove dbkey from dbkeys so we know it successfully downloaded
154
+ del dbkeys[file_dbkey]
155
+
156
+ if len(dbkeys) > 0:
157
+ return {"error": f"The data from the following dbkeys could not be downloaded: {dbkeys}"}
158
+
159
+ return {"success": "Completed outflow flow data download."}
160
+
161
+
162
+ if __name__ == "__main__":
163
+ workspace = sys.argv[1].rstrip("/")
164
+ main(workspace)
@@ -0,0 +1,155 @@
1
+ import sys
2
+ from datetime import datetime
3
+ from glob import glob
4
+ from retry import retry
5
+ import os
6
+ import pandas as pd
7
+ from rpy2.robjects import r
8
+ from rpy2.rinterface_lib.embedded import RRuntimeError
9
+
10
+
11
+ DATE_NOW = datetime.now().strftime("%Y-%m-%d")
12
+
13
+
14
+ @retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
15
+ def get(
16
+ workspace: str,
17
+ dbkey: str,
18
+ date_min: str = "1990-01-01",
19
+ date_max: str = DATE_NOW
20
+ ) -> None:
21
+ r_str = f"""
22
+ download_flow_data <- function(workspace, dbkey, date_min, date_max)
23
+ {{
24
+ # Load the required libraries
25
+ library(dbhydroR)
26
+ library(dplyr)
27
+
28
+ # Retrieve data for the dbkey
29
+ data <- get_hydro(dbkey = "{dbkey}", date_min = "{date_min}", date_max = "{date_max}", raw = TRUE)
30
+
31
+ # Check if data is empty or contains only the "date" column
32
+ if (ncol(data) <= 1) {{
33
+ cat("No data found for dbkey", "{dbkey}", "Skipping to the next dbkey.\n")
34
+ }}
35
+
36
+ # Give data.frame correct column names so it can be cleaned using the clean_hydro function
37
+ colnames(data) <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
38
+
39
+ # Check if the data.frame has any rows
40
+ if (nrow(data) == 0)
41
+ {{
42
+ # No data given back, It's possible that the dbkey has reached its end date.
43
+ print(paste("Empty data.frame returned for dbkey", "{dbkey}", "It's possible that the dbkey has reached its end date. Skipping to the next dbkey."))
44
+ return(list(success = FALSE, dbkey = "{dbkey}"))
45
+ }}
46
+
47
+ # Add a type and units column to data so it can be cleaned using the clean_hydro function
48
+ data$type <- "FLOW"
49
+ data$units <- "cfs"
50
+
51
+ # Get the station
52
+ station <- data$station[1]
53
+
54
+ # Clean the data.frame
55
+ data <- clean_hydro(data)
56
+
57
+ # Multiply all columns except "date" column by 0.0283168466 * 86400 to convert Flow rate from cfs to m³/day
58
+ data[, -1] <- data[, -1] * (0.0283168466 * 86400)
59
+
60
+ # Drop the " _FLOW_cfs" column
61
+ data <- data %>% select(-` _FLOW_cfs`)
62
+
63
+ # Sort the data by date
64
+ data <- data[order(data$date), ]
65
+
66
+ # Get the filename for the output CSV file
67
+ filename <- paste0(station, "_FLOW", "_{dbkey}_cmd.csv")
68
+
69
+ # Save data to a CSV file
70
+ write.csv(data, file = paste0("{workspace}/", filename))
71
+
72
+ # Print a message indicating the file has been saved
73
+ cat("CSV file", filename, "has been saved.\n")
74
+
75
+ # Add a delay between requests
76
+ Sys.sleep(1) # Wait for 1 second before the next iteration
77
+
78
+ # Return the station and dbkey to the python code
79
+ list(success = TRUE, station = station, dbkey = "{dbkey}")
80
+ }}
81
+ """
82
+
83
+ r(r_str)
84
+
85
+ # Call the R function to download the flow data
86
+ result = r.download_flow_data(workspace, dbkey, date_min, date_max)
87
+
88
+ # Check for failure
89
+ success = result.rx2("success")[0]
90
+
91
+ if not success:
92
+ return
93
+
94
+ # Get the station name for _reformat_flow_file()
95
+ station = result.rx2("station")[0]
96
+
97
+ # Reformat the flow data file to the expected layout
98
+ _reformat_flow_file(workspace, station, dbkey)
99
+
100
+ # Check if the station name contains a space
101
+ if " " in station:
102
+ # Replace space with underscore in the station name
103
+ station_previous = station
104
+ station = station.replace(" ", "_")
105
+
106
+ # Rename the file
107
+ os.rename(f"{workspace}/{station_previous}_FLOW_{dbkey}_cmd.csv", f"{workspace}/{station}_FLOW_{dbkey}_cmd.csv")
108
+
109
+ # column values are converted to cmd in R. This snippet makes sure column names are updated accordingly.
110
+ file = glob(f'{workspace}/*FLOW*{dbkey}_cmd.csv')[0]
111
+ df = pd.read_csv(file, index_col=False)
112
+ df.columns = df.columns.astype(str).str.replace("_cfs", "_cmd")
113
+ df.to_csv(file, index=False)
114
+
115
+
116
+ def _reformat_flow_file(workspace:str, station: str, dbkey: str):
117
+ '''
118
+ Reformat the flow data file to the expected layout.
119
+ Converts the format of the dates in the file to 'YYYY-MM-DD' then sorts the data by date.
120
+ Reads and writes to a .CSV file.
121
+
122
+ Args:
123
+ workspace (str): The path to the workspace directory.
124
+ station (str): The station name.
125
+ dbkey (str): The dbkey for the station.
126
+
127
+ Returns:
128
+ None
129
+ '''
130
+ # Read in the data
131
+ df = pd.read_csv(f"{workspace}/{station}_FLOW_{dbkey}_cmd.csv")
132
+
133
+ # Grab only the columns we need
134
+ df = df[['date', f'{station}_FLOW_cfs']]
135
+
136
+ # Convert date column to datetime
137
+ df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
138
+
139
+ # Sort the data by date
140
+ df.sort_values('date', inplace=True)
141
+
142
+ # Renumber the index
143
+ df.reset_index(drop=True, inplace=True)
144
+
145
+ # Drop rows that are missing values for both the date and value columns
146
+ df = df.drop(df[(df['date'].isna()) & (df[f'{station}_FLOW_cfs'].isna())].index)
147
+
148
+ # Write the updated data back to the file
149
+ df.to_csv(f"{workspace}/{station}_FLOW_{dbkey}_cmd.csv")
150
+
151
+
152
+ if __name__ == "__main__":
153
+ workspace = sys.argv[1].rstrip("/")
154
+ dbkey = sys.argv[2]
155
+ get(workspace, dbkey)
@@ -663,6 +663,75 @@ def nutrient_prediction(
663
663
  out_dataframe.to_csv(os.path.join(input_dir, f"{station}_PHOSPHATE_predicted.csv"))
664
664
 
665
665
 
666
+ def find_last_date_in_csv(workspace: str, file_name: str) -> str:
667
+ """
668
+ Gets the most recent date from the last line of a .csv file.
669
+ Assumes the file is formatted as a .csv file, encoded in UTF-8,
670
+ and the rows in the file are sorted by date in ascending order.
671
+
672
+ Args:
673
+ workspace (str): The directory where the file is located.
674
+ file_name (str): The name of the file.
675
+
676
+ Returns:
677
+ str: The most recent date as a string in YYYY-MM-DD format, or None if the file does not exist or the date cannot be found.
678
+ """
679
+ # Helper Functions
680
+ def is_valid_date(date_string):
681
+ try:
682
+ datetime.datetime.strptime(date_string, '%Y-%m-%d')
683
+ return True
684
+ except ValueError:
685
+ return False
686
+
687
+ # Check that file exists
688
+ file_path = os.path.join(workspace, file_name)
689
+ if not os.path.exists(file_path):
690
+ return None
691
+
692
+ # Attempt to extract the date of the last line in the file
693
+ try:
694
+ with open(file_path, 'rb') as file:
695
+ # Go to the end of the file
696
+ file.seek(-2, os.SEEK_END)
697
+
698
+ # Loop backwards until you find the first newline character
699
+ while file.read(1) != b'\n':
700
+ file.seek(-2, os.SEEK_CUR)
701
+
702
+ # Read the last line
703
+ last_line = file.readline().decode()
704
+
705
+ # Extract the date from the last line
706
+ date = None
707
+
708
+ for value in last_line.split(','):
709
+ if is_valid_date(value):
710
+ date = value
711
+ break
712
+
713
+ # Return date
714
+ return date
715
+ except OSError as e:
716
+ print(f"Error reading file {file_name}: {e}")
717
+ return None
718
+
719
+
720
+ def dbhydro_data_is_latest(date_latest: str):
721
+ """
722
+ Checks whether the given date is the most recent date possible to get data from dbhydro.
723
+ Can be used to check whether dbhydro data is up-to-date.
724
+
725
+ Args:
726
+ date_latest (str): The date of the most recent data of the dbhydro data you have
727
+
728
+ Returns:
729
+ bool: True if the date_latest is the most recent date possible to get data from dbhydro, False otherwise
730
+ """
731
+ date_latest_object = datetime.datetime.strptime(date_latest, "%Y-%m-%d").date()
732
+ return date_latest_object == (datetime.datetime.now().date() - datetime.timedelta(days=1))
733
+
734
+
666
735
  if __name__ == "__main__":
667
736
  if sys.argv[1] == "get_dbkeys":
668
737
  get_dbkeys(sys.argv[2].strip("[]").replace(" ", "").split(","), *sys.argv[3:])