loone-data-prep 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -383,7 +383,6 @@ def main(input_dir: str, output_dir: str) -> None:
383
383
  LOWS['LZ40WS'] = LZ40WS['LZ40_WNDS_MPH']
384
384
  LOWS['LO_Avg_WS_MPH'] = LOWS.mean(axis=1, numeric_only=True)
385
385
  LOWS.to_csv(f'{output_dir}/LOWS.csv', index=False)
386
- LOWS.to_csv(f'{input_dir}/LOWS.csv', index=False) # Also needed in temporary directory by utils.py's wind_induced_waves()
387
386
 
388
387
  # RFVol acft
389
388
  # Create File (RF_Volume)
@@ -418,7 +417,7 @@ def main(input_dir: str, output_dir: str) -> None:
418
417
  Stg_3A28 = pd.read_csv(f'{input_dir}/Stg_3A28.csv')
419
418
  Stg_3A28 = DF_Date_Range(Stg_3A28, St_Yr, St_M, St_D, En_Yr, En_M, En_D)
420
419
  WCA_Stg = pd.DataFrame(Stg_3A28['date'], columns=['date'])
421
- WCA_Stg['3A-NW'] = Stg_3ANW['3A-NW_STG_ft NGVD29'].values
420
+ WCA_Stg['3A-NW'] = Stg_3ANW.iloc[:, -1].values
422
421
  WCA_Stg['2A-17'] = Stg_2A17.iloc[:, -1].values
423
422
  WCA_Stg['3A-3'] = Stg_3A3.iloc[:, -1].values
424
423
  WCA_Stg['3A-4'] = Stg_3A4.iloc[:, -1].values
@@ -897,7 +896,6 @@ def main(input_dir: str, output_dir: str) -> None:
897
896
  # Write Data into csv files
898
897
  # write Avg Stage (ft, m) Storage (acft, m3) SA (acres) to csv
899
898
  LO_Stg_Sto_SA_df.to_csv(f'{output_dir}/Average_LO_Storage_3MLag.csv', index=False)
900
- LO_Stg_Sto_SA_df.to_csv(f'{input_dir}/Average_LO_Storage_3MLag.csv', index=False) # Also needed in temporary directory by utils.py's wind_induced_waves()
901
899
  # Write S65 TP concentrations (mg/L)
902
900
  S65_total_TP.to_csv(f'{output_dir}/S65_TP_3MLag.csv', index=False)
903
901
  # TP External Loads 3 Months Lag (mg)
@@ -2,22 +2,87 @@ import sys
2
2
  from retry import retry
3
3
  from rpy2.robjects import r
4
4
  from rpy2.rinterface_lib.embedded import RRuntimeError
5
+ import pandas as pd
5
6
 
6
7
 
7
8
  @retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
8
- def get(workspace):
9
+ def get(
10
+ workspace,
11
+ date_min: str = "1972-01-01",
12
+ date_max: str = "2023-06-30"
13
+ ) -> None:
9
14
  r(
10
15
  f"""
11
16
  # Load the required libraries
12
17
  library(dbhydroR)
13
-
14
- #S65E_Total
15
- S65E_total = get_hydro(dbkey = c("91656", "AL760"), date_min = "1972-01-01", date_max = "2023-06-30")
16
- S65E_total[, -1] <- S65E_total[, -1] * (0.0283168466 * 86400)
17
- write.csv(S65E_total,file ='{workspace}/S65E_total.csv')
18
+ library(dplyr)
19
+
20
+ # Helper Functions
21
+ retrieve_data <- function(dbkey, date_min, date_max)
22
+ {{
23
+ # Get the data from dbhydro
24
+ df = get_hydro(dbkey = dbkey, date_min = date_min, date_max = date_max, raw = TRUE)
25
+
26
+ # Give data.frame correct column names so it can be cleaned using the clean_hydro function
27
+ colnames(df) <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
28
+
29
+ # Add a type and units column to data so it can be cleaned using the clean_hydro function
30
+ df$type <- "FLOW"
31
+ df$units <- "cfs"
32
+
33
+ # Clean the data.frame
34
+ df <- clean_hydro(df)
35
+
36
+ # Drop the " _FLOW_cfs" column
37
+ df <- df %>% select(-` _FLOW_cfs`)
38
+
39
+ # Convert Flow rate from cfs to m³/day
40
+ df[, -1] <- df[, -1] * (0.0283168466 * 86400)
41
+
42
+ # Return resulting data.frame
43
+ return(df)
44
+ }}
45
+
46
+ # S65E_S
47
+ S65E_S <- retrieve_data(dbkey = "91656", date_min = "{date_min}", date_max = "{date_max}")
48
+
49
+ # Wait five seconds before next request to avoid "too many requests" error
50
+ Sys.sleep(5)
51
+
52
+ # S65EX1_S
53
+ S65EX1_S <- retrieve_data(dbkey = "AL760", date_min = "{date_min}", date_max = "{date_max}")
54
+
55
+ # Merge the data from each dbkey
56
+ result <- merge(S65E_S, S65EX1_S, by = "date", all = TRUE)
57
+
58
+ # Write the data to a file
59
+ write.csv(result, file = '{workspace}/S65E_total.csv')
18
60
  """
19
61
  )
62
+
63
+ _reformat_s65e_total_file(workspace)
20
64
 
65
+ def _reformat_s65e_total_file(workspace: str):
66
+ # Read in the data
67
+ df = pd.read_csv(f"{workspace}/S65E_total.csv")
68
+
69
+ # Drop unused columns
70
+ df.drop('Unnamed: 0', axis=1, inplace=True)
71
+
72
+ # Convert date column to datetime
73
+ df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
74
+
75
+ # Sort the data by date
76
+ df.sort_values('date', inplace=True)
77
+
78
+ # Renumber the index
79
+ df.reset_index(drop=True, inplace=True)
80
+
81
+ # Drop rows that are missing all their values
82
+ df.dropna(how='all', inplace=True)
83
+
84
+ # Write the updated data back to the file
85
+ df.to_csv(f"{workspace}/S65E_total.csv")
21
86
 
22
87
  if __name__ == "__main__":
23
88
  workspace = sys.argv[1].rstrip("/")
@@ -1,4 +1,5 @@
1
1
  import sys
2
+ import os
2
3
  import pandas as pd
3
4
  import geoglows
4
5
 
@@ -12,6 +13,7 @@ def get_bias_corrected_data(
12
13
  observed_data_path: str,
13
14
  station_ensembles: pd.DataFrame,
14
15
  station_stats: pd.DataFrame,
16
+ cache_path: str = None,
15
17
  ) -> dict:
16
18
  # Load the observed data from a CSV file
17
19
  observed_data = pd.read_csv(
@@ -34,7 +36,23 @@ def get_bias_corrected_data(
34
36
  prepared_od = prep_observed_data(observed_data)
35
37
 
36
38
  # Get the historical simulation data for the given reach ID
37
- historical_data = geoglows.streamflow.historic_simulation(reach_id)
39
+ historical_data = None
40
+
41
+ if cache_path is None:
42
+ historical_data = geoglows.streamflow.historic_simulation(reach_id)
43
+ else:
44
+ # Create the geoglows cache directory if it doesn't exist
45
+ geoglows_cache_path = os.path.join(cache_path, 'geoglows_cache')
46
+ if not os.path.exists(geoglows_cache_path):
47
+ os.makedirs(geoglows_cache_path)
48
+
49
+ # Check if the historical simulation data is already cached
50
+ if os.path.exists(os.path.join(geoglows_cache_path, f'{reach_id}_historic_simulation.csv')):
51
+ historical_data = pd.read_csv(os.path.join(geoglows_cache_path, f'{reach_id}_historic_simulation.csv'), index_col=0)
52
+ historical_data.index = pd.to_datetime(historical_data.index)
53
+ else:
54
+ historical_data = geoglows.streamflow.historic_simulation(reach_id)
55
+ historical_data.to_csv(os.path.join(geoglows_cache_path, f'{reach_id}_historic_simulation.csv'))
38
56
 
39
57
  # Correct the forecast bias in the station ensembles
40
58
  station_ensembles = geoglows.bias.correct_forecast(
@@ -366,6 +366,7 @@ def main(
366
366
  forecast_date: str = FORECAST_DATE,
367
367
  bias_corrected: bool = False,
368
368
  observed_data_dir: str | None = None,
369
+ cache_path: str | None = None,
369
370
  ):
370
371
  """Downloads the flow forecasts for the given station ids and writes them
371
372
  out as .csv files.
@@ -379,6 +380,8 @@ def main(
379
380
  Default is False.
380
381
  observed_data_dir (str): The path to the observed flow data directory
381
382
  (only needed if bias_corrected is True).
383
+ cache_path (str): The path to the cache directory for geoglows data.
384
+ Should hold a directory named geoglows_cache that holds the cached files. Use None to not use a cache.
382
385
  """
383
386
  # Local Variables
384
387
  reach_ids = {}
@@ -428,6 +431,7 @@ def main(
428
431
  observed_data_path,
429
432
  station_ensembles,
430
433
  station_stats,
434
+ cache_path,
431
435
  )
432
436
 
433
437
  ensembles_to_csv(
@@ -1,43 +1,128 @@
1
1
  import sys
2
2
  import os
3
3
  from glob import glob
4
+ from datetime import datetime
5
+ import uuid
6
+ import pandas as pd
4
7
  from loone_data_prep.flow_data import hydro, S65E_total
8
+ from loone_data_prep.utils import find_last_date_in_csv, dbhydro_data_is_latest
5
9
 
6
10
 
7
- # Database keys for needed inflow data
8
- DBKEYS = [
9
- "91370",
10
- "91371",
11
- "91373",
12
- "91377",
13
- "91379",
14
- "91401",
15
- "91429",
16
- "91473",
17
- "91508",
18
- "91510",
19
- "91513",
20
- "91599",
21
- "91608",
22
- "91656",
23
- "91668",
24
- "91675",
25
- "91687",
26
- "15627",
27
- "15640",
28
- "15626",
29
- "15642",
30
- "15638",
31
- ]
11
+ # Database keys for needed inflow data mapped to their stations
12
+ DBKEYS = {
13
+ "91370": "S127_C",
14
+ "91371": "S127_P",
15
+ "91373": "S129_C",
16
+ "91377": "S133_P",
17
+ "91379": "S135_C",
18
+ "91401": "S154_C",
19
+ "91429": "S191_S",
20
+ "91473": "S2_P",
21
+ "91508": "S351_S",
22
+ "91510": "S352_S",
23
+ "91513": "S354_S",
24
+ "91599": "S3_P",
25
+ "91608": "S4_P",
26
+ "91656": "S65E_S",
27
+ "91668": "S71_S",
28
+ "91675": "S72_S",
29
+ "91687": "S84_S",
30
+ "15627": "FISHP",
31
+ "15640": "L8.441",
32
+ "15626": "S308.DS",
33
+ "15642": "S129 PMP_P",
34
+ "15638": "S135 PMP_P",
35
+ }
32
36
 
33
-
34
- def main(workspace: str, dbkeys: list = DBKEYS) -> dict:
37
+ def main(workspace: str, dbkeys: dict = DBKEYS) -> dict:
38
+ """
39
+ Retrieve the inflows data used by LOONE.
40
+
41
+ Args:
42
+ workspace (str): Path to workspace where data will be downloaded.
43
+ dbkeys (dict): Dictionary of dbkeys and corresponding station names.
44
+
45
+ Returns:
46
+ dict: Success or error message
47
+ """
48
+
35
49
  # Retrieve inflow data
36
- for dbkey in dbkeys:
37
- hydro.get(workspace, dbkey)
50
+ for dbkey, station in dbkeys.copy().items():
51
+ file_name = f"{station}_FLOW_cmd.csv"
52
+ date_latest = find_last_date_in_csv(workspace, file_name)
53
+
54
+ # File with data for this dbkey does NOT already exist (or possibly some other error occurred)
55
+ if date_latest is None:
56
+ # Download all the data
57
+ print(f'Downloading all inflow data for {station}')
58
+ hydro.get(workspace, dbkey)
59
+ else:
60
+ # Check whether the latest data is already up to date.
61
+ if dbhydro_data_is_latest(date_latest):
62
+ # Notify that the data is already up to date
63
+ print(f'Downloading of new inflow data skipped for Station {station} (dbkey: {dbkey}). Data is already up to date.')
64
+
65
+ # Remove dbkey from dbkeys so we know it didn't fail
66
+ del dbkeys[dbkey]
67
+ continue
68
+
69
+ # Download only the new data
70
+ print(f'Downloading new inflow data for {station} starting from date {date_latest}')
71
+ hydro.get(workspace, dbkey, date_latest)
72
+
73
+ # Make sure both our original data and newly downloaded data exist
74
+ df_original_path = os.path.join(workspace, f"{station}_FLOW_cmd.csv")
75
+ df_new_path = os.path.join(workspace, f"{station}_FLOW_{dbkey}_cmd.csv")
76
+
77
+ if os.path.exists(df_original_path) and os.path.exists(df_new_path):
78
+ # Merge the new data with the old data
79
+ df_original = pd.read_csv(df_original_path, index_col=0)
80
+ df_new = pd.read_csv(df_new_path, index_col=0)
81
+ df_merged = pd.concat([df_original, df_new], ignore_index=True)
82
+
83
+ # Write the merged data to the new file
84
+ df_merged.to_csv(os.path.join(workspace, f"{station}_FLOW_{dbkey}_cmd.csv"))
85
+
86
+ # Remove the old file
87
+ os.remove(os.path.join(workspace, f"{station}_FLOW_cmd.csv"))
38
88
 
39
- S65E_total.get(workspace)
89
+ # Download S65E_total.csv Data
90
+ date_latest = find_last_date_in_csv(workspace, "S65E_total.csv")
40
91
 
92
+ if date_latest is None:
93
+ print('Downloading all S65E_total data')
94
+ S65E_total.get(workspace, date_max=datetime.now().strftime("%Y-%m-%d"))
95
+ else:
96
+ # Check whether the latest data is already up to date.
97
+ if dbhydro_data_is_latest(date_latest):
98
+ # Notify that the data is already up to date
99
+ print(f'Downloading of new inflow data skipped for S65E_total. Data is already up to date.')
100
+ else:
101
+ # Temporarily rename current data file so it isn't over written
102
+ original_file_name = F"S65E_total_old_{uuid.uuid4()}.csv"
103
+ os.rename(os.path.join(workspace, "S65E_total.csv"), os.path.join(workspace, original_file_name))
104
+
105
+ try:
106
+ # Download only the new data
107
+ print(f'Downloading new S65E_total data starting from date {date_latest}')
108
+ S65E_total.get(workspace, date_min=date_latest, date_max=datetime.now().strftime("%Y-%m-%d"))
109
+
110
+ # Merge the new data with the original data
111
+ df_original = pd.read_csv(os.path.join(workspace, original_file_name), index_col=0)
112
+ df_new = pd.read_csv(os.path.join(workspace, "S65E_total.csv"), index_col=0)
113
+ df_merged = pd.concat([df_original, df_new], ignore_index=True)
114
+
115
+ # Write out the merged data
116
+ df_merged.to_csv(os.path.join(workspace, original_file_name))
117
+
118
+ # Remove the newly downloaded data file
119
+ os.remove(os.path.join(workspace, "S65E_total.csv"))
120
+ except Exception as e:
121
+ print(f"Error occurred while downloading new S65E_total data: {e}")
122
+ finally:
123
+ # Rename the original updated file back to its original name
124
+ os.rename(os.path.join(workspace, original_file_name), os.path.join(workspace, "S65E_total.csv"))
125
+
41
126
  # Check if all files were downloaded
42
127
  files = glob(f"{workspace}/*FLOW*_cmd.csv")
43
128
 
@@ -50,17 +135,21 @@ def main(workspace: str, dbkeys: list = DBKEYS) -> dict:
50
135
  os.rename(file, new_file_name)
51
136
 
52
137
  # Remove dbkey from dbkeys so we know it successfully downloaded
53
- dbkeys.remove(file_dbkey)
54
-
55
- if len(dbkeys) > 0:
56
- return {
57
- "error": (
58
- "The data from the following dbkeys could not be "
59
- f"downloaded: {dbkeys}"
60
- )
61
- }
62
- elif not os.path.exists(f"{workspace}/S65E_total.csv"):
63
- return {"error": "S65E_total.csv file could not be downloaded."}
138
+ del dbkeys[file_dbkey]
139
+
140
+ # Check for failed downloads
141
+ if len(dbkeys) > 0 or not os.path.exists(f"{workspace}/S65E_total.csv"):
142
+ error_message = ""
143
+
144
+ # dbkeys
145
+ if len(dbkeys) > 0:
146
+ error_message += f"The data from the following dbkeys could not be downloaded: {list(dbkeys.keys())}\n"
147
+
148
+ # S65E_total.csv
149
+ if not os.path.exists(f"{workspace}/S65E_total.csv"):
150
+ error_message += "S65E_total.csv file could not be downloaded.\n"
151
+
152
+ return {"error": error_message}
64
153
 
65
154
  return {"success": "Completed inflow flow data download."}
66
155
 
@@ -1,7 +1,8 @@
1
1
  import sys
2
2
  import os
3
3
  from glob import glob
4
- from loone_data_prep.utils import get_dbkeys
4
+ import pandas as pd
5
+ from loone_data_prep.utils import get_dbkeys, find_last_date_in_csv, dbhydro_data_is_latest
5
6
  from loone_data_prep.flow_data import hydro
6
7
 
7
8
  STATION_IDS = [
@@ -24,33 +25,39 @@ STATION_IDS = [
24
25
  ]
25
26
 
26
27
 
27
- DBKEYS = [
28
- "91370",
29
- "91373",
30
- "91379",
31
- "91508",
32
- "91510",
33
- "91513",
34
- "91677",
35
- "15628",
36
- "15640",
37
- "15626",
38
- "00865",
39
- "JW224",
40
- "00436",
41
- "15018",
42
- "91606",
43
- "JW223",
44
- ]
45
-
28
+ DBKEYS = {
29
+ "91370": "S127_C",
30
+ "91373": "S129_C",
31
+ "91379": "S135_C",
32
+ "91508": "S351_S",
33
+ "91510": "S352_S",
34
+ "91513": "S354_S",
35
+ "91677": "S77_S",
36
+ "15628": "INDUST",
37
+ "15640": "L8.441",
38
+ "15626": "S308.DS",
39
+ "00865": "S79_TOT",
40
+ "JW224": "S80_S",
41
+ "00436": "S2 NNR",
42
+ "15018": "S3",
43
+ "91606": "S48_S",
44
+ "JW223": "S49_S",
45
+ }
46
46
 
47
- def main(workspace: str, dbkeys: list = DBKEYS, station_ids: list = STATION_IDS) -> dict:
48
- # Retrieve outflow data
49
47
 
50
- if dbkeys is None:
51
- # Get dbkeys from station ids
52
- dbkeys = list(get_dbkeys(station_ids, "SW", "FLOW", "MEAN", "PREF", detail_level="dbkey"))
53
- dbkeys.extend(list(get_dbkeys(station_ids, "SW", "FLOW", "MEAN", "DRV", detail_level="dbkey")))
48
+ def _get_outflow_data_from_station_ids(workspace: str, station_ids: list) -> dict:
49
+ """Attempt to download outflow data from station ids.
50
+
51
+ Args:
52
+ workspace (str): Path to workspace where data will be downloaded.
53
+ station_ids (list): List of station ids to download data for.
54
+
55
+ Returns:
56
+ dict: Success or error message
57
+ """
58
+ # Get dbkeys from station ids
59
+ dbkeys = list(get_dbkeys(station_ids, "SW", "FLOW", "MEAN", "PREF", detail_level="dbkey"))
60
+ dbkeys.extend(list(get_dbkeys(station_ids, "SW", "FLOW", "MEAN", "DRV", detail_level="dbkey")))
54
61
 
55
62
  for dbkey in dbkeys:
56
63
  hydro.get(workspace, dbkey, "2000-01-01")
@@ -75,6 +82,83 @@ def main(workspace: str, dbkeys: list = DBKEYS, station_ids: list = STATION_IDS)
75
82
  return {"success": "Completed outflow flow data download."}
76
83
 
77
84
 
85
+ def main(workspace: str, dbkeys: dict = DBKEYS, station_ids: list = STATION_IDS) -> dict:
86
+ """
87
+ Retrieve the outflow data used by LOONE.
88
+
89
+ Args:
90
+ workspace (str): Path to workspace where data will be downloaded.
91
+ dbkeys (dict): Dictionary of dbkeys and corresponding station names.
92
+ station_ids (list): List of station ids to download data for if the dbkeys argument is not provided.
93
+
94
+ Returns:
95
+ dict: Success or error message
96
+ """
97
+
98
+ # No dbkeys given, attempt to get data from station ids
99
+ if dbkeys is None:
100
+ return _get_outflow_data_from_station_ids(workspace, station_ids)
101
+
102
+ # Get outflow data from dbkeys
103
+ for dbkey, station in dbkeys.copy().items():
104
+ # Get the date of the latest data in the csv file (if any)
105
+ date_latest = find_last_date_in_csv(workspace, f"{station}_FLOW_cmd.csv")
106
+
107
+ # File with data for this dbkey does NOT already exist (or possibly some other error occurred)
108
+ if date_latest is None:
109
+ # Download all data
110
+ print(f'Downloading all outflow data for {station}')
111
+ hydro.get(workspace, dbkey, "2000-01-01")
112
+ else:
113
+ # Check whether the latest data is already up to date.
114
+ if dbhydro_data_is_latest(date_latest):
115
+ # Notify that the data is already up to date
116
+ print(f'Downloading of new outflow data skipped for Station {station} (dbkey: {dbkey}). Data is already up to date.')
117
+
118
+ # Remove dbkey from dbkeys so we know it didn't fail
119
+ del dbkeys[dbkey]
120
+ continue
121
+
122
+ # Download only the new data
123
+ print(f'Downloading new outflow data for {station} starting from date {date_latest}')
124
+ hydro.get(workspace, dbkey, date_latest)
125
+
126
+ # Make sure both our original data and newly downloaded data exist
127
+ df_old_path = os.path.join(workspace, f"{station}_FLOW_cmd.csv")
128
+ df_new_path = os.path.join(workspace, f"{station}_FLOW_{dbkey}_cmd.csv")
129
+
130
+ if os.path.exists(df_old_path) and os.path.exists(df_new_path):
131
+ # Merge the new data with the old data
132
+ df_original = pd.read_csv(df_old_path, index_col=0)
133
+ df_new = pd.read_csv(df_new_path, index_col=0)
134
+ df_merged = pd.concat([df_original, df_new], ignore_index=True)
135
+
136
+ # Write the merged data to the new file
137
+ df_merged.to_csv(os.path.join(workspace, f"{station}_FLOW_{dbkey}_cmd.csv"))
138
+
139
+ # Remove the old file
140
+ os.remove(os.path.join(workspace, f"{station}_FLOW_cmd.csv"))
141
+
142
+ # Check if all files were downloaded
143
+ files = glob(f"{workspace}/*FLOW*_cmd.csv")
144
+
145
+ for file in files:
146
+ file_dbkey = file.split("_")[-2]
147
+
148
+ if file_dbkey in dbkeys:
149
+ # Remove dbkey from file name
150
+ new_file_name = file.replace(f"_{file_dbkey}", "")
151
+ os.rename(file, new_file_name)
152
+
153
+ # Remove dbkey from dbkeys so we know it successfully downloaded
154
+ del dbkeys[file_dbkey]
155
+
156
+ if len(dbkeys) > 0:
157
+ return {"error": f"The data from the following dbkeys could not be downloaded: {dbkeys}"}
158
+
159
+ return {"success": "Completed outflow flow data download."}
160
+
161
+
78
162
  if __name__ == "__main__":
79
163
  workspace = sys.argv[1].rstrip("/")
80
164
  main(workspace)