loone-data-prep 0.1.8__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,9 +22,9 @@ D2_D = 30
22
22
  St_Yr = 2008
23
23
  St_M = 1
24
24
  St_D = 1
25
- En_Yr = 2023
26
- En_M = 3
27
- En_D = 31
25
+ En_Yr = 2024
26
+ En_M = 9
27
+ En_D = 30
28
28
 
29
29
  # Tp Concentrations Dataframe
30
30
  TP_df = None
@@ -545,7 +545,7 @@ def main(input_dir: str, output_dir: str) -> None:
545
545
  LO_TP_data_Inter['Mean_TP'] = LO_TP_data_Inter.mean(axis=1, numeric_only=True)
546
546
  LO_TP_data_Inter = LO_TP_data_Inter.set_index(['date'])
547
547
  LO_TP_data_Inter.index = pd.to_datetime(LO_TP_data_Inter.index, unit='ns')
548
- LO_TP_Monthly_Inter = LO_TP_data_Inter.resample('M').mean()
548
+ LO_TP_Monthly_Inter = LO_TP_data_Inter.resample('ME').mean()
549
549
  Max = LO_TP_Monthly_Inter.max(axis=1)
550
550
  Min = LO_TP_Monthly_Inter.min(axis=1)
551
551
  LO_TP_Monthly_Inter['Max'] = Max.values
@@ -624,7 +624,7 @@ def main(input_dir: str, output_dir: str) -> None:
624
624
  LO_NH4_Clean_Inter.to_csv(f'{output_dir}/LO_NH4_Clean_daily.csv', index=False)
625
625
  LO_NH4_Clean_Inter = LO_NH4_Clean_Inter.set_index(['date'])
626
626
  LO_NH4_Clean_Inter.index = pd.to_datetime(LO_NH4_Clean_Inter.index, unit='ns')
627
- LO_NH4_Monthly_Inter = LO_NH4_Clean_Inter.resample('M').mean()
627
+ LO_NH4_Monthly_Inter = LO_NH4_Clean_Inter.resample('ME').mean()
628
628
  LO_NH4_Monthly_Inter.to_csv(f'{output_dir}/LO_NH4_Monthly_Inter.csv')
629
629
 
630
630
  # Interpolated NO Observations in Lake
@@ -967,6 +967,7 @@ def main(input_dir: str, output_dir: str) -> None:
967
967
  NO_list = {'S65_NO': S65_NO, 'S71_NO': S71_NO, 'S72_NO': S72_NO, 'S84_NO': S84_NO, 'S127_NO': S127_NO,
968
968
  'S133_NO': S133_NO, 'S154_NO': S154_NO, 'S191_NO': S191_NO, 'S308_NO': S308_NO,
969
969
  'FISHP_NO': FISHP_NO, 'L8_NO': L8_NO, 'S4_NO': S4_NO}
970
+ #TODO: Why is this date hard coded into this part?
970
971
  date_NO = pd.date_range(start='1/1/2008', end='3/31/2023', freq='D')
971
972
 
972
973
  NO_df = pd.DataFrame(date_NO, columns=['date'])
@@ -982,32 +983,48 @@ def main(input_dir: str, output_dir: str) -> None:
982
983
  Flow_df = DF_Date_Range(Flow_df, St_Yr, St_M, St_D, En_Yr, En_M, En_D)
983
984
 
984
985
  # Determine NO Loads
985
- NO_Loads_In = pd.DataFrame(date_NO, columns=['date'])
986
- NO_Loads_In['S65_NO_Ld'] = Flow_df['S65_Q'].values * NO_df['S65_NO'].values * 1000
987
- NO_Loads_In['S71_NO_Ld'] = Flow_df['S71_Q'].values * NO_df['S71_NO'].values * 1000
988
- NO_Loads_In['S72_NO_Ld'] = Flow_df['S72_Q'].values * NO_df['S72_NO'].values * 1000
989
- NO_Loads_In['S84_NO_Ld'] = Flow_df['S84_Q'].values * NO_df['S84_NO'].values * 1000
990
- NO_Loads_In['S127_NO_Ld'] = Flow_df['S127_In'].values * NO_df['S127_NO'].values * 1000
991
- NO_Loads_In['S133_NO_Ld'] = Flow_df['S133_P_Q'].values * NO_df['S133_NO'].values * 1000
986
+ # Ensure 'date' is datetime
987
+ NO_df['date'] = pd.to_datetime(NO_df['date'])
988
+ Flow_df['date'] = pd.to_datetime(Flow_df['date'])
989
+
990
+ # Merge the two dataframes on date - this will ensure that the dates match
991
+ merged = pd.merge(NO_df, Flow_df, on='date', how='inner')
992
+
993
+ # Compute NO Loads
994
+ NO_Loads_In = merged[['date']].copy()
995
+ NO_Loads_In['S65_NO_Ld'] = merged['S65_Q'] * merged['S65_NO'] * 1000
996
+ NO_Loads_In['S71_NO_Ld'] = merged['S71_Q'] * merged['S71_NO'] * 1000
997
+ NO_Loads_In['S71_NO_Ld'] = merged['S71_Q'] * merged['S71_NO'] * 1000
998
+ NO_Loads_In['S72_NO_Ld'] = merged['S72_Q'] * merged['S72_NO'] * 1000
999
+ NO_Loads_In['S84_NO_Ld'] = merged['S84_Q'] * merged['S84_NO'] * 1000
1000
+ NO_Loads_In['S127_NO_Ld'] = merged['S127_In'] * merged['S127_NO'] * 1000
1001
+ NO_Loads_In['S133_NO_Ld'] = merged['S133_P_Q'] * merged['S133_NO'] * 1000
992
1002
  # NO_Loads_In['S135_NO_Ld'] = Flow_df['S135_In'].values * NO_df['S135_NO'].values * 1000
993
- NO_Loads_In['S154_NO_Ld'] = Flow_df['S154_Q'].values * NO_df['S154_NO'].values * 1000
994
- NO_Loads_In['S191_NO_Ld'] = Flow_df['S191_Q'].values * NO_df['S191_NO'].values * 1000
995
- NO_Loads_In['S308_NO_Ld'] = Flow_df['S308_In'].values * NO_df['S308_NO'].values * 1000
996
- NO_Loads_In['FISHP_NO_Ld'] = Flow_df['FISHP_Q'].values * NO_df['FISHP_NO'].values * 1000
997
- NO_Loads_In['L8_NO_Ld'] = Flow_df['L8_In'].values * NO_df['L8_NO'].values * 1000
998
- NO_Loads_In['S4_NO_Ld'] = Flow_df['S4_P_Q'].values * NO_df['S4_NO'].values * 1000
1003
+ NO_Loads_In['S154_NO_Ld'] = merged['S154_Q'] * merged['S154_NO'] * 1000
1004
+ NO_Loads_In['S191_NO_Ld'] = merged['S191_Q'] * merged['S191_NO'] * 1000
1005
+ NO_Loads_In['S308_NO_Ld'] = merged['S308_In'] * merged['S308_NO'] * 1000
1006
+ NO_Loads_In['FISHP_NO_Ld'] = merged['FISHP_Q'] * merged['FISHP_NO'] * 1000
1007
+ NO_Loads_In['L8_NO_Ld'] = merged['L8_In'] * merged['L8_NO'] * 1000
1008
+ NO_Loads_In['S4_NO_Ld'] = merged['S4_P_Q'] * merged['S4_NO'] * 1000
999
1009
  # Calculate the total External Loads to Lake Okeechobee
1000
1010
  NO_Loads_In['External_NO_Ld_mg'] = NO_Loads_In.sum(axis=1, numeric_only=True)
1001
1011
  NO_Loads_In.to_csv(f'{output_dir}/LO_External_Loadings_NO.csv', index=False)
1002
1012
 
1003
1013
  # Determine Chla Loads
1004
1014
  # Create File (Chla_Loads_In)
1015
+ # Read and date-filter Chla data
1005
1016
  S65E_Chla = pd.read_csv(f'{output_dir}/S65E_Chla_Merged.csv')
1017
+ S65E_Chla['date'] = pd.to_datetime(S65E_Chla['date']) # Ensure date column is datetime
1006
1018
  S65E_Chla = DF_Date_Range(S65E_Chla, St_Yr, St_M, St_D, En_Yr, En_M, En_D)
1007
- Chla_Loads_In = pd.DataFrame(date_NO, columns=['date'])
1008
- Chla_Loads_In['Chla_Loads'] = Flow_df['Inflows'].values * S65E_Chla['Data'].values
1019
+ # Merge on date
1020
+ merged = pd.merge(Flow_df[['date', 'Inflows']], S65E_Chla[['date', 'Data']], on='date', how='inner')
1021
+ # Calculate Chlorophyll-a loads
1022
+ merged['Chla_Loads'] = merged['Inflows'] * merged['Data']
1023
+ # Save results
1024
+ Chla_Loads_In = merged[['date', 'Chla_Loads']]
1009
1025
  Chla_Loads_In.to_csv(f'{output_dir}/Chla_Loads_In.csv', index=False)
1010
1026
 
1027
+
1011
1028
  # Write Data into csv files
1012
1029
  # write Avg Stage (ft, m) Storage (acft, m3) SA (acres) to csv
1013
1030
  LO_Stg_Sto_SA_df.to_csv(f'{output_dir}/Average_LO_Storage_3MLag.csv', index=False)
@@ -37,47 +37,65 @@ def get_bias_corrected_data(
37
37
 
38
38
  # Prepare the observed data by filling NaN values with the 10yr average
39
39
  prepared_od = prep_observed_data(observed_data)
40
-
41
- # Get the historical simulation data for the given reach ID
42
- historical_data = None
43
-
44
- if cache_path is None:
45
- historical_data = geoglows.streamflow.historic_simulation(reach_id)
46
- else:
47
- # Create the geoglows cache directory if it doesn't exist
48
- geoglows_cache_path = os.path.join(cache_path, "geoglows_cache")
49
- if not os.path.exists(geoglows_cache_path):
50
- os.makedirs(geoglows_cache_path)
51
-
52
- # Check if the historical simulation data is already cached
53
- if os.path.exists(
54
- os.path.join(
55
- geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
56
- )
57
- ):
58
- historical_data = pd.read_csv(
59
- os.path.join(
60
- geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
61
- ),
62
- index_col=0,
63
- )
64
- historical_data.index = pd.to_datetime(historical_data.index)
65
- else:
66
- historical_data = geoglows.streamflow.historic_simulation(reach_id)
67
- historical_data.to_csv(
68
- os.path.join(
69
- geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
70
- )
71
- )
40
+ historical_data = geoglows.data.retro_daily(reach_id)
41
+ # Get the historical simulation data for the given reach ID - TODO: Do we for sure want to cache the historical data?
42
+ # I am reading the observed data that we queried earlier instead of caching it
43
+ # historical_data = None
44
+
45
+ # if cache_path is None:
46
+ # historical_data = geoglows.streamflow.historic_simulation(reach_id)
47
+ # else:
48
+ # # Create the geoglows cache directory if it doesn't exist
49
+ # geoglows_cache_path = os.path.join(cache_path, "geoglows_cache")
50
+ # if not os.path.exists(geoglows_cache_path):
51
+ # os.makedirs(geoglows_cache_path)
52
+
53
+ # # Check if the historical simulation data is already cached
54
+ # if os.path.exists(
55
+ # os.path.join(
56
+ # geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
57
+ # )
58
+ # ):
59
+ # historical_data = pd.read_csv(
60
+ # os.path.join(
61
+ # geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
62
+ # ),
63
+ # index_col=0,
64
+ # )
65
+ # historical_data.index = pd.to_datetime(historical_data.index)
66
+ # else:
67
+ # historical_data = geoglows.streamflow.historic_simulation(reach_id)
68
+ # historical_data.to_csv(
69
+ # os.path.join(
70
+ # geoglows_cache_path, f"{reach_id}_historic_simulation.csv"
71
+ # )
72
+ # )
73
+ # Drop 'ensemble_52' column if it exists - not necessary but we don't need it
74
+ station_ensembles.drop(columns=['ensemble_52'], inplace=True, errors='ignore')
75
+
76
+ # Drop all rows with any NaN values - again not necessary but we can drop them because we don't need it
77
+ station_ensembles.dropna(inplace=True)
72
78
 
73
79
  # Correct the forecast bias in the station ensembles
74
- station_ensembles = bias_correct_forecast(
80
+ station_ensembles = geoglows.bias.correct_forecast(
75
81
  station_ensembles, historical_data, prepared_od
76
82
  )
83
+
77
84
  # Correct the forecast bias in the station stats
78
- station_stats = bias_correct_forecast(
85
+ station_stats = geoglows.bias.correct_forecast(
79
86
  station_stats, historical_data, prepared_od
80
87
  )
88
+ #This is to clean out any infinite values that may have occurred during bias correction
89
+ station_ensembles = station_ensembles.replace([np.inf, -np.inf], np.nan)
90
+ station_ensembles = station_ensembles.interpolate(axis=0, limit_direction='both')
91
+
92
+ # Fill any remaining NaNs (e.g., at column ends)
93
+ station_ensembles = station_ensembles.ffill(axis=0).bfill(axis=0)
94
+ station_stats = station_stats.replace([np.inf, -np.inf], np.nan)
95
+ station_stats = station_stats.interpolate(axis=0, limit_direction='both')
96
+
97
+ # Fill any remaining NaNs (e.g., at column ends)
98
+ station_stats = station_stats.ffill(axis=0).bfill(axis=0)
81
99
 
82
100
  # Return the bias-corrected station ensembles and station stats
83
101
  return station_ensembles, station_stats
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  import sys
3
- import glob
4
3
  import pandas as pd
5
4
  import rpy2.robjects as ro
6
5
  from rpy2.robjects import pandas2ri
@@ -44,37 +43,16 @@ STATION_IDS = [
44
43
  "S49_S",
45
44
  ] # Added these stations. They seemed to be missing.
46
45
 
47
- REACH_IDS = {
48
- "S191_S": 13082707,
49
- "S65E_S": 13082699,
50
- "S65EX1_S": 13082699,
51
- "S84_S": 13082700,
52
- "S154_C": 13082716,
53
- "S71_S": 13082743,
54
- "S72_S": 13082727,
55
- "FISHP": 13082756,
56
- "S308.DS": 13082736,
57
- "L8.441": 13082747,
58
- "S133_P": 13082709,
59
- "S127_C": 13082716,
60
- "S127_P": 13082716,
61
- "S129_C": 13082727,
62
- "S135_C": 13082725,
63
- "S2_P": 13082783,
64
- "S3_P": 13082809,
65
- "S4_P": 13082806,
66
- "S351_S": 13082804,
67
- "S352_S": 13082762,
68
- "S354_S": 13082809,
69
- "S129 PMP_P": 13082727,
70
- "S135 PMP_P": 13082725,
71
- "S77_S": 13082767,
72
- "INDUST": 13082806,
73
- "S79_S": 13082791,
74
- "S80_S": 13082718,
75
- "S40_S": 13082797,
76
- "S49_S": 13082696,
77
- }
46
+ INFLOW_IDS = [
47
+ 750059718, 750043742, 750035446, 750034865, 750055574, 750053211,
48
+ 750050248, 750065049, 750064453, 750049661, 750069195, 750051436,
49
+ 750068005, 750063868, 750069782, 750072741
50
+ ]
51
+ OUTFLOW_IDS = [750053809, 750057949]
52
+ MATCHED_IDS = [750052624, 750049656, 750057357,
53
+ 750038427, 750051428, 750068601, 750058536, 750038416,
54
+ 750050259, 750045514, 750053213, 750028935]
55
+
78
56
 
79
57
  SECONDS_IN_HOUR = 3600
80
58
  SECONDS_IN_DAY = 86400
@@ -140,7 +118,7 @@ def get_reach_id(latitude: float, longitude: float):
140
118
  Returns:
141
119
  (int): The reach id of the given latitude/longitude
142
120
  """
143
- reach_data = geoglows.streamflow.latlon_to_reach(latitude, longitude)
121
+ reach_data = geoglows.streams.latlon_to_reach(latitude, longitude)
144
122
 
145
123
  if "error" in reach_data:
146
124
  raise Exception(reach_data["error"])
@@ -159,8 +137,8 @@ def get_flow_forecast_ensembles(reach_id: str, forecast_date: str):
159
137
  Returns:
160
138
  (pandas.core.frame.DataFrame): The 52 ensemble flow forecasts.
161
139
  """
162
- return geoglows.streamflow.forecast_ensembles(
163
- reach_id=reach_id, forecast_date=forecast_date, endpoint=GEOGLOWS_ENDPOINT
140
+ return geoglows.data.forecast_ensembles(
141
+ river_id=reach_id, date=forecast_date
164
142
  )
165
143
 
166
144
 
@@ -176,14 +154,15 @@ def get_flow_forecast_stats(reach_id: str, forecast_date: str):
176
154
  Returns:
177
155
  (pandas.core.frame.DataFrame): The forecast stats
178
156
  """
179
- return geoglows.streamflow.forecast_stats(
180
- reach_id=reach_id, forecast_date=forecast_date, endpoint=GEOGLOWS_ENDPOINT
157
+ return geoglows.data.forecast_stats(
158
+ river_id=reach_id, date=forecast_date
181
159
  )
182
160
 
183
161
 
184
162
  def ensembles_to_csv(
185
163
  workspace: str,
186
- station_id: str,
164
+ flow_type: str,
165
+ reach_id: str,
187
166
  ensembles: pd.core.frame.DataFrame,
188
167
  stats: pd.core.frame.DataFrame,
189
168
  ):
@@ -202,7 +181,7 @@ def ensembles_to_csv(
202
181
  data.
203
182
  """
204
183
  # Get the path to the file that will be written
205
- file_name = f"{station_id}_FLOW_cmd_geoglows.csv"
184
+ file_name = f"{reach_id}_{flow_type}_cmd_geoglows.csv"
206
185
  file_path = os.path.join(workspace, file_name)
207
186
 
208
187
  # Format DataFrames for LOONE
@@ -234,8 +213,8 @@ def _format_ensembles_DataFrame(dataframe: pd.core.frame.DataFrame):
234
213
  DataFrame.
235
214
  """
236
215
  # Remove high resolution columns (ensemble 52)
237
- if "ensemble_52_m^3/s" in dataframe.columns:
238
- dataframe.drop(columns="ensemble_52_m^3/s", inplace=True)
216
+ if "ensemble_52" in dataframe.columns:
217
+ dataframe.drop(columns="ensemble_52", inplace=True)
239
218
 
240
219
  # Remove rows with null values
241
220
  dataframe.dropna(axis="index", inplace=True)
@@ -284,8 +263,8 @@ def _format_stats_DataFrame(dataframe: pd.core.frame.DataFrame):
284
263
  DataFrame.
285
264
  """
286
265
  # Remove high resolution columns (ensemble 52, high_res_m^3/s)
287
- if "high_res_m^3/s" in dataframe.columns:
288
- dataframe.drop(columns="high_res_m^3/s", inplace=True)
266
+ if "high_res" in dataframe.columns:
267
+ dataframe.drop(columns="high_res", inplace=True)
289
268
 
290
269
  # Remove rows with null values
291
270
  dataframe.dropna(axis="index", inplace=True)
@@ -300,28 +279,28 @@ def _format_stats_DataFrame(dataframe: pd.core.frame.DataFrame):
300
279
  dataframe.clip(0, inplace=True)
301
280
 
302
281
  # Max Column (Max)
303
- column_max = dataframe[["flow_max_m^3/s"]].copy()
282
+ column_max = dataframe[["flow_max"]].copy()
304
283
  column_max = column_max.groupby([column_max.index]).max()
305
284
 
306
285
  # 75th Percentile Column (Average)
307
- column_75percentile = dataframe[["flow_75%_m^3/s"]].copy()
286
+ column_75percentile = dataframe[["flow_75p"]].copy()
308
287
  column_75percentile = column_75percentile.groupby(
309
288
  [column_75percentile.index]
310
289
  ).mean()
311
290
 
312
291
  # Average Column (Weighted Average)
313
- column_average = dataframe[["flow_avg_m^3/s"]].copy()
292
+ column_average = dataframe[["flow_avg"]].copy()
314
293
  column_average.transform(lambda x: x / 8)
315
294
  column_average = column_average.groupby([column_average.index]).sum()
316
295
 
317
296
  # 25th Percentile Column (Average)
318
- column_25percentile = dataframe[["flow_25%_m^3/s"]].copy()
297
+ column_25percentile = dataframe[["flow_25p"]].copy()
319
298
  column_25percentile = column_25percentile.groupby(
320
299
  [column_25percentile.index]
321
300
  ).mean()
322
301
 
323
302
  # Min Column (Min)
324
- column_min = dataframe[["flow_min_m^3/s"]].copy()
303
+ column_min = dataframe[["flow_min"]].copy()
325
304
  column_min = column_min.groupby([column_min.index]).min()
326
305
 
327
306
  # Convert values in each column from m^3/h to m^3/d
@@ -338,17 +317,17 @@ def _format_stats_DataFrame(dataframe: pd.core.frame.DataFrame):
338
317
  # Append modified columns into one pandas DataFrame
339
318
  dataframe_result = pd.DataFrame()
340
319
  dataframe_result.index = dataframe.groupby([dataframe.index]).mean().index
341
- dataframe_result["flow_max_m^3/d"] = column_max["flow_max_m^3/s"].tolist()
320
+ dataframe_result["flow_max_m^3/d"] = column_max["flow_max"].tolist()
342
321
  dataframe_result["flow_75%_m^3/d"] = column_75percentile[
343
- "flow_75%_m^3/s"
322
+ "flow_75p"
344
323
  ].tolist()
345
324
  dataframe_result["flow_avg_m^3/d"] = column_average[
346
- "flow_avg_m^3/s"
325
+ "flow_avg"
347
326
  ].tolist()
348
327
  dataframe_result["flow_25%_m^3/d"] = column_25percentile[
349
- "flow_25%_m^3/s"
328
+ "flow_25p"
350
329
  ].tolist()
351
- dataframe_result["flow_min_m^3/d"] = column_min["flow_min_m^3/s"].tolist()
330
+ dataframe_result["flow_min_m^3/d"] = column_min["flow_min"].tolist()
352
331
 
353
332
  # Format datetimes to just dates
354
333
  dataframe_result.index = dataframe_result.index.strftime("%Y-%m-%d")
@@ -383,48 +362,111 @@ def main(
383
362
  cache_path (str): The path to the cache directory for geoglows data.
384
363
  Should hold a directory named geoglows_cache that holds the cached files. Use None to not use a cache.
385
364
  """
386
- # Local Variables
387
- reach_ids = {}
388
-
389
- # Get the latitude/longitude for each station
390
- station_locations = get_stations_latitude_longitude(station_ids)
391
-
392
- # Check for any download failures
393
- for station_id in station_ids:
394
- if station_id in REACH_IDS.keys():
395
- reach_ids[station_id] = REACH_IDS[station_id]
396
- elif station_id not in station_locations.keys():
397
- raise Exception(
398
- "Error: The longitude and latitude could not be downloaded "
399
- f"for station {station_id}"
400
- )
401
-
402
- # Get station reach ids
403
- if station_id not in REACH_IDS.keys():
404
- for station_id in station_locations.keys():
405
- location = station_locations[station_id]
406
- try:
407
- reach_ids[station_id] = get_reach_id(location[0], location[1])
408
- except Exception as e:
409
- print(
410
- "Error: Failed to get reach id for station "
411
- f"{station_id} ({str(e)})"
412
- )
365
+ # # Local Variables
366
+ # reach_ids = {}
367
+
368
+ # # Get the latitude/longitude for each station
369
+ # station_locations = get_stations_latitude_longitude(station_ids)
370
+
371
+ # # Check for any download failures
372
+ # for station_id in station_ids:
373
+ # if station_id in REACH_IDS.keys():
374
+ # reach_ids[station_id] = REACH_IDS[station_id]
375
+ # elif station_id not in station_locations.keys():
376
+ # raise Exception(
377
+ # "Error: The longitude and latitude could not be downloaded "
378
+ # f"for station {station_id}"
379
+ # )
380
+
381
+ # # Get station reach ids
382
+ # if station_id not in REACH_IDS.keys():
383
+ # for station_id in station_locations.keys():
384
+ # location = station_locations[station_id]
385
+ # try:
386
+ # reach_ids[station_id] = get_reach_id(location[0], location[1])
387
+ # except Exception as e:
388
+ # print(
389
+ # "Error: Failed to get reach id for station "
390
+ # f"{station_id} ({str(e)})"
391
+ # )
413
392
 
414
393
  # Get the flow data for each station
415
- for station_id in reach_ids.keys():
416
- reach_id = reach_ids[station_id]
394
+ stations_inflow_by_comid = {
395
+ 750072741: "S65E_S", # TODO: Should this be S65E_total or S65E_S? - this is a station we definitely want
396
+ 750069782: "S84_S", #
397
+ # 750053211: "S129_C", # TODO: Should this be S129_C or S129_PMP_P? - Also right now it is all 0s
398
+ # 750035446: "S133_P", # TODO: Should this be S133_P or S133_C? - Also right now it is all 0s
399
+ 750064453: "S154_C", # This is primarily 0s
400
+ }
401
+
402
+
403
+ for reach_id in INFLOW_IDS:
404
+ station_ensembles = get_flow_forecast_ensembles(
405
+ reach_id, forecast_date
406
+ )
407
+ station_stats = get_flow_forecast_stats(reach_id, forecast_date)
408
+
409
+ if bias_corrected:
410
+ if reach_id in stations_inflow_by_comid:
411
+ station_id = stations_inflow_by_comid[reach_id]
412
+ observed_data_path = os.path.join(observed_data_dir, f"{station_id}_FLOW_cmd.csv")
413
+ # if observed_data_list:
414
+ # observed_data_path = observed_data_list[0]
415
+ station_ensembles, station_stats = get_bias_corrected_data(
416
+ station_id,
417
+ reach_id,
418
+ observed_data_path,
419
+ station_ensembles,
420
+ station_stats,
421
+ cache_path,
422
+ )
423
+
424
+ ensembles_to_csv(
425
+ workspace,
426
+ "INFLOW",
427
+ reach_id,
428
+ station_ensembles,
429
+ station_stats,
430
+ )
431
+ for reach_id in OUTFLOW_IDS:
417
432
  station_ensembles = get_flow_forecast_ensembles(
418
433
  reach_id, forecast_date
419
434
  )
420
435
  station_stats = get_flow_forecast_stats(reach_id, forecast_date)
421
436
 
437
+ ensembles_to_csv(
438
+ workspace,
439
+ "OUTFLOW",
440
+ reach_id,
441
+ station_ensembles,
442
+ station_stats,
443
+ )
444
+ for reach_id in MATCHED_IDS:
445
+ stations_matched_by_comid = {
446
+ 750068601: "S71_S",
447
+ 750052624: "S135_C", # TODO: Should this be S135_C or S135_P?
448
+ # 750052624: "S308", # NOTE: Same COMID as S135 — only one key allowed!
449
+ 750053213: "FISHP",
450
+ 750038416: "S77_S",
451
+ 750050259: "S79_TOT",
452
+ 750045514: "S80_S",
453
+ 750058536: "S72_S",
454
+ 750051428: "S49_S",
455
+ # 750038427: "S40",
456
+ 750057357: "S191_S",
457
+ 750028935: "S127_C", #TODO: Should this be S127_C or S127_P?
458
+ }
459
+
460
+ station_ensembles = get_flow_forecast_ensembles(
461
+ reach_id, forecast_date
462
+ )
463
+ station_stats = get_flow_forecast_stats(reach_id, forecast_date)
422
464
  if bias_corrected:
423
- observed_data_list = glob.glob(
424
- os.path.join(observed_data_dir, f"{station_id}*FLOW_cmd.csv")
425
- )
426
- if observed_data_list:
427
- observed_data_path = observed_data_list[0]
465
+ if reach_id in stations_matched_by_comid:
466
+ station_id = stations_matched_by_comid[reach_id]
467
+ observed_data_path = os.path.join(observed_data_dir, f"{station_id}_FLOW_cmd.csv")
468
+ # if observed_data_list:
469
+ # observed_data_path = observed_data_list[0]
428
470
  station_ensembles, station_stats = get_bias_corrected_data(
429
471
  station_id,
430
472
  reach_id,
@@ -436,7 +478,8 @@ def main(
436
478
 
437
479
  ensembles_to_csv(
438
480
  workspace,
439
- station_id,
481
+ "MATCHED",
482
+ reach_id,
440
483
  station_ensembles,
441
484
  station_stats,
442
485
  )
@@ -0,0 +1,127 @@
1
+ import os
2
+ from herbie import FastHerbie
3
+ from datetime import datetime
4
+ import pandas as pd
5
+ from retry_requests import retry
6
+ import warnings
7
+
8
+
9
+ def generate_wind_forecasts(output_dir):
10
+ # Ensure output directory exists
11
+ warnings.filterwarnings("ignore", message="Will not remove GRIB file because it previously existed.")
12
+ os.makedirs(output_dir, exist_ok=True)
13
+
14
+ # Define points of interest
15
+ points = pd.DataFrame({
16
+ "longitude": [-80.7934, -80.9724, -80.7828, -80.7890],
17
+ "latitude": [27.1389, 26.9567, 26.8226, 26.9018]
18
+ })
19
+
20
+ # Station-specific file and column names
21
+ file_map = {
22
+ "Point_1": ("L001_WNDS_MPH_predicted.csv", "L001_WNDS_MPH"),
23
+ "Point_2": ("L005_WNDS_MPH_predicted.csv", "L005_WNDS_MPH"),
24
+ "Point_3": ("L006_WNDS_MPH_predicted.csv", "L006_WNDS_MPH"),
25
+ "Point_4": ("LZ40_WNDS_MPH_predicted.csv", "LZ40_WNDS_MPH")
26
+ }
27
+
28
+ today_str = datetime.today().strftime('%Y-%m-%d 00:00')
29
+ FH = FastHerbie([today_str], model="ifs", fxx=range(0, 360, 3))
30
+ dfs = []
31
+
32
+ variables = {
33
+ "10u": "10u",
34
+ "10v": "10v",
35
+ "2t": "2t",
36
+
37
+ }
38
+
39
+ # Loop through points and extract data
40
+ for index, point in points.iterrows():
41
+ print(f"\nProcessing Point {index + 1}: ({point.latitude}, {point.longitude})")
42
+
43
+ point_df = pd.DataFrame({
44
+ "longitude": [point.longitude],
45
+ "latitude": [point.latitude]
46
+ })
47
+
48
+ for var_key, var_name in variables.items():
49
+ print(f" Variable: {var_key}")
50
+
51
+ # Download and load dataset
52
+ FH.download(f":{var_key}")
53
+ ds = FH.xarray(f":{var_key}", backend_kwargs={"decode_timedelta": True})
54
+
55
+ # Extract point data
56
+ dsi = ds.herbie.pick_points(point_df, method="nearest")
57
+
58
+ # Get actual variable name
59
+ if var_name == "10u":
60
+ var_name_actual = "u10" # Map 10u to u10
61
+ elif var_name == "10v":
62
+ var_name_actual = "v10" # Map 10v to v10
63
+ elif var_name == "2t":
64
+ var_name_actual = "t2m" #TODO: check that this is correct
65
+
66
+ # Convert to DataFrame
67
+ time_series = dsi[var_name_actual].squeeze()
68
+ df = time_series.to_dataframe().reset_index()
69
+
70
+ # Handle datetime columns
71
+ if "valid_time" in df.columns:
72
+ df = df.rename(columns={"valid_time": "datetime"})
73
+ elif "step" in df.columns and "time" in dsi.coords:
74
+ df["datetime"] = dsi.time.values[0] + df["step"]
75
+
76
+ # Retain necessary columns
77
+ df = df[["datetime", var_name_actual]].drop_duplicates()
78
+ dfs.append((index, var_name_actual, df))
79
+
80
+ # Merge and process data per point
81
+ results = {}
82
+ for point_index in range(len(points)):
83
+ u_df = [df for idx, name, df in dfs if idx == point_index and name == "u10"][0]
84
+ v_df = [df for idx, name, df in dfs if idx == point_index and name == "v10"][0]
85
+ merged = u_df.merge(v_df, on="datetime", how="outer")
86
+
87
+ # Compute wind speed and correction
88
+ merged["wind_speed"] = (merged["u10"] ** 2 + merged["v10"] ** 2) ** 0.5
89
+ merged["wind_speed_corrected"] = 0.4167 * merged["wind_speed"] + 4.1868
90
+ merged["wind_speed_corrected"] = merged["wind_speed_corrected"] * 2.23694 # m/s to mph
91
+
92
+ results[f"Point_{point_index + 1}"] = merged
93
+
94
+ # Save outputs with station-specific column names
95
+ for key, (filename, new_col_name) in file_map.items():
96
+ df = results[key].copy()
97
+ df = df[["datetime", "wind_speed_corrected"]].rename(columns={
98
+ "wind_speed_corrected": new_col_name,
99
+ "datetime": "date"
100
+ })
101
+ filepath = os.path.join(output_dir, filename)
102
+ df.to_csv(filepath, index=False)
103
+ # Save 2-meter air temperature data
104
+ airt_file_map = {
105
+ "Point_1": "L001_AIRT_Degrees Celsius_forecast.csv",
106
+ "Point_2": "L005_AIRT_Degrees Celsius_forecast.csv",
107
+ "Point_3": "L006_AIRT_Degrees Celsius_forecast.csv",
108
+ "Point_4": "LZ40_AIRT_Degrees Celsius_forecast.csv"
109
+ }
110
+ airt_column_map = {
111
+ "Point_1": "L001_AIRT_Degrees Celsius",
112
+ "Point_2": "L005_AIRT_Degrees Celsius",
113
+ "Point_3": "L006_AIRT_Degrees Celsius",
114
+ "Point_4": "LZ40_AIRT_Degrees Celsius"
115
+ }
116
+
117
+ for key in airt_file_map:
118
+ point_index = int(key.split("_")[1]) - 1
119
+ df_airt = [df for idx, name, df in dfs if idx == point_index and name == "t2m"][0].copy()
120
+ df_airt["t2m"] = df_airt["t2m"] - 273.15 # Convert from Kelvin to Celsius
121
+ df_airt = df_airt.rename(columns={
122
+ "datetime": "date",
123
+ "t2m": airt_column_map[key]
124
+ })
125
+ filepath = os.path.join(output_dir, airt_file_map[key])
126
+ df_airt.to_csv(filepath, index=False)
127
+