loone-data-prep 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ import sys
2
2
  from datetime import datetime
3
3
  from glob import glob
4
4
  from retry import retry
5
+ import os
5
6
  import pandas as pd
6
7
  from rpy2.robjects import r
7
8
  from rpy2.rinterface_lib.embedded import RRuntimeError
@@ -18,36 +19,92 @@ def get(
18
19
  date_max: str = DATE_NOW
19
20
  ) -> None:
20
21
  r_str = f"""
21
- # Load the required libraries
22
- library(dbhydroR)
22
+ download_flow_data <- function(workspace, dbkey, date_min, date_max)
23
+ {{
24
+ # Load the required libraries
25
+ library(dbhydroR)
26
+ library(dplyr)
23
27
 
24
- # Retrieve data for the dbkey
25
- data <- get_hydro(dbkey = "{dbkey}", date_min = "{date_min}", date_max = "{date_max}")
28
+ # Retrieve data for the dbkey
29
+ data <- get_hydro(dbkey = "{dbkey}", date_min = "{date_min}", date_max = "{date_max}", raw = TRUE)
30
+
31
+ # Check if data is empty or contains only the "date" column
32
+ if (ncol(data) <= 1) {{
33
+ cat("No data found for dbkey", "{dbkey}", "Skipping to the next dbkey.\n")
34
+ }}
35
+
36
+ # Give data.frame correct column names so it can be cleaned using the clean_hydro function
37
+ colnames(data) <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
38
+
39
+ # Check if the data.frame has any rows
40
+ if (nrow(data) == 0)
41
+ {{
42
+ # No data given back, It's possible that the dbkey has reached its end date.
43
+ print(paste("Empty data.frame returned for dbkey", "{dbkey}", "It's possible that the dbkey has reached its end date. Skipping to the next dbkey."))
44
+ return(list(success = FALSE, dbkey = "{dbkey}"))
45
+ }}
46
+
47
+ # Add a type and units column to data so it can be cleaned using the clean_hydro function
48
+ data$type <- "FLOW"
49
+ data$units <- "cfs"
50
+
51
+ # Get the station
52
+ station <- data$station[1]
53
+
54
+ # Clean the data.frame
55
+ data <- clean_hydro(data)
26
56
 
27
- # Check if data is empty or contains only the "date" column
28
- if (ncol(data) <= 1) {{
29
- cat("No data found for dbkey", "{dbkey}", "Skipping to the next dbkey.\n")
30
- }}
31
-
32
- # Multiply all columns except "date" column by 0.0283168466 * 86400 to convert Flow rate from cfs to m³/day
33
- data[, -1] <- data[, -1] * (0.0283168466 * 86400)
34
-
35
- # Extract the column names excluding the date column
36
- column_names <- names(data)[-1]
57
+ # Multiply all columns except "date" column by 0.0283168466 * 86400 to convert Flow rate from cfs to m³/day
58
+ data[, -1] <- data[, -1] * (0.0283168466 * 86400)
59
+
60
+ # Drop the " _FLOW_cfs" column
61
+ data <- data %>% select(-` _FLOW_cfs`)
62
+
63
+ # Sort the data by date
64
+ data <- data[order(data$date), ]
65
+
66
+ # Get the filename for the output CSV file
67
+ filename <- paste0(station, "_FLOW", "_{dbkey}_cmd.csv")
68
+
69
+ # Save data to a CSV file
70
+ write.csv(data, file = paste0("{workspace}/", filename))
37
71
 
38
- # Generate the filename based on the column names
39
- filename <- paste0( gsub(" ", "_", sub("_[^_]*$", "", paste(column_names, collapse = "_"))), "_{dbkey}_cmd.csv")
40
- # Save data to a CSV file
41
- write.csv(data, file = paste0("{workspace}/", filename))
72
+ # Print a message indicating the file has been saved
73
+ cat("CSV file", filename, "has been saved.\n")
42
74
 
43
- # Print a message indicating the file has been saved
44
- cat("CSV file", filename, "has been saved.\n")
45
-
46
- # Add a delay between requests
47
- Sys.sleep(1) # Wait for 1 second before the next iteration
75
+ # Add a delay between requests
76
+ Sys.sleep(1) # Wait for 1 second before the next iteration
77
+
78
+ # Return the station and dbkey to the python code
79
+ list(success = TRUE, station = station, dbkey = "{dbkey}")
80
+ }}
48
81
  """
49
82
 
50
83
  r(r_str)
84
+
85
+ # Call the R function to download the flow data
86
+ result = r.download_flow_data(workspace, dbkey, date_min, date_max)
87
+
88
+ # Check for failure
89
+ success = result.rx2("success")[0]
90
+
91
+ if not success:
92
+ return
93
+
94
+ # Get the station name for _reformat_flow_file()
95
+ station = result.rx2("station")[0]
96
+
97
+ # Reformat the flow data file to the expected layout
98
+ _reformat_flow_file(workspace, station, dbkey)
99
+
100
+ # Check if the station name contains a space
101
+ if " " in station:
102
+ # Replace space with underscore in the station name
103
+ station_previous = station
104
+ station = station.replace(" ", "_")
105
+
106
+ # Rename the file
107
+ os.rename(f"{workspace}/{station_previous}_FLOW_{dbkey}_cmd.csv", f"{workspace}/{station}_FLOW_{dbkey}_cmd.csv")
51
108
 
52
109
  # column values are converted to cmd in R. This snippet makes sure column names are updated accordingly.
53
110
  file = glob(f'{workspace}/*FLOW*{dbkey}_cmd.csv')[0]
@@ -55,7 +112,44 @@ def get(
55
112
  df.columns = df.columns.astype(str).str.replace("_cfs", "_cmd")
56
113
  df.to_csv(file, index=False)
57
114
 
58
- if __name__ == "__main__":
59
- workspace = sys.argv[1].rstrip("/")
60
- dbkey = sys.argv[2]
61
- get(workspace, dbkey)
115
+
116
+ def _reformat_flow_file(workspace:str, station: str, dbkey: str):
117
+ '''
118
+ Reformat the flow data file to the expected layout.
119
+ Converts the format of the dates in the file to 'YYYY-MM-DD' then sorts the data by date.
120
+ Reads and writes to a .CSV file.
121
+
122
+ Args:
123
+ workspace (str): The path to the workspace directory.
124
+ station (str): The station name.
125
+ dbkey (str): The dbkey for the station.
126
+
127
+ Returns:
128
+ None
129
+ '''
130
+ # Read in the data
131
+ df = pd.read_csv(f"{workspace}/{station}_FLOW_{dbkey}_cmd.csv")
132
+
133
+ # Grab only the columns we need
134
+ df = df[['date', f'{station}_FLOW_cfs']]
135
+
136
+ # Convert date column to datetime
137
+ df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
138
+
139
+ # Sort the data by date
140
+ df.sort_values('date', inplace=True)
141
+
142
+ # Renumber the index
143
+ df.reset_index(drop=True, inplace=True)
144
+
145
+ # Drop rows that are missing values for both the date and value columns
146
+ df = df.drop(df[(df['date'].isna()) & (df[f'{station}_FLOW_cfs'].isna())].index)
147
+
148
+ # Write the updated data back to the file
149
+ df.to_csv(f"{workspace}/{station}_FLOW_{dbkey}_cmd.csv")
150
+
151
+
152
+ if __name__ == "__main__":
153
+ workspace = sys.argv[1].rstrip("/")
154
+ dbkey = sys.argv[2]
155
+ get(workspace, dbkey)
loone_data_prep/utils.py CHANGED
@@ -4,6 +4,7 @@ import datetime
4
4
  import math
5
5
  from glob import glob
6
6
  from calendar import monthrange
7
+ import traceback
7
8
  import numpy as np
8
9
  import pandas as pd
9
10
  from retry import retry
@@ -635,8 +636,9 @@ def nutrient_prediction(
635
636
  predicted_column.columns = [column_name]
636
637
 
637
638
  prediction_columns.append(predicted_column)
638
- except RuntimeWarning:
639
- breakpoint()
639
+ except RuntimeWarning as e:
640
+ print(f"Unexpected RuntimeWarning: {str(e)}")
641
+ traceback.print_exc()
640
642
 
641
643
  # Concat individual ensemble columns together into one pandas DataFrame
642
644
  out_dataframe = pd.concat(objs=prediction_columns, axis="columns")
@@ -661,6 +663,75 @@ def nutrient_prediction(
661
663
  out_dataframe.to_csv(os.path.join(input_dir, f"{station}_PHOSPHATE_predicted.csv"))
662
664
 
663
665
 
666
+ def find_last_date_in_csv(workspace: str, file_name: str) -> str:
667
+ """
668
+ Gets the most recent date from the last line of a .csv file.
669
+ Assumes the file is formatted as a .csv file, encoded in UTF-8,
670
+ and the rows in the file are sorted by date in ascending order.
671
+
672
+ Args:
673
+ workspace (str): The directory where the file is located.
674
+ file_name (str): The name of the file.
675
+
676
+ Returns:
677
+ str: The most recent date as a string in YYYY-MM-DD format, or None if the file does not exist or the date cannot be found.
678
+ """
679
+ # Helper Functions
680
+ def is_valid_date(date_string):
681
+ try:
682
+ datetime.datetime.strptime(date_string, '%Y-%m-%d')
683
+ return True
684
+ except ValueError:
685
+ return False
686
+
687
+ # Check that file exists
688
+ file_path = os.path.join(workspace, file_name)
689
+ if not os.path.exists(file_path):
690
+ return None
691
+
692
+ # Attempt to extract the date of the last line in the file
693
+ try:
694
+ with open(file_path, 'rb') as file:
695
+ # Go to the end of the file
696
+ file.seek(-2, os.SEEK_END)
697
+
698
+ # Loop backwards until you find the first newline character
699
+ while file.read(1) != b'\n':
700
+ file.seek(-2, os.SEEK_CUR)
701
+
702
+ # Read the last line
703
+ last_line = file.readline().decode()
704
+
705
+ # Extract the date from the last line
706
+ date = None
707
+
708
+ for value in last_line.split(','):
709
+ if is_valid_date(value):
710
+ date = value
711
+ break
712
+
713
+ # Return date
714
+ return date
715
+ except OSError as e:
716
+ print(f"Error reading file {file_name}: {e}")
717
+ return None
718
+
719
+
720
+ def dbhydro_data_is_latest(date_latest: str):
721
+ """
722
+ Checks whether the given date is the most recent date possible to get data from dbhydro.
723
+ Can be used to check whether dbhydro data is up-to-date.
724
+
725
+ Args:
726
+ date_latest (str): The date of the most recent data of the dbhydro data you have
727
+
728
+ Returns:
729
+ bool: True if the date_latest is the most recent date possible to get data from dbhydro, False otherwise
730
+ """
731
+ date_latest_object = datetime.datetime.strptime(date_latest, "%Y-%m-%d").date()
732
+ return date_latest_object == (datetime.datetime.now().date() - datetime.timedelta(days=1))
733
+
734
+
664
735
  if __name__ == "__main__":
665
736
  if sys.argv[1] == "get_dbkeys":
666
737
  get_dbkeys(sys.argv[2].strip("[]").replace(" ", "").split(","), *sys.argv[3:])
@@ -1,34 +1,231 @@
1
1
  import sys
2
2
  import os
3
+ import requests
4
+ import uuid
5
+ from datetime import datetime
3
6
  from loone_data_prep.water_level_data import hydro
7
+ from loone_data_prep.flow_data.get_forecast_flows import get_stations_latitude_longitude
8
+ from loone_data_prep.utils import find_last_date_in_csv, dbhydro_data_is_latest
9
+ import pandas as pd
4
10
 
11
+ DATE_NOW = datetime.now().date().strftime("%Y-%m-%d")
5
12
 
6
13
  D = {
7
- "LO_Stage": {"dbkeys": ["16022", "12509", "12519", "16265", "15611"]},
8
- "Stg_3ANW": {"dbkeys": ["LA369"], "date_min": "1972-01-01", "date_max": "2023-04-30"},
9
- "Stg_2A17": {"dbkeys": ["16531"], "date_min": "1972-01-01", "date_max": "2023-04-30"},
10
- "Stg_3A3": {"dbkeys": ["16532"], "date_min": "1972-01-01", "date_max": "2023-04-30"},
11
- "Stg_3A4": {"dbkeys": ["16537"], "date_min": "1972-01-01", "date_max": "2023-04-30"},
12
- "Stg_3A28": {"dbkeys": ["16538"], "date_min": "1972-01-01", "date_max": "2023-04-30"}
14
+ "LO_Stage": {"dbkeys": ["16022", "12509", "12519", "16265", "15611"], "datum": "NGVD29"},
15
+ "LO_Stage_2": {"dbkeys": ["94832"], "date_min": "2024-04-30", "datum": "NAVD88"},
16
+ "Stg_3ANW": {"dbkeys": ["LA369"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
17
+ "Stg_2A17": {"dbkeys": ["16531"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
18
+ "Stg_3A3": {"dbkeys": ["16532"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
19
+ "Stg_3A4": {"dbkeys": ["16537"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
20
+ "Stg_3A28": {"dbkeys": ["16538"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"}
13
21
  }
14
22
 
15
23
 
16
24
  def main(workspace: str, d: dict = D) -> dict:
17
25
  missing_files = []
26
+ failed_downloads = [] # List of file names that the script failed to get the latest data for (but the files still exist)
27
+
28
+ # Get the date of the latest data in LO_Stage_2.csv
29
+ date_latest_lo_stage_2 = find_last_date_in_csv(workspace, "LO_Stage_2.csv")
30
+
18
31
  for name, params in d.items():
19
- print(f"Getting {name}.")
20
- hydro.get(workspace, name, **params)
32
+ # Get the date of the latest data in the csv file
33
+ date_latest = find_last_date_in_csv(workspace, f"{name}.csv")
34
+
35
+ # File with data for this dbkey does NOT already exist (or possibly some other error occurred)
36
+ if date_latest is None:
37
+ print(f"Getting all water level data for {name}.")
38
+ hydro.get(workspace, name, **params)
39
+ else:
40
+ # Check whether the latest data is already up to date.
41
+ if dbhydro_data_is_latest(date_latest):
42
+ # Notify that the data is already up to date
43
+ print(f'Downloading of new water level data skipped for {name}. Data is already up to date.')
44
+ continue
45
+
46
+ # Temporarily rename current data file so it isn't over written
47
+ original_file_name = f"{name}.csv"
48
+ original_file_name_temp = f"{name}_{uuid.uuid4()}.csv"
49
+ os.rename(os.path.join(workspace, original_file_name), os.path.join(workspace, original_file_name_temp))
50
+
51
+ try:
52
+ # Download only the new data
53
+ print(f'Downloading new water level data for {name} starting from date {date_latest}')
54
+ hydro.get(workspace, name, dbkeys=params['dbkeys'], date_min=date_latest, date_max=DATE_NOW, datum=params['datum'])
55
+
56
+ # Read in the original data and the newly downloaded data
57
+ df_original = pd.read_csv(os.path.join(workspace, original_file_name_temp), index_col=0)
58
+ df_new = pd.read_csv(os.path.join(workspace, original_file_name), index_col=0)
59
+
60
+ # For get_hydro() calls with multiple dbkeys, remove the row corresponding to the latest date from the downloaded data.
61
+ # When get_hydro() is given multiple keys its returned data starts from the date given instead of the day after like it
62
+ # does when given a single key.
63
+ if len(params['dbkeys']) > 1:
64
+ df_new = df_new[df_new['date'] != date_latest]
65
+
66
+ # Merge the new data with the original data
67
+ df_merged = pd.concat([df_original, df_new], ignore_index=True)
68
+
69
+ # Write out the merged data
70
+ df_merged.to_csv(os.path.join(workspace, original_file_name))
71
+
72
+ # Remove the original renamed data file
73
+ os.remove(os.path.join(workspace, original_file_name_temp))
74
+ except Exception as e:
75
+ # Notify of the error
76
+ print(f"Error occurred while downloading new water level data: {e}")
77
+
78
+ # Remove the newly downloaded data file if it exists
79
+ if os.path.exists(os.path.join(workspace, original_file_name)):
80
+ os.remove(os.path.join(workspace, original_file_name))
81
+
82
+ # Rename the original renamed file back to its original name
83
+ if os.path.exists(os.path.join(workspace, original_file_name_temp)):
84
+ os.rename(os.path.join(workspace, original_file_name_temp), os.path.join(workspace, original_file_name))
85
+
86
+ # Add the file name to the list of failed downloads
87
+ failed_downloads.append(original_file_name)
88
+
21
89
  if os.path.exists(os.path.join(workspace, f"{name}.csv")):
22
90
  print(f"{name} downloaded successfully.")
23
91
  else:
24
92
  missing_files.append(f"{name}.csv")
25
93
  print(f"{name} could not be downloaded after various tries.")
26
94
 
27
- if missing_files:
28
- return {"error": f"The following files could not be downloaded: {missing_files}"}
29
-
95
+ # Merge data from old and new dbkey for station "L OKEE"
96
+ convert_failure = False
97
+ if os.path.exists(os.path.join(workspace, "LO_Stage.csv")) and os.path.exists(os.path.join(workspace, "LO_Stage_2.csv")):
98
+ # Output Progress
99
+ print("\nMerging data for station 'L OKEE'...")
100
+
101
+ # Get the latitude and longitude of the "L OKEE" station
102
+ lat_long_map = get_stations_latitude_longitude(["L OKEE"])
103
+ latitude, longitude = lat_long_map["L OKEE"]
104
+
105
+ # Load the LO_Stage_2.csv file
106
+ df_lo_stage_2 = pd.read_csv(os.path.join(workspace, "LO_Stage_2.csv"), index_col="date")
107
+ df_lo_stage_2.index = pd.to_datetime(df_lo_stage_2.index)
108
+
109
+ # Output Progress
110
+ print("Converting NAVD88 to NGVD29 for 'L OKEE's new dbkey...\n")
111
+
112
+ # Use only the data that is not already in the LO_Stage.csv file
113
+ if date_latest_lo_stage_2 is not None:
114
+ date_start = datetime.strptime(date_latest_lo_stage_2, "%Y-%m-%d") + pd.DateOffset(days=1)
115
+ df_lo_stage_2 = df_lo_stage_2.loc[date_start:]
116
+
117
+ # Convert the stage values from NAVD88 to NGVD29
118
+ lo_stage_2_dates = df_lo_stage_2.index.tolist()
119
+ lo_stage_2_values_navd88 = df_lo_stage_2["L OKEE_STG_ft NGVD29"].tolist()
120
+ lo_stage_2_values_ngvd29 = []
121
+
122
+ for i in range(0, len(lo_stage_2_values_navd88)):
123
+ date = lo_stage_2_dates[i]
124
+ value = lo_stage_2_values_navd88[i]
125
+ try:
126
+ lo_stage_2_values_ngvd29.append(_convert_navd88_to_ngvd29(latitude, longitude, value, date.year))
127
+ except Exception as e:
128
+ convert_failure = True
129
+ print(str(e))
130
+ break
131
+
132
+ # Check for conversion failure
133
+ if not convert_failure:
134
+ # Update the LO_Stage.csv file with the converted values
135
+ df_lo_stage = pd.read_csv(os.path.join(workspace, "LO_Stage.csv"), index_col="date")
136
+ df_lo_stage.index = pd.to_datetime(df_lo_stage.index)
137
+
138
+ for i in range(0, len(lo_stage_2_values_ngvd29)):
139
+ # Get the current date and value
140
+ date = lo_stage_2_dates[i]
141
+ value = lo_stage_2_values_ngvd29[i]
142
+
143
+ # Update the value in the LO_Stage dataframe
144
+ df_lo_stage.at[date, "L OKEE_STG_ft NGVD29"] = value
145
+
146
+ # Reset the index
147
+ df_lo_stage.reset_index(inplace=True)
148
+ df_lo_stage.drop(columns=["Unnamed: 0"], inplace=True)
149
+
150
+ # Save the updated LO_Stage.csv file
151
+ df_lo_stage.to_csv(os.path.join(workspace, "LO_Stage.csv"))
152
+ else:
153
+ # Conversion failed due to missing files
154
+ convert_failure = True
155
+ print("Error: Missing LO_Stage.csv or LO_Stage_2.csv file, cannot convert and merge.")
156
+
157
+ if missing_files or convert_failure:
158
+ error_string = ""
159
+
160
+ if missing_files:
161
+ error_string += f"The following files could not be downloaded: {missing_files}"
162
+
163
+ if failed_downloads:
164
+ error_string += f"\nFailed to download the latest data for the following files: {failed_downloads}"
165
+
166
+ if convert_failure:
167
+ error_string += "\nFailed to convert NAVD88 to NGVD29 for 'L OKEE' station."
168
+
169
+ return {"error": error_string}
170
+
30
171
  return {"success": "Completed water level data download."}
31
172
 
173
+ def _convert_navd88_to_ngvd29(latitude: float, longitude: float, stage: float, year: int) -> float:
174
+ """Converts a stage value from NAVD88 to NGVD29 using NCAT.
175
+
176
+ Args:
177
+ latitude (float): The latitude of the station (in decimal degrees format).
178
+ longitude (float): The longitude of the station (in decimal degrees format).
179
+ stage (float): The stage (water level) value to convert (in feet).
180
+ year (int): The year when the stage value was recorded.
181
+
182
+ Returns:
183
+ float: The converted stage value in feet (NGVD29).
184
+ """
185
+ # Helper functions
186
+ def _feet_to_meters(feet: float) -> float:
187
+ return feet * 0.3048
188
+
189
+ def _meters_to_feet(meters: float) -> float:
190
+ return meters / 0.3048
191
+
192
+ # Check for NA value
193
+ if pd.isna(stage):
194
+ return stage
195
+
196
+ # Convert stage to meters
197
+ stage_meters = _feet_to_meters(stage)
198
+
199
+ # Make request
200
+ base_url = "https://geodesy.noaa.gov/api/ncat/llh"
201
+
202
+ params = {
203
+ "lat": latitude, # latitude
204
+ "lon": longitude, # longitude
205
+ "orthoHt": stage_meters, # orthometric height in NAVD88
206
+ "year": year, # year of observation
207
+ "inDatum": "NAD83(1986)", # Datum used for input latitude and longitude
208
+ "outDatum": "NAD83(1986)", # Datum used for output latitude and longitude
209
+ "inVertDatum": "NAVD88", # vertical datum of input orthometric height
210
+ "outVertDatum": "NGVD29", # vertical datum of output orthometric height (desired vertical datum)
211
+ }
212
+
213
+ try:
214
+ response = requests.get(base_url, params=params)
215
+ except Exception as e:
216
+ raise Exception(f"Error converting NAVD88 to NGVD29: {e}")
217
+
218
+ # Check for failure
219
+ if response.status_code != 200:
220
+ raise Exception(f"Error converting NAVD88 to NGVD29: {response.text}")
221
+
222
+ # Return converted stage in feet
223
+ try:
224
+ value = _meters_to_feet(float(response.json()["destOrthoht"]))
225
+ except Exception as e:
226
+ raise Exception(f"Error converting NAVD88 to NGVD29: {e}")
227
+
228
+ return value
32
229
 
33
230
  if __name__ == "__main__":
34
231
  workspace = sys.argv[1].rstrip("/")
@@ -3,7 +3,7 @@ from datetime import datetime
3
3
  from retry import retry
4
4
  from rpy2.robjects import r
5
5
  from rpy2.rinterface_lib.embedded import RRuntimeError
6
-
6
+ import pandas as pd
7
7
 
8
8
  DEFAULT_DBKEYS = ["16022", "12509", "12519", "16265", "15611"]
9
9
  DATE_NOW = datetime.now().strftime("%Y-%m-%d")
@@ -16,20 +16,88 @@ def get(
16
16
  dbkeys: list = DEFAULT_DBKEYS,
17
17
  date_min: str = "1950-01-01",
18
18
  date_max: str = DATE_NOW,
19
+ datum: str = "",
19
20
  **kwargs: str | list
20
21
  ) -> None:
22
+ # Get the type and units for the station
23
+ data_type = "STG"
24
+ units = "ft NGVD29"
25
+
26
+ if name in ["Stg_3A3", "Stg_2A17", "Stg_3A4", "Stg_3A28"]:
27
+ data_type = "GAGHT"
28
+ units = "feet"
29
+
21
30
  dbkeys_str = "\"" + "\", \"".join(dbkeys) + "\""
22
31
  r(
23
32
  f"""
24
33
  # Load the required libraries
25
34
  library(rio)
26
35
  library(dbhydroR)
27
- #Stage Data
28
- {name} = get_hydro(dbkey = c({dbkeys_str}), date_min = "{date_min}", date_max = "{date_max}")
36
+ library(dplyr)
37
+
38
+ # Stage Data
39
+ if ("{datum}" == "")
40
+ {{
41
+ {name} <- get_hydro(dbkey = c({dbkeys_str}), date_min = "{date_min}", date_max = "{date_max}", raw = TRUE)
42
+ }}
43
+
44
+ if (nchar("{datum}") > 0)
45
+ {{
46
+ {name} <- get_hydro(dbkey = c({dbkeys_str}), date_min = "{date_min}", date_max = "{date_max}", raw = TRUE, datum = "{datum}")
47
+ }}
48
+
49
+ # Give data.frame correct column names so it can be cleaned using the clean_hydro function
50
+ colnames({name}) <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
51
+
52
+ # Check if the data.frame has any rows
53
+ if (nrow({name}) == 0)
54
+ {{
55
+ # No data given back, It's possible that the dbkey has reached its end date.
56
+ print(paste("Empty data.frame returned for dbkeys", "{dbkeys}", "It's possible that the dbkey has reached its end date. Skipping to the next dbkey."))
57
+ return(list(success = FALSE, dbkey = "{dbkeys}"))
58
+ }}
59
+
60
+ # Get the station
61
+ station <- {name}$station[1]
62
+
63
+ # Add a type and units column to data so it can be cleaned using the clean_hydro function
64
+ {name}$type <- "{data_type}"
65
+ {name}$units <- "{units}"
66
+
67
+ # Clean the data.frame
68
+ {name} <- clean_hydro({name})
69
+
70
+ # Drop the " _STG_ft NGVD29" column
71
+ {name} <- {name} %>% select(-` _{data_type}_{units}`)
72
+
73
+ # Write the data to a csv file
29
74
  write.csv({name},file ='{workspace}/{name}.csv')
30
75
  """
31
76
  )
77
+
78
+ _reformat_water_level_file(workspace, name)
32
79
 
80
+ def _reformat_water_level_file(workspace: str, name: str):
81
+ # Read in the data
82
+ df = pd.read_csv(f"{workspace}/{name}.csv")
83
+
84
+ # Drop the "Unnamed: 0" column
85
+ df.drop(columns=['Unnamed: 0'], inplace=True)
86
+
87
+ # Convert date column to datetime
88
+ df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
89
+
90
+ # Sort the data by date
91
+ df.sort_values('date', inplace=True)
92
+
93
+ # Renumber the index
94
+ df.reset_index(drop=True, inplace=True)
95
+
96
+ # Drop rows that are missing all their values
97
+ df.dropna(how='all', inplace=True)
98
+
99
+ # Write the updated data back to the file
100
+ df.to_csv(f"{workspace}/{name}.csv")
33
101
 
34
102
  if __name__ == "__main__":
35
103
  args = [sys.argv[1].rstrip("/"), sys.argv[2]]