loone-data-prep 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,10 @@
1
1
  import sys
2
2
  import os
3
+ import uuid
4
+ from datetime import datetime, timedelta
5
+ import pandas as pd
3
6
  from loone_data_prep.water_quality_data import wq
7
+ from loone_data_prep.utils import find_last_date_in_csv, dbhydro_data_is_latest
4
8
 
5
9
 
6
10
  D = {
@@ -21,18 +25,99 @@ D = {
21
25
  }
22
26
 
23
27
 
28
+ DEFAULT_DATE_MIN = "1950-01-01"
29
+
30
+
24
31
  def main(workspace: str, d: dict = D) -> dict:
25
32
  missing_files = []
33
+ failed_downloads = [] # List of file names that the script failed to get the latest data for (but the files still exist)
26
34
  for name, params in d.items():
27
35
  print(f"Getting {name} for the following station IDs: {params['station_ids']}.")
28
- wq.get(workspace, name, **params)
36
+
37
+ # Get the date of the latest data in the csv file for each station id
38
+ station_date_latest = {}
39
+ for station_id in params["station_ids"]:
40
+ station_date_latest[station_id] = find_last_date_in_csv(workspace, f"water_quality_{station_id}_{name}.csv")
41
+
42
+ # Get the water quality data
43
+ for station_id, date_latest in station_date_latest.items():
44
+ # File with data for this station/name combination does NOT already exist (or possibly some other error occurred)
45
+ if date_latest is None:
46
+ # Get all the water quality data for the name/station combination
47
+ print(f"Getting all {name} data for station ID: {station_id}.")
48
+ wq.get(workspace, name, [station_id])
49
+ else:
50
+ # Check whether we already have the latest data
51
+ if dbhydro_data_is_latest(date_latest):
52
+ # Notify that the data is already up to date
53
+ print(f'Downloading of new water quality data for test name: {name} station: {station} skipped. Data is already up to date.')
54
+ continue
55
+
56
+ # Temporarily rename current data file so it isn't over written
57
+ original_file_name = f"water_quality_{station_id}_{name}.csv"
58
+ original_file_name_temp = f"water_quality_{station_id}_{name}_{uuid.uuid4()}.csv"
59
+ os.rename(os.path.join(workspace, original_file_name), os.path.join(workspace, original_file_name_temp))
60
+
61
+ try:
62
+ # Get only the water quality data that is newer than the latest data in the csv file
63
+ print(f"Downloading new water quality data for test name: {name} station ID: {station_id} starting from date: {date_latest}.")
64
+ date_latest = (datetime.strptime(date_latest, "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
65
+ wq.get(workspace, name, [station_id], date_min=date_latest)
66
+
67
+ # Data failed to download - It's possible the data's end date has been reached
68
+ if not os.path.exists(os.path.join(workspace, original_file_name)):
69
+ raise Exception(f"It's possible that the data for test name: {name} station ID: {station_id} has reached its end date.")
70
+
71
+ # Read in the original data
72
+ df_original = pd.read_csv(os.path.join(workspace, original_file_name_temp), index_col=0)
73
+
74
+ # Calculate the days column for the newly downloaded data
75
+ df_original_date_min = df_original['date'].min()
76
+ wq._calculate_days_column(workspace, original_file_name, df_original_date_min)
77
+
78
+ # Read in the newly downloaded data
79
+ df_new = pd.read_csv(os.path.join(workspace, original_file_name), index_col=0)
80
+ df_new.reset_index(inplace=True)
81
+
82
+ # Merge the new data with the original data
83
+ df_merged = pd.concat([df_original, df_new], ignore_index=True)
84
+
85
+ # Write out the merged data
86
+ df_merged.to_csv(os.path.join(workspace, original_file_name))
87
+
88
+ # Remove the original renamed data file
89
+ os.remove(os.path.join(workspace, original_file_name_temp))
90
+ except Exception as e:
91
+ # Notify of the error
92
+ print(f"Error occurred while downloading new water quality data: {e}")
93
+
94
+ # Remove the newly downloaded data file if it exists
95
+ if os.path.exists(os.path.join(workspace, original_file_name)):
96
+ os.remove(os.path.join(workspace, original_file_name))
97
+
98
+ # Rename the original renamed file back to its original name
99
+ if os.path.exists(os.path.join(workspace, original_file_name_temp)):
100
+ os.rename(os.path.join(workspace, original_file_name_temp), os.path.join(workspace, original_file_name))
101
+
102
+ # Add the file name to the list of failed downloads
103
+ failed_downloads.append(original_file_name)
104
+
105
+ # Check for any download failures
29
106
  for station in params["station_ids"]:
30
107
  if not os.path.exists(os.path.join(workspace, f"water_quality_{station}_{name}.csv")):
31
108
  missing_files.append(f"water_quality_{station}_{name}.csv")
32
109
  print(f"{name} station ID: {station} could not be downloaded after various tries.")
33
110
 
34
- if missing_files:
35
- return {"error": f"The following files could not be downloaded: {missing_files}"}
111
+ if missing_files or failed_downloads:
112
+ error_string = ""
113
+
114
+ if missing_files:
115
+ error_string += f"The following files could not be downloaded: {missing_files}"
116
+
117
+ if failed_downloads:
118
+ error_string += f"\nThe following files could not be updated: {failed_downloads}"
119
+
120
+ return {"error": error_string}
36
121
 
37
122
  return {"success": "Completed water quality data download."}
38
123
 
@@ -1,6 +1,10 @@
1
1
  import sys
2
2
  import os
3
+ import uuid
4
+ from datetime import datetime, timedelta
5
+ import pandas as pd
3
6
  from loone_data_prep.water_quality_data import wq
7
+ from loone_data_prep.utils import find_last_date_in_csv, dbhydro_data_is_latest
4
8
 
5
9
 
6
10
  D = {
@@ -28,16 +32,94 @@ D = {
28
32
 
29
33
  def main(workspace: str, d: dict = D) -> dict:
30
34
  missing_files = []
35
+ failed_downloads = [] # List of file names that the script failed to get the latest data for (but the files still exist)
31
36
  for name, params in d.items():
32
37
  print(f"Getting {name} for the following station IDs: {params['station_ids']}.")
33
- wq.get(workspace, name, **params)
38
+
39
+ # Get the date of the latest data in the csv file for each station id
40
+ station_date_latest = {}
41
+ for station_id in params["station_ids"]:
42
+ station_date_latest[station_id] = find_last_date_in_csv(workspace, f"water_quality_{station_id}_{name}.csv")
43
+
44
+ # Get the water quality data
45
+ for station_id, date_latest in station_date_latest.items():
46
+ # File with data for this station/name combination does NOT already exist (or possibly some other error occurred)
47
+ if date_latest is None:
48
+ # Get all the water quality data for the name/station combination
49
+ print(f"Getting all {name} data for station ID: {station_id}.")
50
+ wq.get(workspace, name, [station_id])
51
+ else:
52
+ # Check whether we already have the latest data
53
+ if dbhydro_data_is_latest(date_latest):
54
+ # Notify that the data is already up to date
55
+ print(f'Downloading of new water quality data for test name: {name} station: {station} skipped. Data is already up to date.')
56
+ continue
57
+
58
+ # Temporarily rename current data file so it isn't over written
59
+ original_file_name = f"water_quality_{station_id}_{name}.csv"
60
+ original_file_name_temp = f"water_quality_{station_id}_{name}_{uuid.uuid4()}.csv"
61
+ os.rename(os.path.join(workspace, original_file_name), os.path.join(workspace, original_file_name_temp))
62
+
63
+ try:
64
+ # Get only the water quality data that is newer than the latest data in the csv file
65
+ print(f"Downloading new water quality data for test name: {name} station ID: {station_id} starting from date: {date_latest}.")
66
+ date_latest = (datetime.strptime(date_latest, "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
67
+ wq.get(workspace, name, [station_id], date_min=date_latest)
68
+
69
+ # Data failed to download - It's possible the data's end date has been reached
70
+ if not os.path.exists(os.path.join(workspace, original_file_name)):
71
+ raise Exception(f"It's possible that the data for test name: {name} station ID: {station_id} has reached its end date.")
72
+
73
+ # Read in the original data
74
+ df_original = pd.read_csv(os.path.join(workspace, original_file_name_temp), index_col=0)
75
+
76
+ # Calculate the days column for the newly downloaded data
77
+ df_original_date_min = df_original['date'].min()
78
+ wq._calculate_days_column(workspace, original_file_name, df_original_date_min)
79
+
80
+ # Read in the newly downloaded data
81
+ df_new = pd.read_csv(os.path.join(workspace, original_file_name), index_col=0)
82
+ df_new.reset_index(inplace=True)
83
+
84
+ # Merge the new data with the original data
85
+ df_merged = pd.concat([df_original, df_new], ignore_index=True)
86
+
87
+ # Write out the merged data
88
+ df_merged.to_csv(os.path.join(workspace, original_file_name))
89
+
90
+ # Remove the original renamed data file
91
+ os.remove(os.path.join(workspace, original_file_name_temp))
92
+ except Exception as e:
93
+ # Notify of the error
94
+ print(f"Error occurred while downloading new water quality data: {e}")
95
+
96
+ # Remove the newly downloaded data file if it exists
97
+ if os.path.exists(os.path.join(workspace, original_file_name)):
98
+ os.remove(os.path.join(workspace, original_file_name))
99
+
100
+ # Rename the original renamed file back to its original name
101
+ if os.path.exists(os.path.join(workspace, original_file_name_temp)):
102
+ os.rename(os.path.join(workspace, original_file_name_temp), os.path.join(workspace, original_file_name))
103
+
104
+ # Add the file name to the list of failed downloads
105
+ failed_downloads.append(original_file_name)
106
+
107
+ # Check for missing files
34
108
  for station in params["station_ids"]:
35
109
  if not os.path.exists(os.path.join(workspace, f"water_quality_{station}_{name}.csv")):
36
110
  missing_files.append(f"water_quality_{station}_{name}.csv")
37
111
  print(f"{name} station ID: {station} could not be downloaded after various tries.")
38
112
 
39
- if missing_files:
40
- return {"error": f"The following files could not be downloaded: {missing_files}"}
113
+ if missing_files or failed_downloads:
114
+ error_string = ""
115
+
116
+ if missing_files:
117
+ error_string += f"The following files could not be downloaded: {missing_files}"
118
+
119
+ if failed_downloads:
120
+ error_string += f"\nThe following files could not be updated: {failed_downloads}"
121
+
122
+ return {"error": error_string}
41
123
 
42
124
  return {"success": "Completed water quality data download."}
43
125
 
@@ -70,6 +70,50 @@ def get(
70
70
  )
71
71
 
72
72
 
73
+ def _calculate_days_column(workspace: str, file_name: str, date_min: str):
74
+ """
75
+ Calculates the values that should be in the "days" column of the water quality data CSV file
76
+ based on the given date_min and writes the updated data frame back to the CSV file.
77
+
78
+ Args:
79
+ workspace (str): The path to the workspace directory.
80
+ file_name (str): The name of the water quality data CSV file.
81
+ date_min (str): The minimum date that the "days" column values should be calculated from. Should be in format "YYYY-MM-DD".
82
+ """
83
+ r(
84
+ f"""
85
+ # Import necessary libraries
86
+ library(lubridate)
87
+
88
+ # Read the CSV file
89
+ df <- read.csv("{workspace}/{file_name}", check.names = FALSE)
90
+
91
+ # Drop the "X" column that R adds when reading CSV files
92
+ df <- df[,-1]
93
+
94
+ # Get date_min as an object with the correct timezone
95
+ date_min_object <- as.POSIXct("{date_min}", tz = "UTC")
96
+ date_min_tz <- format(with_tz(date_min_object, tzone = "America/New_York"), "%Z")
97
+ date_min_object <- as.POSIXct("{date_min}", tz = date_min_tz)
98
+
99
+ # Calculate each value in the days column based on the date_min
100
+ for(i in 1:nrow(df))
101
+ {{
102
+ # Get the current row's date as an object with the correct timezone
103
+ date <- as.POSIXct(df$date[i], tz = "UTC")
104
+ date_tz <- format(with_tz(date, tzone = "America/New_York"), "%Z")
105
+ date <- as.POSIXct(df$date[i], tz = date_tz)
106
+
107
+ # Calculate the number of days from the minimum date to the row's date plus the number of days in date_min
108
+ df$days[i] <- as.integer(difftime(date, date_min_object, units = "days")) + as.integer(format(date_min_object, "%d"))
109
+ }}
110
+
111
+ # Write the updated data frame back to the CSV file
112
+ write.csv(df, file = "{workspace}/{file_name}", row.names = FALSE)
113
+ """ # noqa: E501
114
+ )
115
+
116
+
73
117
  if __name__ == "__main__":
74
118
  args = [sys.argv[1].rstrip("/"), sys.argv[2]]
75
119
  if len(sys.argv) >= 4:
@@ -1,6 +1,10 @@
1
1
  import sys
2
2
  from glob import glob
3
+ import uuid
4
+ import os
5
+ import pandas as pd
3
6
  from loone_data_prep.weather_data import weather
7
+ from loone_data_prep.utils import find_last_date_in_csv, dbhydro_data_is_latest
4
8
 
5
9
 
6
10
  D = {
@@ -14,14 +18,133 @@ D = {
14
18
  }
15
19
 
16
20
 
17
- def main(workspace: str, d: dict = D) -> dict:
21
+ DBKEY_STATIONS = {
22
+ "16021": "L001",
23
+ "12515": "L005",
24
+ "12524": "L006",
25
+ "13081": "LZ40",
26
+ "UT736": "L001",
27
+ "VM675": "L005",
28
+ "UT743": "L006",
29
+ "UT748": "LZ40",
30
+ "16031": "L001",
31
+ "12518": "L005",
32
+ "12527": "L006",
33
+ "16267": "LZ40",
34
+ "16025": "L001",
35
+ "12516": "L005",
36
+ "12525": "L006",
37
+ "15649": "LZ40",
38
+ "16024": "L001",
39
+ "12512": "L005",
40
+ "12522": "L006",
41
+ "13080": "LZ40",
42
+ "16027": "L001",
43
+ "12514": "L005",
44
+ "12911": "L006",
45
+ "13078": "LZ40",
46
+ "16023": "L001",
47
+ "12510": "L005",
48
+ "12520": "L006",
49
+ "13076": "LZ40",
50
+ }
51
+
52
+ def main(workspace: str, d: dict = D, dbkey_stations: dict = DBKEY_STATIONS) -> dict:
53
+ """
54
+ Retrieves all weather data used by LOONE. When the dbkey_stations argument is provided
55
+ the function will download only the latest data it doesn't have for the dbkeys in the d and dbkey_stations arguments.
56
+ Otherwise, it will download all the data for the dbkeys in the d argument.
57
+
58
+ Args:
59
+ workspace (str): Path to workspace where data will be downloaded.
60
+ d (dict): A dictionary of data type keys and dict values that hold keyword arguments to be used with weather_data.weather.get().
61
+ Valid keys are 'RAIN', 'ETPI', 'H2OT', 'RADP', 'RADT', 'AIRT', and 'WNDS'.
62
+ dbkey_stations (dict): Dictionary of dbkeys mapped to their station's name.
63
+ """
18
64
  missing_files = []
65
+ failed_downloads = [] # List of (data type name, file name) tuples that the script failed to get the latest data for (but the files still exist)
66
+
67
+ # Get the data for each data type
19
68
  for name, params in d.items():
20
- print(f"Getting {name} for the following dbkeys: {params['dbkeys']}.")
21
- weather.get(workspace, name, **params)
69
+
70
+ # Get the data for each dbkey individually for this data type
71
+ for dbkey in params['dbkeys']:
72
+ # Get the file name of the current file being downloaded
73
+ station = dbkey_stations[dbkey]
74
+ date_units_file, _ = weather._get_file_header_data_units(name)
75
+ original_file_name = ""
76
+ if name in ['RADP', 'RADT']:
77
+ original_file_name = f"{station}_{name}.csv"
78
+ else:
79
+ original_file_name = f"{station}_{name}_{date_units_file}.csv"
80
+
81
+ # Get the date of the latest data in the csv file
82
+ date_latest = find_last_date_in_csv(workspace, original_file_name)
83
+
84
+ # File with data for this dbkey does NOT already exist (or possibly some other error occurred)
85
+ if date_latest is None:
86
+ print(f"Getting all {name} data for the following dbkey: {dbkey}.")
87
+ weather.get(workspace, name, dbkeys=[dbkey])
88
+ continue
89
+
90
+ # Check whether the latest data is already up to date.
91
+ if dbhydro_data_is_latest(date_latest):
92
+ # Notify that the data is already up to date
93
+ print(f'Downloading of new {name} data skipped for dbkey {dbkey}. Data is already up to date.')
94
+ continue
95
+
96
+ # Temporarily rename current data file so it isn't over written
97
+ original_file_name_temp = original_file_name.replace(".csv", f"_{uuid.uuid4()}.csv")
98
+ os.rename(os.path.join(workspace, original_file_name), os.path.join(workspace, original_file_name_temp))
99
+
100
+ try:
101
+ # Download only the new data
102
+ print(f'Downloading new {name} data for dbkey {dbkey} starting from date {date_latest}')
103
+ weather.get(workspace, name, dbkeys=[dbkey], date_min=date_latest)
104
+
105
+ # Data failed to download - It's possible the data's end date has been reached
106
+ if not os.path.exists(os.path.join(workspace, original_file_name)):
107
+ raise Exception(f"It's possible that the data for {name} dbkey {dbkey} has reached its end date.")
108
+
109
+ # Read in the original data and the newly downloaded data
110
+ df_original = pd.read_csv(os.path.join(workspace, original_file_name_temp), index_col=0)
111
+ df_new = pd.read_csv(os.path.join(workspace, original_file_name), index_col=0)
112
+
113
+ # Merge the new data with the original data
114
+ df_merged = pd.concat([df_original, df_new], ignore_index=True)
115
+
116
+ # Write out the merged data
117
+ df_merged.to_csv(os.path.join(workspace, original_file_name))
118
+
119
+ # Remove the original renamed data file
120
+ os.remove(os.path.join(workspace, original_file_name_temp))
121
+ except Exception as e:
122
+ # Notify of the error
123
+ print(f"Error occurred while downloading new weather data: {e}")
124
+
125
+ # Remove the newly downloaded data file if it exists
126
+ if os.path.exists(os.path.join(workspace, original_file_name)):
127
+ os.remove(os.path.join(workspace, original_file_name))
128
+
129
+ # Rename the original renamed file back to its original name
130
+ if os.path.exists(os.path.join(workspace, original_file_name_temp)):
131
+ os.rename(os.path.join(workspace, original_file_name_temp), os.path.join(workspace, original_file_name))
132
+
133
+ # Add the file name to the list of failed downloads
134
+ failed_downloads.append((name, original_file_name))
135
+
136
+ # Check if all the files were downloaded
22
137
  if len(glob(f"{workspace}/*{name}*.csv")) < len(params["dbkeys"]):
23
138
  missing_files.append(True)
24
139
  print(f"After various tries, files are still missing for {name}.")
140
+
141
+ # Check if any files failed to update
142
+ if len(failed_downloads) > 0:
143
+ print(f"Failed to update the following files {failed_downloads}")
144
+
145
+ # Create LAKE_RAINFALL_DATA.csv and LOONE_AVERAGE_ETPI_DATA.csv
146
+ weather.merge_data(workspace, 'RAIN')
147
+ weather.merge_data(workspace, 'ETPI')
25
148
 
26
149
  if True in missing_files:
27
150
  return {"error": "Missing files."}
@@ -3,6 +3,7 @@ from datetime import datetime
3
3
  from retry import retry
4
4
  from rpy2.robjects import r
5
5
  from rpy2.rinterface_lib.embedded import RRuntimeError
6
+ import pandas as pd
6
7
 
7
8
 
8
9
  DEFAULT_DBKEYS = ["16021", "12515", "12524", "13081"]
@@ -20,40 +21,112 @@ def get(
20
21
  ) -> None:
21
22
  dbkeys_str = "\"" + "\", \"".join(dbkeys) + "\""
22
23
 
23
- r(
24
- f"""
25
- library(dbhydroR)
26
- library(dplyr)
24
+ data_type = param
25
+ data_units_file = None
26
+ data_units_header = None
27
+
28
+ # Get the units for the file name and column header based on the type of data
29
+ data_units_file, data_units_header = _get_file_header_data_units(data_type)
30
+
31
+ r_str = f"""
32
+ download_weather_data <- function()#workspace, dbkeys, date_min, date_max, data_type, data_units_file, data_units_header)
33
+ {{
34
+ library(dbhydroR)
35
+ library(dplyr)
27
36
 
28
- dbkeys <- c({dbkeys_str})
37
+ dbkeys <- c({dbkeys_str})
38
+ successful_stations <- list()
39
+
40
+ for (i in dbkeys)
41
+ {{
42
+ # Retrieve data for the dbkey
43
+ data <- get_hydro(dbkey = i, date_min = "{date_min}", date_max = "{date_max}", raw = TRUE)
44
+
45
+ # Give data.frame correct column names so it can be cleaned using the clean_hydro function
46
+ column_names <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
47
+ colnames(data) <- column_names
48
+
49
+ # Check if the data.frame has any rows
50
+ if (nrow(data) > 0)
51
+ {{
52
+ # Get the station
53
+ station <- data$station[1]
54
+
55
+ # Add a type and units column to data so it can be cleaned using the clean_hydro function
56
+ data$type <- "{data_type}"
57
+ data$units <- "{data_units_header}"
58
+
59
+ # Clean the data.frame
60
+ data <- clean_hydro(data)
61
+
62
+ # Get the filename of the output file
63
+ filename <- ""
64
+
65
+ if ("{param}" %in% c("RADP", "RADT"))
66
+ {{
67
+ filename <- paste(station, "{data_type}", sep = "_")
68
+ }}
69
+ else
70
+ {{
71
+ filename <- paste(station, "{data_type}", "{data_units_file}", sep = "_")
72
+ }}
73
+
74
+ filename <- paste0(filename, ".csv")
75
+ filename <- paste0("{workspace}/", filename)
29
76
 
30
- for (i in dbkeys) {{
31
- # Retrieve data for the dbkey
32
- data <- get_hydro(dbkey = i, date_min = "{date_min}", date_max = "{date_max}")
77
+ # Save data to a CSV file
78
+ write.csv(data, file = filename)
33
79
 
34
- # Extract the column names excluding the date column
35
- column_names <- names(data)[-1]
80
+ # Print a message indicating the file has been saved
81
+ cat("CSV file", filename, "has been saved.\n")
36
82
 
37
- # Generate the filename based on the column names
38
- if ("{param}" %in% c("RADP", "RADT")) {{
39
- filename <- paste0("{workspace}/", gsub(" ", "_", sub("_[^_]*$", "", paste(column_names, collapse = "_"))), ".csv")
40
- }} else {{
41
- filename <- paste0("{workspace}/", paste(column_names, collapse = "_"), ".csv")
42
- }}
43
-
44
- # Save data to a CSV file
45
- write.csv(data, file = filename)
46
-
47
- # Print a message indicating the file has been saved
48
- cat("CSV file", filename, "has been saved.\n")
83
+ # Append the station to the list of successful stations
84
+ successful_stations <- c(successful_stations, station)
85
+ }}
86
+ else
87
+ {{
88
+ # No data given back, It's possible that the dbkey has reached its end date.
89
+ print(paste("Empty data.frame returned for dbkey", i, "It's possible that the dbkey has reached its end date. Skipping to the next dbkey."))
90
+ }}
49
91
 
50
- # Add a delay between requests
51
- Sys.sleep(2) # Wait for 2 seconds before the next iteration
92
+ # Add a delay between requests
93
+ Sys.sleep(2) # Wait for 2 seconds before the next iteration
94
+ }}
95
+
96
+ # Return the station and dbkey to the python code
97
+ return(successful_stations)
52
98
  }}
53
99
  """ # noqa: E501
54
- )
100
+
101
+ # Download the weather data
102
+ r(r_str)
103
+ result = r.download_weather_data()
104
+
105
+ # Get the stations of the dbkeys who's data were successfully downloaded
106
+ stations = []
107
+ for value in result:
108
+ stations.append(value[0])
109
+
110
+ # Format files to expected layout
111
+ for station in stations:
112
+ if station in ["L001", "L005", "L006", "LZ40"]:
113
+ _reformat_weather_file(workspace, station, data_type, data_units_file, data_units_header)
114
+
115
+ # Print a message indicating the file has been saved
116
+ print(f"CSV file {workspace}/{station}_{data_type}_{data_units_file}.csv has been reformatted.")
117
+
55
118
 
56
- if param == "RAIN":
119
+ def merge_data(workspace: str, data_type: str):
120
+ """
121
+ Merge the data files for the different stations to create either the LAKE_RAINFALL_DATA.csv or LOONE_AVERAGE_ETPI_DATA.csv file.
122
+
123
+ Args:
124
+ workspace (str): The path to the workspace directory.
125
+ data_type (str): The type of data. Either 'RAIN' for LAKE_RAINFALL_DATA.csv or 'ETPI' for LOONE_AVERAGE_ETPI_DATA.csv.
126
+ """
127
+
128
+ # Merge the data files for the different stations (LAKE_RAINFALL_DATA.csv)
129
+ if data_type == "RAIN":
57
130
  r(
58
131
  f"""
59
132
  L001_RAIN_Inches <- read.csv("{workspace}/L001_RAIN_Inches.csv", colClasses = c("NULL", "character", "numeric"))
@@ -79,7 +152,8 @@ def get(
79
152
  """ # noqa: E501
80
153
  )
81
154
 
82
- if param == "ETPI":
155
+ # Merge the data files for the different stations (LOONE_AVERAGE_ETPI_DATA.csv)
156
+ if data_type == "ETPI":
83
157
  r(
84
158
  f"""
85
159
  L001_ETPI_Inches <- read.csv("{workspace}/L001_ETPI_Inches.csv", colClasses = c("NULL", "character", "numeric"))
@@ -107,6 +181,90 @@ def get(
107
181
  )
108
182
 
109
183
 
184
+ def _reformat_weather_file(workspace: str, station: str, data_type: str, data_units_file: str, data_units_header: str) -> None:
185
+ '''
186
+ Reformats the dbhydro weather file to the layout expected by the rest of the LOONE scripts.
187
+ This function reads in and writes out a .csv file.
188
+
189
+ Args:
190
+ workspace (str): The path to the workspace directory.
191
+ station (str): The station name. Ex: L001, L005, L006, LZ40.
192
+ data_type (str): The type of data. Ex: RAIN, ETPI, H2OT, RADP, RADT, AIRT, WNDS.
193
+ data_units_file (str): The units for the file name. Ex: Inches, Degrees Celsius, etc.
194
+ data_units_header (str): The units for the column header. Ex: Inches, Degrees Celsius, etc. Can differ from data_units_file when data_type is either RADP or RADT.
195
+
196
+ Returns:
197
+ None
198
+ '''
199
+ # Read in the data
200
+ df = None
201
+ if data_type in ['RADP', 'RADT']:
202
+ df = pd.read_csv(f"{workspace}/{station}_{data_type}.csv")
203
+ else:
204
+ df = pd.read_csv(f"{workspace}/{station}_{data_type}_{data_units_file}.csv")
205
+
206
+ # Remove unneeded column columns
207
+ df.drop(f' _{data_type}_{data_units_header}', axis=1, inplace=True)
208
+ df.drop('Unnamed: 0', axis=1, inplace=True)
209
+
210
+ # Convert date column to datetime
211
+ df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
212
+
213
+ # Sort the data by date
214
+ df.sort_values('date', inplace=True)
215
+
216
+ # Renumber the index
217
+ df.reset_index(drop=True, inplace=True)
218
+
219
+ # Drop rows that are missing all their values
220
+ df.dropna(how='all', inplace=True)
221
+
222
+ # Write the updated data back to the file
223
+ if data_type in ['RADP', 'RADT']:
224
+ df.to_csv(f"{workspace}/{station}_{data_type}.csv")
225
+ else:
226
+ df.to_csv(f"{workspace}/{station}_{data_type}_{data_units_file}.csv")
227
+
228
+
229
+ def _get_file_header_data_units(data_type: str) -> tuple[str, str]:
230
+ """
231
+ Retrieves the units of measurement for a given environmental data type to be used in file names and column headers.
232
+
233
+ This function maps a specified environmental data type to its corresponding units of measurement.
234
+ These units are used for naming files and for the column headers within those files.
235
+
236
+ Args:
237
+ data_type (str): The type of environmental data for which units are being requested. Supported types include "RAIN", "ETPI", "H2OT", "RADP", "RADT", "AIRT", and "WNDS".
238
+
239
+ Returns:
240
+ tuple[str, str]: A tuple containing two strings. The first string represents the unit of measurement for the file name, and the second string represents the unit of measurement for the column header in the data file.
241
+ """
242
+ # Get the units for the file name and column header based on the type of data
243
+ if data_type == "RAIN":
244
+ data_units_file = "Inches"
245
+ data_units_header = "Inches"
246
+ elif data_type == "ETPI":
247
+ data_units_file = "Inches"
248
+ data_units_header = "Inches"
249
+ elif data_type == "H2OT":
250
+ data_units_file = "Degrees Celsius"
251
+ data_units_header = "Degrees Celsius"
252
+ elif data_type == "RADP":
253
+ data_units_file = ""
254
+ data_units_header = "MICROMOLE/m^2/s"
255
+ elif data_type == "RADT":
256
+ data_units_file = ""
257
+ data_units_header = "kW/m^2"
258
+ elif data_type == "AIRT":
259
+ data_units_file = "Degrees Celsius"
260
+ data_units_header = "Degrees Celsius"
261
+ elif data_type == "WNDS":
262
+ data_units_file = "MPH"
263
+ data_units_header = "MPH"
264
+
265
+ return data_units_file, data_units_header
266
+
267
+
110
268
  if __name__ == "__main__":
111
269
  args = [sys.argv[1].rstrip("/"), sys.argv[2]]
112
270
  if len(sys.argv) >= 4: