loone-data-prep 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loone_data_prep/LOONE_DATA_PREP.py +0 -2
- loone_data_prep/flow_data/S65E_total.py +71 -6
- loone_data_prep/flow_data/get_inflows.py +130 -41
- loone_data_prep/flow_data/get_outflows.py +110 -26
- loone_data_prep/flow_data/hydro.py +121 -27
- loone_data_prep/utils.py +69 -0
- loone_data_prep/water_level_data/get_all.py +208 -11
- loone_data_prep/water_level_data/hydro.py +71 -3
- loone_data_prep/water_quality_data/get_inflows.py +88 -3
- loone_data_prep/water_quality_data/get_lake_wq.py +85 -3
- loone_data_prep/water_quality_data/wq.py +44 -0
- loone_data_prep/weather_data/get_all.py +126 -3
- loone_data_prep/weather_data/weather.py +185 -27
- {loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.7.dist-info}/METADATA +1 -1
- loone_data_prep-0.1.7.dist-info/RECORD +27 -0
- {loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.7.dist-info}/WHEEL +1 -1
- loone_data_prep-0.1.6.dist-info/RECORD +0 -27
- {loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.7.dist-info}/LICENSE +0 -0
- {loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.7.dist-info}/top_level.txt +0 -0
loone_data_prep/utils.py
CHANGED
|
@@ -663,6 +663,75 @@ def nutrient_prediction(
|
|
|
663
663
|
out_dataframe.to_csv(os.path.join(input_dir, f"{station}_PHOSPHATE_predicted.csv"))
|
|
664
664
|
|
|
665
665
|
|
|
666
|
+
def find_last_date_in_csv(workspace: str, file_name: str) -> str:
|
|
667
|
+
"""
|
|
668
|
+
Gets the most recent date from the last line of a .csv file.
|
|
669
|
+
Assumes the file is formatted as a .csv file, encoded in UTF-8,
|
|
670
|
+
and the rows in the file are sorted by date in ascending order.
|
|
671
|
+
|
|
672
|
+
Args:
|
|
673
|
+
workspace (str): The directory where the file is located.
|
|
674
|
+
file_name (str): The name of the file.
|
|
675
|
+
|
|
676
|
+
Returns:
|
|
677
|
+
str: The most recent date as a string in YYYY-MM-DD format, or None if the file does not exist or the date cannot be found.
|
|
678
|
+
"""
|
|
679
|
+
# Helper Functions
|
|
680
|
+
def is_valid_date(date_string):
|
|
681
|
+
try:
|
|
682
|
+
datetime.datetime.strptime(date_string, '%Y-%m-%d')
|
|
683
|
+
return True
|
|
684
|
+
except ValueError:
|
|
685
|
+
return False
|
|
686
|
+
|
|
687
|
+
# Check that file exists
|
|
688
|
+
file_path = os.path.join(workspace, file_name)
|
|
689
|
+
if not os.path.exists(file_path):
|
|
690
|
+
return None
|
|
691
|
+
|
|
692
|
+
# Attempt to extract the date of the last line in the file
|
|
693
|
+
try:
|
|
694
|
+
with open(file_path, 'rb') as file:
|
|
695
|
+
# Go to the end of the file
|
|
696
|
+
file.seek(-2, os.SEEK_END)
|
|
697
|
+
|
|
698
|
+
# Loop backwards until you find the first newline character
|
|
699
|
+
while file.read(1) != b'\n':
|
|
700
|
+
file.seek(-2, os.SEEK_CUR)
|
|
701
|
+
|
|
702
|
+
# Read the last line
|
|
703
|
+
last_line = file.readline().decode()
|
|
704
|
+
|
|
705
|
+
# Extract the date from the last line
|
|
706
|
+
date = None
|
|
707
|
+
|
|
708
|
+
for value in last_line.split(','):
|
|
709
|
+
if is_valid_date(value):
|
|
710
|
+
date = value
|
|
711
|
+
break
|
|
712
|
+
|
|
713
|
+
# Return date
|
|
714
|
+
return date
|
|
715
|
+
except OSError as e:
|
|
716
|
+
print(f"Error reading file {file_name}: {e}")
|
|
717
|
+
return None
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
def dbhydro_data_is_latest(date_latest: str):
|
|
721
|
+
"""
|
|
722
|
+
Checks whether the given date is the most recent date possible to get data from dbhydro.
|
|
723
|
+
Can be used to check whether dbhydro data is up-to-date.
|
|
724
|
+
|
|
725
|
+
Args:
|
|
726
|
+
date_latest (str): The date of the most recent data of the dbhydro data you have
|
|
727
|
+
|
|
728
|
+
Returns:
|
|
729
|
+
bool: True if the date_latest is the most recent date possible to get data from dbhydro, False otherwise
|
|
730
|
+
"""
|
|
731
|
+
date_latest_object = datetime.datetime.strptime(date_latest, "%Y-%m-%d").date()
|
|
732
|
+
return date_latest_object == (datetime.datetime.now().date() - datetime.timedelta(days=1))
|
|
733
|
+
|
|
734
|
+
|
|
666
735
|
if __name__ == "__main__":
|
|
667
736
|
if sys.argv[1] == "get_dbkeys":
|
|
668
737
|
get_dbkeys(sys.argv[2].strip("[]").replace(" ", "").split(","), *sys.argv[3:])
|
|
@@ -1,34 +1,231 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
import os
|
|
3
|
+
import requests
|
|
4
|
+
import uuid
|
|
5
|
+
from datetime import datetime
|
|
3
6
|
from loone_data_prep.water_level_data import hydro
|
|
7
|
+
from loone_data_prep.flow_data.get_forecast_flows import get_stations_latitude_longitude
|
|
8
|
+
from loone_data_prep.utils import find_last_date_in_csv, dbhydro_data_is_latest
|
|
9
|
+
import pandas as pd
|
|
4
10
|
|
|
11
|
+
DATE_NOW = datetime.now().date().strftime("%Y-%m-%d")
|
|
5
12
|
|
|
6
13
|
D = {
|
|
7
|
-
"LO_Stage": {"dbkeys": ["16022", "12509", "12519", "16265", "15611"]},
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
"
|
|
14
|
+
"LO_Stage": {"dbkeys": ["16022", "12509", "12519", "16265", "15611"], "datum": "NGVD29"},
|
|
15
|
+
"LO_Stage_2": {"dbkeys": ["94832"], "date_min": "2024-04-30", "datum": "NAVD88"},
|
|
16
|
+
"Stg_3ANW": {"dbkeys": ["LA369"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
|
|
17
|
+
"Stg_2A17": {"dbkeys": ["16531"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
|
|
18
|
+
"Stg_3A3": {"dbkeys": ["16532"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
|
|
19
|
+
"Stg_3A4": {"dbkeys": ["16537"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"},
|
|
20
|
+
"Stg_3A28": {"dbkeys": ["16538"], "date_min": "1972-01-01", "date_max": "2023-04-30", "datum": "NGVD29"}
|
|
13
21
|
}
|
|
14
22
|
|
|
15
23
|
|
|
16
24
|
def main(workspace: str, d: dict = D) -> dict:
|
|
17
25
|
missing_files = []
|
|
26
|
+
failed_downloads = [] # List of file names that the script failed to get the latest data for (but the files still exist)
|
|
27
|
+
|
|
28
|
+
# Get the date of the latest data in LO_Stage_2.csv
|
|
29
|
+
date_latest_lo_stage_2 = find_last_date_in_csv(workspace, "LO_Stage_2.csv")
|
|
30
|
+
|
|
18
31
|
for name, params in d.items():
|
|
19
|
-
|
|
20
|
-
|
|
32
|
+
# Get the date of the latest data in the csv file
|
|
33
|
+
date_latest = find_last_date_in_csv(workspace, f"{name}.csv")
|
|
34
|
+
|
|
35
|
+
# File with data for this dbkey does NOT already exist (or possibly some other error occurred)
|
|
36
|
+
if date_latest is None:
|
|
37
|
+
print(f"Getting all water level data for {name}.")
|
|
38
|
+
hydro.get(workspace, name, **params)
|
|
39
|
+
else:
|
|
40
|
+
# Check whether the latest data is already up to date.
|
|
41
|
+
if dbhydro_data_is_latest(date_latest):
|
|
42
|
+
# Notify that the data is already up to date
|
|
43
|
+
print(f'Downloading of new water level data skipped for {name}. Data is already up to date.')
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
# Temporarily rename current data file so it isn't over written
|
|
47
|
+
original_file_name = f"{name}.csv"
|
|
48
|
+
original_file_name_temp = f"{name}_{uuid.uuid4()}.csv"
|
|
49
|
+
os.rename(os.path.join(workspace, original_file_name), os.path.join(workspace, original_file_name_temp))
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
# Download only the new data
|
|
53
|
+
print(f'Downloading new water level data for {name} starting from date {date_latest}')
|
|
54
|
+
hydro.get(workspace, name, dbkeys=params['dbkeys'], date_min=date_latest, date_max=DATE_NOW, datum=params['datum'])
|
|
55
|
+
|
|
56
|
+
# Read in the original data and the newly downloaded data
|
|
57
|
+
df_original = pd.read_csv(os.path.join(workspace, original_file_name_temp), index_col=0)
|
|
58
|
+
df_new = pd.read_csv(os.path.join(workspace, original_file_name), index_col=0)
|
|
59
|
+
|
|
60
|
+
# For get_hydro() calls with multiple dbkeys, remove the row corresponding to the latest date from the downloaded data.
|
|
61
|
+
# When get_hydro() is given multiple keys its returned data starts from the date given instead of the day after like it
|
|
62
|
+
# does when given a single key.
|
|
63
|
+
if len(params['dbkeys']) > 1:
|
|
64
|
+
df_new = df_new[df_new['date'] != date_latest]
|
|
65
|
+
|
|
66
|
+
# Merge the new data with the original data
|
|
67
|
+
df_merged = pd.concat([df_original, df_new], ignore_index=True)
|
|
68
|
+
|
|
69
|
+
# Write out the merged data
|
|
70
|
+
df_merged.to_csv(os.path.join(workspace, original_file_name))
|
|
71
|
+
|
|
72
|
+
# Remove the original renamed data file
|
|
73
|
+
os.remove(os.path.join(workspace, original_file_name_temp))
|
|
74
|
+
except Exception as e:
|
|
75
|
+
# Notify of the error
|
|
76
|
+
print(f"Error occurred while downloading new water level data: {e}")
|
|
77
|
+
|
|
78
|
+
# Remove the newly downloaded data file if it exists
|
|
79
|
+
if os.path.exists(os.path.join(workspace, original_file_name)):
|
|
80
|
+
os.remove(os.path.join(workspace, original_file_name))
|
|
81
|
+
|
|
82
|
+
# Rename the original renamed file back to its original name
|
|
83
|
+
if os.path.exists(os.path.join(workspace, original_file_name_temp)):
|
|
84
|
+
os.rename(os.path.join(workspace, original_file_name_temp), os.path.join(workspace, original_file_name))
|
|
85
|
+
|
|
86
|
+
# Add the file name to the list of failed downloads
|
|
87
|
+
failed_downloads.append(original_file_name)
|
|
88
|
+
|
|
21
89
|
if os.path.exists(os.path.join(workspace, f"{name}.csv")):
|
|
22
90
|
print(f"{name} downloaded successfully.")
|
|
23
91
|
else:
|
|
24
92
|
missing_files.append(f"{name}.csv")
|
|
25
93
|
print(f"{name} could not be downloaded after various tries.")
|
|
26
94
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
95
|
+
# Merge data from old and new dbkey for station "L OKEE"
|
|
96
|
+
convert_failure = False
|
|
97
|
+
if os.path.exists(os.path.join(workspace, "LO_Stage.csv")) and os.path.exists(os.path.join(workspace, "LO_Stage_2.csv")):
|
|
98
|
+
# Output Progress
|
|
99
|
+
print("\nMerging data for station 'L OKEE'...")
|
|
100
|
+
|
|
101
|
+
# Get the latitude and longitude of the "L OKEE" station
|
|
102
|
+
lat_long_map = get_stations_latitude_longitude(["L OKEE"])
|
|
103
|
+
latitude, longitude = lat_long_map["L OKEE"]
|
|
104
|
+
|
|
105
|
+
# Load the LO_Stage_2.csv file
|
|
106
|
+
df_lo_stage_2 = pd.read_csv(os.path.join(workspace, "LO_Stage_2.csv"), index_col="date")
|
|
107
|
+
df_lo_stage_2.index = pd.to_datetime(df_lo_stage_2.index)
|
|
108
|
+
|
|
109
|
+
# Output Progress
|
|
110
|
+
print("Converting NAVD88 to NGVD29 for 'L OKEE's new dbkey...\n")
|
|
111
|
+
|
|
112
|
+
# Use only the data that is not already in the LO_Stage.csv file
|
|
113
|
+
if date_latest_lo_stage_2 is not None:
|
|
114
|
+
date_start = datetime.strptime(date_latest_lo_stage_2, "%Y-%m-%d") + pd.DateOffset(days=1)
|
|
115
|
+
df_lo_stage_2 = df_lo_stage_2.loc[date_start:]
|
|
116
|
+
|
|
117
|
+
# Convert the stage values from NAVD88 to NGVD29
|
|
118
|
+
lo_stage_2_dates = df_lo_stage_2.index.tolist()
|
|
119
|
+
lo_stage_2_values_navd88 = df_lo_stage_2["L OKEE_STG_ft NGVD29"].tolist()
|
|
120
|
+
lo_stage_2_values_ngvd29 = []
|
|
121
|
+
|
|
122
|
+
for i in range(0, len(lo_stage_2_values_navd88)):
|
|
123
|
+
date = lo_stage_2_dates[i]
|
|
124
|
+
value = lo_stage_2_values_navd88[i]
|
|
125
|
+
try:
|
|
126
|
+
lo_stage_2_values_ngvd29.append(_convert_navd88_to_ngvd29(latitude, longitude, value, date.year))
|
|
127
|
+
except Exception as e:
|
|
128
|
+
convert_failure = True
|
|
129
|
+
print(str(e))
|
|
130
|
+
break
|
|
131
|
+
|
|
132
|
+
# Check for conversion failure
|
|
133
|
+
if not convert_failure:
|
|
134
|
+
# Update the LO_Stage.csv file with the converted values
|
|
135
|
+
df_lo_stage = pd.read_csv(os.path.join(workspace, "LO_Stage.csv"), index_col="date")
|
|
136
|
+
df_lo_stage.index = pd.to_datetime(df_lo_stage.index)
|
|
137
|
+
|
|
138
|
+
for i in range(0, len(lo_stage_2_values_ngvd29)):
|
|
139
|
+
# Get the current date and value
|
|
140
|
+
date = lo_stage_2_dates[i]
|
|
141
|
+
value = lo_stage_2_values_ngvd29[i]
|
|
142
|
+
|
|
143
|
+
# Update the value in the LO_Stage dataframe
|
|
144
|
+
df_lo_stage.at[date, "L OKEE_STG_ft NGVD29"] = value
|
|
145
|
+
|
|
146
|
+
# Reset the index
|
|
147
|
+
df_lo_stage.reset_index(inplace=True)
|
|
148
|
+
df_lo_stage.drop(columns=["Unnamed: 0"], inplace=True)
|
|
149
|
+
|
|
150
|
+
# Save the updated LO_Stage.csv file
|
|
151
|
+
df_lo_stage.to_csv(os.path.join(workspace, "LO_Stage.csv"))
|
|
152
|
+
else:
|
|
153
|
+
# Conversion failed due to missing files
|
|
154
|
+
convert_failure = True
|
|
155
|
+
print("Error: Missing LO_Stage.csv or LO_Stage_2.csv file, cannot convert and merge.")
|
|
156
|
+
|
|
157
|
+
if missing_files or convert_failure:
|
|
158
|
+
error_string = ""
|
|
159
|
+
|
|
160
|
+
if missing_files:
|
|
161
|
+
error_string += f"The following files could not be downloaded: {missing_files}"
|
|
162
|
+
|
|
163
|
+
if failed_downloads:
|
|
164
|
+
error_string += f"\nFailed to download the latest data for the following files: {failed_downloads}"
|
|
165
|
+
|
|
166
|
+
if convert_failure:
|
|
167
|
+
error_string += "\nFailed to convert NAVD88 to NGVD29 for 'L OKEE' station."
|
|
168
|
+
|
|
169
|
+
return {"error": error_string}
|
|
170
|
+
|
|
30
171
|
return {"success": "Completed water level data download."}
|
|
31
172
|
|
|
173
|
+
def _convert_navd88_to_ngvd29(latitude: float, longitude: float, stage: float, year: int) -> float:
|
|
174
|
+
"""Converts a stage value from NAVD88 to NGVD29 using NCAT.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
latitude (float): The latitude of the station (in decimal degrees format).
|
|
178
|
+
longitude (float): The longitude of the station (in decimal degrees format).
|
|
179
|
+
stage (float): The stage (water level) value to convert (in feet).
|
|
180
|
+
year (int): The year when the stage value was recorded.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
float: The converted stage value in feet (NGVD29).
|
|
184
|
+
"""
|
|
185
|
+
# Helper functions
|
|
186
|
+
def _feet_to_meters(feet: float) -> float:
|
|
187
|
+
return feet * 0.3048
|
|
188
|
+
|
|
189
|
+
def _meters_to_feet(meters: float) -> float:
|
|
190
|
+
return meters / 0.3048
|
|
191
|
+
|
|
192
|
+
# Check for NA value
|
|
193
|
+
if pd.isna(stage):
|
|
194
|
+
return stage
|
|
195
|
+
|
|
196
|
+
# Convert stage to meters
|
|
197
|
+
stage_meters = _feet_to_meters(stage)
|
|
198
|
+
|
|
199
|
+
# Make request
|
|
200
|
+
base_url = "https://geodesy.noaa.gov/api/ncat/llh"
|
|
201
|
+
|
|
202
|
+
params = {
|
|
203
|
+
"lat": latitude, # latitude
|
|
204
|
+
"lon": longitude, # longitude
|
|
205
|
+
"orthoHt": stage_meters, # orthometric height in NAVD88
|
|
206
|
+
"year": year, # year of observation
|
|
207
|
+
"inDatum": "NAD83(1986)", # Datum used for input latitude and longitude
|
|
208
|
+
"outDatum": "NAD83(1986)", # Datum used for output latitude and longitude
|
|
209
|
+
"inVertDatum": "NAVD88", # vertical datum of input orthometric height
|
|
210
|
+
"outVertDatum": "NGVD29", # vertical datum of output orthometric height (desired vertical datum)
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
response = requests.get(base_url, params=params)
|
|
215
|
+
except Exception as e:
|
|
216
|
+
raise Exception(f"Error converting NAVD88 to NGVD29: {e}")
|
|
217
|
+
|
|
218
|
+
# Check for failure
|
|
219
|
+
if response.status_code != 200:
|
|
220
|
+
raise Exception(f"Error converting NAVD88 to NGVD29: {response.text}")
|
|
221
|
+
|
|
222
|
+
# Return converted stage in feet
|
|
223
|
+
try:
|
|
224
|
+
value = _meters_to_feet(float(response.json()["destOrthoht"]))
|
|
225
|
+
except Exception as e:
|
|
226
|
+
raise Exception(f"Error converting NAVD88 to NGVD29: {e}")
|
|
227
|
+
|
|
228
|
+
return value
|
|
32
229
|
|
|
33
230
|
if __name__ == "__main__":
|
|
34
231
|
workspace = sys.argv[1].rstrip("/")
|
|
@@ -3,7 +3,7 @@ from datetime import datetime
|
|
|
3
3
|
from retry import retry
|
|
4
4
|
from rpy2.robjects import r
|
|
5
5
|
from rpy2.rinterface_lib.embedded import RRuntimeError
|
|
6
|
-
|
|
6
|
+
import pandas as pd
|
|
7
7
|
|
|
8
8
|
DEFAULT_DBKEYS = ["16022", "12509", "12519", "16265", "15611"]
|
|
9
9
|
DATE_NOW = datetime.now().strftime("%Y-%m-%d")
|
|
@@ -16,20 +16,88 @@ def get(
|
|
|
16
16
|
dbkeys: list = DEFAULT_DBKEYS,
|
|
17
17
|
date_min: str = "1950-01-01",
|
|
18
18
|
date_max: str = DATE_NOW,
|
|
19
|
+
datum: str = "",
|
|
19
20
|
**kwargs: str | list
|
|
20
21
|
) -> None:
|
|
22
|
+
# Get the type and units for the station
|
|
23
|
+
data_type = "STG"
|
|
24
|
+
units = "ft NGVD29"
|
|
25
|
+
|
|
26
|
+
if name in ["Stg_3A3", "Stg_2A17", "Stg_3A4", "Stg_3A28"]:
|
|
27
|
+
data_type = "GAGHT"
|
|
28
|
+
units = "feet"
|
|
29
|
+
|
|
21
30
|
dbkeys_str = "\"" + "\", \"".join(dbkeys) + "\""
|
|
22
31
|
r(
|
|
23
32
|
f"""
|
|
24
33
|
# Load the required libraries
|
|
25
34
|
library(rio)
|
|
26
35
|
library(dbhydroR)
|
|
27
|
-
|
|
28
|
-
|
|
36
|
+
library(dplyr)
|
|
37
|
+
|
|
38
|
+
# Stage Data
|
|
39
|
+
if ("{datum}" == "")
|
|
40
|
+
{{
|
|
41
|
+
{name} <- get_hydro(dbkey = c({dbkeys_str}), date_min = "{date_min}", date_max = "{date_max}", raw = TRUE)
|
|
42
|
+
}}
|
|
43
|
+
|
|
44
|
+
if (nchar("{datum}") > 0)
|
|
45
|
+
{{
|
|
46
|
+
{name} <- get_hydro(dbkey = c({dbkeys_str}), date_min = "{date_min}", date_max = "{date_max}", raw = TRUE, datum = "{datum}")
|
|
47
|
+
}}
|
|
48
|
+
|
|
49
|
+
# Give data.frame correct column names so it can be cleaned using the clean_hydro function
|
|
50
|
+
colnames({name}) <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
|
|
51
|
+
|
|
52
|
+
# Check if the data.frame has any rows
|
|
53
|
+
if (nrow({name}) == 0)
|
|
54
|
+
{{
|
|
55
|
+
# No data given back, It's possible that the dbkey has reached its end date.
|
|
56
|
+
print(paste("Empty data.frame returned for dbkeys", "{dbkeys}", "It's possible that the dbkey has reached its end date. Skipping to the next dbkey."))
|
|
57
|
+
return(list(success = FALSE, dbkey = "{dbkeys}"))
|
|
58
|
+
}}
|
|
59
|
+
|
|
60
|
+
# Get the station
|
|
61
|
+
station <- {name}$station[1]
|
|
62
|
+
|
|
63
|
+
# Add a type and units column to data so it can be cleaned using the clean_hydro function
|
|
64
|
+
{name}$type <- "{data_type}"
|
|
65
|
+
{name}$units <- "{units}"
|
|
66
|
+
|
|
67
|
+
# Clean the data.frame
|
|
68
|
+
{name} <- clean_hydro({name})
|
|
69
|
+
|
|
70
|
+
# Drop the " _STG_ft NGVD29" column
|
|
71
|
+
{name} <- {name} %>% select(-` _{data_type}_{units}`)
|
|
72
|
+
|
|
73
|
+
# Write the data to a csv file
|
|
29
74
|
write.csv({name},file ='{workspace}/{name}.csv')
|
|
30
75
|
"""
|
|
31
76
|
)
|
|
77
|
+
|
|
78
|
+
_reformat_water_level_file(workspace, name)
|
|
32
79
|
|
|
80
|
+
def _reformat_water_level_file(workspace: str, name: str):
|
|
81
|
+
# Read in the data
|
|
82
|
+
df = pd.read_csv(f"{workspace}/{name}.csv")
|
|
83
|
+
|
|
84
|
+
# Drop the "Unnamed: 0" column
|
|
85
|
+
df.drop(columns=['Unnamed: 0'], inplace=True)
|
|
86
|
+
|
|
87
|
+
# Convert date column to datetime
|
|
88
|
+
df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
|
|
89
|
+
|
|
90
|
+
# Sort the data by date
|
|
91
|
+
df.sort_values('date', inplace=True)
|
|
92
|
+
|
|
93
|
+
# Renumber the index
|
|
94
|
+
df.reset_index(drop=True, inplace=True)
|
|
95
|
+
|
|
96
|
+
# Drop rows that are missing all their values
|
|
97
|
+
df.dropna(how='all', inplace=True)
|
|
98
|
+
|
|
99
|
+
# Write the updated data back to the file
|
|
100
|
+
df.to_csv(f"{workspace}/{name}.csv")
|
|
33
101
|
|
|
34
102
|
if __name__ == "__main__":
|
|
35
103
|
args = [sys.argv[1].rstrip("/"), sys.argv[2]]
|
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
import os
|
|
3
|
+
import uuid
|
|
4
|
+
from datetime import datetime, timedelta
|
|
5
|
+
import pandas as pd
|
|
3
6
|
from loone_data_prep.water_quality_data import wq
|
|
7
|
+
from loone_data_prep.utils import find_last_date_in_csv, dbhydro_data_is_latest
|
|
4
8
|
|
|
5
9
|
|
|
6
10
|
D = {
|
|
@@ -21,18 +25,99 @@ D = {
|
|
|
21
25
|
}
|
|
22
26
|
|
|
23
27
|
|
|
28
|
+
DEFAULT_DATE_MIN = "1950-01-01"
|
|
29
|
+
|
|
30
|
+
|
|
24
31
|
def main(workspace: str, d: dict = D) -> dict:
|
|
25
32
|
missing_files = []
|
|
33
|
+
failed_downloads = [] # List of file names that the script failed to get the latest data for (but the files still exist)
|
|
26
34
|
for name, params in d.items():
|
|
27
35
|
print(f"Getting {name} for the following station IDs: {params['station_ids']}.")
|
|
28
|
-
|
|
36
|
+
|
|
37
|
+
# Get the date of the latest data in the csv file for each station id
|
|
38
|
+
station_date_latest = {}
|
|
39
|
+
for station_id in params["station_ids"]:
|
|
40
|
+
station_date_latest[station_id] = find_last_date_in_csv(workspace, f"water_quality_{station_id}_{name}.csv")
|
|
41
|
+
|
|
42
|
+
# Get the water quality data
|
|
43
|
+
for station_id, date_latest in station_date_latest.items():
|
|
44
|
+
# File with data for this station/name combination does NOT already exist (or possibly some other error occurred)
|
|
45
|
+
if date_latest is None:
|
|
46
|
+
# Get all the water quality data for the name/station combination
|
|
47
|
+
print(f"Getting all {name} data for station ID: {station_id}.")
|
|
48
|
+
wq.get(workspace, name, [station_id])
|
|
49
|
+
else:
|
|
50
|
+
# Check whether we already have the latest data
|
|
51
|
+
if dbhydro_data_is_latest(date_latest):
|
|
52
|
+
# Notify that the data is already up to date
|
|
53
|
+
print(f'Downloading of new water quality data for test name: {name} station: {station} skipped. Data is already up to date.')
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
# Temporarily rename current data file so it isn't over written
|
|
57
|
+
original_file_name = f"water_quality_{station_id}_{name}.csv"
|
|
58
|
+
original_file_name_temp = f"water_quality_{station_id}_{name}_{uuid.uuid4()}.csv"
|
|
59
|
+
os.rename(os.path.join(workspace, original_file_name), os.path.join(workspace, original_file_name_temp))
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
# Get only the water quality data that is newer than the latest data in the csv file
|
|
63
|
+
print(f"Downloading new water quality data for test name: {name} station ID: {station_id} starting from date: {date_latest}.")
|
|
64
|
+
date_latest = (datetime.strptime(date_latest, "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
|
|
65
|
+
wq.get(workspace, name, [station_id], date_min=date_latest)
|
|
66
|
+
|
|
67
|
+
# Data failed to download - It's possible the data's end date has been reached
|
|
68
|
+
if not os.path.exists(os.path.join(workspace, original_file_name)):
|
|
69
|
+
raise Exception(f"It's possible that the data for test name: {name} station ID: {station_id} has reached its end date.")
|
|
70
|
+
|
|
71
|
+
# Read in the original data
|
|
72
|
+
df_original = pd.read_csv(os.path.join(workspace, original_file_name_temp), index_col=0)
|
|
73
|
+
|
|
74
|
+
# Calculate the days column for the newly downloaded data
|
|
75
|
+
df_original_date_min = df_original['date'].min()
|
|
76
|
+
wq._calculate_days_column(workspace, original_file_name, df_original_date_min)
|
|
77
|
+
|
|
78
|
+
# Read in the newly downloaded data
|
|
79
|
+
df_new = pd.read_csv(os.path.join(workspace, original_file_name), index_col=0)
|
|
80
|
+
df_new.reset_index(inplace=True)
|
|
81
|
+
|
|
82
|
+
# Merge the new data with the original data
|
|
83
|
+
df_merged = pd.concat([df_original, df_new], ignore_index=True)
|
|
84
|
+
|
|
85
|
+
# Write out the merged data
|
|
86
|
+
df_merged.to_csv(os.path.join(workspace, original_file_name))
|
|
87
|
+
|
|
88
|
+
# Remove the original renamed data file
|
|
89
|
+
os.remove(os.path.join(workspace, original_file_name_temp))
|
|
90
|
+
except Exception as e:
|
|
91
|
+
# Notify of the error
|
|
92
|
+
print(f"Error occurred while downloading new water quality data: {e}")
|
|
93
|
+
|
|
94
|
+
# Remove the newly downloaded data file if it exists
|
|
95
|
+
if os.path.exists(os.path.join(workspace, original_file_name)):
|
|
96
|
+
os.remove(os.path.join(workspace, original_file_name))
|
|
97
|
+
|
|
98
|
+
# Rename the original renamed file back to its original name
|
|
99
|
+
if os.path.exists(os.path.join(workspace, original_file_name_temp)):
|
|
100
|
+
os.rename(os.path.join(workspace, original_file_name_temp), os.path.join(workspace, original_file_name))
|
|
101
|
+
|
|
102
|
+
# Add the file name to the list of failed downloads
|
|
103
|
+
failed_downloads.append(original_file_name)
|
|
104
|
+
|
|
105
|
+
# Check for any download failures
|
|
29
106
|
for station in params["station_ids"]:
|
|
30
107
|
if not os.path.exists(os.path.join(workspace, f"water_quality_{station}_{name}.csv")):
|
|
31
108
|
missing_files.append(f"water_quality_{station}_{name}.csv")
|
|
32
109
|
print(f"{name} station ID: {station} could not be downloaded after various tries.")
|
|
33
110
|
|
|
34
|
-
if missing_files:
|
|
35
|
-
|
|
111
|
+
if missing_files or failed_downloads:
|
|
112
|
+
error_string = ""
|
|
113
|
+
|
|
114
|
+
if missing_files:
|
|
115
|
+
error_string += f"The following files could not be downloaded: {missing_files}"
|
|
116
|
+
|
|
117
|
+
if failed_downloads:
|
|
118
|
+
error_string += f"\nThe following files could not be updated: {failed_downloads}"
|
|
119
|
+
|
|
120
|
+
return {"error": error_string}
|
|
36
121
|
|
|
37
122
|
return {"success": "Completed water quality data download."}
|
|
38
123
|
|
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
import os
|
|
3
|
+
import uuid
|
|
4
|
+
from datetime import datetime, timedelta
|
|
5
|
+
import pandas as pd
|
|
3
6
|
from loone_data_prep.water_quality_data import wq
|
|
7
|
+
from loone_data_prep.utils import find_last_date_in_csv, dbhydro_data_is_latest
|
|
4
8
|
|
|
5
9
|
|
|
6
10
|
D = {
|
|
@@ -28,16 +32,94 @@ D = {
|
|
|
28
32
|
|
|
29
33
|
def main(workspace: str, d: dict = D) -> dict:
|
|
30
34
|
missing_files = []
|
|
35
|
+
failed_downloads = [] # List of file names that the script failed to get the latest data for (but the files still exist)
|
|
31
36
|
for name, params in d.items():
|
|
32
37
|
print(f"Getting {name} for the following station IDs: {params['station_ids']}.")
|
|
33
|
-
|
|
38
|
+
|
|
39
|
+
# Get the date of the latest data in the csv file for each station id
|
|
40
|
+
station_date_latest = {}
|
|
41
|
+
for station_id in params["station_ids"]:
|
|
42
|
+
station_date_latest[station_id] = find_last_date_in_csv(workspace, f"water_quality_{station_id}_{name}.csv")
|
|
43
|
+
|
|
44
|
+
# Get the water quality data
|
|
45
|
+
for station_id, date_latest in station_date_latest.items():
|
|
46
|
+
# File with data for this station/name combination does NOT already exist (or possibly some other error occurred)
|
|
47
|
+
if date_latest is None:
|
|
48
|
+
# Get all the water quality data for the name/station combination
|
|
49
|
+
print(f"Getting all {name} data for station ID: {station_id}.")
|
|
50
|
+
wq.get(workspace, name, [station_id])
|
|
51
|
+
else:
|
|
52
|
+
# Check whether we already have the latest data
|
|
53
|
+
if dbhydro_data_is_latest(date_latest):
|
|
54
|
+
# Notify that the data is already up to date
|
|
55
|
+
print(f'Downloading of new water quality data for test name: {name} station: {station} skipped. Data is already up to date.')
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
# Temporarily rename current data file so it isn't over written
|
|
59
|
+
original_file_name = f"water_quality_{station_id}_{name}.csv"
|
|
60
|
+
original_file_name_temp = f"water_quality_{station_id}_{name}_{uuid.uuid4()}.csv"
|
|
61
|
+
os.rename(os.path.join(workspace, original_file_name), os.path.join(workspace, original_file_name_temp))
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
# Get only the water quality data that is newer than the latest data in the csv file
|
|
65
|
+
print(f"Downloading new water quality data for test name: {name} station ID: {station_id} starting from date: {date_latest}.")
|
|
66
|
+
date_latest = (datetime.strptime(date_latest, "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
|
|
67
|
+
wq.get(workspace, name, [station_id], date_min=date_latest)
|
|
68
|
+
|
|
69
|
+
# Data failed to download - It's possible the data's end date has been reached
|
|
70
|
+
if not os.path.exists(os.path.join(workspace, original_file_name)):
|
|
71
|
+
raise Exception(f"It's possible that the data for test name: {name} station ID: {station_id} has reached its end date.")
|
|
72
|
+
|
|
73
|
+
# Read in the original data
|
|
74
|
+
df_original = pd.read_csv(os.path.join(workspace, original_file_name_temp), index_col=0)
|
|
75
|
+
|
|
76
|
+
# Calculate the days column for the newly downloaded data
|
|
77
|
+
df_original_date_min = df_original['date'].min()
|
|
78
|
+
wq._calculate_days_column(workspace, original_file_name, df_original_date_min)
|
|
79
|
+
|
|
80
|
+
# Read in the newly downloaded data
|
|
81
|
+
df_new = pd.read_csv(os.path.join(workspace, original_file_name), index_col=0)
|
|
82
|
+
df_new.reset_index(inplace=True)
|
|
83
|
+
|
|
84
|
+
# Merge the new data with the original data
|
|
85
|
+
df_merged = pd.concat([df_original, df_new], ignore_index=True)
|
|
86
|
+
|
|
87
|
+
# Write out the merged data
|
|
88
|
+
df_merged.to_csv(os.path.join(workspace, original_file_name))
|
|
89
|
+
|
|
90
|
+
# Remove the original renamed data file
|
|
91
|
+
os.remove(os.path.join(workspace, original_file_name_temp))
|
|
92
|
+
except Exception as e:
|
|
93
|
+
# Notify of the error
|
|
94
|
+
print(f"Error occurred while downloading new water quality data: {e}")
|
|
95
|
+
|
|
96
|
+
# Remove the newly downloaded data file if it exists
|
|
97
|
+
if os.path.exists(os.path.join(workspace, original_file_name)):
|
|
98
|
+
os.remove(os.path.join(workspace, original_file_name))
|
|
99
|
+
|
|
100
|
+
# Rename the original renamed file back to its original name
|
|
101
|
+
if os.path.exists(os.path.join(workspace, original_file_name_temp)):
|
|
102
|
+
os.rename(os.path.join(workspace, original_file_name_temp), os.path.join(workspace, original_file_name))
|
|
103
|
+
|
|
104
|
+
# Add the file name to the list of failed downloads
|
|
105
|
+
failed_downloads.append(original_file_name)
|
|
106
|
+
|
|
107
|
+
# Check for missing files
|
|
34
108
|
for station in params["station_ids"]:
|
|
35
109
|
if not os.path.exists(os.path.join(workspace, f"water_quality_{station}_{name}.csv")):
|
|
36
110
|
missing_files.append(f"water_quality_{station}_{name}.csv")
|
|
37
111
|
print(f"{name} station ID: {station} could not be downloaded after various tries.")
|
|
38
112
|
|
|
39
|
-
if missing_files:
|
|
40
|
-
|
|
113
|
+
if missing_files or failed_downloads:
|
|
114
|
+
error_string = ""
|
|
115
|
+
|
|
116
|
+
if missing_files:
|
|
117
|
+
error_string += f"The following files could not be downloaded: {missing_files}"
|
|
118
|
+
|
|
119
|
+
if failed_downloads:
|
|
120
|
+
error_string += f"\nThe following files could not be updated: {failed_downloads}"
|
|
121
|
+
|
|
122
|
+
return {"error": error_string}
|
|
41
123
|
|
|
42
124
|
return {"success": "Completed water quality data download."}
|
|
43
125
|
|