loone-data-prep 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loone_data_prep/LOONE_DATA_PREP.py +0 -2
- loone_data_prep/flow_data/S65E_total.py +71 -6
- loone_data_prep/flow_data/get_inflows.py +130 -41
- loone_data_prep/flow_data/get_outflows.py +110 -26
- loone_data_prep/flow_data/hydro.py +121 -27
- loone_data_prep/utils.py +69 -0
- loone_data_prep/water_level_data/get_all.py +208 -11
- loone_data_prep/water_level_data/hydro.py +71 -3
- loone_data_prep/water_quality_data/get_inflows.py +88 -3
- loone_data_prep/water_quality_data/get_lake_wq.py +85 -3
- loone_data_prep/water_quality_data/wq.py +44 -0
- loone_data_prep/weather_data/get_all.py +126 -3
- loone_data_prep/weather_data/weather.py +185 -27
- {loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.7.dist-info}/METADATA +1 -1
- loone_data_prep-0.1.7.dist-info/RECORD +27 -0
- {loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.7.dist-info}/WHEEL +1 -1
- loone_data_prep-0.1.6.dist-info/RECORD +0 -27
- {loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.7.dist-info}/LICENSE +0 -0
- {loone_data_prep-0.1.6.dist-info → loone_data_prep-0.1.7.dist-info}/top_level.txt +0 -0
|
@@ -383,7 +383,6 @@ def main(input_dir: str, output_dir: str) -> None:
|
|
|
383
383
|
LOWS['LZ40WS'] = LZ40WS['LZ40_WNDS_MPH']
|
|
384
384
|
LOWS['LO_Avg_WS_MPH'] = LOWS.mean(axis=1, numeric_only=True)
|
|
385
385
|
LOWS.to_csv(f'{output_dir}/LOWS.csv', index=False)
|
|
386
|
-
LOWS.to_csv(f'{input_dir}/LOWS.csv', index=False) # Also needed in temporary directory by utils.py's wind_induced_waves()
|
|
387
386
|
|
|
388
387
|
# RFVol acft
|
|
389
388
|
# Create File (RF_Volume)
|
|
@@ -897,7 +896,6 @@ def main(input_dir: str, output_dir: str) -> None:
|
|
|
897
896
|
# Write Data into csv files
|
|
898
897
|
# write Avg Stage (ft, m) Storage (acft, m3) SA (acres) to csv
|
|
899
898
|
LO_Stg_Sto_SA_df.to_csv(f'{output_dir}/Average_LO_Storage_3MLag.csv', index=False)
|
|
900
|
-
LO_Stg_Sto_SA_df.to_csv(f'{input_dir}/Average_LO_Storage_3MLag.csv', index=False) # Also needed in temporary directory by utils.py's wind_induced_waves()
|
|
901
899
|
# Write S65 TP concentrations (mg/L)
|
|
902
900
|
S65_total_TP.to_csv(f'{output_dir}/S65_TP_3MLag.csv', index=False)
|
|
903
901
|
# TP External Loads 3 Months Lag (mg)
|
|
@@ -2,22 +2,87 @@ import sys
|
|
|
2
2
|
from retry import retry
|
|
3
3
|
from rpy2.robjects import r
|
|
4
4
|
from rpy2.rinterface_lib.embedded import RRuntimeError
|
|
5
|
+
import pandas as pd
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
@retry(RRuntimeError, tries=5, delay=15, max_delay=60, backoff=2)
|
|
8
|
-
def get(
|
|
9
|
+
def get(
|
|
10
|
+
workspace,
|
|
11
|
+
date_min: str = "1972-01-01",
|
|
12
|
+
date_max: str = "2023-06-30"
|
|
13
|
+
) -> None:
|
|
9
14
|
r(
|
|
10
15
|
f"""
|
|
11
16
|
# Load the required libraries
|
|
12
17
|
library(dbhydroR)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
+
library(dplyr)
|
|
19
|
+
|
|
20
|
+
# Helper Functions
|
|
21
|
+
retrieve_data <- function(dbkey, date_min, date_max)
|
|
22
|
+
{{
|
|
23
|
+
# Get the data from dbhydro
|
|
24
|
+
df = get_hydro(dbkey = dbkey, date_min = date_min, date_max = date_max, raw = TRUE)
|
|
25
|
+
|
|
26
|
+
# Give data.frame correct column names so it can be cleaned using the clean_hydro function
|
|
27
|
+
colnames(df) <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
|
|
28
|
+
|
|
29
|
+
# Add a type and units column to data so it can be cleaned using the clean_hydro function
|
|
30
|
+
df$type <- "FLOW"
|
|
31
|
+
df$units <- "cfs"
|
|
32
|
+
|
|
33
|
+
# Clean the data.frame
|
|
34
|
+
df <- clean_hydro(df)
|
|
35
|
+
|
|
36
|
+
# Drop the " _FLOW_cfs" column
|
|
37
|
+
df <- df %>% select(-` _FLOW_cfs`)
|
|
38
|
+
|
|
39
|
+
# Convert Flow rate from cfs to m³/day
|
|
40
|
+
df[, -1] <- df[, -1] * (0.0283168466 * 86400)
|
|
41
|
+
|
|
42
|
+
# Return resulting data.frame
|
|
43
|
+
return(df)
|
|
44
|
+
}}
|
|
45
|
+
|
|
46
|
+
# S65E_S
|
|
47
|
+
S65E_S <- retrieve_data(dbkey = "91656", date_min = "{date_min}", date_max = "{date_max}")
|
|
48
|
+
|
|
49
|
+
# Wait five seconds before next request to avoid "too many requests" error
|
|
50
|
+
Sys.sleep(5)
|
|
51
|
+
|
|
52
|
+
# S65EX1_S
|
|
53
|
+
S65EX1_S <- retrieve_data(dbkey = "AL760", date_min = "{date_min}", date_max = "{date_max}")
|
|
54
|
+
|
|
55
|
+
# Merge the data from each dbkey
|
|
56
|
+
result <- merge(S65E_S, S65EX1_S, by = "date", all = TRUE)
|
|
57
|
+
|
|
58
|
+
# Write the data to a file
|
|
59
|
+
write.csv(result, file = '{workspace}/S65E_total.csv')
|
|
18
60
|
"""
|
|
19
61
|
)
|
|
62
|
+
|
|
63
|
+
_reformat_s65e_total_file(workspace)
|
|
20
64
|
|
|
65
|
+
def _reformat_s65e_total_file(workspace: str):
|
|
66
|
+
# Read in the data
|
|
67
|
+
df = pd.read_csv(f"{workspace}/S65E_total.csv")
|
|
68
|
+
|
|
69
|
+
# Drop unused columns
|
|
70
|
+
df.drop('Unnamed: 0', axis=1, inplace=True)
|
|
71
|
+
|
|
72
|
+
# Convert date column to datetime
|
|
73
|
+
df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
|
|
74
|
+
|
|
75
|
+
# Sort the data by date
|
|
76
|
+
df.sort_values('date', inplace=True)
|
|
77
|
+
|
|
78
|
+
# Renumber the index
|
|
79
|
+
df.reset_index(drop=True, inplace=True)
|
|
80
|
+
|
|
81
|
+
# Drop rows that are missing all their values
|
|
82
|
+
df.dropna(how='all', inplace=True)
|
|
83
|
+
|
|
84
|
+
# Write the updated data back to the file
|
|
85
|
+
df.to_csv(f"{workspace}/S65E_total.csv")
|
|
21
86
|
|
|
22
87
|
if __name__ == "__main__":
|
|
23
88
|
workspace = sys.argv[1].rstrip("/")
|
|
@@ -1,43 +1,128 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
import os
|
|
3
3
|
from glob import glob
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
import uuid
|
|
6
|
+
import pandas as pd
|
|
4
7
|
from loone_data_prep.flow_data import hydro, S65E_total
|
|
8
|
+
from loone_data_prep.utils import find_last_date_in_csv, dbhydro_data_is_latest
|
|
5
9
|
|
|
6
10
|
|
|
7
|
-
# Database keys for needed inflow data
|
|
8
|
-
DBKEYS =
|
|
9
|
-
"91370",
|
|
10
|
-
"91371",
|
|
11
|
-
"91373",
|
|
12
|
-
"91377",
|
|
13
|
-
"91379",
|
|
14
|
-
"91401",
|
|
15
|
-
"91429",
|
|
16
|
-
"91473",
|
|
17
|
-
"91508",
|
|
18
|
-
"91510",
|
|
19
|
-
"91513",
|
|
20
|
-
"91599",
|
|
21
|
-
"91608",
|
|
22
|
-
"91656",
|
|
23
|
-
"91668",
|
|
24
|
-
"91675",
|
|
25
|
-
"91687",
|
|
26
|
-
"15627",
|
|
27
|
-
"15640",
|
|
28
|
-
"15626",
|
|
29
|
-
"15642",
|
|
30
|
-
"15638",
|
|
31
|
-
|
|
11
|
+
# Database keys for needed inflow data mapped to their stations
|
|
12
|
+
DBKEYS = {
|
|
13
|
+
"91370": "S127_C",
|
|
14
|
+
"91371": "S127_P",
|
|
15
|
+
"91373": "S129_C",
|
|
16
|
+
"91377": "S133_P",
|
|
17
|
+
"91379": "S135_C",
|
|
18
|
+
"91401": "S154_C",
|
|
19
|
+
"91429": "S191_S",
|
|
20
|
+
"91473": "S2_P",
|
|
21
|
+
"91508": "S351_S",
|
|
22
|
+
"91510": "S352_S",
|
|
23
|
+
"91513": "S354_S",
|
|
24
|
+
"91599": "S3_P",
|
|
25
|
+
"91608": "S4_P",
|
|
26
|
+
"91656": "S65E_S",
|
|
27
|
+
"91668": "S71_S",
|
|
28
|
+
"91675": "S72_S",
|
|
29
|
+
"91687": "S84_S",
|
|
30
|
+
"15627": "FISHP",
|
|
31
|
+
"15640": "L8.441",
|
|
32
|
+
"15626": "S308.DS",
|
|
33
|
+
"15642": "S129 PMP_P",
|
|
34
|
+
"15638": "S135 PMP_P",
|
|
35
|
+
}
|
|
32
36
|
|
|
33
|
-
|
|
34
|
-
|
|
37
|
+
def main(workspace: str, dbkeys: dict = DBKEYS) -> dict:
|
|
38
|
+
"""
|
|
39
|
+
Retrieve the inflows data used by LOONE.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
workspace (str): Path to workspace where data will be downloaded.
|
|
43
|
+
dbkeys (dict): Dictionary of dbkeys and corresponding station names.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
dict: Success or error message
|
|
47
|
+
"""
|
|
48
|
+
|
|
35
49
|
# Retrieve inflow data
|
|
36
|
-
for dbkey in dbkeys:
|
|
37
|
-
|
|
50
|
+
for dbkey, station in dbkeys.copy().items():
|
|
51
|
+
file_name = f"{station}_FLOW_cmd.csv"
|
|
52
|
+
date_latest = find_last_date_in_csv(workspace, file_name)
|
|
53
|
+
|
|
54
|
+
# File with data for this dbkey does NOT already exist (or possibly some other error occurred)
|
|
55
|
+
if date_latest is None:
|
|
56
|
+
# Download all the data
|
|
57
|
+
print(f'Downloading all inflow data for {station}')
|
|
58
|
+
hydro.get(workspace, dbkey)
|
|
59
|
+
else:
|
|
60
|
+
# Check whether the latest data is already up to date.
|
|
61
|
+
if dbhydro_data_is_latest(date_latest):
|
|
62
|
+
# Notify that the data is already up to date
|
|
63
|
+
print(f'Downloading of new inflow data skipped for Station {station} (dbkey: {dbkey}). Data is already up to date.')
|
|
64
|
+
|
|
65
|
+
# Remove dbkey from dbkeys so we know it didn't fail
|
|
66
|
+
del dbkeys[dbkey]
|
|
67
|
+
continue
|
|
68
|
+
|
|
69
|
+
# Download only the new data
|
|
70
|
+
print(f'Downloading new inflow data for {station} starting from date {date_latest}')
|
|
71
|
+
hydro.get(workspace, dbkey, date_latest)
|
|
72
|
+
|
|
73
|
+
# Make sure both our original data and newly downloaded data exist
|
|
74
|
+
df_original_path = os.path.join(workspace, f"{station}_FLOW_cmd.csv")
|
|
75
|
+
df_new_path = os.path.join(workspace, f"{station}_FLOW_{dbkey}_cmd.csv")
|
|
76
|
+
|
|
77
|
+
if os.path.exists(df_original_path) and os.path.exists(df_new_path):
|
|
78
|
+
# Merge the new data with the old data
|
|
79
|
+
df_original = pd.read_csv(df_original_path, index_col=0)
|
|
80
|
+
df_new = pd.read_csv(df_new_path, index_col=0)
|
|
81
|
+
df_merged = pd.concat([df_original, df_new], ignore_index=True)
|
|
82
|
+
|
|
83
|
+
# Write the merged data to the new file
|
|
84
|
+
df_merged.to_csv(os.path.join(workspace, f"{station}_FLOW_{dbkey}_cmd.csv"))
|
|
85
|
+
|
|
86
|
+
# Remove the old file
|
|
87
|
+
os.remove(os.path.join(workspace, f"{station}_FLOW_cmd.csv"))
|
|
38
88
|
|
|
39
|
-
S65E_total.
|
|
89
|
+
# Download S65E_total.csv Data
|
|
90
|
+
date_latest = find_last_date_in_csv(workspace, "S65E_total.csv")
|
|
40
91
|
|
|
92
|
+
if date_latest is None:
|
|
93
|
+
print('Downloading all S65E_total data')
|
|
94
|
+
S65E_total.get(workspace, date_max=datetime.now().strftime("%Y-%m-%d"))
|
|
95
|
+
else:
|
|
96
|
+
# Check whether the latest data is already up to date.
|
|
97
|
+
if dbhydro_data_is_latest(date_latest):
|
|
98
|
+
# Notify that the data is already up to date
|
|
99
|
+
print(f'Downloading of new inflow data skipped for S65E_total. Data is already up to date.')
|
|
100
|
+
else:
|
|
101
|
+
# Temporarily rename current data file so it isn't over written
|
|
102
|
+
original_file_name = F"S65E_total_old_{uuid.uuid4()}.csv"
|
|
103
|
+
os.rename(os.path.join(workspace, "S65E_total.csv"), os.path.join(workspace, original_file_name))
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
# Download only the new data
|
|
107
|
+
print(f'Downloading new S65E_total data starting from date {date_latest}')
|
|
108
|
+
S65E_total.get(workspace, date_min=date_latest, date_max=datetime.now().strftime("%Y-%m-%d"))
|
|
109
|
+
|
|
110
|
+
# Merge the new data with the original data
|
|
111
|
+
df_original = pd.read_csv(os.path.join(workspace, original_file_name), index_col=0)
|
|
112
|
+
df_new = pd.read_csv(os.path.join(workspace, "S65E_total.csv"), index_col=0)
|
|
113
|
+
df_merged = pd.concat([df_original, df_new], ignore_index=True)
|
|
114
|
+
|
|
115
|
+
# Write out the merged data
|
|
116
|
+
df_merged.to_csv(os.path.join(workspace, original_file_name))
|
|
117
|
+
|
|
118
|
+
# Remove the newly downloaded data file
|
|
119
|
+
os.remove(os.path.join(workspace, "S65E_total.csv"))
|
|
120
|
+
except Exception as e:
|
|
121
|
+
print(f"Error occurred while downloading new S65E_total data: {e}")
|
|
122
|
+
finally:
|
|
123
|
+
# Rename the original updated file back to its original name
|
|
124
|
+
os.rename(os.path.join(workspace, original_file_name), os.path.join(workspace, "S65E_total.csv"))
|
|
125
|
+
|
|
41
126
|
# Check if all files were downloaded
|
|
42
127
|
files = glob(f"{workspace}/*FLOW*_cmd.csv")
|
|
43
128
|
|
|
@@ -50,17 +135,21 @@ def main(workspace: str, dbkeys: list = DBKEYS) -> dict:
|
|
|
50
135
|
os.rename(file, new_file_name)
|
|
51
136
|
|
|
52
137
|
# Remove dbkey from dbkeys so we know it successfully downloaded
|
|
53
|
-
dbkeys
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
138
|
+
del dbkeys[file_dbkey]
|
|
139
|
+
|
|
140
|
+
# Check for failed downloads
|
|
141
|
+
if len(dbkeys) > 0 or not os.path.exists(f"{workspace}/S65E_total.csv"):
|
|
142
|
+
error_message = ""
|
|
143
|
+
|
|
144
|
+
# dbkeys
|
|
145
|
+
if len(dbkeys) > 0:
|
|
146
|
+
error_message += f"The data from the following dbkeys could not be downloaded: {list(dbkeys.keys())}\n"
|
|
147
|
+
|
|
148
|
+
# S65E_total.csv
|
|
149
|
+
if not os.path.exists(f"{workspace}/S65E_total.csv"):
|
|
150
|
+
error_message += "S65E_total.csv file could not be downloaded.\n"
|
|
151
|
+
|
|
152
|
+
return {"error": error_message}
|
|
64
153
|
|
|
65
154
|
return {"success": "Completed inflow flow data download."}
|
|
66
155
|
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
import os
|
|
3
3
|
from glob import glob
|
|
4
|
-
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from loone_data_prep.utils import get_dbkeys, find_last_date_in_csv, dbhydro_data_is_latest
|
|
5
6
|
from loone_data_prep.flow_data import hydro
|
|
6
7
|
|
|
7
8
|
STATION_IDS = [
|
|
@@ -24,33 +25,39 @@ STATION_IDS = [
|
|
|
24
25
|
]
|
|
25
26
|
|
|
26
27
|
|
|
27
|
-
DBKEYS =
|
|
28
|
-
"91370",
|
|
29
|
-
"91373",
|
|
30
|
-
"91379",
|
|
31
|
-
"91508",
|
|
32
|
-
"91510",
|
|
33
|
-
"91513",
|
|
34
|
-
"91677",
|
|
35
|
-
"15628",
|
|
36
|
-
"15640",
|
|
37
|
-
"15626",
|
|
38
|
-
"00865",
|
|
39
|
-
"JW224",
|
|
40
|
-
"00436",
|
|
41
|
-
"15018",
|
|
42
|
-
"91606",
|
|
43
|
-
"JW223",
|
|
44
|
-
|
|
45
|
-
|
|
28
|
+
DBKEYS = {
|
|
29
|
+
"91370": "S127_C",
|
|
30
|
+
"91373": "S129_C",
|
|
31
|
+
"91379": "S135_C",
|
|
32
|
+
"91508": "S351_S",
|
|
33
|
+
"91510": "S352_S",
|
|
34
|
+
"91513": "S354_S",
|
|
35
|
+
"91677": "S77_S",
|
|
36
|
+
"15628": "INDUST",
|
|
37
|
+
"15640": "L8.441",
|
|
38
|
+
"15626": "S308.DS",
|
|
39
|
+
"00865": "S79_TOT",
|
|
40
|
+
"JW224": "S80_S",
|
|
41
|
+
"00436": "S2 NNR",
|
|
42
|
+
"15018": "S3",
|
|
43
|
+
"91606": "S48_S",
|
|
44
|
+
"JW223": "S49_S",
|
|
45
|
+
}
|
|
46
46
|
|
|
47
|
-
def main(workspace: str, dbkeys: list = DBKEYS, station_ids: list = STATION_IDS) -> dict:
|
|
48
|
-
# Retrieve outflow data
|
|
49
47
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
48
|
+
def _get_outflow_data_from_station_ids(workspace: str, station_ids: list) -> dict:
|
|
49
|
+
"""Attempt to download outflow data from station ids.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
workspace (str): Path to workspace where data will be downloaded.
|
|
53
|
+
station_ids (list): List of station ids to download data for.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
dict: Success or error message
|
|
57
|
+
"""
|
|
58
|
+
# Get dbkeys from station ids
|
|
59
|
+
dbkeys = list(get_dbkeys(station_ids, "SW", "FLOW", "MEAN", "PREF", detail_level="dbkey"))
|
|
60
|
+
dbkeys.extend(list(get_dbkeys(station_ids, "SW", "FLOW", "MEAN", "DRV", detail_level="dbkey")))
|
|
54
61
|
|
|
55
62
|
for dbkey in dbkeys:
|
|
56
63
|
hydro.get(workspace, dbkey, "2000-01-01")
|
|
@@ -75,6 +82,83 @@ def main(workspace: str, dbkeys: list = DBKEYS, station_ids: list = STATION_IDS)
|
|
|
75
82
|
return {"success": "Completed outflow flow data download."}
|
|
76
83
|
|
|
77
84
|
|
|
85
|
+
def main(workspace: str, dbkeys: dict = DBKEYS, station_ids: list = STATION_IDS) -> dict:
|
|
86
|
+
"""
|
|
87
|
+
Retrieve the outflow data used by LOONE.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
workspace (str): Path to workspace where data will be downloaded.
|
|
91
|
+
dbkeys (dict): Dictionary of dbkeys and corresponding station names.
|
|
92
|
+
station_ids (list): List of station ids to download data for if the dbkeys argument is not provided.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
dict: Success or error message
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
# No dbkeys given, attempt to get data from station ids
|
|
99
|
+
if dbkeys is None:
|
|
100
|
+
return _get_outflow_data_from_station_ids(workspace, station_ids)
|
|
101
|
+
|
|
102
|
+
# Get outflow data from dbkeys
|
|
103
|
+
for dbkey, station in dbkeys.copy().items():
|
|
104
|
+
# Get the date of the latest data in the csv file (if any)
|
|
105
|
+
date_latest = find_last_date_in_csv(workspace, f"{station}_FLOW_cmd.csv")
|
|
106
|
+
|
|
107
|
+
# File with data for this dbkey does NOT already exist (or possibly some other error occurred)
|
|
108
|
+
if date_latest is None:
|
|
109
|
+
# Download all data
|
|
110
|
+
print(f'Downloading all outflow data for {station}')
|
|
111
|
+
hydro.get(workspace, dbkey, "2000-01-01")
|
|
112
|
+
else:
|
|
113
|
+
# Check whether the latest data is already up to date.
|
|
114
|
+
if dbhydro_data_is_latest(date_latest):
|
|
115
|
+
# Notify that the data is already up to date
|
|
116
|
+
print(f'Downloading of new outflow data skipped for Station {station} (dbkey: {dbkey}). Data is already up to date.')
|
|
117
|
+
|
|
118
|
+
# Remove dbkey from dbkeys so we know it didn't fail
|
|
119
|
+
del dbkeys[dbkey]
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
# Download only the new data
|
|
123
|
+
print(f'Downloading new outflow data for {station} starting from date {date_latest}')
|
|
124
|
+
hydro.get(workspace, dbkey, date_latest)
|
|
125
|
+
|
|
126
|
+
# Make sure both our original data and newly downloaded data exist
|
|
127
|
+
df_old_path = os.path.join(workspace, f"{station}_FLOW_cmd.csv")
|
|
128
|
+
df_new_path = os.path.join(workspace, f"{station}_FLOW_{dbkey}_cmd.csv")
|
|
129
|
+
|
|
130
|
+
if os.path.exists(df_old_path) and os.path.exists(df_new_path):
|
|
131
|
+
# Merge the new data with the old data
|
|
132
|
+
df_original = pd.read_csv(df_old_path, index_col=0)
|
|
133
|
+
df_new = pd.read_csv(df_new_path, index_col=0)
|
|
134
|
+
df_merged = pd.concat([df_original, df_new], ignore_index=True)
|
|
135
|
+
|
|
136
|
+
# Write the merged data to the new file
|
|
137
|
+
df_merged.to_csv(os.path.join(workspace, f"{station}_FLOW_{dbkey}_cmd.csv"))
|
|
138
|
+
|
|
139
|
+
# Remove the old file
|
|
140
|
+
os.remove(os.path.join(workspace, f"{station}_FLOW_cmd.csv"))
|
|
141
|
+
|
|
142
|
+
# Check if all files were downloaded
|
|
143
|
+
files = glob(f"{workspace}/*FLOW*_cmd.csv")
|
|
144
|
+
|
|
145
|
+
for file in files:
|
|
146
|
+
file_dbkey = file.split("_")[-2]
|
|
147
|
+
|
|
148
|
+
if file_dbkey in dbkeys:
|
|
149
|
+
# Remove dbkey from file name
|
|
150
|
+
new_file_name = file.replace(f"_{file_dbkey}", "")
|
|
151
|
+
os.rename(file, new_file_name)
|
|
152
|
+
|
|
153
|
+
# Remove dbkey from dbkeys so we know it successfully downloaded
|
|
154
|
+
del dbkeys[file_dbkey]
|
|
155
|
+
|
|
156
|
+
if len(dbkeys) > 0:
|
|
157
|
+
return {"error": f"The data from the following dbkeys could not be downloaded: {dbkeys}"}
|
|
158
|
+
|
|
159
|
+
return {"success": "Completed outflow flow data download."}
|
|
160
|
+
|
|
161
|
+
|
|
78
162
|
if __name__ == "__main__":
|
|
79
163
|
workspace = sys.argv[1].rstrip("/")
|
|
80
164
|
main(workspace)
|
|
@@ -2,6 +2,7 @@ import sys
|
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from glob import glob
|
|
4
4
|
from retry import retry
|
|
5
|
+
import os
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from rpy2.robjects import r
|
|
7
8
|
from rpy2.rinterface_lib.embedded import RRuntimeError
|
|
@@ -18,36 +19,92 @@ def get(
|
|
|
18
19
|
date_max: str = DATE_NOW
|
|
19
20
|
) -> None:
|
|
20
21
|
r_str = f"""
|
|
21
|
-
|
|
22
|
-
|
|
22
|
+
download_flow_data <- function(workspace, dbkey, date_min, date_max)
|
|
23
|
+
{{
|
|
24
|
+
# Load the required libraries
|
|
25
|
+
library(dbhydroR)
|
|
26
|
+
library(dplyr)
|
|
23
27
|
|
|
24
|
-
|
|
25
|
-
|
|
28
|
+
# Retrieve data for the dbkey
|
|
29
|
+
data <- get_hydro(dbkey = "{dbkey}", date_min = "{date_min}", date_max = "{date_max}", raw = TRUE)
|
|
30
|
+
|
|
31
|
+
# Check if data is empty or contains only the "date" column
|
|
32
|
+
if (ncol(data) <= 1) {{
|
|
33
|
+
cat("No data found for dbkey", "{dbkey}", "Skipping to the next dbkey.\n")
|
|
34
|
+
}}
|
|
35
|
+
|
|
36
|
+
# Give data.frame correct column names so it can be cleaned using the clean_hydro function
|
|
37
|
+
colnames(data) <- c("station", "dbkey", "date", "data.value", "qualifer", "revision.date")
|
|
38
|
+
|
|
39
|
+
# Check if the data.frame has any rows
|
|
40
|
+
if (nrow(data) == 0)
|
|
41
|
+
{{
|
|
42
|
+
# No data given back, It's possible that the dbkey has reached its end date.
|
|
43
|
+
print(paste("Empty data.frame returned for dbkey", "{dbkey}", "It's possible that the dbkey has reached its end date. Skipping to the next dbkey."))
|
|
44
|
+
return(list(success = FALSE, dbkey = "{dbkey}"))
|
|
45
|
+
}}
|
|
46
|
+
|
|
47
|
+
# Add a type and units column to data so it can be cleaned using the clean_hydro function
|
|
48
|
+
data$type <- "FLOW"
|
|
49
|
+
data$units <- "cfs"
|
|
50
|
+
|
|
51
|
+
# Get the station
|
|
52
|
+
station <- data$station[1]
|
|
53
|
+
|
|
54
|
+
# Clean the data.frame
|
|
55
|
+
data <- clean_hydro(data)
|
|
26
56
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
57
|
+
# Multiply all columns except "date" column by 0.0283168466 * 86400 to convert Flow rate from cfs to m³/day
|
|
58
|
+
data[, -1] <- data[, -1] * (0.0283168466 * 86400)
|
|
59
|
+
|
|
60
|
+
# Drop the " _FLOW_cfs" column
|
|
61
|
+
data <- data %>% select(-` _FLOW_cfs`)
|
|
62
|
+
|
|
63
|
+
# Sort the data by date
|
|
64
|
+
data <- data[order(data$date), ]
|
|
65
|
+
|
|
66
|
+
# Get the filename for the output CSV file
|
|
67
|
+
filename <- paste0(station, "_FLOW", "_{dbkey}_cmd.csv")
|
|
68
|
+
|
|
69
|
+
# Save data to a CSV file
|
|
70
|
+
write.csv(data, file = paste0("{workspace}/", filename))
|
|
37
71
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
# Save data to a CSV file
|
|
41
|
-
write.csv(data, file = paste0("{workspace}/", filename))
|
|
72
|
+
# Print a message indicating the file has been saved
|
|
73
|
+
cat("CSV file", filename, "has been saved.\n")
|
|
42
74
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
75
|
+
# Add a delay between requests
|
|
76
|
+
Sys.sleep(1) # Wait for 1 second before the next iteration
|
|
77
|
+
|
|
78
|
+
# Return the station and dbkey to the python code
|
|
79
|
+
list(success = TRUE, station = station, dbkey = "{dbkey}")
|
|
80
|
+
}}
|
|
48
81
|
"""
|
|
49
82
|
|
|
50
83
|
r(r_str)
|
|
84
|
+
|
|
85
|
+
# Call the R function to download the flow data
|
|
86
|
+
result = r.download_flow_data(workspace, dbkey, date_min, date_max)
|
|
87
|
+
|
|
88
|
+
# Check for failure
|
|
89
|
+
success = result.rx2("success")[0]
|
|
90
|
+
|
|
91
|
+
if not success:
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
# Get the station name for _reformat_flow_file()
|
|
95
|
+
station = result.rx2("station")[0]
|
|
96
|
+
|
|
97
|
+
# Reformat the flow data file to the expected layout
|
|
98
|
+
_reformat_flow_file(workspace, station, dbkey)
|
|
99
|
+
|
|
100
|
+
# Check if the station name contains a space
|
|
101
|
+
if " " in station:
|
|
102
|
+
# Replace space with underscore in the station name
|
|
103
|
+
station_previous = station
|
|
104
|
+
station = station.replace(" ", "_")
|
|
105
|
+
|
|
106
|
+
# Rename the file
|
|
107
|
+
os.rename(f"{workspace}/{station_previous}_FLOW_{dbkey}_cmd.csv", f"{workspace}/{station}_FLOW_{dbkey}_cmd.csv")
|
|
51
108
|
|
|
52
109
|
# column values are converted to cmd in R. This snippet makes sure column names are updated accordingly.
|
|
53
110
|
file = glob(f'{workspace}/*FLOW*{dbkey}_cmd.csv')[0]
|
|
@@ -55,7 +112,44 @@ def get(
|
|
|
55
112
|
df.columns = df.columns.astype(str).str.replace("_cfs", "_cmd")
|
|
56
113
|
df.to_csv(file, index=False)
|
|
57
114
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
115
|
+
|
|
116
|
+
def _reformat_flow_file(workspace:str, station: str, dbkey: str):
|
|
117
|
+
'''
|
|
118
|
+
Reformat the flow data file to the expected layout.
|
|
119
|
+
Converts the format of the dates in the file to 'YYYY-MM-DD' then sorts the data by date.
|
|
120
|
+
Reads and writes to a .CSV file.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
workspace (str): The path to the workspace directory.
|
|
124
|
+
station (str): The station name.
|
|
125
|
+
dbkey (str): The dbkey for the station.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
None
|
|
129
|
+
'''
|
|
130
|
+
# Read in the data
|
|
131
|
+
df = pd.read_csv(f"{workspace}/{station}_FLOW_{dbkey}_cmd.csv")
|
|
132
|
+
|
|
133
|
+
# Grab only the columns we need
|
|
134
|
+
df = df[['date', f'{station}_FLOW_cfs']]
|
|
135
|
+
|
|
136
|
+
# Convert date column to datetime
|
|
137
|
+
df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y')
|
|
138
|
+
|
|
139
|
+
# Sort the data by date
|
|
140
|
+
df.sort_values('date', inplace=True)
|
|
141
|
+
|
|
142
|
+
# Renumber the index
|
|
143
|
+
df.reset_index(drop=True, inplace=True)
|
|
144
|
+
|
|
145
|
+
# Drop rows that are missing values for both the date and value columns
|
|
146
|
+
df = df.drop(df[(df['date'].isna()) & (df[f'{station}_FLOW_cfs'].isna())].index)
|
|
147
|
+
|
|
148
|
+
# Write the updated data back to the file
|
|
149
|
+
df.to_csv(f"{workspace}/{station}_FLOW_{dbkey}_cmd.csv")
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
if __name__ == "__main__":
|
|
153
|
+
workspace = sys.argv[1].rstrip("/")
|
|
154
|
+
dbkey = sys.argv[2]
|
|
155
|
+
get(workspace, dbkey)
|