cwms-cli 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cwms_cli-0.1.1.dist-info/METADATA +40 -0
- cwms_cli-0.1.1.dist-info/RECORD +41 -0
- cwms_cli-0.1.1.dist-info/WHEEL +4 -0
- cwms_cli-0.1.1.dist-info/entry_points.txt +3 -0
- cwms_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
- cwmscli/__init__.py +12 -0
- cwmscli/__main__.py +15 -0
- cwmscli/callbacks/__init__.py +18 -0
- cwmscli/commands/blob.py +439 -0
- cwmscli/commands/commands_cwms.py +227 -0
- cwmscli/commands/csv2cwms/.gitignore +3 -0
- cwmscli/commands/csv2cwms/README.md +51 -0
- cwmscli/commands/csv2cwms/__init__.py +5 -0
- cwmscli/commands/csv2cwms/__main__.py +265 -0
- cwmscli/commands/csv2cwms/examples/complete_config.json +19 -0
- cwmscli/commands/csv2cwms/examples/hourly.json +243 -0
- cwmscli/commands/csv2cwms/examples/minutes.json +315 -0
- cwmscli/commands/csv2cwms/tests/__init__.py +0 -0
- cwmscli/commands/csv2cwms/tests/data/.gitignore +1 -0
- cwmscli/commands/csv2cwms/tests/data/expected_brok_output.json +278 -0
- cwmscli/commands/csv2cwms/tests/data/sample_brok.csv +9 -0
- cwmscli/commands/csv2cwms/tests/data/sample_config.json +45 -0
- cwmscli/commands/csv2cwms/tests/skip_test_integration_pipeline.py +35 -0
- cwmscli/commands/csv2cwms/tests/test_dateutils.py +68 -0
- cwmscli/commands/csv2cwms/tests/test_expressions.py +49 -0
- cwmscli/commands/csv2cwms/tests/test_fileio.py +43 -0
- cwmscli/commands/csv2cwms/utils/__init__.py +5 -0
- cwmscli/commands/csv2cwms/utils/dateutils.py +105 -0
- cwmscli/commands/csv2cwms/utils/expression.py +39 -0
- cwmscli/commands/csv2cwms/utils/fileio.py +26 -0
- cwmscli/commands/csv2cwms/utils/logging.py +80 -0
- cwmscli/commands/csv2cwms/utils/terminal.py +45 -0
- cwmscli/commands/shef_critfile_import.py +146 -0
- cwmscli/requirements.py +25 -0
- cwmscli/usgs/__init__.py +161 -0
- cwmscli/usgs/getUSGS_ratings_cda.py +346 -0
- cwmscli/usgs/getusgs_cda.py +345 -0
- cwmscli/usgs/getusgs_measurements_cda.py +961 -0
- cwmscli/usgs/rating_ini_file_import.py +130 -0
- cwmscli/utils/__init__.py +68 -0
- cwmscli/utils/deps.py +102 -0
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
from json import loads
|
|
5
|
+
|
|
6
|
+
import cwms
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import requests
|
|
10
|
+
from dataretrieval import nwis
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def getusgs_rating_cda(api_root, office_id, days_back, api_key):
|
|
14
|
+
api_key = "apikey " + api_key
|
|
15
|
+
cwms.api.init_session(api_root=api_root, api_key=api_key)
|
|
16
|
+
logging.info(f"CDA connection: {api_root}")
|
|
17
|
+
logging.info(
|
|
18
|
+
f"Updated Ratings will be obtained from the USGS for the past {days_back} days"
|
|
19
|
+
)
|
|
20
|
+
execution_date = datetime.now()
|
|
21
|
+
logging.info(f"Execution date {execution_date}")
|
|
22
|
+
|
|
23
|
+
logging.info("Getting Rating Specification information from CWMS Database")
|
|
24
|
+
rating_specs = get_rating_ids_from_specs(office_id)
|
|
25
|
+
USGS_ratings = get_location_aliases(
|
|
26
|
+
rating_specs, "USGS Station Number", "Agency Aliases", "CWMS", None, None
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# grab ratings that don't have an existing rating curve. ie new specs.
|
|
30
|
+
USGS_ratings_empty = USGS_ratings[USGS_ratings["effective-dates"].isna()]
|
|
31
|
+
USGS_ratings = USGS_ratings[USGS_ratings["effective-dates"].notna()]
|
|
32
|
+
|
|
33
|
+
logging.info(f"Getting list of ratings updated by USGS in past {days_back} days")
|
|
34
|
+
df = get_usgs_updated_ratings(days_back * 24)
|
|
35
|
+
|
|
36
|
+
updated_ratings = pd.merge(
|
|
37
|
+
USGS_ratings,
|
|
38
|
+
df,
|
|
39
|
+
how="inner",
|
|
40
|
+
left_on=["USGS_St_Num", "rating-type"],
|
|
41
|
+
right_on=["USGS_St_Num", "rating-type"],
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
updated_ratings.loc[:, "effective-dates"] = updated_ratings[
|
|
45
|
+
"effective-dates"
|
|
46
|
+
].apply(lambda x: [pd.to_datetime(d) for d in x])
|
|
47
|
+
updated_ratings.loc[:, "cwms_max_effective_date"] = updated_ratings[
|
|
48
|
+
"effective-dates"
|
|
49
|
+
].apply(max)
|
|
50
|
+
|
|
51
|
+
# merge the new specs without an existing curve back into the update ratings df
|
|
52
|
+
if not USGS_ratings_empty.empty:
|
|
53
|
+
updated_ratings = pd.concat(
|
|
54
|
+
[updated_ratings, USGS_ratings_empty], ignore_index=True
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
cwms_write_ratings(updated_ratings)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_rating_ids_from_specs(office_id):
|
|
61
|
+
rating_types = ["EXSA", "CORR", "BASE"]
|
|
62
|
+
rating_specs = cwms.get_rating_specs(office_id=office_id).df
|
|
63
|
+
if "effective-dates" not in rating_specs.columns:
|
|
64
|
+
rating_specs["effective-dates"] = np.nan
|
|
65
|
+
# Determine if any specs return
|
|
66
|
+
if rating_specs.empty:
|
|
67
|
+
logging.warning(f"No rating specifications found for office {office_id}")
|
|
68
|
+
sys.exit()
|
|
69
|
+
rating_specs = rating_specs.dropna(subset=["description"])
|
|
70
|
+
for rating_type in rating_types:
|
|
71
|
+
rating_specs.loc[
|
|
72
|
+
rating_specs["description"].str.contains(f"USGS-{rating_type}"),
|
|
73
|
+
"rating-type",
|
|
74
|
+
] = rating_type
|
|
75
|
+
rating_specs = rating_specs[
|
|
76
|
+
(rating_specs["rating-type"].isin(rating_types))
|
|
77
|
+
& (rating_specs["active"])
|
|
78
|
+
& (rating_specs["auto-update"])
|
|
79
|
+
]
|
|
80
|
+
return rating_specs
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def get_location_aliases(
|
|
84
|
+
df, loc_group_id, category_id, office_id, category_office_id, group_office_id
|
|
85
|
+
):
|
|
86
|
+
# CDA get location group endpoint has an error with category and group office ids. need to update when error is fixed.
|
|
87
|
+
Locdf = cwms.get_location_group(
|
|
88
|
+
loc_group_id=loc_group_id,
|
|
89
|
+
category_id=category_id,
|
|
90
|
+
office_id=office_id,
|
|
91
|
+
category_office_id=category_office_id,
|
|
92
|
+
group_office_id=group_office_id,
|
|
93
|
+
).df
|
|
94
|
+
USGS_alias = Locdf[Locdf["alias-id"].notnull()]
|
|
95
|
+
USGS_alias = USGS_alias.rename(
|
|
96
|
+
columns={"alias-id": "USGS_St_Num", "attribute": "Loc_attribute"}
|
|
97
|
+
)
|
|
98
|
+
USGS_alias.USGS_St_Num = USGS_alias.USGS_St_Num.str.rjust(8, "0")
|
|
99
|
+
USGS_ratings = pd.merge(
|
|
100
|
+
df, USGS_alias, how="inner", on=["location-id", "office-id"]
|
|
101
|
+
)
|
|
102
|
+
return USGS_ratings
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def get_usgs_updated_ratings(period):
|
|
106
|
+
"""
|
|
107
|
+
Function to grab data from the USGS based off of dataretieve-python
|
|
108
|
+
"""
|
|
109
|
+
# Get USGS data
|
|
110
|
+
base_url = "https://nwis.waterdata.usgs.gov/nwisweb/get_ratings"
|
|
111
|
+
|
|
112
|
+
query_dict = {"period": period, "format": "rdb"}
|
|
113
|
+
|
|
114
|
+
r = requests.get(base_url, params=query_dict)
|
|
115
|
+
temp = pd.DataFrame(r.text.split("\n"))
|
|
116
|
+
temp = temp[temp[0].str.startswith("USGS")]
|
|
117
|
+
updated_ratings = temp[0].str.split("\t", expand=True)
|
|
118
|
+
updated_ratings.columns = [
|
|
119
|
+
"org",
|
|
120
|
+
"USGS_St_Num",
|
|
121
|
+
"rating-type",
|
|
122
|
+
"date_updated",
|
|
123
|
+
"url",
|
|
124
|
+
]
|
|
125
|
+
updated_ratings["rating-type"] = updated_ratings["rating-type"].str.upper()
|
|
126
|
+
return updated_ratings
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def convert_tz(tz: str):
|
|
130
|
+
if tz in ("AST", "ADT"):
|
|
131
|
+
tzid = "America/Halifax"
|
|
132
|
+
elif tz in ("EST", "EDT"):
|
|
133
|
+
tzid = "US/Eastern"
|
|
134
|
+
elif tz in ("CST", "CDT"):
|
|
135
|
+
tzid = "US/Central"
|
|
136
|
+
elif tz in ("MST", "MDT"):
|
|
137
|
+
tzid = "US/Mountain"
|
|
138
|
+
elif tz in ("PST", "PDT"):
|
|
139
|
+
tzid = "US/Pacific"
|
|
140
|
+
elif tz in ("AKST", "AKDT"):
|
|
141
|
+
tzid = "America/Anchorage"
|
|
142
|
+
elif tz in ("UTC", "GMT"):
|
|
143
|
+
tzid = "UTC"
|
|
144
|
+
else:
|
|
145
|
+
tzid = tz
|
|
146
|
+
return tzid
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def get_usgs_tz(data):
|
|
150
|
+
line = data[data[0].str.startswith("# //STATION AGENCY=")].iloc[0, 0]
|
|
151
|
+
timezone = line.split("TIME_ZONE=")[1].split()[0].replace('"', "")
|
|
152
|
+
timezone = convert_tz(timezone)
|
|
153
|
+
return timezone
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def get_begin_with_date(data, str_starts):
|
|
157
|
+
date_string = None
|
|
158
|
+
lines = data[data[0].str.startswith(str_starts)]
|
|
159
|
+
for _, line in lines.iterrows():
|
|
160
|
+
timestr = line[0].split("BEGIN=")[1].split()[0].strip().replace('"', "")
|
|
161
|
+
if timestr.isdigit():
|
|
162
|
+
date_string = timestr
|
|
163
|
+
return date_string
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def get_usgs_effective_date(data, rating_type):
|
|
167
|
+
|
|
168
|
+
date_string = None
|
|
169
|
+
if rating_type == "EXSA":
|
|
170
|
+
line = data[data[0].str.startswith("# //RATING SHIFTED=")].iloc[0, 0]
|
|
171
|
+
rating_shifted_date = line.split("=")[1].replace('"', "")
|
|
172
|
+
date_string = rating_shifted_date.split()[0]
|
|
173
|
+
|
|
174
|
+
elif rating_type == "BASE":
|
|
175
|
+
date_string = get_begin_with_date(data, ("# //RATING_DATETIME BEGIN="))
|
|
176
|
+
|
|
177
|
+
elif rating_type == "CORR":
|
|
178
|
+
date_string = get_begin_with_date(
|
|
179
|
+
data,
|
|
180
|
+
("# //CORR1_PREV BEGIN=", "# //CORR2_PREV BEGIN=", "# //CORR3_PREV BEGIN="),
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
if date_string is None:
|
|
184
|
+
line = data[data[0].str.startswith("# //RETRIEVED:")].iloc[0, 0]
|
|
185
|
+
date_string = line.split("RETRIEVED: ")[1]
|
|
186
|
+
|
|
187
|
+
timezone = get_usgs_tz(data)
|
|
188
|
+
dt = pd.to_datetime(date_string).tz_localize(timezone).floor("Min")
|
|
189
|
+
return dt
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def convert_usgs_rating_df(df, rating_type):
|
|
193
|
+
if rating_type == "CORR":
|
|
194
|
+
df = df.groupby("CORR")
|
|
195
|
+
df = pd.concat([df.first(), df.last()], ignore_index=True, join="inner")
|
|
196
|
+
df = df.sort_values(by=["INDEP"], ignore_index=True)
|
|
197
|
+
df = df.rename(columns={"INDEP": "ind", "CORRINDEP": "dep", "DEP": "dep"})
|
|
198
|
+
df_out = df[["ind", "dep"]].copy()
|
|
199
|
+
return df_out
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def cwms_write_ratings(updated_ratings):
|
|
203
|
+
storErr = []
|
|
204
|
+
usgsapiErr = []
|
|
205
|
+
usgsemptyErr = []
|
|
206
|
+
usgseffectiveErr = []
|
|
207
|
+
total_recs = len(updated_ratings.index)
|
|
208
|
+
saved = 0
|
|
209
|
+
saved_ratings = []
|
|
210
|
+
same_effective = 0
|
|
211
|
+
|
|
212
|
+
rating_units = {"EXSA": "ft;cfs", "BASE": "ft;cfs", "CORR": "ft;ft"}
|
|
213
|
+
for _, row in updated_ratings.iterrows():
|
|
214
|
+
logging.info(f'Getting data for rating ID = {row["rating-id"]}')
|
|
215
|
+
logging.info(
|
|
216
|
+
f'Getting data from USGS for USGS ID = {row["USGS_St_Num"]}, Rating Type = {row["rating-type"]}'
|
|
217
|
+
)
|
|
218
|
+
try:
|
|
219
|
+
usgs_rating, meta = nwis.get_ratings(
|
|
220
|
+
site=row["USGS_St_Num"], file_type=str(row["rating-type"]).lower()
|
|
221
|
+
)
|
|
222
|
+
url = meta.url
|
|
223
|
+
except Exception as error:
|
|
224
|
+
usgsapiErr.append(
|
|
225
|
+
[row["rating-id"], row["USGS_St_Num"], row["rating-type"], error]
|
|
226
|
+
)
|
|
227
|
+
logging.error(
|
|
228
|
+
f'FAIL Error collecting rating data from USGS for --> {row["rating-id"]},{row["USGS_St_Num"]}, {row["rating-type"]} USGS error = {error}'
|
|
229
|
+
)
|
|
230
|
+
continue
|
|
231
|
+
if usgs_rating.empty:
|
|
232
|
+
logging.warning(
|
|
233
|
+
f'Empty rating obtained from USGS for USGS ID = {row["USGS_St_Num"]}, Rating Type = {row["rating-type"]}, url'
|
|
234
|
+
)
|
|
235
|
+
usgsemptyErr.append(
|
|
236
|
+
[row["rating-id"], row["USGS_St_Num"], row["rating-type"]]
|
|
237
|
+
)
|
|
238
|
+
else:
|
|
239
|
+
try:
|
|
240
|
+
response = requests.get(url)
|
|
241
|
+
temp = pd.DataFrame(response.text.split("\n"))
|
|
242
|
+
usgs_effective_date = get_usgs_effective_date(temp, row["rating-type"])
|
|
243
|
+
except Exception as error:
|
|
244
|
+
usgseffectiveErr.append(
|
|
245
|
+
[row["rating-id"], row["USGS_St_Num"], row["rating-type"], error]
|
|
246
|
+
)
|
|
247
|
+
logging.error(
|
|
248
|
+
f'FAIL Error collecting effective date from USGS rating --> {row["rating-id"]},{row["USGS_St_Num"]}, {row["rating-type"]} CDA error = {error}'
|
|
249
|
+
)
|
|
250
|
+
continue
|
|
251
|
+
cwms_effective_date = row["cwms_max_effective_date"]
|
|
252
|
+
logging.info(
|
|
253
|
+
f"Effective dates: cwms = {cwms_effective_date}, usgs = {usgs_effective_date}"
|
|
254
|
+
)
|
|
255
|
+
if (cwms_effective_date == usgs_effective_date) or (
|
|
256
|
+
cwms_effective_date == (usgs_effective_date + timedelta(hours=1))
|
|
257
|
+
):
|
|
258
|
+
logging.info(
|
|
259
|
+
"Effective dates are the same rating curve will not be saved"
|
|
260
|
+
)
|
|
261
|
+
same_effective = same_effective + 1
|
|
262
|
+
else:
|
|
263
|
+
try:
|
|
264
|
+
usgs_store_rating = convert_usgs_rating_df(
|
|
265
|
+
usgs_rating, row["rating-type"]
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
if row["auto-migrate-extension"] and pd.notna(cwms_effective_date):
|
|
269
|
+
current_rating = cwms.get_ratings(
|
|
270
|
+
rating_id=row["rating-id"],
|
|
271
|
+
office_id=row["office-id"],
|
|
272
|
+
begin=cwms_effective_date,
|
|
273
|
+
end=cwms_effective_date,
|
|
274
|
+
method="EAGER",
|
|
275
|
+
single_rating_df=True,
|
|
276
|
+
)
|
|
277
|
+
rating_json = current_rating.json
|
|
278
|
+
points_json = loads(usgs_store_rating.to_json(orient="records"))
|
|
279
|
+
rating_json["simple-rating"]["rating-points"] = {
|
|
280
|
+
"point": points_json
|
|
281
|
+
}
|
|
282
|
+
rating_json["simple-rating"][
|
|
283
|
+
"effective-date"
|
|
284
|
+
] = usgs_effective_date.isoformat()
|
|
285
|
+
del rating_json["simple-rating"]["create-date"]
|
|
286
|
+
rating_json["simple-rating"]["active"] = row["auto-activate"]
|
|
287
|
+
else:
|
|
288
|
+
rating_json = cwms.rating_simple_df_to_json(
|
|
289
|
+
data=usgs_store_rating,
|
|
290
|
+
rating_id=row["rating-id"],
|
|
291
|
+
office_id=row["office-id"],
|
|
292
|
+
units=rating_units[row["rating-type"]],
|
|
293
|
+
effective_date=usgs_effective_date,
|
|
294
|
+
active=row["auto-activate"],
|
|
295
|
+
)
|
|
296
|
+
response = cwms.update_ratings(
|
|
297
|
+
data=rating_json, rating_id=row["rating-id"]
|
|
298
|
+
)
|
|
299
|
+
logging.info(
|
|
300
|
+
f'SUCCESS Stored rating for rating id = {row["rating-id"]}, effective date = {usgs_effective_date}'
|
|
301
|
+
)
|
|
302
|
+
saved = saved + 1
|
|
303
|
+
saved_ratings.append(
|
|
304
|
+
[row["rating-id"], row["USGS_St_Num"], row["rating-type"]]
|
|
305
|
+
)
|
|
306
|
+
except Exception as error:
|
|
307
|
+
storErr.append(
|
|
308
|
+
[
|
|
309
|
+
row["rating-id"],
|
|
310
|
+
row["USGS_St_Num"],
|
|
311
|
+
row["rating-type"],
|
|
312
|
+
error,
|
|
313
|
+
]
|
|
314
|
+
)
|
|
315
|
+
logging.error(
|
|
316
|
+
f'FAIL Data could not be stored to CWMS database for --> {row["rating-id"]},{row["USGS_St_Num"]}, {row["rating-type"]} CDA error = {error}'
|
|
317
|
+
)
|
|
318
|
+
logging.info(
|
|
319
|
+
f"A total of {total_recs} ratings were updated by the USGS over the lookback period."
|
|
320
|
+
)
|
|
321
|
+
logging.info(
|
|
322
|
+
f"Of those {total_recs} ratings {same_effective} were already stored in the CWMS database"
|
|
323
|
+
)
|
|
324
|
+
if len(saved_ratings) > 0:
|
|
325
|
+
logging.info(
|
|
326
|
+
f"A total of {saved} ratings were new and saved successfully to the database"
|
|
327
|
+
)
|
|
328
|
+
logging.info(
|
|
329
|
+
f"Rating ids saved successfully to the database were: {saved_ratings}"
|
|
330
|
+
)
|
|
331
|
+
if len(usgsapiErr) > 0:
|
|
332
|
+
logging.info(
|
|
333
|
+
f"The following ratings errored out when accessing the USGS API: {usgsapiErr}"
|
|
334
|
+
)
|
|
335
|
+
if len(usgsemptyErr) > 0:
|
|
336
|
+
logging.info(
|
|
337
|
+
f"The following ratings had an empty rating curve returned from the usgs: {usgsemptyErr}"
|
|
338
|
+
)
|
|
339
|
+
if len(usgseffectiveErr) > 0:
|
|
340
|
+
logging.info(
|
|
341
|
+
f"The following ratings errored when trying to determine the effective date from the USGS: {usgseffectiveErr}"
|
|
342
|
+
)
|
|
343
|
+
if len(storErr) > 0:
|
|
344
|
+
logging.info(
|
|
345
|
+
f"The following ratings errored when trying to store to CDA: {storErr}"
|
|
346
|
+
)
|
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from datetime import datetime, timedelta
|
|
3
|
+
|
|
4
|
+
import cwms
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def getusgs_cda(api_root, office_id, days_back, api_key, backfill_tsids: list = None):
|
|
11
|
+
api_key = "apikey " + api_key
|
|
12
|
+
cwms.api.init_session(api_root=api_root, api_key=api_key)
|
|
13
|
+
logging.info(f"CDA connection: {api_root}")
|
|
14
|
+
logging.info(
|
|
15
|
+
f"Data will be grabbed and stored from USGS for past {days_back} days for office: {office_id}"
|
|
16
|
+
)
|
|
17
|
+
execution_date = datetime.now()
|
|
18
|
+
|
|
19
|
+
USGS_ts = get_CMWS_TS_Loc_Data(office_id)
|
|
20
|
+
|
|
21
|
+
if backfill_tsids:
|
|
22
|
+
USGS_ts = USGS_ts[USGS_ts["timeseries-id"].isin(backfill_tsids)]
|
|
23
|
+
|
|
24
|
+
if len(USGS_ts) > 0:
|
|
25
|
+
# grab all of the unique USGS stations numbers to be sent to USGS api
|
|
26
|
+
sites = USGS_ts[USGS_ts["USGS_Method_TS"].isna()].USGS_St_Num.unique()
|
|
27
|
+
method_sites = USGS_ts[USGS_ts["USGS_Method_TS"].notna()].USGS_St_Num.unique()
|
|
28
|
+
logging.info(f"Execution date {execution_date}")
|
|
29
|
+
|
|
30
|
+
# This is added to the 'startDT'
|
|
31
|
+
tw_delta = -timedelta(days_back)
|
|
32
|
+
|
|
33
|
+
# Set the execution date and time window for URL
|
|
34
|
+
startDT = execution_date + tw_delta
|
|
35
|
+
|
|
36
|
+
# Airflow only looks at the last period during an execution run,
|
|
37
|
+
# so to ensure the latest data is retrieved, add 2 hours to end date
|
|
38
|
+
endDT = execution_date + timedelta(hours=2)
|
|
39
|
+
|
|
40
|
+
logging.info(f"Grabing data from USGS between {startDT} and {endDT}")
|
|
41
|
+
|
|
42
|
+
USGS_data = pd.DataFrame()
|
|
43
|
+
USGS_data_method = pd.DataFrame()
|
|
44
|
+
|
|
45
|
+
if len(sites) > 0:
|
|
46
|
+
USGS_data = getUSGS_ts(sites, startDT, endDT)
|
|
47
|
+
# sites with a method_id or usgs tsid are retrieved from a seperate database. this is access using 3 as access in
|
|
48
|
+
# usgs API call.
|
|
49
|
+
if len(method_sites) > 0:
|
|
50
|
+
USGS_data_method = getUSGS_ts(method_sites, startDT, endDT, 3)
|
|
51
|
+
|
|
52
|
+
CWMS_writeData(USGS_ts, USGS_data, USGS_data_method, days_back)
|
|
53
|
+
else:
|
|
54
|
+
if backfill_tsids:
|
|
55
|
+
logging.error(
|
|
56
|
+
f"The following backload timeseries ids were not present in the USGS timeseries or Locations groups: {backfill_tsids}"
|
|
57
|
+
)
|
|
58
|
+
else:
|
|
59
|
+
logging.error(
|
|
60
|
+
f"USGS data was present in the timeseries or locations groups"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_USGS_params():
|
|
65
|
+
# defines USGS standard parameters.
|
|
66
|
+
columns = [
|
|
67
|
+
"USGS_PARAMETER",
|
|
68
|
+
"USGS_Alias",
|
|
69
|
+
"CWMS_PARAMETER",
|
|
70
|
+
"CWMS_FACTOR",
|
|
71
|
+
"CWMS_UNIT",
|
|
72
|
+
"CWMS_TYPE",
|
|
73
|
+
]
|
|
74
|
+
data = [
|
|
75
|
+
["00010", "Water Temp", "Temp-Water", 1, "C", "Inst"],
|
|
76
|
+
["00021", "Air Temp", "Temp-Air", 1, "F", "Inst"],
|
|
77
|
+
["00035", "Wind Speed", "Speed-Wind", 1, "mph", "Inst"],
|
|
78
|
+
["00036", "Wind Dir", "Dir-Wind", 1, "deg", "Inst"],
|
|
79
|
+
["00045", "Precip", "Precip-Inc", 1, "in", "Total"],
|
|
80
|
+
["00045", "Precip", "Precip", 1, "in", "Total"],
|
|
81
|
+
["00052", "RelHumidity", "%-Humidity", 1, "%", "Inst"],
|
|
82
|
+
["00060", "Flow", "Flow", 1, "cfs", "Inst"],
|
|
83
|
+
# ['00061','Flow',1,'cfs','Inst'],
|
|
84
|
+
["00065", "Stage", "Stage", 1, "ft", "Inst"],
|
|
85
|
+
["00095", "Sp Cond", "Cond", 1, "umho/cm", "Inst"],
|
|
86
|
+
["00096", "Salinity", "Conc-Salinity", 0.001, "mg/l", "Inst"],
|
|
87
|
+
# ['00062','Elevation','Elev',1,'ft','Inst'],
|
|
88
|
+
["72036", "Res Storage", "Stor", 1000, "ac-ft", "Inst"],
|
|
89
|
+
["62608", "Sol Rad", "Irrad-Solar", 1, "W/m2", "Inst"],
|
|
90
|
+
# ['62614','Elev-Lake','Elev',1,'ft','Inst'],
|
|
91
|
+
["63160", "Elev-NAVD88", "Elev", 1, "ft", "Inst"],
|
|
92
|
+
]
|
|
93
|
+
USGS_Params = pd.DataFrame(data, columns=columns).set_index("CWMS_PARAMETER")
|
|
94
|
+
return USGS_Params
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_CMWS_TS_Loc_Data(office):
|
|
98
|
+
"""
|
|
99
|
+
get time series group and location alias information and combine into singe dataframe
|
|
100
|
+
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
def find_usgsparam(attribute, param):
|
|
104
|
+
if attribute > 0:
|
|
105
|
+
usgs_param = str(attribute).split(".")[0]
|
|
106
|
+
elif param in USGS_Params.index:
|
|
107
|
+
usgs_param = USGS_Params.at[param, "USGS_PARAMETER"]
|
|
108
|
+
else:
|
|
109
|
+
usgs_param = "Not Found"
|
|
110
|
+
return usgs_param
|
|
111
|
+
|
|
112
|
+
df = cwms.get_timeseries_group(
|
|
113
|
+
group_id="USGS TS Data Acquisition",
|
|
114
|
+
category_id="Data Acquisition",
|
|
115
|
+
office_id=office,
|
|
116
|
+
category_office_id="CWMS",
|
|
117
|
+
group_office_id="CWMS",
|
|
118
|
+
).df
|
|
119
|
+
df[["location-id", "param", "type", "int", "dur", "ver"]] = df[
|
|
120
|
+
"timeseries-id"
|
|
121
|
+
].str.split(".", expand=True)
|
|
122
|
+
|
|
123
|
+
df = df[df["office-id"] == office]
|
|
124
|
+
df["base-loc"] = df["location-id"].str.split("-", expand=True)[0]
|
|
125
|
+
if "alias-id" not in df.columns:
|
|
126
|
+
df["alias-id"] = np.nan
|
|
127
|
+
if "attribute" not in df.columns:
|
|
128
|
+
df["attribute"] = np.nan
|
|
129
|
+
df = df.rename(columns={"alias-id": "USGS_Method_TS"})
|
|
130
|
+
|
|
131
|
+
# error in CDA with category_office_id and group_office_id. need to fix once CDA is updated
|
|
132
|
+
Locdf = cwms.get_location_group(
|
|
133
|
+
loc_group_id="USGS Station Number",
|
|
134
|
+
category_id="Agency Aliases",
|
|
135
|
+
office_id="CWMS",
|
|
136
|
+
).df.set_index("location-id")
|
|
137
|
+
|
|
138
|
+
Locdf = Locdf[Locdf["office-id"] == office]
|
|
139
|
+
if "attribute" not in Locdf.columns:
|
|
140
|
+
Locdf["attribute"] = np.nan
|
|
141
|
+
# Grab all of the locations that have a USGS station number assigned to them
|
|
142
|
+
USGS_alias = Locdf[Locdf["alias-id"].notnull()]
|
|
143
|
+
# rename the columns
|
|
144
|
+
USGS_alias = USGS_alias.rename(
|
|
145
|
+
columns={"alias-id": "USGS_St_Num", "attribute": "Loc_attribute"}
|
|
146
|
+
)
|
|
147
|
+
# pad the USGS id with 0s if they are not 8 digits long
|
|
148
|
+
USGS_alias.USGS_St_Num = USGS_alias.USGS_St_Num.str.rjust(8, "0")
|
|
149
|
+
|
|
150
|
+
# do an inner join with the time series that are in the USGS time series group and the location group. Join based on the Location ID and office if
|
|
151
|
+
USGS_ts = pd.merge(df, USGS_alias, how="left", on=["location-id", "office-id"])
|
|
152
|
+
# grab time series with missing USGS_St_Num and check to see if the base location has an assigned USGS station.
|
|
153
|
+
if USGS_ts.USGS_St_Num.isnull().any():
|
|
154
|
+
USGS_ts_base = pd.merge(
|
|
155
|
+
USGS_ts[USGS_ts.USGS_St_Num.isnull()].drop(
|
|
156
|
+
["USGS_St_Num", "Loc_attribute"], axis=1
|
|
157
|
+
),
|
|
158
|
+
USGS_alias,
|
|
159
|
+
left_on=["base-loc", "office-id"],
|
|
160
|
+
right_on=["location-id", "office-id"],
|
|
161
|
+
)
|
|
162
|
+
# merge with existing dataframe
|
|
163
|
+
USGS_ts = pd.concat(
|
|
164
|
+
[USGS_ts[USGS_ts["USGS_St_Num"].notnull()], USGS_ts_base], axis=0
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
USGS_Params = get_USGS_params()
|
|
168
|
+
# this code fills in the USGS_Params field with values in the Time Series Group Attribute if it exists. If it does not exist it
|
|
169
|
+
# grabs the default USGS paramter for the coresponding CWMS parameter
|
|
170
|
+
USGS_ts.attribute = USGS_ts.apply(
|
|
171
|
+
lambda x: find_usgsparam(x.attribute, x.param), axis=1
|
|
172
|
+
).astype("string")
|
|
173
|
+
USGS_ts.attribute = USGS_ts.attribute.str.rjust(5, "0")
|
|
174
|
+
# renames the attribute column to USGS_PARAMETER
|
|
175
|
+
USGS_ts = USGS_ts.rename(columns={"attribute": "USGS_PARAMETER"})
|
|
176
|
+
|
|
177
|
+
logging.info("CWMS TS Groups and Location Data Obtained")
|
|
178
|
+
return USGS_ts
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def getUSGS_ts(sites, startDT, endDT, access=None):
|
|
182
|
+
"""
|
|
183
|
+
Function to grab data from the USGS based off of dataretieve-python
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
# Get USGS data
|
|
187
|
+
base_url = "https://waterservices.usgs.gov/nwis/iv/?"
|
|
188
|
+
|
|
189
|
+
query_dict = {
|
|
190
|
+
"format": "json",
|
|
191
|
+
"sites": ",".join(sites),
|
|
192
|
+
"startDT": startDT.isoformat(),
|
|
193
|
+
"endDT": endDT.isoformat(),
|
|
194
|
+
"access": access,
|
|
195
|
+
# "parameterCd": ",".join(unique_param_codes),
|
|
196
|
+
# 'period': 'P1D',
|
|
197
|
+
# "modifiedSince": "PT6H",
|
|
198
|
+
"siteStatus": "active",
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
r = requests.get(base_url, params=query_dict).json()
|
|
202
|
+
|
|
203
|
+
# format the responce from USGS API into dataframe
|
|
204
|
+
USGS_data = pd.DataFrame(r["value"]["timeSeries"])
|
|
205
|
+
USGS_data["Id.param"] = (
|
|
206
|
+
USGS_data.name.str.split(":").str[1]
|
|
207
|
+
+ "."
|
|
208
|
+
+ USGS_data.name.str.split(":").str[2]
|
|
209
|
+
)
|
|
210
|
+
USGS_data = USGS_data.set_index("Id.param")
|
|
211
|
+
|
|
212
|
+
logging.info("Data obtained from USGS")
|
|
213
|
+
return USGS_data
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def CWMS_writeData(USGS_ts, USGS_data, USGS_data_method, days_back):
|
|
217
|
+
# lists to hold time series that fail
|
|
218
|
+
# noData -> usgs location and parameter were present in USGS api but the values were empty
|
|
219
|
+
# NotinAPI -> usgs location and parameter were not retrieved from USGS api
|
|
220
|
+
# storErr -> an error occured when saving data to CWMS database
|
|
221
|
+
noData = []
|
|
222
|
+
NotinAPI = []
|
|
223
|
+
storErr = []
|
|
224
|
+
mult_ids = []
|
|
225
|
+
total_recs = len(USGS_ts.index)
|
|
226
|
+
saved = 0
|
|
227
|
+
|
|
228
|
+
# loop through all rows in the USGS_ts dataframe
|
|
229
|
+
for index, row in USGS_ts.iterrows():
|
|
230
|
+
# grab the CWMS time series if and the USGS station numbuer plus USGS parameter code
|
|
231
|
+
ts_id = row["timeseries-id"]
|
|
232
|
+
USGS_Id_param = f"{row.USGS_St_Num}.{row.USGS_PARAMETER}"
|
|
233
|
+
# check if the USGS st number and para code are in the data obtain from USGS api
|
|
234
|
+
logging.info(
|
|
235
|
+
f"Attempting to write values for ts_id --> {ts_id},{USGS_Id_param}"
|
|
236
|
+
)
|
|
237
|
+
values = pd.DataFrame()
|
|
238
|
+
USGS_data_row = None
|
|
239
|
+
if (USGS_Id_param in USGS_data.index) and pd.isna(row.USGS_Method_TS):
|
|
240
|
+
USGS_data_row = USGS_data.loc[USGS_Id_param]
|
|
241
|
+
elif USGS_Id_param in USGS_data_method.index:
|
|
242
|
+
USGS_data_row = USGS_data_method.loc[USGS_Id_param]
|
|
243
|
+
if USGS_data_row is not None:
|
|
244
|
+
try:
|
|
245
|
+
|
|
246
|
+
# grab the time series values obtained from USGS API.
|
|
247
|
+
values_df = pd.DataFrame(USGS_data_row["values"])
|
|
248
|
+
if values_df.shape[0] > 1:
|
|
249
|
+
if pd.isna(row.USGS_Method_TS):
|
|
250
|
+
logging.warning(
|
|
251
|
+
f"FAIL there are multiple time series for {USGS_Id_param} need to specify the USGS method TSID for {ts_id}"
|
|
252
|
+
)
|
|
253
|
+
mult_ids.append([ts_id, USGS_Id_param])
|
|
254
|
+
else:
|
|
255
|
+
temp = values_df.method.apply(pd.Series)
|
|
256
|
+
temp = values_df.join(pd.json_normalize(temp.pop(0)))
|
|
257
|
+
try:
|
|
258
|
+
values = pd.DataFrame(
|
|
259
|
+
temp.query(f"methodID == {row.USGS_Method_TS}")[
|
|
260
|
+
"value"
|
|
261
|
+
].item()
|
|
262
|
+
)
|
|
263
|
+
except Exception as error:
|
|
264
|
+
mult_ids.append([ts_id, USGS_Id_param])
|
|
265
|
+
logging.error(
|
|
266
|
+
f"The USGS method ID defined could not be found from the USGS API check that it is correct for --> {ts_id},{USGS_Id_param},{row.USGS_Method_TS}"
|
|
267
|
+
)
|
|
268
|
+
else:
|
|
269
|
+
values = pd.DataFrame(values_df.loc[0, "value"])
|
|
270
|
+
# if values array is empty then append infor to noData list
|
|
271
|
+
if values.empty:
|
|
272
|
+
noData.append([ts_id, USGS_Id_param])
|
|
273
|
+
logging.warning(
|
|
274
|
+
f"FAIL No Data obtained from USGS for ts_id: Values array is empty in USGS API output--> {ts_id},{USGS_Id_param}"
|
|
275
|
+
)
|
|
276
|
+
else:
|
|
277
|
+
# grab value and for no data (ie -999999) remove from dataset
|
|
278
|
+
nodata_val = USGS_data_row["variable"]["noDataValue"]
|
|
279
|
+
values = values[values.value != str(int(nodata_val))]
|
|
280
|
+
# check again if values dataframe is empty after removing nodata_vals
|
|
281
|
+
if values.empty:
|
|
282
|
+
noData.append([ts_id, USGS_Id_param])
|
|
283
|
+
logging.warning(
|
|
284
|
+
f"FAIL No Data obtained from USGS for ts_id: Values array is empty after removing -999999 values--> {ts_id},{USGS_Id_param}"
|
|
285
|
+
)
|
|
286
|
+
# if values are present grab information needed to save to CWMS database using CDA
|
|
287
|
+
else:
|
|
288
|
+
values = values.reindex(
|
|
289
|
+
columns=["dateTime", "value", "qualifiers"]
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
# adjust column names to fit cwms-python format.
|
|
293
|
+
values = values.rename(
|
|
294
|
+
columns={
|
|
295
|
+
"dateTime": "date-time",
|
|
296
|
+
"qualifiers": "quality-code",
|
|
297
|
+
}
|
|
298
|
+
)
|
|
299
|
+
units = USGS_data_row["variable"]["unit"]["unitCode"]
|
|
300
|
+
office = row["office-id"]
|
|
301
|
+
values["quality-code"] = 0
|
|
302
|
+
|
|
303
|
+
# write values to CWMS database
|
|
304
|
+
try:
|
|
305
|
+
data = cwms.timeseries_df_to_json(
|
|
306
|
+
data=values, ts_id=ts_id, units=units, office_id=office
|
|
307
|
+
)
|
|
308
|
+
if days_back < 365:
|
|
309
|
+
cwms.store_timeseries(data)
|
|
310
|
+
else:
|
|
311
|
+
cwms.store_timeseries(
|
|
312
|
+
data, max_workers=30, chunk_size=30 * 24 * 4
|
|
313
|
+
)
|
|
314
|
+
logging.info(
|
|
315
|
+
f"SUCCESS Data stored in CWMS database for --> {ts_id},{USGS_Id_param}"
|
|
316
|
+
)
|
|
317
|
+
saved = saved + 1
|
|
318
|
+
except Exception as error:
|
|
319
|
+
storErr.append([ts_id, USGS_Id_param, error])
|
|
320
|
+
logging.error(
|
|
321
|
+
f"FAIL Data could not be stored to CWMS database for --> {ts_id},{USGS_Id_param} CDA error = {error}"
|
|
322
|
+
)
|
|
323
|
+
except Exception as error:
|
|
324
|
+
logging.error(
|
|
325
|
+
f"FAIL Unspecified Error when trying to save USGS data --> {ts_id},{USGS_Id_param} error = {error}"
|
|
326
|
+
)
|
|
327
|
+
else:
|
|
328
|
+
NotinAPI.append([ts_id, USGS_Id_param])
|
|
329
|
+
logging.warning(
|
|
330
|
+
f"FAIL USGS ID and parameter were not present in USGS API for--> {ts_id},{USGS_Id_param}"
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
logging.info(
|
|
334
|
+
f"A total of {saved} records were successfully saved out of {total_recs}"
|
|
335
|
+
)
|
|
336
|
+
logging.info(
|
|
337
|
+
f"The following ts_ids errored due to no data received from USGS for the time period requested: {noData}"
|
|
338
|
+
)
|
|
339
|
+
logging.info(
|
|
340
|
+
f"The following ts_ids errored because the USGS ID and parameter were not found in USGS API {NotinAPI}"
|
|
341
|
+
)
|
|
342
|
+
logging.info(f"The following ts_ids errored when storing into CDA {storErr}")
|
|
343
|
+
logging.info(
|
|
344
|
+
f"The following ts_ids errored because multiple method TSID were present for the USGS station. A USGS method TSID needs to be defined in the time series group in CWMS or an incorrect TSID is defined. {mult_ids}"
|
|
345
|
+
)
|