cwms-cli 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. cwms_cli-0.1.1.dist-info/METADATA +40 -0
  2. cwms_cli-0.1.1.dist-info/RECORD +41 -0
  3. cwms_cli-0.1.1.dist-info/WHEEL +4 -0
  4. cwms_cli-0.1.1.dist-info/entry_points.txt +3 -0
  5. cwms_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
  6. cwmscli/__init__.py +12 -0
  7. cwmscli/__main__.py +15 -0
  8. cwmscli/callbacks/__init__.py +18 -0
  9. cwmscli/commands/blob.py +439 -0
  10. cwmscli/commands/commands_cwms.py +227 -0
  11. cwmscli/commands/csv2cwms/.gitignore +3 -0
  12. cwmscli/commands/csv2cwms/README.md +51 -0
  13. cwmscli/commands/csv2cwms/__init__.py +5 -0
  14. cwmscli/commands/csv2cwms/__main__.py +265 -0
  15. cwmscli/commands/csv2cwms/examples/complete_config.json +19 -0
  16. cwmscli/commands/csv2cwms/examples/hourly.json +243 -0
  17. cwmscli/commands/csv2cwms/examples/minutes.json +315 -0
  18. cwmscli/commands/csv2cwms/tests/__init__.py +0 -0
  19. cwmscli/commands/csv2cwms/tests/data/.gitignore +1 -0
  20. cwmscli/commands/csv2cwms/tests/data/expected_brok_output.json +278 -0
  21. cwmscli/commands/csv2cwms/tests/data/sample_brok.csv +9 -0
  22. cwmscli/commands/csv2cwms/tests/data/sample_config.json +45 -0
  23. cwmscli/commands/csv2cwms/tests/skip_test_integration_pipeline.py +35 -0
  24. cwmscli/commands/csv2cwms/tests/test_dateutils.py +68 -0
  25. cwmscli/commands/csv2cwms/tests/test_expressions.py +49 -0
  26. cwmscli/commands/csv2cwms/tests/test_fileio.py +43 -0
  27. cwmscli/commands/csv2cwms/utils/__init__.py +5 -0
  28. cwmscli/commands/csv2cwms/utils/dateutils.py +105 -0
  29. cwmscli/commands/csv2cwms/utils/expression.py +39 -0
  30. cwmscli/commands/csv2cwms/utils/fileio.py +26 -0
  31. cwmscli/commands/csv2cwms/utils/logging.py +80 -0
  32. cwmscli/commands/csv2cwms/utils/terminal.py +45 -0
  33. cwmscli/commands/shef_critfile_import.py +146 -0
  34. cwmscli/requirements.py +25 -0
  35. cwmscli/usgs/__init__.py +161 -0
  36. cwmscli/usgs/getUSGS_ratings_cda.py +346 -0
  37. cwmscli/usgs/getusgs_cda.py +345 -0
  38. cwmscli/usgs/getusgs_measurements_cda.py +961 -0
  39. cwmscli/usgs/rating_ini_file_import.py +130 -0
  40. cwmscli/utils/__init__.py +68 -0
  41. cwmscli/utils/deps.py +102 -0
@@ -0,0 +1,346 @@
1
+ import logging
2
+ import sys
3
+ from datetime import datetime, timedelta
4
+ from json import loads
5
+
6
+ import cwms
7
+ import numpy as np
8
+ import pandas as pd
9
+ import requests
10
+ from dataretrieval import nwis
11
+
12
+
13
+ def getusgs_rating_cda(api_root, office_id, days_back, api_key):
14
+ api_key = "apikey " + api_key
15
+ cwms.api.init_session(api_root=api_root, api_key=api_key)
16
+ logging.info(f"CDA connection: {api_root}")
17
+ logging.info(
18
+ f"Updated Ratings will be obtained from the USGS for the past {days_back} days"
19
+ )
20
+ execution_date = datetime.now()
21
+ logging.info(f"Execution date {execution_date}")
22
+
23
+ logging.info("Getting Rating Specification information from CWMS Database")
24
+ rating_specs = get_rating_ids_from_specs(office_id)
25
+ USGS_ratings = get_location_aliases(
26
+ rating_specs, "USGS Station Number", "Agency Aliases", "CWMS", None, None
27
+ )
28
+
29
+ # grab ratings that don't have an existing rating curve. ie new specs.
30
+ USGS_ratings_empty = USGS_ratings[USGS_ratings["effective-dates"].isna()]
31
+ USGS_ratings = USGS_ratings[USGS_ratings["effective-dates"].notna()]
32
+
33
+ logging.info(f"Getting list of ratings updated by USGS in past {days_back} days")
34
+ df = get_usgs_updated_ratings(days_back * 24)
35
+
36
+ updated_ratings = pd.merge(
37
+ USGS_ratings,
38
+ df,
39
+ how="inner",
40
+ left_on=["USGS_St_Num", "rating-type"],
41
+ right_on=["USGS_St_Num", "rating-type"],
42
+ )
43
+
44
+ updated_ratings.loc[:, "effective-dates"] = updated_ratings[
45
+ "effective-dates"
46
+ ].apply(lambda x: [pd.to_datetime(d) for d in x])
47
+ updated_ratings.loc[:, "cwms_max_effective_date"] = updated_ratings[
48
+ "effective-dates"
49
+ ].apply(max)
50
+
51
+ # merge the new specs without an existing curve back into the update ratings df
52
+ if not USGS_ratings_empty.empty:
53
+ updated_ratings = pd.concat(
54
+ [updated_ratings, USGS_ratings_empty], ignore_index=True
55
+ )
56
+
57
+ cwms_write_ratings(updated_ratings)
58
+
59
+
60
+ def get_rating_ids_from_specs(office_id):
61
+ rating_types = ["EXSA", "CORR", "BASE"]
62
+ rating_specs = cwms.get_rating_specs(office_id=office_id).df
63
+ if "effective-dates" not in rating_specs.columns:
64
+ rating_specs["effective-dates"] = np.nan
65
+ # Determine if any specs return
66
+ if rating_specs.empty:
67
+ logging.warning(f"No rating specifications found for office {office_id}")
68
+ sys.exit()
69
+ rating_specs = rating_specs.dropna(subset=["description"])
70
+ for rating_type in rating_types:
71
+ rating_specs.loc[
72
+ rating_specs["description"].str.contains(f"USGS-{rating_type}"),
73
+ "rating-type",
74
+ ] = rating_type
75
+ rating_specs = rating_specs[
76
+ (rating_specs["rating-type"].isin(rating_types))
77
+ & (rating_specs["active"])
78
+ & (rating_specs["auto-update"])
79
+ ]
80
+ return rating_specs
81
+
82
+
83
+ def get_location_aliases(
84
+ df, loc_group_id, category_id, office_id, category_office_id, group_office_id
85
+ ):
86
+ # CDA get location group endpoint has an error with category and group office ids. need to update when error is fixed.
87
+ Locdf = cwms.get_location_group(
88
+ loc_group_id=loc_group_id,
89
+ category_id=category_id,
90
+ office_id=office_id,
91
+ category_office_id=category_office_id,
92
+ group_office_id=group_office_id,
93
+ ).df
94
+ USGS_alias = Locdf[Locdf["alias-id"].notnull()]
95
+ USGS_alias = USGS_alias.rename(
96
+ columns={"alias-id": "USGS_St_Num", "attribute": "Loc_attribute"}
97
+ )
98
+ USGS_alias.USGS_St_Num = USGS_alias.USGS_St_Num.str.rjust(8, "0")
99
+ USGS_ratings = pd.merge(
100
+ df, USGS_alias, how="inner", on=["location-id", "office-id"]
101
+ )
102
+ return USGS_ratings
103
+
104
+
105
+ def get_usgs_updated_ratings(period):
106
+ """
107
+ Function to grab data from the USGS based off of dataretieve-python
108
+ """
109
+ # Get USGS data
110
+ base_url = "https://nwis.waterdata.usgs.gov/nwisweb/get_ratings"
111
+
112
+ query_dict = {"period": period, "format": "rdb"}
113
+
114
+ r = requests.get(base_url, params=query_dict)
115
+ temp = pd.DataFrame(r.text.split("\n"))
116
+ temp = temp[temp[0].str.startswith("USGS")]
117
+ updated_ratings = temp[0].str.split("\t", expand=True)
118
+ updated_ratings.columns = [
119
+ "org",
120
+ "USGS_St_Num",
121
+ "rating-type",
122
+ "date_updated",
123
+ "url",
124
+ ]
125
+ updated_ratings["rating-type"] = updated_ratings["rating-type"].str.upper()
126
+ return updated_ratings
127
+
128
+
129
+ def convert_tz(tz: str):
130
+ if tz in ("AST", "ADT"):
131
+ tzid = "America/Halifax"
132
+ elif tz in ("EST", "EDT"):
133
+ tzid = "US/Eastern"
134
+ elif tz in ("CST", "CDT"):
135
+ tzid = "US/Central"
136
+ elif tz in ("MST", "MDT"):
137
+ tzid = "US/Mountain"
138
+ elif tz in ("PST", "PDT"):
139
+ tzid = "US/Pacific"
140
+ elif tz in ("AKST", "AKDT"):
141
+ tzid = "America/Anchorage"
142
+ elif tz in ("UTC", "GMT"):
143
+ tzid = "UTC"
144
+ else:
145
+ tzid = tz
146
+ return tzid
147
+
148
+
149
+ def get_usgs_tz(data):
150
+ line = data[data[0].str.startswith("# //STATION AGENCY=")].iloc[0, 0]
151
+ timezone = line.split("TIME_ZONE=")[1].split()[0].replace('"', "")
152
+ timezone = convert_tz(timezone)
153
+ return timezone
154
+
155
+
156
+ def get_begin_with_date(data, str_starts):
157
+ date_string = None
158
+ lines = data[data[0].str.startswith(str_starts)]
159
+ for _, line in lines.iterrows():
160
+ timestr = line[0].split("BEGIN=")[1].split()[0].strip().replace('"', "")
161
+ if timestr.isdigit():
162
+ date_string = timestr
163
+ return date_string
164
+
165
+
166
+ def get_usgs_effective_date(data, rating_type):
167
+
168
+ date_string = None
169
+ if rating_type == "EXSA":
170
+ line = data[data[0].str.startswith("# //RATING SHIFTED=")].iloc[0, 0]
171
+ rating_shifted_date = line.split("=")[1].replace('"', "")
172
+ date_string = rating_shifted_date.split()[0]
173
+
174
+ elif rating_type == "BASE":
175
+ date_string = get_begin_with_date(data, ("# //RATING_DATETIME BEGIN="))
176
+
177
+ elif rating_type == "CORR":
178
+ date_string = get_begin_with_date(
179
+ data,
180
+ ("# //CORR1_PREV BEGIN=", "# //CORR2_PREV BEGIN=", "# //CORR3_PREV BEGIN="),
181
+ )
182
+
183
+ if date_string is None:
184
+ line = data[data[0].str.startswith("# //RETRIEVED:")].iloc[0, 0]
185
+ date_string = line.split("RETRIEVED: ")[1]
186
+
187
+ timezone = get_usgs_tz(data)
188
+ dt = pd.to_datetime(date_string).tz_localize(timezone).floor("Min")
189
+ return dt
190
+
191
+
192
+ def convert_usgs_rating_df(df, rating_type):
193
+ if rating_type == "CORR":
194
+ df = df.groupby("CORR")
195
+ df = pd.concat([df.first(), df.last()], ignore_index=True, join="inner")
196
+ df = df.sort_values(by=["INDEP"], ignore_index=True)
197
+ df = df.rename(columns={"INDEP": "ind", "CORRINDEP": "dep", "DEP": "dep"})
198
+ df_out = df[["ind", "dep"]].copy()
199
+ return df_out
200
+
201
+
202
+ def cwms_write_ratings(updated_ratings):
203
+ storErr = []
204
+ usgsapiErr = []
205
+ usgsemptyErr = []
206
+ usgseffectiveErr = []
207
+ total_recs = len(updated_ratings.index)
208
+ saved = 0
209
+ saved_ratings = []
210
+ same_effective = 0
211
+
212
+ rating_units = {"EXSA": "ft;cfs", "BASE": "ft;cfs", "CORR": "ft;ft"}
213
+ for _, row in updated_ratings.iterrows():
214
+ logging.info(f'Getting data for rating ID = {row["rating-id"]}')
215
+ logging.info(
216
+ f'Getting data from USGS for USGS ID = {row["USGS_St_Num"]}, Rating Type = {row["rating-type"]}'
217
+ )
218
+ try:
219
+ usgs_rating, meta = nwis.get_ratings(
220
+ site=row["USGS_St_Num"], file_type=str(row["rating-type"]).lower()
221
+ )
222
+ url = meta.url
223
+ except Exception as error:
224
+ usgsapiErr.append(
225
+ [row["rating-id"], row["USGS_St_Num"], row["rating-type"], error]
226
+ )
227
+ logging.error(
228
+ f'FAIL Error collecting rating data from USGS for --> {row["rating-id"]},{row["USGS_St_Num"]}, {row["rating-type"]} USGS error = {error}'
229
+ )
230
+ continue
231
+ if usgs_rating.empty:
232
+ logging.warning(
233
+ f'Empty rating obtained from USGS for USGS ID = {row["USGS_St_Num"]}, Rating Type = {row["rating-type"]}, url'
234
+ )
235
+ usgsemptyErr.append(
236
+ [row["rating-id"], row["USGS_St_Num"], row["rating-type"]]
237
+ )
238
+ else:
239
+ try:
240
+ response = requests.get(url)
241
+ temp = pd.DataFrame(response.text.split("\n"))
242
+ usgs_effective_date = get_usgs_effective_date(temp, row["rating-type"])
243
+ except Exception as error:
244
+ usgseffectiveErr.append(
245
+ [row["rating-id"], row["USGS_St_Num"], row["rating-type"], error]
246
+ )
247
+ logging.error(
248
+ f'FAIL Error collecting effective date from USGS rating --> {row["rating-id"]},{row["USGS_St_Num"]}, {row["rating-type"]} CDA error = {error}'
249
+ )
250
+ continue
251
+ cwms_effective_date = row["cwms_max_effective_date"]
252
+ logging.info(
253
+ f"Effective dates: cwms = {cwms_effective_date}, usgs = {usgs_effective_date}"
254
+ )
255
+ if (cwms_effective_date == usgs_effective_date) or (
256
+ cwms_effective_date == (usgs_effective_date + timedelta(hours=1))
257
+ ):
258
+ logging.info(
259
+ "Effective dates are the same rating curve will not be saved"
260
+ )
261
+ same_effective = same_effective + 1
262
+ else:
263
+ try:
264
+ usgs_store_rating = convert_usgs_rating_df(
265
+ usgs_rating, row["rating-type"]
266
+ )
267
+
268
+ if row["auto-migrate-extension"] and pd.notna(cwms_effective_date):
269
+ current_rating = cwms.get_ratings(
270
+ rating_id=row["rating-id"],
271
+ office_id=row["office-id"],
272
+ begin=cwms_effective_date,
273
+ end=cwms_effective_date,
274
+ method="EAGER",
275
+ single_rating_df=True,
276
+ )
277
+ rating_json = current_rating.json
278
+ points_json = loads(usgs_store_rating.to_json(orient="records"))
279
+ rating_json["simple-rating"]["rating-points"] = {
280
+ "point": points_json
281
+ }
282
+ rating_json["simple-rating"][
283
+ "effective-date"
284
+ ] = usgs_effective_date.isoformat()
285
+ del rating_json["simple-rating"]["create-date"]
286
+ rating_json["simple-rating"]["active"] = row["auto-activate"]
287
+ else:
288
+ rating_json = cwms.rating_simple_df_to_json(
289
+ data=usgs_store_rating,
290
+ rating_id=row["rating-id"],
291
+ office_id=row["office-id"],
292
+ units=rating_units[row["rating-type"]],
293
+ effective_date=usgs_effective_date,
294
+ active=row["auto-activate"],
295
+ )
296
+ response = cwms.update_ratings(
297
+ data=rating_json, rating_id=row["rating-id"]
298
+ )
299
+ logging.info(
300
+ f'SUCCESS Stored rating for rating id = {row["rating-id"]}, effective date = {usgs_effective_date}'
301
+ )
302
+ saved = saved + 1
303
+ saved_ratings.append(
304
+ [row["rating-id"], row["USGS_St_Num"], row["rating-type"]]
305
+ )
306
+ except Exception as error:
307
+ storErr.append(
308
+ [
309
+ row["rating-id"],
310
+ row["USGS_St_Num"],
311
+ row["rating-type"],
312
+ error,
313
+ ]
314
+ )
315
+ logging.error(
316
+ f'FAIL Data could not be stored to CWMS database for --> {row["rating-id"]},{row["USGS_St_Num"]}, {row["rating-type"]} CDA error = {error}'
317
+ )
318
+ logging.info(
319
+ f"A total of {total_recs} ratings were updated by the USGS over the lookback period."
320
+ )
321
+ logging.info(
322
+ f"Of those {total_recs} ratings {same_effective} were already stored in the CWMS database"
323
+ )
324
+ if len(saved_ratings) > 0:
325
+ logging.info(
326
+ f"A total of {saved} ratings were new and saved successfully to the database"
327
+ )
328
+ logging.info(
329
+ f"Rating ids saved successfully to the database were: {saved_ratings}"
330
+ )
331
+ if len(usgsapiErr) > 0:
332
+ logging.info(
333
+ f"The following ratings errored out when accessing the USGS API: {usgsapiErr}"
334
+ )
335
+ if len(usgsemptyErr) > 0:
336
+ logging.info(
337
+ f"The following ratings had an empty rating curve returned from the usgs: {usgsemptyErr}"
338
+ )
339
+ if len(usgseffectiveErr) > 0:
340
+ logging.info(
341
+ f"The following ratings errored when trying to determine the effective date from the USGS: {usgseffectiveErr}"
342
+ )
343
+ if len(storErr) > 0:
344
+ logging.info(
345
+ f"The following ratings errored when trying to store to CDA: {storErr}"
346
+ )
@@ -0,0 +1,345 @@
1
+ import logging
2
+ from datetime import datetime, timedelta
3
+
4
+ import cwms
5
+ import numpy as np
6
+ import pandas as pd
7
+ import requests
8
+
9
+
10
+ def getusgs_cda(api_root, office_id, days_back, api_key, backfill_tsids: list = None):
11
+ api_key = "apikey " + api_key
12
+ cwms.api.init_session(api_root=api_root, api_key=api_key)
13
+ logging.info(f"CDA connection: {api_root}")
14
+ logging.info(
15
+ f"Data will be grabbed and stored from USGS for past {days_back} days for office: {office_id}"
16
+ )
17
+ execution_date = datetime.now()
18
+
19
+ USGS_ts = get_CMWS_TS_Loc_Data(office_id)
20
+
21
+ if backfill_tsids:
22
+ USGS_ts = USGS_ts[USGS_ts["timeseries-id"].isin(backfill_tsids)]
23
+
24
+ if len(USGS_ts) > 0:
25
+ # grab all of the unique USGS stations numbers to be sent to USGS api
26
+ sites = USGS_ts[USGS_ts["USGS_Method_TS"].isna()].USGS_St_Num.unique()
27
+ method_sites = USGS_ts[USGS_ts["USGS_Method_TS"].notna()].USGS_St_Num.unique()
28
+ logging.info(f"Execution date {execution_date}")
29
+
30
+ # This is added to the 'startDT'
31
+ tw_delta = -timedelta(days_back)
32
+
33
+ # Set the execution date and time window for URL
34
+ startDT = execution_date + tw_delta
35
+
36
+ # Airflow only looks at the last period during an execution run,
37
+ # so to ensure the latest data is retrieved, add 2 hours to end date
38
+ endDT = execution_date + timedelta(hours=2)
39
+
40
+ logging.info(f"Grabing data from USGS between {startDT} and {endDT}")
41
+
42
+ USGS_data = pd.DataFrame()
43
+ USGS_data_method = pd.DataFrame()
44
+
45
+ if len(sites) > 0:
46
+ USGS_data = getUSGS_ts(sites, startDT, endDT)
47
+ # sites with a method_id or usgs tsid are retrieved from a seperate database. this is access using 3 as access in
48
+ # usgs API call.
49
+ if len(method_sites) > 0:
50
+ USGS_data_method = getUSGS_ts(method_sites, startDT, endDT, 3)
51
+
52
+ CWMS_writeData(USGS_ts, USGS_data, USGS_data_method, days_back)
53
+ else:
54
+ if backfill_tsids:
55
+ logging.error(
56
+ f"The following backload timeseries ids were not present in the USGS timeseries or Locations groups: {backfill_tsids}"
57
+ )
58
+ else:
59
+ logging.error(
60
+ f"USGS data was present in the timeseries or locations groups"
61
+ )
62
+
63
+
64
+ def get_USGS_params():
65
+ # defines USGS standard parameters.
66
+ columns = [
67
+ "USGS_PARAMETER",
68
+ "USGS_Alias",
69
+ "CWMS_PARAMETER",
70
+ "CWMS_FACTOR",
71
+ "CWMS_UNIT",
72
+ "CWMS_TYPE",
73
+ ]
74
+ data = [
75
+ ["00010", "Water Temp", "Temp-Water", 1, "C", "Inst"],
76
+ ["00021", "Air Temp", "Temp-Air", 1, "F", "Inst"],
77
+ ["00035", "Wind Speed", "Speed-Wind", 1, "mph", "Inst"],
78
+ ["00036", "Wind Dir", "Dir-Wind", 1, "deg", "Inst"],
79
+ ["00045", "Precip", "Precip-Inc", 1, "in", "Total"],
80
+ ["00045", "Precip", "Precip", 1, "in", "Total"],
81
+ ["00052", "RelHumidity", "%-Humidity", 1, "%", "Inst"],
82
+ ["00060", "Flow", "Flow", 1, "cfs", "Inst"],
83
+ # ['00061','Flow',1,'cfs','Inst'],
84
+ ["00065", "Stage", "Stage", 1, "ft", "Inst"],
85
+ ["00095", "Sp Cond", "Cond", 1, "umho/cm", "Inst"],
86
+ ["00096", "Salinity", "Conc-Salinity", 0.001, "mg/l", "Inst"],
87
+ # ['00062','Elevation','Elev',1,'ft','Inst'],
88
+ ["72036", "Res Storage", "Stor", 1000, "ac-ft", "Inst"],
89
+ ["62608", "Sol Rad", "Irrad-Solar", 1, "W/m2", "Inst"],
90
+ # ['62614','Elev-Lake','Elev',1,'ft','Inst'],
91
+ ["63160", "Elev-NAVD88", "Elev", 1, "ft", "Inst"],
92
+ ]
93
+ USGS_Params = pd.DataFrame(data, columns=columns).set_index("CWMS_PARAMETER")
94
+ return USGS_Params
95
+
96
+
97
+ def get_CMWS_TS_Loc_Data(office):
98
+ """
99
+ get time series group and location alias information and combine into singe dataframe
100
+
101
+ """
102
+
103
+ def find_usgsparam(attribute, param):
104
+ if attribute > 0:
105
+ usgs_param = str(attribute).split(".")[0]
106
+ elif param in USGS_Params.index:
107
+ usgs_param = USGS_Params.at[param, "USGS_PARAMETER"]
108
+ else:
109
+ usgs_param = "Not Found"
110
+ return usgs_param
111
+
112
+ df = cwms.get_timeseries_group(
113
+ group_id="USGS TS Data Acquisition",
114
+ category_id="Data Acquisition",
115
+ office_id=office,
116
+ category_office_id="CWMS",
117
+ group_office_id="CWMS",
118
+ ).df
119
+ df[["location-id", "param", "type", "int", "dur", "ver"]] = df[
120
+ "timeseries-id"
121
+ ].str.split(".", expand=True)
122
+
123
+ df = df[df["office-id"] == office]
124
+ df["base-loc"] = df["location-id"].str.split("-", expand=True)[0]
125
+ if "alias-id" not in df.columns:
126
+ df["alias-id"] = np.nan
127
+ if "attribute" not in df.columns:
128
+ df["attribute"] = np.nan
129
+ df = df.rename(columns={"alias-id": "USGS_Method_TS"})
130
+
131
+ # error in CDA with category_office_id and group_office_id. need to fix once CDA is updated
132
+ Locdf = cwms.get_location_group(
133
+ loc_group_id="USGS Station Number",
134
+ category_id="Agency Aliases",
135
+ office_id="CWMS",
136
+ ).df.set_index("location-id")
137
+
138
+ Locdf = Locdf[Locdf["office-id"] == office]
139
+ if "attribute" not in Locdf.columns:
140
+ Locdf["attribute"] = np.nan
141
+ # Grab all of the locations that have a USGS station number assigned to them
142
+ USGS_alias = Locdf[Locdf["alias-id"].notnull()]
143
+ # rename the columns
144
+ USGS_alias = USGS_alias.rename(
145
+ columns={"alias-id": "USGS_St_Num", "attribute": "Loc_attribute"}
146
+ )
147
+ # pad the USGS id with 0s if they are not 8 digits long
148
+ USGS_alias.USGS_St_Num = USGS_alias.USGS_St_Num.str.rjust(8, "0")
149
+
150
+ # do an inner join with the time series that are in the USGS time series group and the location group. Join based on the Location ID and office if
151
+ USGS_ts = pd.merge(df, USGS_alias, how="left", on=["location-id", "office-id"])
152
+ # grab time series with missing USGS_St_Num and check to see if the base location has an assigned USGS station.
153
+ if USGS_ts.USGS_St_Num.isnull().any():
154
+ USGS_ts_base = pd.merge(
155
+ USGS_ts[USGS_ts.USGS_St_Num.isnull()].drop(
156
+ ["USGS_St_Num", "Loc_attribute"], axis=1
157
+ ),
158
+ USGS_alias,
159
+ left_on=["base-loc", "office-id"],
160
+ right_on=["location-id", "office-id"],
161
+ )
162
+ # merge with existing dataframe
163
+ USGS_ts = pd.concat(
164
+ [USGS_ts[USGS_ts["USGS_St_Num"].notnull()], USGS_ts_base], axis=0
165
+ )
166
+
167
+ USGS_Params = get_USGS_params()
168
+ # this code fills in the USGS_Params field with values in the Time Series Group Attribute if it exists. If it does not exist it
169
+ # grabs the default USGS paramter for the coresponding CWMS parameter
170
+ USGS_ts.attribute = USGS_ts.apply(
171
+ lambda x: find_usgsparam(x.attribute, x.param), axis=1
172
+ ).astype("string")
173
+ USGS_ts.attribute = USGS_ts.attribute.str.rjust(5, "0")
174
+ # renames the attribute column to USGS_PARAMETER
175
+ USGS_ts = USGS_ts.rename(columns={"attribute": "USGS_PARAMETER"})
176
+
177
+ logging.info("CWMS TS Groups and Location Data Obtained")
178
+ return USGS_ts
179
+
180
+
181
+ def getUSGS_ts(sites, startDT, endDT, access=None):
182
+ """
183
+ Function to grab data from the USGS based off of dataretieve-python
184
+ """
185
+
186
+ # Get USGS data
187
+ base_url = "https://waterservices.usgs.gov/nwis/iv/?"
188
+
189
+ query_dict = {
190
+ "format": "json",
191
+ "sites": ",".join(sites),
192
+ "startDT": startDT.isoformat(),
193
+ "endDT": endDT.isoformat(),
194
+ "access": access,
195
+ # "parameterCd": ",".join(unique_param_codes),
196
+ # 'period': 'P1D',
197
+ # "modifiedSince": "PT6H",
198
+ "siteStatus": "active",
199
+ }
200
+
201
+ r = requests.get(base_url, params=query_dict).json()
202
+
203
+ # format the responce from USGS API into dataframe
204
+ USGS_data = pd.DataFrame(r["value"]["timeSeries"])
205
+ USGS_data["Id.param"] = (
206
+ USGS_data.name.str.split(":").str[1]
207
+ + "."
208
+ + USGS_data.name.str.split(":").str[2]
209
+ )
210
+ USGS_data = USGS_data.set_index("Id.param")
211
+
212
+ logging.info("Data obtained from USGS")
213
+ return USGS_data
214
+
215
+
216
+ def CWMS_writeData(USGS_ts, USGS_data, USGS_data_method, days_back):
217
+ # lists to hold time series that fail
218
+ # noData -> usgs location and parameter were present in USGS api but the values were empty
219
+ # NotinAPI -> usgs location and parameter were not retrieved from USGS api
220
+ # storErr -> an error occured when saving data to CWMS database
221
+ noData = []
222
+ NotinAPI = []
223
+ storErr = []
224
+ mult_ids = []
225
+ total_recs = len(USGS_ts.index)
226
+ saved = 0
227
+
228
+ # loop through all rows in the USGS_ts dataframe
229
+ for index, row in USGS_ts.iterrows():
230
+ # grab the CWMS time series if and the USGS station numbuer plus USGS parameter code
231
+ ts_id = row["timeseries-id"]
232
+ USGS_Id_param = f"{row.USGS_St_Num}.{row.USGS_PARAMETER}"
233
+ # check if the USGS st number and para code are in the data obtain from USGS api
234
+ logging.info(
235
+ f"Attempting to write values for ts_id --> {ts_id},{USGS_Id_param}"
236
+ )
237
+ values = pd.DataFrame()
238
+ USGS_data_row = None
239
+ if (USGS_Id_param in USGS_data.index) and pd.isna(row.USGS_Method_TS):
240
+ USGS_data_row = USGS_data.loc[USGS_Id_param]
241
+ elif USGS_Id_param in USGS_data_method.index:
242
+ USGS_data_row = USGS_data_method.loc[USGS_Id_param]
243
+ if USGS_data_row is not None:
244
+ try:
245
+
246
+ # grab the time series values obtained from USGS API.
247
+ values_df = pd.DataFrame(USGS_data_row["values"])
248
+ if values_df.shape[0] > 1:
249
+ if pd.isna(row.USGS_Method_TS):
250
+ logging.warning(
251
+ f"FAIL there are multiple time series for {USGS_Id_param} need to specify the USGS method TSID for {ts_id}"
252
+ )
253
+ mult_ids.append([ts_id, USGS_Id_param])
254
+ else:
255
+ temp = values_df.method.apply(pd.Series)
256
+ temp = values_df.join(pd.json_normalize(temp.pop(0)))
257
+ try:
258
+ values = pd.DataFrame(
259
+ temp.query(f"methodID == {row.USGS_Method_TS}")[
260
+ "value"
261
+ ].item()
262
+ )
263
+ except Exception as error:
264
+ mult_ids.append([ts_id, USGS_Id_param])
265
+ logging.error(
266
+ f"The USGS method ID defined could not be found from the USGS API check that it is correct for --> {ts_id},{USGS_Id_param},{row.USGS_Method_TS}"
267
+ )
268
+ else:
269
+ values = pd.DataFrame(values_df.loc[0, "value"])
270
+ # if values array is empty then append infor to noData list
271
+ if values.empty:
272
+ noData.append([ts_id, USGS_Id_param])
273
+ logging.warning(
274
+ f"FAIL No Data obtained from USGS for ts_id: Values array is empty in USGS API output--> {ts_id},{USGS_Id_param}"
275
+ )
276
+ else:
277
+ # grab value and for no data (ie -999999) remove from dataset
278
+ nodata_val = USGS_data_row["variable"]["noDataValue"]
279
+ values = values[values.value != str(int(nodata_val))]
280
+ # check again if values dataframe is empty after removing nodata_vals
281
+ if values.empty:
282
+ noData.append([ts_id, USGS_Id_param])
283
+ logging.warning(
284
+ f"FAIL No Data obtained from USGS for ts_id: Values array is empty after removing -999999 values--> {ts_id},{USGS_Id_param}"
285
+ )
286
+ # if values are present grab information needed to save to CWMS database using CDA
287
+ else:
288
+ values = values.reindex(
289
+ columns=["dateTime", "value", "qualifiers"]
290
+ )
291
+
292
+ # adjust column names to fit cwms-python format.
293
+ values = values.rename(
294
+ columns={
295
+ "dateTime": "date-time",
296
+ "qualifiers": "quality-code",
297
+ }
298
+ )
299
+ units = USGS_data_row["variable"]["unit"]["unitCode"]
300
+ office = row["office-id"]
301
+ values["quality-code"] = 0
302
+
303
+ # write values to CWMS database
304
+ try:
305
+ data = cwms.timeseries_df_to_json(
306
+ data=values, ts_id=ts_id, units=units, office_id=office
307
+ )
308
+ if days_back < 365:
309
+ cwms.store_timeseries(data)
310
+ else:
311
+ cwms.store_timeseries(
312
+ data, max_workers=30, chunk_size=30 * 24 * 4
313
+ )
314
+ logging.info(
315
+ f"SUCCESS Data stored in CWMS database for --> {ts_id},{USGS_Id_param}"
316
+ )
317
+ saved = saved + 1
318
+ except Exception as error:
319
+ storErr.append([ts_id, USGS_Id_param, error])
320
+ logging.error(
321
+ f"FAIL Data could not be stored to CWMS database for --> {ts_id},{USGS_Id_param} CDA error = {error}"
322
+ )
323
+ except Exception as error:
324
+ logging.error(
325
+ f"FAIL Unspecified Error when trying to save USGS data --> {ts_id},{USGS_Id_param} error = {error}"
326
+ )
327
+ else:
328
+ NotinAPI.append([ts_id, USGS_Id_param])
329
+ logging.warning(
330
+ f"FAIL USGS ID and parameter were not present in USGS API for--> {ts_id},{USGS_Id_param}"
331
+ )
332
+
333
+ logging.info(
334
+ f"A total of {saved} records were successfully saved out of {total_recs}"
335
+ )
336
+ logging.info(
337
+ f"The following ts_ids errored due to no data received from USGS for the time period requested: {noData}"
338
+ )
339
+ logging.info(
340
+ f"The following ts_ids errored because the USGS ID and parameter were not found in USGS API {NotinAPI}"
341
+ )
342
+ logging.info(f"The following ts_ids errored when storing into CDA {storErr}")
343
+ logging.info(
344
+ f"The following ts_ids errored because multiple method TSID were present for the USGS station. A USGS method TSID needs to be defined in the time series group in CWMS or an incorrect TSID is defined. {mult_ids}"
345
+ )