cwms-cli 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cwms_cli-0.1.1.dist-info/METADATA +40 -0
- cwms_cli-0.1.1.dist-info/RECORD +41 -0
- cwms_cli-0.1.1.dist-info/WHEEL +4 -0
- cwms_cli-0.1.1.dist-info/entry_points.txt +3 -0
- cwms_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
- cwmscli/__init__.py +12 -0
- cwmscli/__main__.py +15 -0
- cwmscli/callbacks/__init__.py +18 -0
- cwmscli/commands/blob.py +439 -0
- cwmscli/commands/commands_cwms.py +227 -0
- cwmscli/commands/csv2cwms/.gitignore +3 -0
- cwmscli/commands/csv2cwms/README.md +51 -0
- cwmscli/commands/csv2cwms/__init__.py +5 -0
- cwmscli/commands/csv2cwms/__main__.py +265 -0
- cwmscli/commands/csv2cwms/examples/complete_config.json +19 -0
- cwmscli/commands/csv2cwms/examples/hourly.json +243 -0
- cwmscli/commands/csv2cwms/examples/minutes.json +315 -0
- cwmscli/commands/csv2cwms/tests/__init__.py +0 -0
- cwmscli/commands/csv2cwms/tests/data/.gitignore +1 -0
- cwmscli/commands/csv2cwms/tests/data/expected_brok_output.json +278 -0
- cwmscli/commands/csv2cwms/tests/data/sample_brok.csv +9 -0
- cwmscli/commands/csv2cwms/tests/data/sample_config.json +45 -0
- cwmscli/commands/csv2cwms/tests/skip_test_integration_pipeline.py +35 -0
- cwmscli/commands/csv2cwms/tests/test_dateutils.py +68 -0
- cwmscli/commands/csv2cwms/tests/test_expressions.py +49 -0
- cwmscli/commands/csv2cwms/tests/test_fileio.py +43 -0
- cwmscli/commands/csv2cwms/utils/__init__.py +5 -0
- cwmscli/commands/csv2cwms/utils/dateutils.py +105 -0
- cwmscli/commands/csv2cwms/utils/expression.py +39 -0
- cwmscli/commands/csv2cwms/utils/fileio.py +26 -0
- cwmscli/commands/csv2cwms/utils/logging.py +80 -0
- cwmscli/commands/csv2cwms/utils/terminal.py +45 -0
- cwmscli/commands/shef_critfile_import.py +146 -0
- cwmscli/requirements.py +25 -0
- cwmscli/usgs/__init__.py +161 -0
- cwmscli/usgs/getUSGS_ratings_cda.py +346 -0
- cwmscli/usgs/getusgs_cda.py +345 -0
- cwmscli/usgs/getusgs_measurements_cda.py +961 -0
- cwmscli/usgs/rating_ini_file_import.py +130 -0
- cwmscli/utils/__init__.py +68 -0
- cwmscli/utils/deps.py +102 -0
|
@@ -0,0 +1,961 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import math
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from datetime import datetime, timedelta, timezone
|
|
5
|
+
|
|
6
|
+
import cwms
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import pytz
|
|
10
|
+
import requests
|
|
11
|
+
from dataretrieval import nwis
|
|
12
|
+
|
|
13
|
+
# --- Constants ---
|
|
14
|
+
CWMS_MISSING_VALUE = -340282346638528859811704183484516925440
|
|
15
|
+
|
|
16
|
+
TZ_MAPPING = {
|
|
17
|
+
"AST": "America/Puerto_Rico",
|
|
18
|
+
"EST": "America/New_York",
|
|
19
|
+
"EDT": "America/New_York",
|
|
20
|
+
"CST": "America/Chicago",
|
|
21
|
+
"CDT": "America/Chicago",
|
|
22
|
+
"MST": "America/Denver",
|
|
23
|
+
"MDT": "America/Denver",
|
|
24
|
+
"PST": "America/Los_Angeles",
|
|
25
|
+
"PDT": "America/Los_Angeles",
|
|
26
|
+
"AKST": "America/Anchorage",
|
|
27
|
+
"AKDT": "America/Anchorage",
|
|
28
|
+
"HST": "Pacific/Honolulu",
|
|
29
|
+
"GST": "Pacific/Guam",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
COLUMN_MAPPING = {
|
|
33
|
+
"agency_cd": "usgs_agency_cd",
|
|
34
|
+
"site_no": "usgs_site_no",
|
|
35
|
+
"measurement_nu": "number",
|
|
36
|
+
"measurement_dt": "usgs_measurement_dt",
|
|
37
|
+
"tz_cd": "usgs_tz_cd",
|
|
38
|
+
"q_meas_used_fg": "used",
|
|
39
|
+
"party_nm": "party",
|
|
40
|
+
"site_visit_coll_agency_cd": "agency",
|
|
41
|
+
"discharge_va": "flow",
|
|
42
|
+
"gage_height_va": "gage-height",
|
|
43
|
+
"gage_va_change": "delta-height",
|
|
44
|
+
"gage_va_time": "delta-time",
|
|
45
|
+
"measured_rating_diff": "quality",
|
|
46
|
+
"control_type_cd": "control-condition",
|
|
47
|
+
"discharge_cd": "flow-adjustment",
|
|
48
|
+
"chan_nu": None,
|
|
49
|
+
"chan_name": None,
|
|
50
|
+
"meas_type": None,
|
|
51
|
+
"streamflow_method": None,
|
|
52
|
+
"velocity_method": None,
|
|
53
|
+
"chan_discharge": "channel-flow",
|
|
54
|
+
"chan_width": "top-width",
|
|
55
|
+
"chan_velocity": "avg-velocity",
|
|
56
|
+
"chan_area": "effective-flow-area",
|
|
57
|
+
"chan_stability": None,
|
|
58
|
+
"chan_material": None,
|
|
59
|
+
"chan_evenness": None,
|
|
60
|
+
"long_vel_desc": None,
|
|
61
|
+
"horz_vel_desc": None,
|
|
62
|
+
"vert_vel_desc": None,
|
|
63
|
+
"chan_loc_cd": None,
|
|
64
|
+
"chan_loc_dist": None,
|
|
65
|
+
"location-id": "name",
|
|
66
|
+
"utc_time": "instant",
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def getusgs_measurement_cda(
|
|
71
|
+
api_root,
|
|
72
|
+
office_id,
|
|
73
|
+
api_key,
|
|
74
|
+
days_back_modified=2,
|
|
75
|
+
days_back_collected=365,
|
|
76
|
+
backfill_list=None,
|
|
77
|
+
backfill_group=None,
|
|
78
|
+
):
|
|
79
|
+
apiKey = "apikey " + api_key
|
|
80
|
+
api = cwms.api.init_session(api_root=api_root, api_key=apiKey)
|
|
81
|
+
|
|
82
|
+
logging.info("Fetching CWMS location groups...")
|
|
83
|
+
try:
|
|
84
|
+
usgs_alias_group = cwms.get_location_group(
|
|
85
|
+
loc_group_id="USGS Station Number",
|
|
86
|
+
category_id="Agency Aliases",
|
|
87
|
+
office_id="CWMS",
|
|
88
|
+
group_office_id=office_id,
|
|
89
|
+
category_office_id=office_id,
|
|
90
|
+
)
|
|
91
|
+
usgs_measurement_locs = cwms.get_location_group(
|
|
92
|
+
loc_group_id="USGS Measurements",
|
|
93
|
+
category_id="Data Acquisition",
|
|
94
|
+
office_id="CWMS",
|
|
95
|
+
group_office_id=office_id,
|
|
96
|
+
category_office_id=office_id,
|
|
97
|
+
)
|
|
98
|
+
except requests.exceptions.RequestException as e:
|
|
99
|
+
logging.critical(f"Failed to fetch CWMS location groups: {e}. Exiting.")
|
|
100
|
+
exit(1)
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logging.critical(
|
|
103
|
+
f"An unexpected error occurred fetching CWMS location groups: {e}. Exiting."
|
|
104
|
+
)
|
|
105
|
+
exit(1)
|
|
106
|
+
|
|
107
|
+
# merge them together
|
|
108
|
+
measurement_site_df = pd.merge(
|
|
109
|
+
usgs_measurement_locs.df,
|
|
110
|
+
usgs_alias_group.df,
|
|
111
|
+
on="location-id",
|
|
112
|
+
how="inner",
|
|
113
|
+
left_on=None,
|
|
114
|
+
right_on=None,
|
|
115
|
+
)
|
|
116
|
+
# drop any that don't have a USGS id
|
|
117
|
+
measurement_site_df = measurement_site_df[measurement_site_df["alias-id"].notnull()]
|
|
118
|
+
|
|
119
|
+
if measurement_site_df.empty:
|
|
120
|
+
logging.warning(
|
|
121
|
+
"No valid USGS measurement locations found in CWMS after de-duplication. Exiting."
|
|
122
|
+
)
|
|
123
|
+
exit(0)
|
|
124
|
+
|
|
125
|
+
# backfilling entire group get list of USGS ids to backfill
|
|
126
|
+
if backfill_group:
|
|
127
|
+
backfill_list = list(measurement_site_df["alias-id"].values)
|
|
128
|
+
|
|
129
|
+
if backfill_list:
|
|
130
|
+
backfill_mode(backfill_list, measurement_site_df)
|
|
131
|
+
else:
|
|
132
|
+
realtime_mode(days_back_collected, days_back_modified, measurement_site_df)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def convert_to_utc(df):
|
|
136
|
+
"""
|
|
137
|
+
Converts a pandas DataFrame with timezone-aware datetimes to UTC using a timezone mapping.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
df: pandas DataFrame with columns 'measurement_dt' (datetime-like) and 'tz_cd' (timezone code).
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
pandas DataFrame with an added 'utc_time' column in UTC. Returns the original dataframe if there is an issue.
|
|
144
|
+
"""
|
|
145
|
+
df_copy = df.copy()
|
|
146
|
+
|
|
147
|
+
if "measurement_dt" not in df_copy.columns or "tz_cd" not in df_copy.columns:
|
|
148
|
+
logging.error(
|
|
149
|
+
"Error: 'measurement_dt' or 'tz_cd' column not found in DataFrame for UTC conversion."
|
|
150
|
+
)
|
|
151
|
+
return df_copy
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
df_copy["measurement_dt"] = pd.to_datetime(
|
|
155
|
+
df_copy["measurement_dt"], errors="coerce", format="ISO8601"
|
|
156
|
+
)
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logging.error(f"Error converting 'measurement_dt' to datetime: {e}")
|
|
159
|
+
return df_copy
|
|
160
|
+
|
|
161
|
+
def to_utc_single_row(row):
|
|
162
|
+
dt = row["measurement_dt"]
|
|
163
|
+
tz_str = row["tz_cd"]
|
|
164
|
+
|
|
165
|
+
if pd.isna(dt):
|
|
166
|
+
return pd.NaT
|
|
167
|
+
|
|
168
|
+
if pd.isna(tz_str):
|
|
169
|
+
if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
|
|
170
|
+
return pytz.timezone("UTC").localize(dt).astimezone(pytz.utc)
|
|
171
|
+
else:
|
|
172
|
+
return dt.astimezone(pytz.utc)
|
|
173
|
+
|
|
174
|
+
try:
|
|
175
|
+
iana_tz_name = TZ_MAPPING.get(tz_str)
|
|
176
|
+
if iana_tz_name is None:
|
|
177
|
+
logging.warning(
|
|
178
|
+
f"Unknown timezone code: '{tz_str}'. Check TZ_MAPPING. Returning NaT for this row."
|
|
179
|
+
)
|
|
180
|
+
return pd.NaT
|
|
181
|
+
|
|
182
|
+
tz = pytz.timezone(iana_tz_name)
|
|
183
|
+
if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
|
|
184
|
+
dt_aware = tz.localize(dt)
|
|
185
|
+
else:
|
|
186
|
+
dt_aware = dt.astimezone(tz)
|
|
187
|
+
|
|
188
|
+
dt_utc = dt_aware.astimezone(pytz.utc)
|
|
189
|
+
return dt_utc
|
|
190
|
+
except pytz.exceptions.UnknownTimeZoneError:
|
|
191
|
+
logging.warning(
|
|
192
|
+
f"Unknown IANA timezone: '{iana_tz_name}' derived from '{tz_str}'. Returning NaT for this row."
|
|
193
|
+
)
|
|
194
|
+
return pd.NaT
|
|
195
|
+
except Exception as e:
|
|
196
|
+
logging.error(
|
|
197
|
+
f"An unexpected error occurred during UTC conversion for '{tz_str}': {e}. Returning NaT for this row."
|
|
198
|
+
)
|
|
199
|
+
return pd.NaT
|
|
200
|
+
|
|
201
|
+
df_copy["utc_time"] = df_copy.apply(to_utc_single_row, axis=1)
|
|
202
|
+
return df_copy
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def rename_and_drop_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
206
|
+
"""
|
|
207
|
+
Renames columns in a pandas DataFrame based on a predefined mapping.
|
|
208
|
+
If a target column name is not provided, the column is dropped.
|
|
209
|
+
Only columns that exist are renamed or dropped.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
df: The input pandas DataFrame.
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
A new pandas DataFrame with renamed and dropped columns.
|
|
216
|
+
"""
|
|
217
|
+
df_copy = df.copy()
|
|
218
|
+
|
|
219
|
+
columns_to_drop = [
|
|
220
|
+
col
|
|
221
|
+
for col, target in COLUMN_MAPPING.items()
|
|
222
|
+
if target is None and col in df_copy.columns
|
|
223
|
+
]
|
|
224
|
+
df_copy = df_copy.drop(columns=columns_to_drop, errors="ignore")
|
|
225
|
+
|
|
226
|
+
columns_to_rename = {
|
|
227
|
+
col: target
|
|
228
|
+
for col, target in COLUMN_MAPPING.items()
|
|
229
|
+
if target is not None and col in df_copy.columns
|
|
230
|
+
}
|
|
231
|
+
df_copy = df_copy.rename(columns=columns_to_rename, errors="ignore")
|
|
232
|
+
|
|
233
|
+
return df_copy
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def clean_data(df):
|
|
237
|
+
"""
|
|
238
|
+
Performs several data cleaning operations on a pandas DataFrame.
|
|
239
|
+
|
|
240
|
+
- Converts 'Yes'/'No' in 'used' to True/False (after renaming).
|
|
241
|
+
- Fills NaN values in string columns with empty strings.
|
|
242
|
+
- Fills NaN values in numeric columns with pandas.NA.
|
|
243
|
+
- Drops rows where both 'flow' and 'gage-height' are NaN.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
df (pd.DataFrame): The input DataFrame to clean.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
pd.DataFrame: The cleaned DataFrame.
|
|
250
|
+
"""
|
|
251
|
+
df_cleaned = df.copy()
|
|
252
|
+
dropped = 0
|
|
253
|
+
|
|
254
|
+
if "used" in df_cleaned.columns:
|
|
255
|
+
dropped = len(df_cleaned[df_cleaned["used"] == True])
|
|
256
|
+
logging.info(f"Dropping {dropped} measurements flagged as not used")
|
|
257
|
+
df_cleaned.loc[:, "used"] = (
|
|
258
|
+
df_cleaned["used"].map({"Yes": True, "No": False}).astype(pd.BooleanDtype())
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
string_cols = df_cleaned.select_dtypes(include="object").columns
|
|
262
|
+
numeric_cols = df_cleaned.select_dtypes(include=np.number).columns
|
|
263
|
+
|
|
264
|
+
if not string_cols.empty:
|
|
265
|
+
df_cleaned[string_cols] = df_cleaned[string_cols].astype("string").fillna("")
|
|
266
|
+
if not numeric_cols.empty:
|
|
267
|
+
df_cleaned[numeric_cols] = df_cleaned[numeric_cols].fillna(pd.NA)
|
|
268
|
+
|
|
269
|
+
if "flow" in df_cleaned.columns and "gage-height" in df_cleaned.columns:
|
|
270
|
+
mask = df_cleaned[["flow", "gage-height"]].isna().all(axis=1)
|
|
271
|
+
df_cleaned = df_cleaned[~mask].copy()
|
|
272
|
+
elif "flow" in df_cleaned.columns or "gage-height" in df_cleaned.columns:
|
|
273
|
+
logging.warning(
|
|
274
|
+
"Only one of 'flow' or 'gage-height' columns exists. Cannot perform combined NaN drop."
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
return df_cleaned, dropped
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def process_usgs_data(df: pd.DataFrame) -> pd.DataFrame:
|
|
281
|
+
"""
|
|
282
|
+
Orchestrates the processing of USGS data by applying a series of transformations:
|
|
283
|
+
1. Converts 'measurement_dt' to UTC and adds 'utc_time' column.
|
|
284
|
+
2. Renames and drops columns according to a predefined mapping.
|
|
285
|
+
3. Performs general data cleaning (boolean mapping, NaN handling, row dropping).
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
df (pd.DataFrame): The input DataFrame containing USGS data.
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
pd.DataFrame: The fully processed and cleaned DataFrame.
|
|
292
|
+
"""
|
|
293
|
+
df_processed = df.copy()
|
|
294
|
+
|
|
295
|
+
df_processed = convert_to_utc(df_processed)
|
|
296
|
+
df_processed = rename_and_drop_columns(df_processed)
|
|
297
|
+
df_processed, dropped = clean_data(df_processed)
|
|
298
|
+
|
|
299
|
+
return df_processed, dropped
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def remove_nan_values(data):
|
|
303
|
+
"""
|
|
304
|
+
Recursively remove keys with None, NaN, or empty string values from a dictionary.
|
|
305
|
+
"""
|
|
306
|
+
if isinstance(data, dict):
|
|
307
|
+
return {
|
|
308
|
+
k: remove_nan_values(v)
|
|
309
|
+
for k, v in data.items()
|
|
310
|
+
if v is not None
|
|
311
|
+
and not (isinstance(v, float) and math.isnan(v))
|
|
312
|
+
and not (isinstance(v, str) and v == "")
|
|
313
|
+
}
|
|
314
|
+
elif isinstance(data, list):
|
|
315
|
+
return [remove_nan_values(elem) for elem in data if elem is not None]
|
|
316
|
+
return data
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def check_single_row_for_duplicates(row_to_check, df_existing):
|
|
320
|
+
"""
|
|
321
|
+
Checks a single row for duplicates based on "number" and "instant"
|
|
322
|
+
against df_existing, and identifies differences if an instant-based
|
|
323
|
+
duplicate is found.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
row_to_check: A pandas Series or a DataFrame with a single row
|
|
327
|
+
representing the item to check.
|
|
328
|
+
df_existing: The DataFrame to compare against. Its 'number' and 'instant'
|
|
329
|
+
columns will be temporarily converted for comparison.
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
A tuple containing:
|
|
333
|
+
- original_row_passed_in: The original pandas Series or 1-row DataFrame
|
|
334
|
+
that was passed into the function.
|
|
335
|
+
- is_rejected: A boolean (True if the row was rejected due to a
|
|
336
|
+
duplicate number or a close instant, False otherwise).
|
|
337
|
+
- df_differences: A DataFrame detailing specific value differences
|
|
338
|
+
between the rejected incoming row and the closest
|
|
339
|
+
existing row. This DataFrame is populated ONLY if
|
|
340
|
+
is_rejected is True due to an instant duplicate
|
|
341
|
+
AND there are actual value differences.
|
|
342
|
+
Columns: ['Column Name', 'Incoming Value', 'Existing Value'].
|
|
343
|
+
Returns an empty DataFrame otherwise.
|
|
344
|
+
"""
|
|
345
|
+
original_input_for_return = row_to_check
|
|
346
|
+
|
|
347
|
+
if isinstance(row_to_check, pd.Series):
|
|
348
|
+
df_store_internal = row_to_check.to_frame().T
|
|
349
|
+
elif isinstance(row_to_check, pd.DataFrame) and len(row_to_check) == 1:
|
|
350
|
+
df_store_internal = row_to_check
|
|
351
|
+
else:
|
|
352
|
+
raise ValueError(
|
|
353
|
+
"row_to_check must be a pandas Series or a DataFrame with a single row."
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
is_rejected = False
|
|
357
|
+
df_differences = pd.DataFrame(
|
|
358
|
+
columns=["Column Name", "Incoming Value", "Existing Value"]
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
if df_existing.empty:
|
|
362
|
+
return original_input_for_return, is_rejected, df_differences
|
|
363
|
+
|
|
364
|
+
df_store_compare = df_store_internal.copy()
|
|
365
|
+
df_existing_compare = df_existing.copy()
|
|
366
|
+
|
|
367
|
+
# cast number columns as int, sometimes USGS won't resolve to int...drop those rows
|
|
368
|
+
df_invalid = df_store_compare[
|
|
369
|
+
pd.to_numeric(df_store_compare["number"], errors="coerce").isna()
|
|
370
|
+
]
|
|
371
|
+
if not df_invalid.empty:
|
|
372
|
+
logging.info(
|
|
373
|
+
f"Can't resolve measurement numbers {df_invalid['number'].values} to number. Won't store those measurements"
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# Convert the valid rows to numeric and drop the invalid ones
|
|
377
|
+
df_store_compare["number"] = pd.to_numeric(
|
|
378
|
+
df_store_compare["number"], errors="coerce"
|
|
379
|
+
) # Convert to numeric, coercing errors to NaN
|
|
380
|
+
df_store_compare = df_store_compare.dropna(
|
|
381
|
+
subset=["number"]
|
|
382
|
+
) # Drop rows where 'number' is NaN
|
|
383
|
+
|
|
384
|
+
df_store_compare["number"] = df_store_compare["number"].astype(str)
|
|
385
|
+
df_existing_compare["number"] = df_existing_compare["number"].astype(str)
|
|
386
|
+
|
|
387
|
+
df_store_compare["instant"] = pd.to_datetime(df_store_compare["instant"])
|
|
388
|
+
df_existing_compare["instant"] = pd.to_datetime(df_existing_compare["instant"])
|
|
389
|
+
|
|
390
|
+
current_number = df_store_compare["number"].iloc[0]
|
|
391
|
+
current_instant = df_store_compare["instant"].iloc[0]
|
|
392
|
+
|
|
393
|
+
if current_number in df_existing_compare["number"].values:
|
|
394
|
+
is_rejected = True
|
|
395
|
+
return original_input_for_return, is_rejected, df_differences
|
|
396
|
+
|
|
397
|
+
time_diffs = (df_existing_compare["instant"] - current_instant).abs()
|
|
398
|
+
five_minutes = pd.Timedelta(minutes=5)
|
|
399
|
+
|
|
400
|
+
if not time_diffs.empty and time_diffs.min() <= five_minutes:
|
|
401
|
+
is_rejected = True
|
|
402
|
+
|
|
403
|
+
close_matches_mask = time_diffs <= five_minutes
|
|
404
|
+
close_matches = df_existing_compare[close_matches_mask]
|
|
405
|
+
|
|
406
|
+
if not close_matches.empty:
|
|
407
|
+
closest_existing_row_idx = (
|
|
408
|
+
(close_matches["instant"] - current_instant).abs().idxmin()
|
|
409
|
+
)
|
|
410
|
+
closest_existing_row = close_matches.loc[closest_existing_row_idx]
|
|
411
|
+
|
|
412
|
+
diff_records = []
|
|
413
|
+
columns_to_compare = [
|
|
414
|
+
col
|
|
415
|
+
for col in df_store_compare.columns
|
|
416
|
+
if col not in ["number", "instant"]
|
|
417
|
+
]
|
|
418
|
+
|
|
419
|
+
for col in columns_to_compare:
|
|
420
|
+
current_val = df_store_compare[col].iloc[0]
|
|
421
|
+
existing_val = closest_existing_row.get(col)
|
|
422
|
+
|
|
423
|
+
# Handle NaN values explicitly
|
|
424
|
+
if pd.isna(current_val) and pd.isna(existing_val):
|
|
425
|
+
continue
|
|
426
|
+
elif pd.isna(current_val) != pd.isna(
|
|
427
|
+
existing_val
|
|
428
|
+
): # One is NaN, other is not
|
|
429
|
+
diff_records.append(
|
|
430
|
+
{
|
|
431
|
+
"Column Name": col,
|
|
432
|
+
"Incoming Value": current_val,
|
|
433
|
+
"Existing Value": existing_val,
|
|
434
|
+
}
|
|
435
|
+
)
|
|
436
|
+
elif pd.api.types.is_numeric_dtype(
|
|
437
|
+
df_store_compare[col]
|
|
438
|
+
) and pd.api.types.is_numeric_dtype(closest_existing_row[col]):
|
|
439
|
+
# For numeric values, compare with a small tolerance
|
|
440
|
+
if (
|
|
441
|
+
abs(current_val - existing_val) > 1e-6
|
|
442
|
+
): # Example tolerance for floats
|
|
443
|
+
diff_records.append(
|
|
444
|
+
{
|
|
445
|
+
"Column Name": col,
|
|
446
|
+
"Incoming Value": current_val,
|
|
447
|
+
"Existing Value": existing_val,
|
|
448
|
+
}
|
|
449
|
+
)
|
|
450
|
+
elif current_val != existing_val:
|
|
451
|
+
diff_records.append(
|
|
452
|
+
{
|
|
453
|
+
"Column Name": col,
|
|
454
|
+
"Incoming Value": current_val,
|
|
455
|
+
"Existing Value": existing_val,
|
|
456
|
+
}
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
if diff_records:
|
|
460
|
+
df_differences = pd.DataFrame(diff_records)
|
|
461
|
+
|
|
462
|
+
return original_input_for_return, is_rejected, df_differences
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def check_and_drop_duplicates(df_store, df_existing):
|
|
466
|
+
"""
|
|
467
|
+
Checks for duplicates based on "number" and "instant" columns and drops them.
|
|
468
|
+
|
|
469
|
+
Args:
|
|
470
|
+
df_renamed: The DataFrame to check for duplicates and modify.
|
|
471
|
+
df_existing: The DataFrame to compare against.
|
|
472
|
+
|
|
473
|
+
Returns:
|
|
474
|
+
A tuple containing:
|
|
475
|
+
- df_renamed: The modified DataFrame with duplicates removed.
|
|
476
|
+
- df_rejected_number: DataFrame containing rows rejected due to duplicate "number".
|
|
477
|
+
- df_rejected_instant: DataFrame containing rows rejected due to "instant" within 5 minutes of existing.
|
|
478
|
+
"""
|
|
479
|
+
|
|
480
|
+
if not df_existing.empty:
|
|
481
|
+
|
|
482
|
+
# cast number columns as int, sometimes USGS won't resolve to int...drop those rows
|
|
483
|
+
df_invalid = df_store[pd.to_numeric(df_store["number"], errors="coerce").isna()]
|
|
484
|
+
if not df_invalid.empty:
|
|
485
|
+
print(
|
|
486
|
+
f"Can't resolve measurement numbers {df_invalid['number'].values} to number. Won't store those measurements"
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
# Convert the valid rows to numeric and drop the invalid ones
|
|
490
|
+
df_store["number"] = pd.to_numeric(
|
|
491
|
+
df_store["number"], errors="coerce"
|
|
492
|
+
) # Convert to numeric, coercing errors to NaN
|
|
493
|
+
df_store = df_store.dropna(subset=["number"]) # Drop rows where 'number' is NaN
|
|
494
|
+
|
|
495
|
+
# Convert the 'number' column to str
|
|
496
|
+
df_store.loc[:, "number"] = df_store["number"].astype(str)
|
|
497
|
+
|
|
498
|
+
# Ensure 'instant' columns are datetime objects
|
|
499
|
+
df_store["instant"] = pd.to_datetime(df_store["instant"])
|
|
500
|
+
df_existing["instant"] = pd.to_datetime(df_existing["instant"])
|
|
501
|
+
|
|
502
|
+
# Check for duplicate numbers
|
|
503
|
+
mask_number = df_store["number"].isin(df_existing["number"])
|
|
504
|
+
df_rejected_number = df_store[mask_number].copy() # Store rejected rows
|
|
505
|
+
df_store = df_store[~mask_number] # Remove duplicates from df_store
|
|
506
|
+
|
|
507
|
+
# Check for instants within 5 minutes
|
|
508
|
+
|
|
509
|
+
df_rejected_instant = pd.DataFrame(columns=df_store.columns) # Initialize
|
|
510
|
+
|
|
511
|
+
indices_to_drop = [] # Keep track of indices to drop efficiently
|
|
512
|
+
|
|
513
|
+
for index, row in df_store.iterrows():
|
|
514
|
+
# Find closest time in df_existing
|
|
515
|
+
closest_time = df_existing["instant"].iloc[
|
|
516
|
+
(df_existing["instant"] - row["instant"]).abs().argsort()[:1]
|
|
517
|
+
]
|
|
518
|
+
|
|
519
|
+
# Check if time difference is within 5 minutes (300 seconds)
|
|
520
|
+
if abs((closest_time.iloc[0] - row["instant"]).total_seconds()) <= 300:
|
|
521
|
+
df_rejected_instant = pd.concat([df_rejected_instant, row.to_frame().T])
|
|
522
|
+
indices_to_drop.append(index)
|
|
523
|
+
|
|
524
|
+
df_store = df_store.drop(indices_to_drop)
|
|
525
|
+
|
|
526
|
+
return df_store, df_rejected_number, df_rejected_instant
|
|
527
|
+
else:
|
|
528
|
+
return df_store, pd.DataFrame(), pd.DataFrame()
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def create_json_from_row(row):
|
|
532
|
+
"""
|
|
533
|
+
Transforms a DataFrame row into the specified JSON format.
|
|
534
|
+
"""
|
|
535
|
+
try:
|
|
536
|
+
instant_value = pd.to_datetime(row["instant"]).isoformat()
|
|
537
|
+
except Exception as e:
|
|
538
|
+
logging.warning(
|
|
539
|
+
f"Could not convert instant '{row.get('instant')}' to ISO format: {e}. Setting to None."
|
|
540
|
+
)
|
|
541
|
+
instant_value = None
|
|
542
|
+
|
|
543
|
+
json_data = {
|
|
544
|
+
"height-unit": "ft",
|
|
545
|
+
"flow-unit": "cfs",
|
|
546
|
+
"used": (
|
|
547
|
+
bool(row["used"]) if pd.notna(row["used"]) else False
|
|
548
|
+
), # Ensure proper bool conversion
|
|
549
|
+
"agency": (
|
|
550
|
+
"USGS" if "unsp" in str(row["agency"]).lower() else str(row["agency"])
|
|
551
|
+
),
|
|
552
|
+
"party": str(row["party"]),
|
|
553
|
+
"wm-comments": f"imported from get_USGS_measurements.py {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%SZ')}",
|
|
554
|
+
"instant": instant_value,
|
|
555
|
+
"id": {"office-id": str(row["office"]), "name": str(row["name"])},
|
|
556
|
+
"number": str(row["number"]),
|
|
557
|
+
"streamflow-measurement": {
|
|
558
|
+
"gage-height": (
|
|
559
|
+
float(row["gage-height"])
|
|
560
|
+
if pd.notna(row["gage-height"])
|
|
561
|
+
else CWMS_MISSING_VALUE
|
|
562
|
+
),
|
|
563
|
+
"flow": (
|
|
564
|
+
float(row["flow"]) if pd.notna(row["flow"]) else CWMS_MISSING_VALUE
|
|
565
|
+
),
|
|
566
|
+
"quality": str(row["quality"]),
|
|
567
|
+
},
|
|
568
|
+
"usgs-measurement": {
|
|
569
|
+
"control-condition": (
|
|
570
|
+
"Unspecified"
|
|
571
|
+
if pd.notna(row["control-condition"])
|
|
572
|
+
and "unsp" in row["control-condition"].lower()
|
|
573
|
+
else (
|
|
574
|
+
str(row["control-condition"])
|
|
575
|
+
if pd.notna(row["control-condition"])
|
|
576
|
+
else None
|
|
577
|
+
)
|
|
578
|
+
),
|
|
579
|
+
"flow-adjustment": str(row["flow-adjustment"]),
|
|
580
|
+
"delta-height": (
|
|
581
|
+
float(row["delta-height"])
|
|
582
|
+
if pd.notna(row["delta-height"])
|
|
583
|
+
else None # Assuming None for delta if NaN
|
|
584
|
+
),
|
|
585
|
+
"delta-time": (
|
|
586
|
+
float(row["delta-time"])
|
|
587
|
+
if pd.notna(row["delta-time"])
|
|
588
|
+
else None # Assuming None for delta if NaN
|
|
589
|
+
),
|
|
590
|
+
},
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
# Apply the recursive NaN remover once at the end
|
|
594
|
+
json_data = remove_nan_values(json_data)
|
|
595
|
+
return json_data
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def realtime_mode(DAYS_BACK_COLLECTED, DAYS_BACK_MODIFIED, measurement_site_df):
|
|
599
|
+
execution_date = datetime.now()
|
|
600
|
+
startDT = execution_date - timedelta(DAYS_BACK_COLLECTED)
|
|
601
|
+
|
|
602
|
+
logging.info(
|
|
603
|
+
f"Fetching USGS discharge measurements from {startDT.isoformat()} (modified in last {DAYS_BACK_MODIFIED} days)..."
|
|
604
|
+
)
|
|
605
|
+
try:
|
|
606
|
+
|
|
607
|
+
df_meas_usgs, meta = nwis.get_discharge_measurements(
|
|
608
|
+
# sites=["05058000", "05059500"],
|
|
609
|
+
period=f"P{DAYS_BACK_COLLECTED}D",
|
|
610
|
+
channel_rdb_info="1",
|
|
611
|
+
sv_md_interval="DAY",
|
|
612
|
+
sv_md=f"{DAYS_BACK_MODIFIED}",
|
|
613
|
+
sv_md_minutes="2",
|
|
614
|
+
)
|
|
615
|
+
logging.info(f"Queried {meta}")
|
|
616
|
+
except Exception as e:
|
|
617
|
+
logging.critical(f"Failed to fetch USGS measurements: {e}. Exiting.")
|
|
618
|
+
exit(1)
|
|
619
|
+
|
|
620
|
+
if df_meas_usgs.empty:
|
|
621
|
+
logging.info("No new USGS measurements found to process.")
|
|
622
|
+
exit(0)
|
|
623
|
+
|
|
624
|
+
logging.info(f"Processing {len(df_meas_usgs)} USGS measurements...")
|
|
625
|
+
df_meas_usgs, dropped = process_usgs_data(df_meas_usgs)
|
|
626
|
+
total_usgs_measurements_processed = 0
|
|
627
|
+
total_usgs_measurements_skipped_no_cwms_mapping = 0
|
|
628
|
+
|
|
629
|
+
# This will store stats like: {'office_id_MVP': {'attempted': X, 'successful': Y, 'rejected': Z}}
|
|
630
|
+
office_store_stats = defaultdict(lambda: defaultdict(int))
|
|
631
|
+
for _, usgs_row in df_meas_usgs.iterrows():
|
|
632
|
+
total_usgs_measurements_processed += 1
|
|
633
|
+
site_no = usgs_row.usgs_site_no
|
|
634
|
+
|
|
635
|
+
site_filter_df = measurement_site_df[measurement_site_df["alias-id"] == site_no]
|
|
636
|
+
# skip if site isn't in measurement group
|
|
637
|
+
if len(site_filter_df) == 0:
|
|
638
|
+
total_usgs_measurements_skipped_no_cwms_mapping += 1
|
|
639
|
+
continue
|
|
640
|
+
|
|
641
|
+
cwms_loc = site_filter_df["location-id"].values[0]
|
|
642
|
+
|
|
643
|
+
office_id = site_filter_df["office-id_x"].values[0]
|
|
644
|
+
overwrite_flag = site_filter_df["attribute_x"].values[
|
|
645
|
+
0
|
|
646
|
+
] # Assuming 1 means overwrite, 0 means don't overwrite
|
|
647
|
+
|
|
648
|
+
# Create a copy of the row for JSON creation and modification
|
|
649
|
+
usgs_row_for_json = usgs_row.copy()
|
|
650
|
+
usgs_row_for_json["name"] = cwms_loc
|
|
651
|
+
usgs_row_for_json["office"] = office_id
|
|
652
|
+
|
|
653
|
+
data = create_json_from_row(usgs_row_for_json)
|
|
654
|
+
office_store_stats[office_id][
|
|
655
|
+
"attempted"
|
|
656
|
+
] += 1 # Increment attempted for this office
|
|
657
|
+
|
|
658
|
+
# get existing measurements at site
|
|
659
|
+
df_existing = pd.DataFrame() # Initialize as empty
|
|
660
|
+
try:
|
|
661
|
+
existing_measurements = cwms.get_measurements(
|
|
662
|
+
location_id_mask=cwms_loc, office_id=office_id
|
|
663
|
+
)
|
|
664
|
+
if existing_measurements and existing_measurements.df is not None:
|
|
665
|
+
df_existing = existing_measurements.df
|
|
666
|
+
except Exception as e:
|
|
667
|
+
logging.error(
|
|
668
|
+
f"An unexpected error occurred while getting existing measurements for {cwms_loc} ({office_id}). Assuming no existing measurements."
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
_, is_rejected, df_differences = check_single_row_for_duplicates(
|
|
672
|
+
usgs_row_for_json, df_existing
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
log_prefix = f"USGS site {site_no} -> CWMS loc {cwms_loc} ({office_id}) measurement collected at {usgs_row.instant}"
|
|
676
|
+
|
|
677
|
+
if overwrite_flag == 1:
|
|
678
|
+
try:
|
|
679
|
+
logging.info(f"{log_prefix} (overwrite enabled). Storing.")
|
|
680
|
+
cwms.store_measurements(data=[data], fail_if_exists=False)
|
|
681
|
+
office_store_stats[office_id][
|
|
682
|
+
"successful"
|
|
683
|
+
] += 1 # Increment successful for this office
|
|
684
|
+
if not df_differences.empty:
|
|
685
|
+
logging.info(
|
|
686
|
+
f"Differences found between stored data and new data for {log_prefix}:\n{df_differences.to_string()}"
|
|
687
|
+
)
|
|
688
|
+
except requests.exceptions.RequestException as e:
|
|
689
|
+
logging.error(f"CWMS API network error storing {log_prefix}: {e}")
|
|
690
|
+
# For overwrite enabled, if it fails, it's an error, not a 'rejection' due to existing data
|
|
691
|
+
except Exception as e:
|
|
692
|
+
logging.error(f"Unexpected error storing {log_prefix}: {e}")
|
|
693
|
+
else: # overwrite_flag is 0 or some other value, meaning don't overwrite
|
|
694
|
+
if not is_rejected:
|
|
695
|
+
try:
|
|
696
|
+
logging.info(f"{log_prefix}. Storing.")
|
|
697
|
+
cwms.store_measurements(
|
|
698
|
+
data=[data]
|
|
699
|
+
) # fail_if_exists=True by default
|
|
700
|
+
office_store_stats[office_id][
|
|
701
|
+
"successful"
|
|
702
|
+
] += 1 # Increment successful for this office
|
|
703
|
+
if not df_differences.empty:
|
|
704
|
+
logging.info(
|
|
705
|
+
f"Differences found between stored data and new data for {log_prefix}:\n{df_differences.to_string()}"
|
|
706
|
+
)
|
|
707
|
+
except requests.exceptions.RequestException as e:
|
|
708
|
+
# If fail_if_exists is True (default)
|
|
709
|
+
logging.warning(
|
|
710
|
+
f"CWMS API network error (likely duplicate or conflict) storing {log_prefix}: {e}"
|
|
711
|
+
)
|
|
712
|
+
office_store_stats[office_id][
|
|
713
|
+
"rejected"
|
|
714
|
+
] += 1 # Increment rejected for this office
|
|
715
|
+
except Exception as e:
|
|
716
|
+
logging.error(f"Unexpected error storing {log_prefix}: {e}")
|
|
717
|
+
else:
|
|
718
|
+
logging.warning(
|
|
719
|
+
f"{log_prefix} has same number field ({usgs_row.number}) or similar collection time as existing measurement. Not storing."
|
|
720
|
+
)
|
|
721
|
+
office_store_stats[office_id][
|
|
722
|
+
"rejected"
|
|
723
|
+
] += 1 # Increment rejected for this office
|
|
724
|
+
|
|
725
|
+
logging.info("-" * 50)
|
|
726
|
+
logging.info("Processing Summary:")
|
|
727
|
+
logging.info(f"Total USGS measurements fetched: {len(df_meas_usgs)}")
|
|
728
|
+
logging.info(
|
|
729
|
+
f"Total unique USGS measurements processed for CWMS: {total_usgs_measurements_processed}"
|
|
730
|
+
)
|
|
731
|
+
logging.info(
|
|
732
|
+
f"Total USGS measurements skipped (no CWMS mapping): {total_usgs_measurements_skipped_no_cwms_mapping}"
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
logging.info("\nCWMS Store Statistics Per Office:")
|
|
736
|
+
# Calculate global totals from office_store_stats for consistency
|
|
737
|
+
global_attempted = sum(stats["attempted"] for stats in office_store_stats.values())
|
|
738
|
+
global_successful = sum(
|
|
739
|
+
stats["successful"] for stats in office_store_stats.values()
|
|
740
|
+
)
|
|
741
|
+
global_rejected = sum(stats["rejected"] for stats in office_store_stats.values())
|
|
742
|
+
|
|
743
|
+
for office, stats in sorted(office_store_stats.items()):
|
|
744
|
+
logging.info(f" Office: {office}")
|
|
745
|
+
logging.info(f" Attempted: {stats['attempted']}")
|
|
746
|
+
logging.info(f" Successful: {stats['successful']}")
|
|
747
|
+
logging.info(f" Rejected (Duplicate/Conflict): {stats['rejected']}")
|
|
748
|
+
|
|
749
|
+
logging.info("\nOverall CWMS Store Statistics:")
|
|
750
|
+
logging.info(
|
|
751
|
+
f"Total CWMS store attempts (across all configurations): {global_attempted}"
|
|
752
|
+
)
|
|
753
|
+
logging.info(f"Total CWMS stores successful: {global_successful}")
|
|
754
|
+
logging.info(f"Total CWMS stores rejected (duplicate/conflict): {global_rejected}")
|
|
755
|
+
logging.info("-" * 50)
|
|
756
|
+
pass
|
|
757
|
+
|
|
758
|
+
|
|
759
|
+
def backfill_mode(BACKFILL_LIST, measurement_site_df):
|
|
760
|
+
# Initialize summary tracking dictionaries
|
|
761
|
+
site_summary = {} # Will store stats for each site
|
|
762
|
+
overall_failed_stores = [] # Will store all failed measurement details
|
|
763
|
+
|
|
764
|
+
for usgs_site in BACKFILL_LIST:
|
|
765
|
+
# Initialize site-specific counters
|
|
766
|
+
site_stats = {
|
|
767
|
+
"measurements_fetched": 0,
|
|
768
|
+
"measurements_saved": 0,
|
|
769
|
+
"measurements_failed": 0,
|
|
770
|
+
"failed_details": [],
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
cwms_loc = measurement_site_df[measurement_site_df["alias-id"] == usgs_site][
|
|
774
|
+
"location-id"
|
|
775
|
+
].values[0]
|
|
776
|
+
OFFICE = measurement_site_df[measurement_site_df["alias-id"] == usgs_site][
|
|
777
|
+
"office-id_x"
|
|
778
|
+
].values[0]
|
|
779
|
+
overwrite_code = int(
|
|
780
|
+
measurement_site_df[measurement_site_df["alias-id"] == usgs_site][
|
|
781
|
+
"attribute_x"
|
|
782
|
+
].values[0]
|
|
783
|
+
)
|
|
784
|
+
logging.info(
|
|
785
|
+
f"Fetching USGS POR discharge measurements for {usgs_site} {cwms_loc})..."
|
|
786
|
+
)
|
|
787
|
+
try:
|
|
788
|
+
df_meas_usgs, meta = nwis.get_discharge_measurements(
|
|
789
|
+
sites=[usgs_site],
|
|
790
|
+
channel_rdb_info="1",
|
|
791
|
+
)
|
|
792
|
+
logging.info(f"Queried {meta}")
|
|
793
|
+
site_stats["measurements_fetched"] = len(df_meas_usgs)
|
|
794
|
+
except Exception as e:
|
|
795
|
+
logging.critical(f"Failed to fetch USGS measurements: {e}. Exiting.")
|
|
796
|
+
df_meas_usgs = pd.DataFrame()
|
|
797
|
+
|
|
798
|
+
if df_meas_usgs.empty:
|
|
799
|
+
logging.info("No new USGS measurements found to process.")
|
|
800
|
+
site_summary[f"{usgs_site} ({cwms_loc})"] = site_stats
|
|
801
|
+
continue # Continue to next site instead of exiting
|
|
802
|
+
|
|
803
|
+
logging.info(f"Processing {len(df_meas_usgs)} USGS measurements...")
|
|
804
|
+
df_meas_usgs, dropped = process_usgs_data(df_meas_usgs)
|
|
805
|
+
|
|
806
|
+
# This will store stats like: {'office_id_MVP': {'attempted': X, 'successful': Y, 'rejected': Z}}
|
|
807
|
+
office_store_stats = defaultdict(lambda: defaultdict(int))
|
|
808
|
+
|
|
809
|
+
df_meas_usgs["location-id"] = df_meas_usgs["name"] = cwms_loc
|
|
810
|
+
df_meas_usgs["office"] = OFFICE
|
|
811
|
+
|
|
812
|
+
log_prefix = (
|
|
813
|
+
f"USGS site {usgs_site} -> CWMS loc {cwms_loc} ({OFFICE}) POR measurements"
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
# get existing measurements at site
|
|
817
|
+
df_existing = pd.DataFrame() # Initialize as empty
|
|
818
|
+
try:
|
|
819
|
+
existing_measurements = cwms.get_measurements(
|
|
820
|
+
location_id_mask=cwms_loc, office_id=OFFICE
|
|
821
|
+
)
|
|
822
|
+
if existing_measurements and existing_measurements.df is not None:
|
|
823
|
+
df_existing = existing_measurements.df
|
|
824
|
+
except Exception as e:
|
|
825
|
+
logging.error(
|
|
826
|
+
f"An unexpected error occurred while getting existing measurements for {cwms_loc} ({OFFICE}). Assuming no existing measurements."
|
|
827
|
+
)
|
|
828
|
+
if overwrite_code != 1:
|
|
829
|
+
logging.info(
|
|
830
|
+
"Overwrite flag is off. Filtering out any conflicting measurements"
|
|
831
|
+
)
|
|
832
|
+
df_store, df_rejected_number, df_rejected_instant = (
|
|
833
|
+
check_and_drop_duplicates(df_meas_usgs, df_existing)
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
if not df_rejected_number.empty:
|
|
837
|
+
logging.info(
|
|
838
|
+
f"The following measurements were rejected because of duplicate measurement numbers {df_rejected_number}"
|
|
839
|
+
)
|
|
840
|
+
if not df_rejected_instant.empty:
|
|
841
|
+
logging.info(
|
|
842
|
+
f"The following measurements were rejected because of duplicate measurement numbers {df_rejected_instant}"
|
|
843
|
+
)
|
|
844
|
+
else:
|
|
845
|
+
df_store = df_meas_usgs.copy()
|
|
846
|
+
|
|
847
|
+
json_list = []
|
|
848
|
+
for _, usgs_row in df_store.iterrows():
|
|
849
|
+
json_list.append(create_json_from_row(usgs_row))
|
|
850
|
+
|
|
851
|
+
# store the measurement
|
|
852
|
+
try:
|
|
853
|
+
logging.info(f"{log_prefix} Storing.")
|
|
854
|
+
cwms.store_measurements(data=json_list, fail_if_exists=False)
|
|
855
|
+
logging.info("-" * 50)
|
|
856
|
+
office_store_stats[OFFICE]["successful"] += 1
|
|
857
|
+
site_stats["measurements_saved"] = len(json_list)
|
|
858
|
+
except requests.exceptions.RequestException as e:
|
|
859
|
+
logging.error(f"CWMS API network error storing {log_prefix}: {e}")
|
|
860
|
+
# Track the bulk failure
|
|
861
|
+
site_stats["measurements_failed"] = len(json_list)
|
|
862
|
+
for data in json_list:
|
|
863
|
+
failure_detail = {
|
|
864
|
+
"site": f"{usgs_site} ({cwms_loc})",
|
|
865
|
+
"measurement_number": data.get("number", "Unknown"),
|
|
866
|
+
"instant": data.get("instant", "Unknown"),
|
|
867
|
+
"error": f"Network error: {e}",
|
|
868
|
+
}
|
|
869
|
+
site_stats["failed_details"].append(failure_detail)
|
|
870
|
+
overall_failed_stores.append(failure_detail)
|
|
871
|
+
except Exception as e:
|
|
872
|
+
logging.error(f"Unexpected error storing {log_prefix}: {e}")
|
|
873
|
+
logging.info("Storing one measurement at a time")
|
|
874
|
+
|
|
875
|
+
measurements_saved_individually = 0
|
|
876
|
+
measurements_failed_individually = 0
|
|
877
|
+
|
|
878
|
+
for data in json_list:
|
|
879
|
+
try:
|
|
880
|
+
cwms.store_measurements(data=[data], fail_if_exists=False)
|
|
881
|
+
measurements_saved_individually += 1
|
|
882
|
+
except Exception as individual_error:
|
|
883
|
+
measurements_failed_individually += 1
|
|
884
|
+
inst = data.get("instant", "Unknown")
|
|
885
|
+
number = data.get("number", "Unknown")
|
|
886
|
+
logging.error(
|
|
887
|
+
f"Could not store measurement {number} collected at {inst} at {cwms_loc}"
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
failure_detail = {
|
|
891
|
+
"site": f"{usgs_site} ({cwms_loc})",
|
|
892
|
+
"measurement_number": number,
|
|
893
|
+
"instant": inst,
|
|
894
|
+
"error": str(individual_error),
|
|
895
|
+
}
|
|
896
|
+
site_stats["failed_details"].append(failure_detail)
|
|
897
|
+
overall_failed_stores.append(failure_detail)
|
|
898
|
+
|
|
899
|
+
site_stats["measurements_saved"] = measurements_saved_individually
|
|
900
|
+
site_stats["measurements_failed"] = measurements_failed_individually
|
|
901
|
+
|
|
902
|
+
# Store site summary
|
|
903
|
+
site_summary[f"{usgs_site} ({cwms_loc})"] = site_stats
|
|
904
|
+
|
|
905
|
+
logging.info("Processing Summary for this site:")
|
|
906
|
+
logging.info(
|
|
907
|
+
f"Total USGS measurements fetched: {site_stats['measurements_fetched']}"
|
|
908
|
+
)
|
|
909
|
+
logging.info(f"Total measurements saved: {site_stats['measurements_saved']}")
|
|
910
|
+
logging.info(f"Total measurements failed: {site_stats['measurements_failed']}")
|
|
911
|
+
|
|
912
|
+
# Print overall processing summary
|
|
913
|
+
logging.info("=" * 60)
|
|
914
|
+
logging.info("OVERALL PROCESSING SUMMARY")
|
|
915
|
+
logging.info("=" * 60)
|
|
916
|
+
|
|
917
|
+
# Summary by site
|
|
918
|
+
logging.info("MEASUREMENTS SAVED BY SITE:")
|
|
919
|
+
logging.info("-" * 40)
|
|
920
|
+
total_saved_all_sites = 0
|
|
921
|
+
total_failed_all_sites = 0
|
|
922
|
+
|
|
923
|
+
for site_name, stats in site_summary.items():
|
|
924
|
+
logging.info(f"{site_name}:")
|
|
925
|
+
logging.info(f" - Fetched: {stats['measurements_fetched']}")
|
|
926
|
+
logging.info(f" - Saved: {stats['measurements_saved']}")
|
|
927
|
+
logging.info(f" - Failed: {stats['measurements_failed']}")
|
|
928
|
+
total_saved_all_sites += stats["measurements_saved"]
|
|
929
|
+
total_failed_all_sites += stats["measurements_failed"]
|
|
930
|
+
logging.info("")
|
|
931
|
+
|
|
932
|
+
logging.info(f"TOTAL MEASUREMENTS SAVED ACROSS ALL SITES: {total_saved_all_sites}")
|
|
933
|
+
logging.info(
|
|
934
|
+
f"TOTAL MEASUREMENTS FAILED ACROSS ALL SITES: {total_failed_all_sites}"
|
|
935
|
+
)
|
|
936
|
+
|
|
937
|
+
# Summary of failed measurements
|
|
938
|
+
if overall_failed_stores:
|
|
939
|
+
logging.info("")
|
|
940
|
+
logging.info("FAILED MEASUREMENT STORES SUMMARY:")
|
|
941
|
+
logging.info("-" * 40)
|
|
942
|
+
logging.info(f"Total failed measurements: {len(overall_failed_stores)}")
|
|
943
|
+
|
|
944
|
+
# Group failures by site
|
|
945
|
+
failures_by_site = defaultdict(list)
|
|
946
|
+
for failure in overall_failed_stores:
|
|
947
|
+
failures_by_site[failure["site"]].append(failure)
|
|
948
|
+
|
|
949
|
+
for site, failures in failures_by_site.items():
|
|
950
|
+
logging.info(f"\n{site} - {len(failures)} failed measurements:")
|
|
951
|
+
for failure in failures[:5]: # Show first 5 failures per site
|
|
952
|
+
logging.info(
|
|
953
|
+
f" - Measurement {failure['measurement_number']} at {failure['instant']}"
|
|
954
|
+
)
|
|
955
|
+
if len(failures) > 5:
|
|
956
|
+
logging.info(f" - ... and {len(failures) - 5} more failures")
|
|
957
|
+
else:
|
|
958
|
+
logging.info("")
|
|
959
|
+
logging.info("No failed measurement stores!")
|
|
960
|
+
|
|
961
|
+
logging.info("=" * 60)
|