cwms-cli 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. cwms_cli-0.1.1.dist-info/METADATA +40 -0
  2. cwms_cli-0.1.1.dist-info/RECORD +41 -0
  3. cwms_cli-0.1.1.dist-info/WHEEL +4 -0
  4. cwms_cli-0.1.1.dist-info/entry_points.txt +3 -0
  5. cwms_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
  6. cwmscli/__init__.py +12 -0
  7. cwmscli/__main__.py +15 -0
  8. cwmscli/callbacks/__init__.py +18 -0
  9. cwmscli/commands/blob.py +439 -0
  10. cwmscli/commands/commands_cwms.py +227 -0
  11. cwmscli/commands/csv2cwms/.gitignore +3 -0
  12. cwmscli/commands/csv2cwms/README.md +51 -0
  13. cwmscli/commands/csv2cwms/__init__.py +5 -0
  14. cwmscli/commands/csv2cwms/__main__.py +265 -0
  15. cwmscli/commands/csv2cwms/examples/complete_config.json +19 -0
  16. cwmscli/commands/csv2cwms/examples/hourly.json +243 -0
  17. cwmscli/commands/csv2cwms/examples/minutes.json +315 -0
  18. cwmscli/commands/csv2cwms/tests/__init__.py +0 -0
  19. cwmscli/commands/csv2cwms/tests/data/.gitignore +1 -0
  20. cwmscli/commands/csv2cwms/tests/data/expected_brok_output.json +278 -0
  21. cwmscli/commands/csv2cwms/tests/data/sample_brok.csv +9 -0
  22. cwmscli/commands/csv2cwms/tests/data/sample_config.json +45 -0
  23. cwmscli/commands/csv2cwms/tests/skip_test_integration_pipeline.py +35 -0
  24. cwmscli/commands/csv2cwms/tests/test_dateutils.py +68 -0
  25. cwmscli/commands/csv2cwms/tests/test_expressions.py +49 -0
  26. cwmscli/commands/csv2cwms/tests/test_fileio.py +43 -0
  27. cwmscli/commands/csv2cwms/utils/__init__.py +5 -0
  28. cwmscli/commands/csv2cwms/utils/dateutils.py +105 -0
  29. cwmscli/commands/csv2cwms/utils/expression.py +39 -0
  30. cwmscli/commands/csv2cwms/utils/fileio.py +26 -0
  31. cwmscli/commands/csv2cwms/utils/logging.py +80 -0
  32. cwmscli/commands/csv2cwms/utils/terminal.py +45 -0
  33. cwmscli/commands/shef_critfile_import.py +146 -0
  34. cwmscli/requirements.py +25 -0
  35. cwmscli/usgs/__init__.py +161 -0
  36. cwmscli/usgs/getUSGS_ratings_cda.py +346 -0
  37. cwmscli/usgs/getusgs_cda.py +345 -0
  38. cwmscli/usgs/getusgs_measurements_cda.py +961 -0
  39. cwmscli/usgs/rating_ini_file_import.py +130 -0
  40. cwmscli/utils/__init__.py +68 -0
  41. cwmscli/utils/deps.py +102 -0
@@ -0,0 +1,961 @@
1
+ import logging
2
+ import math
3
+ from collections import defaultdict
4
+ from datetime import datetime, timedelta, timezone
5
+
6
+ import cwms
7
+ import numpy as np
8
+ import pandas as pd
9
+ import pytz
10
+ import requests
11
+ from dataretrieval import nwis
12
+
13
+ # --- Constants ---
14
+ CWMS_MISSING_VALUE = -340282346638528859811704183484516925440
15
+
16
+ TZ_MAPPING = {
17
+ "AST": "America/Puerto_Rico",
18
+ "EST": "America/New_York",
19
+ "EDT": "America/New_York",
20
+ "CST": "America/Chicago",
21
+ "CDT": "America/Chicago",
22
+ "MST": "America/Denver",
23
+ "MDT": "America/Denver",
24
+ "PST": "America/Los_Angeles",
25
+ "PDT": "America/Los_Angeles",
26
+ "AKST": "America/Anchorage",
27
+ "AKDT": "America/Anchorage",
28
+ "HST": "Pacific/Honolulu",
29
+ "GST": "Pacific/Guam",
30
+ }
31
+
32
+ COLUMN_MAPPING = {
33
+ "agency_cd": "usgs_agency_cd",
34
+ "site_no": "usgs_site_no",
35
+ "measurement_nu": "number",
36
+ "measurement_dt": "usgs_measurement_dt",
37
+ "tz_cd": "usgs_tz_cd",
38
+ "q_meas_used_fg": "used",
39
+ "party_nm": "party",
40
+ "site_visit_coll_agency_cd": "agency",
41
+ "discharge_va": "flow",
42
+ "gage_height_va": "gage-height",
43
+ "gage_va_change": "delta-height",
44
+ "gage_va_time": "delta-time",
45
+ "measured_rating_diff": "quality",
46
+ "control_type_cd": "control-condition",
47
+ "discharge_cd": "flow-adjustment",
48
+ "chan_nu": None,
49
+ "chan_name": None,
50
+ "meas_type": None,
51
+ "streamflow_method": None,
52
+ "velocity_method": None,
53
+ "chan_discharge": "channel-flow",
54
+ "chan_width": "top-width",
55
+ "chan_velocity": "avg-velocity",
56
+ "chan_area": "effective-flow-area",
57
+ "chan_stability": None,
58
+ "chan_material": None,
59
+ "chan_evenness": None,
60
+ "long_vel_desc": None,
61
+ "horz_vel_desc": None,
62
+ "vert_vel_desc": None,
63
+ "chan_loc_cd": None,
64
+ "chan_loc_dist": None,
65
+ "location-id": "name",
66
+ "utc_time": "instant",
67
+ }
68
+
69
+
70
+ def getusgs_measurement_cda(
71
+ api_root,
72
+ office_id,
73
+ api_key,
74
+ days_back_modified=2,
75
+ days_back_collected=365,
76
+ backfill_list=None,
77
+ backfill_group=None,
78
+ ):
79
+ apiKey = "apikey " + api_key
80
+ api = cwms.api.init_session(api_root=api_root, api_key=apiKey)
81
+
82
+ logging.info("Fetching CWMS location groups...")
83
+ try:
84
+ usgs_alias_group = cwms.get_location_group(
85
+ loc_group_id="USGS Station Number",
86
+ category_id="Agency Aliases",
87
+ office_id="CWMS",
88
+ group_office_id=office_id,
89
+ category_office_id=office_id,
90
+ )
91
+ usgs_measurement_locs = cwms.get_location_group(
92
+ loc_group_id="USGS Measurements",
93
+ category_id="Data Acquisition",
94
+ office_id="CWMS",
95
+ group_office_id=office_id,
96
+ category_office_id=office_id,
97
+ )
98
+ except requests.exceptions.RequestException as e:
99
+ logging.critical(f"Failed to fetch CWMS location groups: {e}. Exiting.")
100
+ exit(1)
101
+ except Exception as e:
102
+ logging.critical(
103
+ f"An unexpected error occurred fetching CWMS location groups: {e}. Exiting."
104
+ )
105
+ exit(1)
106
+
107
+ # merge them together
108
+ measurement_site_df = pd.merge(
109
+ usgs_measurement_locs.df,
110
+ usgs_alias_group.df,
111
+ on="location-id",
112
+ how="inner",
113
+ left_on=None,
114
+ right_on=None,
115
+ )
116
+ # drop any that don't have a USGS id
117
+ measurement_site_df = measurement_site_df[measurement_site_df["alias-id"].notnull()]
118
+
119
+ if measurement_site_df.empty:
120
+ logging.warning(
121
+ "No valid USGS measurement locations found in CWMS after de-duplication. Exiting."
122
+ )
123
+ exit(0)
124
+
125
+ # backfilling entire group get list of USGS ids to backfill
126
+ if backfill_group:
127
+ backfill_list = list(measurement_site_df["alias-id"].values)
128
+
129
+ if backfill_list:
130
+ backfill_mode(backfill_list, measurement_site_df)
131
+ else:
132
+ realtime_mode(days_back_collected, days_back_modified, measurement_site_df)
133
+
134
+
135
+ def convert_to_utc(df):
136
+ """
137
+ Converts a pandas DataFrame with timezone-aware datetimes to UTC using a timezone mapping.
138
+
139
+ Args:
140
+ df: pandas DataFrame with columns 'measurement_dt' (datetime-like) and 'tz_cd' (timezone code).
141
+
142
+ Returns:
143
+ pandas DataFrame with an added 'utc_time' column in UTC. Returns the original dataframe if there is an issue.
144
+ """
145
+ df_copy = df.copy()
146
+
147
+ if "measurement_dt" not in df_copy.columns or "tz_cd" not in df_copy.columns:
148
+ logging.error(
149
+ "Error: 'measurement_dt' or 'tz_cd' column not found in DataFrame for UTC conversion."
150
+ )
151
+ return df_copy
152
+
153
+ try:
154
+ df_copy["measurement_dt"] = pd.to_datetime(
155
+ df_copy["measurement_dt"], errors="coerce", format="ISO8601"
156
+ )
157
+ except Exception as e:
158
+ logging.error(f"Error converting 'measurement_dt' to datetime: {e}")
159
+ return df_copy
160
+
161
+ def to_utc_single_row(row):
162
+ dt = row["measurement_dt"]
163
+ tz_str = row["tz_cd"]
164
+
165
+ if pd.isna(dt):
166
+ return pd.NaT
167
+
168
+ if pd.isna(tz_str):
169
+ if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
170
+ return pytz.timezone("UTC").localize(dt).astimezone(pytz.utc)
171
+ else:
172
+ return dt.astimezone(pytz.utc)
173
+
174
+ try:
175
+ iana_tz_name = TZ_MAPPING.get(tz_str)
176
+ if iana_tz_name is None:
177
+ logging.warning(
178
+ f"Unknown timezone code: '{tz_str}'. Check TZ_MAPPING. Returning NaT for this row."
179
+ )
180
+ return pd.NaT
181
+
182
+ tz = pytz.timezone(iana_tz_name)
183
+ if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
184
+ dt_aware = tz.localize(dt)
185
+ else:
186
+ dt_aware = dt.astimezone(tz)
187
+
188
+ dt_utc = dt_aware.astimezone(pytz.utc)
189
+ return dt_utc
190
+ except pytz.exceptions.UnknownTimeZoneError:
191
+ logging.warning(
192
+ f"Unknown IANA timezone: '{iana_tz_name}' derived from '{tz_str}'. Returning NaT for this row."
193
+ )
194
+ return pd.NaT
195
+ except Exception as e:
196
+ logging.error(
197
+ f"An unexpected error occurred during UTC conversion for '{tz_str}': {e}. Returning NaT for this row."
198
+ )
199
+ return pd.NaT
200
+
201
+ df_copy["utc_time"] = df_copy.apply(to_utc_single_row, axis=1)
202
+ return df_copy
203
+
204
+
205
+ def rename_and_drop_columns(df: pd.DataFrame) -> pd.DataFrame:
206
+ """
207
+ Renames columns in a pandas DataFrame based on a predefined mapping.
208
+ If a target column name is not provided, the column is dropped.
209
+ Only columns that exist are renamed or dropped.
210
+
211
+ Args:
212
+ df: The input pandas DataFrame.
213
+
214
+ Returns:
215
+ A new pandas DataFrame with renamed and dropped columns.
216
+ """
217
+ df_copy = df.copy()
218
+
219
+ columns_to_drop = [
220
+ col
221
+ for col, target in COLUMN_MAPPING.items()
222
+ if target is None and col in df_copy.columns
223
+ ]
224
+ df_copy = df_copy.drop(columns=columns_to_drop, errors="ignore")
225
+
226
+ columns_to_rename = {
227
+ col: target
228
+ for col, target in COLUMN_MAPPING.items()
229
+ if target is not None and col in df_copy.columns
230
+ }
231
+ df_copy = df_copy.rename(columns=columns_to_rename, errors="ignore")
232
+
233
+ return df_copy
234
+
235
+
236
+ def clean_data(df):
237
+ """
238
+ Performs several data cleaning operations on a pandas DataFrame.
239
+
240
+ - Converts 'Yes'/'No' in 'used' to True/False (after renaming).
241
+ - Fills NaN values in string columns with empty strings.
242
+ - Fills NaN values in numeric columns with pandas.NA.
243
+ - Drops rows where both 'flow' and 'gage-height' are NaN.
244
+
245
+ Args:
246
+ df (pd.DataFrame): The input DataFrame to clean.
247
+
248
+ Returns:
249
+ pd.DataFrame: The cleaned DataFrame.
250
+ """
251
+ df_cleaned = df.copy()
252
+ dropped = 0
253
+
254
+ if "used" in df_cleaned.columns:
255
+ dropped = len(df_cleaned[df_cleaned["used"] == True])
256
+ logging.info(f"Dropping {dropped} measurements flagged as not used")
257
+ df_cleaned.loc[:, "used"] = (
258
+ df_cleaned["used"].map({"Yes": True, "No": False}).astype(pd.BooleanDtype())
259
+ )
260
+
261
+ string_cols = df_cleaned.select_dtypes(include="object").columns
262
+ numeric_cols = df_cleaned.select_dtypes(include=np.number).columns
263
+
264
+ if not string_cols.empty:
265
+ df_cleaned[string_cols] = df_cleaned[string_cols].astype("string").fillna("")
266
+ if not numeric_cols.empty:
267
+ df_cleaned[numeric_cols] = df_cleaned[numeric_cols].fillna(pd.NA)
268
+
269
+ if "flow" in df_cleaned.columns and "gage-height" in df_cleaned.columns:
270
+ mask = df_cleaned[["flow", "gage-height"]].isna().all(axis=1)
271
+ df_cleaned = df_cleaned[~mask].copy()
272
+ elif "flow" in df_cleaned.columns or "gage-height" in df_cleaned.columns:
273
+ logging.warning(
274
+ "Only one of 'flow' or 'gage-height' columns exists. Cannot perform combined NaN drop."
275
+ )
276
+
277
+ return df_cleaned, dropped
278
+
279
+
280
+ def process_usgs_data(df: pd.DataFrame) -> pd.DataFrame:
281
+ """
282
+ Orchestrates the processing of USGS data by applying a series of transformations:
283
+ 1. Converts 'measurement_dt' to UTC and adds 'utc_time' column.
284
+ 2. Renames and drops columns according to a predefined mapping.
285
+ 3. Performs general data cleaning (boolean mapping, NaN handling, row dropping).
286
+
287
+ Args:
288
+ df (pd.DataFrame): The input DataFrame containing USGS data.
289
+
290
+ Returns:
291
+ pd.DataFrame: The fully processed and cleaned DataFrame.
292
+ """
293
+ df_processed = df.copy()
294
+
295
+ df_processed = convert_to_utc(df_processed)
296
+ df_processed = rename_and_drop_columns(df_processed)
297
+ df_processed, dropped = clean_data(df_processed)
298
+
299
+ return df_processed, dropped
300
+
301
+
302
+ def remove_nan_values(data):
303
+ """
304
+ Recursively remove keys with None, NaN, or empty string values from a dictionary.
305
+ """
306
+ if isinstance(data, dict):
307
+ return {
308
+ k: remove_nan_values(v)
309
+ for k, v in data.items()
310
+ if v is not None
311
+ and not (isinstance(v, float) and math.isnan(v))
312
+ and not (isinstance(v, str) and v == "")
313
+ }
314
+ elif isinstance(data, list):
315
+ return [remove_nan_values(elem) for elem in data if elem is not None]
316
+ return data
317
+
318
+
319
+ def check_single_row_for_duplicates(row_to_check, df_existing):
320
+ """
321
+ Checks a single row for duplicates based on "number" and "instant"
322
+ against df_existing, and identifies differences if an instant-based
323
+ duplicate is found.
324
+
325
+ Args:
326
+ row_to_check: A pandas Series or a DataFrame with a single row
327
+ representing the item to check.
328
+ df_existing: The DataFrame to compare against. Its 'number' and 'instant'
329
+ columns will be temporarily converted for comparison.
330
+
331
+ Returns:
332
+ A tuple containing:
333
+ - original_row_passed_in: The original pandas Series or 1-row DataFrame
334
+ that was passed into the function.
335
+ - is_rejected: A boolean (True if the row was rejected due to a
336
+ duplicate number or a close instant, False otherwise).
337
+ - df_differences: A DataFrame detailing specific value differences
338
+ between the rejected incoming row and the closest
339
+ existing row. This DataFrame is populated ONLY if
340
+ is_rejected is True due to an instant duplicate
341
+ AND there are actual value differences.
342
+ Columns: ['Column Name', 'Incoming Value', 'Existing Value'].
343
+ Returns an empty DataFrame otherwise.
344
+ """
345
+ original_input_for_return = row_to_check
346
+
347
+ if isinstance(row_to_check, pd.Series):
348
+ df_store_internal = row_to_check.to_frame().T
349
+ elif isinstance(row_to_check, pd.DataFrame) and len(row_to_check) == 1:
350
+ df_store_internal = row_to_check
351
+ else:
352
+ raise ValueError(
353
+ "row_to_check must be a pandas Series or a DataFrame with a single row."
354
+ )
355
+
356
+ is_rejected = False
357
+ df_differences = pd.DataFrame(
358
+ columns=["Column Name", "Incoming Value", "Existing Value"]
359
+ )
360
+
361
+ if df_existing.empty:
362
+ return original_input_for_return, is_rejected, df_differences
363
+
364
+ df_store_compare = df_store_internal.copy()
365
+ df_existing_compare = df_existing.copy()
366
+
367
+ # cast number columns as int, sometimes USGS won't resolve to int...drop those rows
368
+ df_invalid = df_store_compare[
369
+ pd.to_numeric(df_store_compare["number"], errors="coerce").isna()
370
+ ]
371
+ if not df_invalid.empty:
372
+ logging.info(
373
+ f"Can't resolve measurement numbers {df_invalid['number'].values} to number. Won't store those measurements"
374
+ )
375
+
376
+ # Convert the valid rows to numeric and drop the invalid ones
377
+ df_store_compare["number"] = pd.to_numeric(
378
+ df_store_compare["number"], errors="coerce"
379
+ ) # Convert to numeric, coercing errors to NaN
380
+ df_store_compare = df_store_compare.dropna(
381
+ subset=["number"]
382
+ ) # Drop rows where 'number' is NaN
383
+
384
+ df_store_compare["number"] = df_store_compare["number"].astype(str)
385
+ df_existing_compare["number"] = df_existing_compare["number"].astype(str)
386
+
387
+ df_store_compare["instant"] = pd.to_datetime(df_store_compare["instant"])
388
+ df_existing_compare["instant"] = pd.to_datetime(df_existing_compare["instant"])
389
+
390
+ current_number = df_store_compare["number"].iloc[0]
391
+ current_instant = df_store_compare["instant"].iloc[0]
392
+
393
+ if current_number in df_existing_compare["number"].values:
394
+ is_rejected = True
395
+ return original_input_for_return, is_rejected, df_differences
396
+
397
+ time_diffs = (df_existing_compare["instant"] - current_instant).abs()
398
+ five_minutes = pd.Timedelta(minutes=5)
399
+
400
+ if not time_diffs.empty and time_diffs.min() <= five_minutes:
401
+ is_rejected = True
402
+
403
+ close_matches_mask = time_diffs <= five_minutes
404
+ close_matches = df_existing_compare[close_matches_mask]
405
+
406
+ if not close_matches.empty:
407
+ closest_existing_row_idx = (
408
+ (close_matches["instant"] - current_instant).abs().idxmin()
409
+ )
410
+ closest_existing_row = close_matches.loc[closest_existing_row_idx]
411
+
412
+ diff_records = []
413
+ columns_to_compare = [
414
+ col
415
+ for col in df_store_compare.columns
416
+ if col not in ["number", "instant"]
417
+ ]
418
+
419
+ for col in columns_to_compare:
420
+ current_val = df_store_compare[col].iloc[0]
421
+ existing_val = closest_existing_row.get(col)
422
+
423
+ # Handle NaN values explicitly
424
+ if pd.isna(current_val) and pd.isna(existing_val):
425
+ continue
426
+ elif pd.isna(current_val) != pd.isna(
427
+ existing_val
428
+ ): # One is NaN, other is not
429
+ diff_records.append(
430
+ {
431
+ "Column Name": col,
432
+ "Incoming Value": current_val,
433
+ "Existing Value": existing_val,
434
+ }
435
+ )
436
+ elif pd.api.types.is_numeric_dtype(
437
+ df_store_compare[col]
438
+ ) and pd.api.types.is_numeric_dtype(closest_existing_row[col]):
439
+ # For numeric values, compare with a small tolerance
440
+ if (
441
+ abs(current_val - existing_val) > 1e-6
442
+ ): # Example tolerance for floats
443
+ diff_records.append(
444
+ {
445
+ "Column Name": col,
446
+ "Incoming Value": current_val,
447
+ "Existing Value": existing_val,
448
+ }
449
+ )
450
+ elif current_val != existing_val:
451
+ diff_records.append(
452
+ {
453
+ "Column Name": col,
454
+ "Incoming Value": current_val,
455
+ "Existing Value": existing_val,
456
+ }
457
+ )
458
+
459
+ if diff_records:
460
+ df_differences = pd.DataFrame(diff_records)
461
+
462
+ return original_input_for_return, is_rejected, df_differences
463
+
464
+
465
+ def check_and_drop_duplicates(df_store, df_existing):
466
+ """
467
+ Checks for duplicates based on "number" and "instant" columns and drops them.
468
+
469
+ Args:
470
+ df_renamed: The DataFrame to check for duplicates and modify.
471
+ df_existing: The DataFrame to compare against.
472
+
473
+ Returns:
474
+ A tuple containing:
475
+ - df_renamed: The modified DataFrame with duplicates removed.
476
+ - df_rejected_number: DataFrame containing rows rejected due to duplicate "number".
477
+ - df_rejected_instant: DataFrame containing rows rejected due to "instant" within 5 minutes of existing.
478
+ """
479
+
480
+ if not df_existing.empty:
481
+
482
+ # cast number columns as int, sometimes USGS won't resolve to int...drop those rows
483
+ df_invalid = df_store[pd.to_numeric(df_store["number"], errors="coerce").isna()]
484
+ if not df_invalid.empty:
485
+ print(
486
+ f"Can't resolve measurement numbers {df_invalid['number'].values} to number. Won't store those measurements"
487
+ )
488
+
489
+ # Convert the valid rows to numeric and drop the invalid ones
490
+ df_store["number"] = pd.to_numeric(
491
+ df_store["number"], errors="coerce"
492
+ ) # Convert to numeric, coercing errors to NaN
493
+ df_store = df_store.dropna(subset=["number"]) # Drop rows where 'number' is NaN
494
+
495
+ # Convert the 'number' column to str
496
+ df_store.loc[:, "number"] = df_store["number"].astype(str)
497
+
498
+ # Ensure 'instant' columns are datetime objects
499
+ df_store["instant"] = pd.to_datetime(df_store["instant"])
500
+ df_existing["instant"] = pd.to_datetime(df_existing["instant"])
501
+
502
+ # Check for duplicate numbers
503
+ mask_number = df_store["number"].isin(df_existing["number"])
504
+ df_rejected_number = df_store[mask_number].copy() # Store rejected rows
505
+ df_store = df_store[~mask_number] # Remove duplicates from df_store
506
+
507
+ # Check for instants within 5 minutes
508
+
509
+ df_rejected_instant = pd.DataFrame(columns=df_store.columns) # Initialize
510
+
511
+ indices_to_drop = [] # Keep track of indices to drop efficiently
512
+
513
+ for index, row in df_store.iterrows():
514
+ # Find closest time in df_existing
515
+ closest_time = df_existing["instant"].iloc[
516
+ (df_existing["instant"] - row["instant"]).abs().argsort()[:1]
517
+ ]
518
+
519
+ # Check if time difference is within 5 minutes (300 seconds)
520
+ if abs((closest_time.iloc[0] - row["instant"]).total_seconds()) <= 300:
521
+ df_rejected_instant = pd.concat([df_rejected_instant, row.to_frame().T])
522
+ indices_to_drop.append(index)
523
+
524
+ df_store = df_store.drop(indices_to_drop)
525
+
526
+ return df_store, df_rejected_number, df_rejected_instant
527
+ else:
528
+ return df_store, pd.DataFrame(), pd.DataFrame()
529
+
530
+
531
+ def create_json_from_row(row):
532
+ """
533
+ Transforms a DataFrame row into the specified JSON format.
534
+ """
535
+ try:
536
+ instant_value = pd.to_datetime(row["instant"]).isoformat()
537
+ except Exception as e:
538
+ logging.warning(
539
+ f"Could not convert instant '{row.get('instant')}' to ISO format: {e}. Setting to None."
540
+ )
541
+ instant_value = None
542
+
543
+ json_data = {
544
+ "height-unit": "ft",
545
+ "flow-unit": "cfs",
546
+ "used": (
547
+ bool(row["used"]) if pd.notna(row["used"]) else False
548
+ ), # Ensure proper bool conversion
549
+ "agency": (
550
+ "USGS" if "unsp" in str(row["agency"]).lower() else str(row["agency"])
551
+ ),
552
+ "party": str(row["party"]),
553
+ "wm-comments": f"imported from get_USGS_measurements.py {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%SZ')}",
554
+ "instant": instant_value,
555
+ "id": {"office-id": str(row["office"]), "name": str(row["name"])},
556
+ "number": str(row["number"]),
557
+ "streamflow-measurement": {
558
+ "gage-height": (
559
+ float(row["gage-height"])
560
+ if pd.notna(row["gage-height"])
561
+ else CWMS_MISSING_VALUE
562
+ ),
563
+ "flow": (
564
+ float(row["flow"]) if pd.notna(row["flow"]) else CWMS_MISSING_VALUE
565
+ ),
566
+ "quality": str(row["quality"]),
567
+ },
568
+ "usgs-measurement": {
569
+ "control-condition": (
570
+ "Unspecified"
571
+ if pd.notna(row["control-condition"])
572
+ and "unsp" in row["control-condition"].lower()
573
+ else (
574
+ str(row["control-condition"])
575
+ if pd.notna(row["control-condition"])
576
+ else None
577
+ )
578
+ ),
579
+ "flow-adjustment": str(row["flow-adjustment"]),
580
+ "delta-height": (
581
+ float(row["delta-height"])
582
+ if pd.notna(row["delta-height"])
583
+ else None # Assuming None for delta if NaN
584
+ ),
585
+ "delta-time": (
586
+ float(row["delta-time"])
587
+ if pd.notna(row["delta-time"])
588
+ else None # Assuming None for delta if NaN
589
+ ),
590
+ },
591
+ }
592
+
593
+ # Apply the recursive NaN remover once at the end
594
+ json_data = remove_nan_values(json_data)
595
+ return json_data
596
+
597
+
598
+ def realtime_mode(DAYS_BACK_COLLECTED, DAYS_BACK_MODIFIED, measurement_site_df):
599
+ execution_date = datetime.now()
600
+ startDT = execution_date - timedelta(DAYS_BACK_COLLECTED)
601
+
602
+ logging.info(
603
+ f"Fetching USGS discharge measurements from {startDT.isoformat()} (modified in last {DAYS_BACK_MODIFIED} days)..."
604
+ )
605
+ try:
606
+
607
+ df_meas_usgs, meta = nwis.get_discharge_measurements(
608
+ # sites=["05058000", "05059500"],
609
+ period=f"P{DAYS_BACK_COLLECTED}D",
610
+ channel_rdb_info="1",
611
+ sv_md_interval="DAY",
612
+ sv_md=f"{DAYS_BACK_MODIFIED}",
613
+ sv_md_minutes="2",
614
+ )
615
+ logging.info(f"Queried {meta}")
616
+ except Exception as e:
617
+ logging.critical(f"Failed to fetch USGS measurements: {e}. Exiting.")
618
+ exit(1)
619
+
620
+ if df_meas_usgs.empty:
621
+ logging.info("No new USGS measurements found to process.")
622
+ exit(0)
623
+
624
+ logging.info(f"Processing {len(df_meas_usgs)} USGS measurements...")
625
+ df_meas_usgs, dropped = process_usgs_data(df_meas_usgs)
626
+ total_usgs_measurements_processed = 0
627
+ total_usgs_measurements_skipped_no_cwms_mapping = 0
628
+
629
+ # This will store stats like: {'office_id_MVP': {'attempted': X, 'successful': Y, 'rejected': Z}}
630
+ office_store_stats = defaultdict(lambda: defaultdict(int))
631
+ for _, usgs_row in df_meas_usgs.iterrows():
632
+ total_usgs_measurements_processed += 1
633
+ site_no = usgs_row.usgs_site_no
634
+
635
+ site_filter_df = measurement_site_df[measurement_site_df["alias-id"] == site_no]
636
+ # skip if site isn't in measurement group
637
+ if len(site_filter_df) == 0:
638
+ total_usgs_measurements_skipped_no_cwms_mapping += 1
639
+ continue
640
+
641
+ cwms_loc = site_filter_df["location-id"].values[0]
642
+
643
+ office_id = site_filter_df["office-id_x"].values[0]
644
+ overwrite_flag = site_filter_df["attribute_x"].values[
645
+ 0
646
+ ] # Assuming 1 means overwrite, 0 means don't overwrite
647
+
648
+ # Create a copy of the row for JSON creation and modification
649
+ usgs_row_for_json = usgs_row.copy()
650
+ usgs_row_for_json["name"] = cwms_loc
651
+ usgs_row_for_json["office"] = office_id
652
+
653
+ data = create_json_from_row(usgs_row_for_json)
654
+ office_store_stats[office_id][
655
+ "attempted"
656
+ ] += 1 # Increment attempted for this office
657
+
658
+ # get existing measurements at site
659
+ df_existing = pd.DataFrame() # Initialize as empty
660
+ try:
661
+ existing_measurements = cwms.get_measurements(
662
+ location_id_mask=cwms_loc, office_id=office_id
663
+ )
664
+ if existing_measurements and existing_measurements.df is not None:
665
+ df_existing = existing_measurements.df
666
+ except Exception as e:
667
+ logging.error(
668
+ f"An unexpected error occurred while getting existing measurements for {cwms_loc} ({office_id}). Assuming no existing measurements."
669
+ )
670
+
671
+ _, is_rejected, df_differences = check_single_row_for_duplicates(
672
+ usgs_row_for_json, df_existing
673
+ )
674
+
675
+ log_prefix = f"USGS site {site_no} -> CWMS loc {cwms_loc} ({office_id}) measurement collected at {usgs_row.instant}"
676
+
677
+ if overwrite_flag == 1:
678
+ try:
679
+ logging.info(f"{log_prefix} (overwrite enabled). Storing.")
680
+ cwms.store_measurements(data=[data], fail_if_exists=False)
681
+ office_store_stats[office_id][
682
+ "successful"
683
+ ] += 1 # Increment successful for this office
684
+ if not df_differences.empty:
685
+ logging.info(
686
+ f"Differences found between stored data and new data for {log_prefix}:\n{df_differences.to_string()}"
687
+ )
688
+ except requests.exceptions.RequestException as e:
689
+ logging.error(f"CWMS API network error storing {log_prefix}: {e}")
690
+ # For overwrite enabled, if it fails, it's an error, not a 'rejection' due to existing data
691
+ except Exception as e:
692
+ logging.error(f"Unexpected error storing {log_prefix}: {e}")
693
+ else: # overwrite_flag is 0 or some other value, meaning don't overwrite
694
+ if not is_rejected:
695
+ try:
696
+ logging.info(f"{log_prefix}. Storing.")
697
+ cwms.store_measurements(
698
+ data=[data]
699
+ ) # fail_if_exists=True by default
700
+ office_store_stats[office_id][
701
+ "successful"
702
+ ] += 1 # Increment successful for this office
703
+ if not df_differences.empty:
704
+ logging.info(
705
+ f"Differences found between stored data and new data for {log_prefix}:\n{df_differences.to_string()}"
706
+ )
707
+ except requests.exceptions.RequestException as e:
708
+ # If fail_if_exists is True (default)
709
+ logging.warning(
710
+ f"CWMS API network error (likely duplicate or conflict) storing {log_prefix}: {e}"
711
+ )
712
+ office_store_stats[office_id][
713
+ "rejected"
714
+ ] += 1 # Increment rejected for this office
715
+ except Exception as e:
716
+ logging.error(f"Unexpected error storing {log_prefix}: {e}")
717
+ else:
718
+ logging.warning(
719
+ f"{log_prefix} has same number field ({usgs_row.number}) or similar collection time as existing measurement. Not storing."
720
+ )
721
+ office_store_stats[office_id][
722
+ "rejected"
723
+ ] += 1 # Increment rejected for this office
724
+
725
+ logging.info("-" * 50)
726
+ logging.info("Processing Summary:")
727
+ logging.info(f"Total USGS measurements fetched: {len(df_meas_usgs)}")
728
+ logging.info(
729
+ f"Total unique USGS measurements processed for CWMS: {total_usgs_measurements_processed}"
730
+ )
731
+ logging.info(
732
+ f"Total USGS measurements skipped (no CWMS mapping): {total_usgs_measurements_skipped_no_cwms_mapping}"
733
+ )
734
+
735
+ logging.info("\nCWMS Store Statistics Per Office:")
736
+ # Calculate global totals from office_store_stats for consistency
737
+ global_attempted = sum(stats["attempted"] for stats in office_store_stats.values())
738
+ global_successful = sum(
739
+ stats["successful"] for stats in office_store_stats.values()
740
+ )
741
+ global_rejected = sum(stats["rejected"] for stats in office_store_stats.values())
742
+
743
+ for office, stats in sorted(office_store_stats.items()):
744
+ logging.info(f" Office: {office}")
745
+ logging.info(f" Attempted: {stats['attempted']}")
746
+ logging.info(f" Successful: {stats['successful']}")
747
+ logging.info(f" Rejected (Duplicate/Conflict): {stats['rejected']}")
748
+
749
+ logging.info("\nOverall CWMS Store Statistics:")
750
+ logging.info(
751
+ f"Total CWMS store attempts (across all configurations): {global_attempted}"
752
+ )
753
+ logging.info(f"Total CWMS stores successful: {global_successful}")
754
+ logging.info(f"Total CWMS stores rejected (duplicate/conflict): {global_rejected}")
755
+ logging.info("-" * 50)
756
+ pass
757
+
758
+
759
+ def backfill_mode(BACKFILL_LIST, measurement_site_df):
760
+ # Initialize summary tracking dictionaries
761
+ site_summary = {} # Will store stats for each site
762
+ overall_failed_stores = [] # Will store all failed measurement details
763
+
764
+ for usgs_site in BACKFILL_LIST:
765
+ # Initialize site-specific counters
766
+ site_stats = {
767
+ "measurements_fetched": 0,
768
+ "measurements_saved": 0,
769
+ "measurements_failed": 0,
770
+ "failed_details": [],
771
+ }
772
+
773
+ cwms_loc = measurement_site_df[measurement_site_df["alias-id"] == usgs_site][
774
+ "location-id"
775
+ ].values[0]
776
+ OFFICE = measurement_site_df[measurement_site_df["alias-id"] == usgs_site][
777
+ "office-id_x"
778
+ ].values[0]
779
+ overwrite_code = int(
780
+ measurement_site_df[measurement_site_df["alias-id"] == usgs_site][
781
+ "attribute_x"
782
+ ].values[0]
783
+ )
784
+ logging.info(
785
+ f"Fetching USGS POR discharge measurements for {usgs_site} {cwms_loc})..."
786
+ )
787
+ try:
788
+ df_meas_usgs, meta = nwis.get_discharge_measurements(
789
+ sites=[usgs_site],
790
+ channel_rdb_info="1",
791
+ )
792
+ logging.info(f"Queried {meta}")
793
+ site_stats["measurements_fetched"] = len(df_meas_usgs)
794
+ except Exception as e:
795
+ logging.critical(f"Failed to fetch USGS measurements: {e}. Exiting.")
796
+ df_meas_usgs = pd.DataFrame()
797
+
798
+ if df_meas_usgs.empty:
799
+ logging.info("No new USGS measurements found to process.")
800
+ site_summary[f"{usgs_site} ({cwms_loc})"] = site_stats
801
+ continue # Continue to next site instead of exiting
802
+
803
+ logging.info(f"Processing {len(df_meas_usgs)} USGS measurements...")
804
+ df_meas_usgs, dropped = process_usgs_data(df_meas_usgs)
805
+
806
+ # This will store stats like: {'office_id_MVP': {'attempted': X, 'successful': Y, 'rejected': Z}}
807
+ office_store_stats = defaultdict(lambda: defaultdict(int))
808
+
809
+ df_meas_usgs["location-id"] = df_meas_usgs["name"] = cwms_loc
810
+ df_meas_usgs["office"] = OFFICE
811
+
812
+ log_prefix = (
813
+ f"USGS site {usgs_site} -> CWMS loc {cwms_loc} ({OFFICE}) POR measurements"
814
+ )
815
+
816
+ # get existing measurements at site
817
+ df_existing = pd.DataFrame() # Initialize as empty
818
+ try:
819
+ existing_measurements = cwms.get_measurements(
820
+ location_id_mask=cwms_loc, office_id=OFFICE
821
+ )
822
+ if existing_measurements and existing_measurements.df is not None:
823
+ df_existing = existing_measurements.df
824
+ except Exception as e:
825
+ logging.error(
826
+ f"An unexpected error occurred while getting existing measurements for {cwms_loc} ({OFFICE}). Assuming no existing measurements."
827
+ )
828
+ if overwrite_code != 1:
829
+ logging.info(
830
+ "Overwrite flag is off. Filtering out any conflicting measurements"
831
+ )
832
+ df_store, df_rejected_number, df_rejected_instant = (
833
+ check_and_drop_duplicates(df_meas_usgs, df_existing)
834
+ )
835
+
836
+ if not df_rejected_number.empty:
837
+ logging.info(
838
+ f"The following measurements were rejected because of duplicate measurement numbers {df_rejected_number}"
839
+ )
840
+ if not df_rejected_instant.empty:
841
+ logging.info(
842
+ f"The following measurements were rejected because of duplicate measurement numbers {df_rejected_instant}"
843
+ )
844
+ else:
845
+ df_store = df_meas_usgs.copy()
846
+
847
+ json_list = []
848
+ for _, usgs_row in df_store.iterrows():
849
+ json_list.append(create_json_from_row(usgs_row))
850
+
851
+ # store the measurement
852
+ try:
853
+ logging.info(f"{log_prefix} Storing.")
854
+ cwms.store_measurements(data=json_list, fail_if_exists=False)
855
+ logging.info("-" * 50)
856
+ office_store_stats[OFFICE]["successful"] += 1
857
+ site_stats["measurements_saved"] = len(json_list)
858
+ except requests.exceptions.RequestException as e:
859
+ logging.error(f"CWMS API network error storing {log_prefix}: {e}")
860
+ # Track the bulk failure
861
+ site_stats["measurements_failed"] = len(json_list)
862
+ for data in json_list:
863
+ failure_detail = {
864
+ "site": f"{usgs_site} ({cwms_loc})",
865
+ "measurement_number": data.get("number", "Unknown"),
866
+ "instant": data.get("instant", "Unknown"),
867
+ "error": f"Network error: {e}",
868
+ }
869
+ site_stats["failed_details"].append(failure_detail)
870
+ overall_failed_stores.append(failure_detail)
871
+ except Exception as e:
872
+ logging.error(f"Unexpected error storing {log_prefix}: {e}")
873
+ logging.info("Storing one measurement at a time")
874
+
875
+ measurements_saved_individually = 0
876
+ measurements_failed_individually = 0
877
+
878
+ for data in json_list:
879
+ try:
880
+ cwms.store_measurements(data=[data], fail_if_exists=False)
881
+ measurements_saved_individually += 1
882
+ except Exception as individual_error:
883
+ measurements_failed_individually += 1
884
+ inst = data.get("instant", "Unknown")
885
+ number = data.get("number", "Unknown")
886
+ logging.error(
887
+ f"Could not store measurement {number} collected at {inst} at {cwms_loc}"
888
+ )
889
+
890
+ failure_detail = {
891
+ "site": f"{usgs_site} ({cwms_loc})",
892
+ "measurement_number": number,
893
+ "instant": inst,
894
+ "error": str(individual_error),
895
+ }
896
+ site_stats["failed_details"].append(failure_detail)
897
+ overall_failed_stores.append(failure_detail)
898
+
899
+ site_stats["measurements_saved"] = measurements_saved_individually
900
+ site_stats["measurements_failed"] = measurements_failed_individually
901
+
902
+ # Store site summary
903
+ site_summary[f"{usgs_site} ({cwms_loc})"] = site_stats
904
+
905
+ logging.info("Processing Summary for this site:")
906
+ logging.info(
907
+ f"Total USGS measurements fetched: {site_stats['measurements_fetched']}"
908
+ )
909
+ logging.info(f"Total measurements saved: {site_stats['measurements_saved']}")
910
+ logging.info(f"Total measurements failed: {site_stats['measurements_failed']}")
911
+
912
+ # Print overall processing summary
913
+ logging.info("=" * 60)
914
+ logging.info("OVERALL PROCESSING SUMMARY")
915
+ logging.info("=" * 60)
916
+
917
+ # Summary by site
918
+ logging.info("MEASUREMENTS SAVED BY SITE:")
919
+ logging.info("-" * 40)
920
+ total_saved_all_sites = 0
921
+ total_failed_all_sites = 0
922
+
923
+ for site_name, stats in site_summary.items():
924
+ logging.info(f"{site_name}:")
925
+ logging.info(f" - Fetched: {stats['measurements_fetched']}")
926
+ logging.info(f" - Saved: {stats['measurements_saved']}")
927
+ logging.info(f" - Failed: {stats['measurements_failed']}")
928
+ total_saved_all_sites += stats["measurements_saved"]
929
+ total_failed_all_sites += stats["measurements_failed"]
930
+ logging.info("")
931
+
932
+ logging.info(f"TOTAL MEASUREMENTS SAVED ACROSS ALL SITES: {total_saved_all_sites}")
933
+ logging.info(
934
+ f"TOTAL MEASUREMENTS FAILED ACROSS ALL SITES: {total_failed_all_sites}"
935
+ )
936
+
937
+ # Summary of failed measurements
938
+ if overall_failed_stores:
939
+ logging.info("")
940
+ logging.info("FAILED MEASUREMENT STORES SUMMARY:")
941
+ logging.info("-" * 40)
942
+ logging.info(f"Total failed measurements: {len(overall_failed_stores)}")
943
+
944
+ # Group failures by site
945
+ failures_by_site = defaultdict(list)
946
+ for failure in overall_failed_stores:
947
+ failures_by_site[failure["site"]].append(failure)
948
+
949
+ for site, failures in failures_by_site.items():
950
+ logging.info(f"\n{site} - {len(failures)} failed measurements:")
951
+ for failure in failures[:5]: # Show first 5 failures per site
952
+ logging.info(
953
+ f" - Measurement {failure['measurement_number']} at {failure['instant']}"
954
+ )
955
+ if len(failures) > 5:
956
+ logging.info(f" - ... and {len(failures) - 5} more failures")
957
+ else:
958
+ logging.info("")
959
+ logging.info("No failed measurement stores!")
960
+
961
+ logging.info("=" * 60)