mpcaHydro 2.0.4__py3-none-any.whl → 2.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mpcaHydro/equis.py ADDED
@@ -0,0 +1,488 @@
1
+
2
+
3
+ from datetime import datetime, timezone, timedelta
4
+ import pandas as pd
5
+ from typing import Union
6
+ import oracledb
7
+ import duckdb
8
+
9
+ CONNECTION = None
10
+
11
+ CAS_RN_MAP = {'479-61-8':'CHLA',
12
+ 'CHLA-CORR':'CHLA',
13
+ 'BOD':'BOD',
14
+ 'NO2NO3':'N', #TODO change to 'NO2NO3'
15
+ '14797-55-8': 'NO3',
16
+ '14797-65-0':'NO2',
17
+ '14265-44-2': 'OP',
18
+ 'N-KJEL' : 'TKN',
19
+ 'PHOSPHATE-P': 'TP',
20
+ '7723-14-0' : 'TP',
21
+ 'SOLIDS-TSS': 'TSS',
22
+ 'TEMP-W' : 'WT',
23
+ '7664-41-7' : 'NH3'}
24
+
25
+ def connect(user: str, password: str, host: str = "DELTAT", port: int = 1521, sid: str = "DELTAT"):
26
+ '''Create and return an Oracle database connection.'''
27
+
28
+ global CONNECTION
29
+ CONNECTION = oracledb.connect(user=user,
30
+ password=password,
31
+ host=host,
32
+ port=port,
33
+ sid=sid)
34
+ return CONNECTION
35
+
36
+ def close_connection():
37
+ '''Close the global Oracle database connection if it exists.'''
38
+ global CONNECTION
39
+ if CONNECTION:
40
+ CONNECTION.close()
41
+ CONNECTION = None
42
+
43
+
44
+
45
+ def test_connection():
46
+ raise NotImplementedError("This function is a placeholder for testing Oracle DB connection.")
47
+ try:
48
+ # or for SID:
49
+ # connection = oracledb.connect(user="your_username",
50
+ # password="your_password",
51
+ # host="your_host",
52
+ # port=1521,
53
+ # sid="your_sid")
54
+
55
+ print("Successfully connected to Oracle Database")
56
+
57
+ # Perform database operations here
58
+ # ...
59
+ if connection:
60
+ connection.close()
61
+ print("Connection closed")
62
+ except oracledb.Error as e:
63
+ print(f"Error connecting to Oracle Database: {e}")
64
+
65
+
66
+
67
+ def make_placeholders(items):
68
+ '''Create SQL placeholders and bind values for a list of items'''
69
+ # Create placeholders like :id0, :id1, :id2
70
+ placeholders = ', '.join(f':id{i}' for i in range(len(items)))
71
+ # Create dictionary of bind values
72
+ binds = {f'id{i}': val for i, val in enumerate(items)}
73
+ return placeholders, binds
74
+
75
+ def to_dataframe(odb_cursor):
76
+ '''Convert Oracle cursor results to a pandas DataFrame'''
77
+ column_names = [description[0] for description in odb_cursor.description]
78
+ rows = odb_cursor.fetchall()
79
+ df = pd.DataFrame(rows,columns = column_names)
80
+ return df
81
+
82
+ #%% Query for station locations with HSPF related constituents
83
+
84
+ def download(station_ids):
85
+ '''Download data for given station IDs from Oracle database.
86
+ This grabs data from the Data access Layer (DAL) equis result view for
87
+ river/stream locations and HSPF related constituents only.'''
88
+ placeholders, binds = make_placeholders(station_ids)
89
+ query = f"""
90
+ SELECT
91
+ mpca_dal.eq_fac_station.latitude,
92
+ mpca_dal.eq_fac_station.longitude,
93
+ mpca_dal.eq_fac_station.wid_list,
94
+ mpca_dal.eq_sample.sample_method,
95
+ mpca_dal.eq_sample.sample_remark,
96
+ mpca_dal.mv_eq_result.*
97
+ FROM
98
+ mpca_dal.mv_eq_result
99
+ LEFT JOIN mpca_dal.eq_fac_station
100
+ ON mpca_dal.mv_eq_result.sys_loc_code = mpca_dal.eq_fac_station.sys_loc_code
101
+ AND mpca_dal.mv_eq_result.facility_id = mpca_dal.eq_fac_station.facility_id
102
+ LEFT JOIN mpca_dal.eq_sample ON mpca_dal.mv_eq_result.sample_id = mpca_dal.eq_sample.sample_id
103
+ WHERE
104
+ mpca_dal.mv_eq_result.cas_rn IN ('479-61-8',
105
+ 'CHLA-CORR',
106
+ 'BOD',
107
+ 'NO2NO3',
108
+ '14797-55-8',
109
+ '14797-65-0',
110
+ '14265-44-2',
111
+ 'N-KJEL',
112
+ 'PHOSPHATE-P',
113
+ '7723-14-0',
114
+ 'SOLIDS-TSS',
115
+ 'TEMP-W',
116
+ '7664-41-7',
117
+ 'FLOW')
118
+ AND mpca_dal.eq_fac_station.loc_type = 'River/Stream'
119
+ AND mpca_dal.mv_eq_result.approval_code = 'Final'
120
+ AND mpca_dal.mv_eq_result.reportable_result = 'Y'
121
+ AND mpca_dal.mv_eq_result.facility_id IN ( 1, 33836701 )
122
+ AND mpca_dal.eq_sample.sample_method IN ('G-EVT', 'G', 'FIELDMSROBS', 'LKSURF1M', 'LKSURF2M', 'LKSURFOTH')
123
+ AND mpca_dal.mv_eq_result.sys_loc_code IN ({placeholders})
124
+ """
125
+ with CONNECTION.cursor() as cursor:
126
+ cursor.execute(query,binds)
127
+ return to_dataframe(cursor)
128
+
129
+
130
+
131
+ def as_utc_offset(naive_dt: Union[datetime, str], tz_label: str, target_offset: timezone) -> datetime:
132
+ """
133
+ Interpret naive_dt (datetime or ISO string) using tz_label ('CST'|'CDT' or other).
134
+ - If tz_label == 'CST' -> interpret as UTC-6
135
+ - If tz_label == 'CDT' -> interpret as UTC-5
136
+ - Else -> attempt to interpret using America/Chicago (ZoneInfo) which applies DST rules.
137
+
138
+ WARNING : This function uses a replace mthod that assumes the input datetime is naive (no tzinfo).
139
+ If the input datetime already has tzinfo, this will lead to incorrect results.
140
+
141
+ Returns a timezone-aware datetime converted to a fixed UTC-6 timezone (tzinfo=UTC_MINUS_6).
142
+ This preserves the instant and expresses it in UTC-6.
143
+
144
+
145
+ """
146
+ if isinstance(naive_dt, str):
147
+ naive = pd.to_datetime(naive_dt).to_pydatetime()
148
+ elif isinstance(naive_dt, datetime):
149
+ naive = naive_dt
150
+ else:
151
+ raise TypeError("naive_dt must be datetime or str")
152
+
153
+ label = (tz_label or "").strip().upper()
154
+
155
+ if label == "CST":
156
+ src_tz = timezone(timedelta(hours=-6))
157
+ elif label == "CDT":
158
+ src_tz = timezone(timedelta(hours=-5))
159
+ elif label == 'UTC':
160
+ src_tz = timezone.utc
161
+ else:
162
+ raise ValueError(f"Unexpected timezone label: {tz_label}")
163
+ # attach the source tz (interpret naive as local time in src_tz)
164
+ aware_src = naive.replace(tzinfo=src_tz)
165
+
166
+ # convert the instant to fixed UTC-6
167
+ return aware_src.astimezone(target_offset).tz_localize(None)
168
+
169
+
170
+ def normalize_columns(df):
171
+ '''Select relevant columns from Equis data.'''
172
+ return df[['SYS_LOC_CODE',
173
+ 'constituent',
174
+ 'CAS_RN',
175
+ 'datetime',
176
+ 'RESULT_NUMERIC',
177
+ 'RESULT_UNIT',
178
+ ]].rename(columns={
179
+ 'SYS_LOC_CODE':'station_id',
180
+ 'RESULT_NUMERIC':'value',
181
+ 'RESULT_UNIT':'unit',
182
+ 'CAS_RN':'cas_rn'
183
+ })
184
+
185
+
186
+
187
+ def normalize_timezone(df):
188
+ '''Normalize datetime to UTC in Equis data.'''
189
+ target_offset = timezone(timedelta(hours=-6))
190
+ def _conv(row):
191
+ try:
192
+ return as_utc_offset(row['SAMPLE_DATE_TIME'], row['SAMPLE_DATE_TIMEZONE'],target_offset)
193
+ except Exception:
194
+ return pd.NaT
195
+
196
+ df.loc[:,'datetime'] = df.apply(_conv, axis=1)
197
+ return df
198
+
199
+ def convert_units(df):
200
+ '''Convert units in Equis data to standard units.'''
201
+ # Convert ug/L to mg/L
202
+ df['unit'] = df['unit'].str.lower()
203
+
204
+ mask_ugL = df['unit'] == 'ug/l'
205
+ df.loc[mask_ugL, 'value'] = df.loc[mask_ugL, 'value'] / 1000
206
+ df.loc[mask_ugL, 'unit'] = 'mg/l'
207
+
208
+ # Convert mg/g to mg/L (assuming density of 1 g/mL)
209
+ mask_mgg = df['unit'] == 'mg/g'
210
+ df.loc[mask_mgg, 'value'] = df.loc[mask_mgg, 'value'] * 1000
211
+ df.loc[mask_mgg, 'unit'] = 'mg/l'
212
+
213
+ # Convert deg C to degF
214
+ mask_degC = df['unit'].isin(['deg c', 'degc'])
215
+ df.loc[mask_degC, 'value'] = (df.loc[mask_degC, 'value'] * 9/5) + 32
216
+ df.loc[mask_degC, 'unit'] = 'degf'
217
+
218
+ return df
219
+
220
+ def map_constituents(df):
221
+ '''Map CAS_RN to standard constituent names in Equis data.'''
222
+ df['constituent'] = df['CAS_RN'].map(CAS_RN_MAP)
223
+ return df
224
+
225
+
226
+ def average_results(df):
227
+ ''' Average samples by hour, station, and constituent'''
228
+ df['datetime'] = df['datetime'].dt.round('h')
229
+ df['station_origin'] = 'equis'
230
+ return df.groupby(['station_id', 'datetime', 'constituent', 'unit','station_origin']).agg(
231
+ value=('value', 'mean')
232
+ ).reset_index()
233
+
234
+ def replace_nondetects(df):
235
+ '''Replace non-detect results with 0 in Equis data.'''
236
+ df.loc[df['value'].isna(), 'value'] = 0
237
+ return df
238
+
239
+ def normalize(df):
240
+ '''Normalize Equis data: select relevant columns.'''
241
+ df = map_constituents(df)
242
+ df = normalize_timezone(df)
243
+ df = normalize_columns(df)
244
+ df = convert_units(df)
245
+ return df
246
+
247
+ def transform(df):
248
+ '''Transform Equis data: handle non-detects, convert units, map constituents.'''
249
+
250
+ df = normalize(df)
251
+ df = replace_nondetects(df)
252
+ if not df.empty:
253
+ df = average_results(df)
254
+ return df
255
+
256
+
257
+
258
+ #%% Transformations using duckdb instead of pandas
259
+ # def transform_staging_to_hourly_cte(con: duckdb.DuckDBPyConnection,
260
+ # source_table: str,
261
+ # analytics_table: str):
262
+ # """
263
+ # Single-statement transformation using chained CTEs.
264
+ # - Good when you want the whole logical pipeline in one place and avoid intermediate objects.
265
+ # - Produces analytics.<analytics_table> as the final materialized table.
266
+ # """
267
+
268
+ # mapping_cases = " ".join([f"WHEN '{k}' THEN '{v}'" for k, v in CAS_RN_MAP.items()])
269
+ # target_offset_hours = -6
270
+ # # Example assumes source_table has: station_id, datetime, value (numeric), constituent, unit, station_origin
271
+ # sql = f"""
272
+ # CREATE OR REPLACE TABLE {analytics_table} AS
273
+ # WITH
274
+ # -- Step 1: normalize column names
275
+ # normalized AS (
276
+ # SELECT *,
277
+ # SYS_LOC_CODE AS station_id,
278
+ # SAMPLE_DATE_TIME AS datetime,
279
+ # SAMPLE_DATE_TIMEZONE AS datetime_timezone,
280
+ # RESULT_NUMERIC AS value,
281
+ # RESULT_UNIT AS unit
282
+ # FROM {source_table}),
283
+
284
+ # -- map constituents
285
+ # constituents AS (
286
+ # SELECT
287
+ # *,
288
+ # CASE CAS_RN
289
+ # {mapping_cases}
290
+ # ELSE NULL
291
+ # END AS constituent
292
+ # FROM normalized),
293
+
294
+ # -- Step 2: convert units
295
+ # conversions AS (
296
+ # SELECT *,
297
+ # CASE
298
+ # WHEN LOWER(unit) = 'ug/l' THEN value / 1000
299
+ # WHEN LOWER(unit) = 'mg/g' THEN value * 1000
300
+ # WHEN LOWER(unit) IN ('deg c', 'degc') THEN (value * 9/5) + 32
301
+ # ELSE value
302
+ # END AS value,
303
+ # CASE
304
+ # WHEN LOWER(unit) = 'ug/l' THEN 'mg/L'
305
+ # WHEN LOWER(unit) = 'mg/g' THEN 'mg/L'
306
+ # WHEN LOWER(unit) IN ('deg c', 'degc') THEN 'degF'
307
+ # ELSE unit
308
+ # END AS unit
309
+ # FROM constituents),
310
+
311
+ # -- normalize timezone
312
+ # timezones AS (
313
+ # SELECT *,
314
+ # CASE
315
+ # WHEN datetime_timezone = 'CST' THEN
316
+ # (datetime AT TIME ZONE INTERVAL '-6 hours') AT TIME ZONE INTERVAL '{target_offset_hours} hours'
317
+ # WHEN datetime_timezone = 'CDT' THEN
318
+ # (datetime AT TIME ZONE INTERVAL '-5 hours') AT TIME ZONE INTERVAL '{target_offset_hours} hours'
319
+ # ELSE
320
+ # datetime AT TIME ZONE INTERVAL '{target_offset_hours} hours'
321
+ # END AS datetime
322
+ # FROM conversions),
323
+
324
+
325
+ # hourly AS (
326
+ # SELECT
327
+ # station_id,
328
+ # DATE_TRUNC('hour', datetime + INTERVAL '30 minute') AS datetime,
329
+ # constituent,
330
+ # unit,
331
+ # 'equis' AS station_origin,
332
+ # AVG(value) AS value
333
+ # FROM timezone
334
+ # GROUP BY station_id, datetime, constituent, unit
335
+ # )
336
+
337
+ # SELECT * FROM hourly
338
+ # """
339
+ # con.execute(sql)
340
+ # return 0
341
+
342
+
343
+
344
+
345
+
346
+ # #%%
347
+
348
+ # def normalize_columns(con: duckdb.DuckDBPyConnection, table_name: str):
349
+ # '''
350
+ # Select relevant columns from Equis data using DuckDB.
351
+ # '''
352
+ # con.execute(f"""
353
+ # CREATE TEMP VIEW v_normalized AS
354
+ # SELECT *,
355
+ # SYS_LOC_CODE AS station_id,
356
+ # SAMPLE_DATE_TIME AS datetime,
357
+ # SAMPLE_DATE_TIMEZONE AS datetime_timezone,
358
+ # RESULT_NUMERIC AS value,
359
+ # RESULT_UNIT AS unit
360
+ # FROM {table_name} e
361
+ # """)
362
+
363
+
364
+ # def map_constituents_duckdb(con: duckdb.DuckDBPyConnection, table_name: str):
365
+ # '''
366
+ # Map CAS_RN to standard constituent names in Equis data using DuckDB.
367
+ # '''
368
+
369
+ # mapping_cases = " ".join([f"WHEN '{k}' THEN '{v}'" for k, v in CAS_RN_MAP.items()])
370
+ # con.execute(f"""
371
+ # CREATE TEMP VIEW v_constituents AS
372
+ # SELECT
373
+ # *,
374
+ # CASE CAS_RN
375
+ # {mapping_cases}
376
+ # ELSE NULL
377
+ # END AS constituent
378
+ # FROM v_normalized
379
+ # """)
380
+
381
+ # def convert_units_duckdb(con: duckdb.DuckDBPyConnection, table_name: str):
382
+ # '''
383
+ # Convert units in Equis data to standard units using DuckDB.
384
+ # '''
385
+
386
+ # mapping_cases = " ".join([f"WHEN '{k}' THEN '{v}'" for k, v in CAS_RN_MAP.items()])
387
+ # target_offset = timedelta(hours=-6)
388
+
389
+
390
+ # con.execute(f"""
391
+ # CREATE TEMP VIEW v_conversions AS
392
+ # SELECT
393
+ # *,
394
+
395
+
396
+ # CASE
397
+ # WHEN LOWER(unit) = 'ug/l' THEN value / 1000
398
+ # WHEN LOWER(unit) = 'mg/g' THEN value * 1000
399
+ # WHEN LOWER(unit) IN ('deg c', 'degc') THEN (value * 9/5) + 32
400
+ # ELSE value
401
+ # END AS value,
402
+ # CASE
403
+ # WHEN LOWER(unit) = 'ug/l' THEN 'mg/L'
404
+ # WHEN LOWER(unit) = 'mg/g' THEN 'mg/L'
405
+ # WHEN LOWER(unit) IN ('deg c', 'degc') THEN 'degF'
406
+ # ELSE unit
407
+ # END AS unit
408
+ # FROM v_constituents""")
409
+
410
+
411
+ # def normalize_timezone(con: duckdb.DuckDBPyConnection, source_table: str, target_offset_hours: int = -6):
412
+
413
+ # con.execute(f"""
414
+ # CREATE TEMP VIEW v_timezone AS
415
+ # SELECT *,
416
+ # CASE
417
+ # WHEN SAMPLE_DATE_TIMEZONE = 'CST' THEN
418
+ # (SAMPLE_DATE_TIME AT TIME ZONE INTERVAL '-6 hours') AT TIME ZONE INTERVAL '{target_offset_hours} hours'
419
+ # WHEN SAMPLE_DATE_TIMEZONE = 'CDT' THEN
420
+ # (SAMPLE_DATE_TIME AT TIME ZONE INTERVAL '-5 hours') AT TIME ZONE INTERVAL '{target_offset_hours} hours'
421
+ # ELSE
422
+ # SAMPLE_DATE_TIME AT TIME ZONE INTERVAL '{target_offset_hours} hours'
423
+ # END AS datetime
424
+ # FROM {source_table}""")
425
+
426
+
427
+ # def average_results(con: duckdb.DuckDBPyConnection, table_name: str):
428
+ # '''
429
+ # Average samples by hour, station, and constituent using DuckDB.
430
+ # '''
431
+ # con.execute(f"""
432
+ # CREATE TABLE analytics.equis v_averaged AS
433
+ # SELECT
434
+ # station_id,
435
+ # DATE_TRUNC('hour', datetime) AS datetime,
436
+ # constituent,
437
+ # unit,
438
+ # 'equis' AS station_origin,
439
+ # AVG(value) AS value
440
+ # FROM v_timezone
441
+ # GROUP BY station_id, DATE_TRUNC('hour', datetime), constituent, unit
442
+ # """ )
443
+
444
+ def fetch_station_locations():
445
+ '''Fetch station location data for stations with HSPF related constituents.'''
446
+ query ="""SELECT DISTINCT
447
+ m.SYS_LOC_CODE,
448
+ stn.LONGITUDE,
449
+ stn.LATITUDE,
450
+ stn.LOC_MAJOR_BASIN,
451
+ stn.NON_PUBLIC_LOCATION_FLAG
452
+ FROM MPCA_DAL.MV_EQ_RESULT m
453
+ LEFT JOIN MPCA_DAL.EQ_FAC_STATION_NP stn
454
+ ON m.SYS_LOC_CODE = stn.SYS_LOC_CODE
455
+ WHERE m.LOC_TYPE = 'River/Stream'
456
+ AND m.CAS_RN IN ('479-61-8',
457
+ 'CHLA-CORR',
458
+ 'BOD',
459
+ 'NO2NO3',
460
+ '14797-55-8',
461
+ '14797-65-0',
462
+ '14265-44-2',
463
+ 'N-KJEL',
464
+ 'PHOSPHATE-P',
465
+ '7723-14-0',
466
+ 'SOLIDS-TSS',
467
+ 'TEMP-W',
468
+ '7664-41-7')
469
+ """
470
+ with CONNECTION.cursor() as cursor:
471
+ cursor.execute(query)
472
+ df = to_dataframe(cursor)
473
+
474
+ # dups = set(df.loc[df['SYS_LOC_CODE'].isin(df.loc[df['SYS_LOC_CODE'].duplicated()]['SYS_LOC_CODE']),'SYS_LOC_CODE'].to_list())
475
+ # for dup in dups:
476
+ # #percent difference between lat/long values
477
+ # sub = df.loc[df['SYS_LOC_CODE'] == dup]
478
+ # lat_diff = abs(sub['LATITUDE'].max() - sub['LATITUDE'].min()) / ((sub['LATITUDE'].max() + sub['LATITUDE'].min()) / 2) * 100
479
+ # long_diff = abs(sub['LONGITUDE'].max() - sub['LONGITUDE'].min()) / ((sub['LONGITUDE'].max() + sub['LONGITUDE'].min()) / 2) * 100
480
+ # print(f'Duplicate station {dup} has {lat_diff:.6f}% latitude difference')
481
+ # print(f'Duplicate station {dup} has {long_diff:.6f}% longitude difference')
482
+
483
+
484
+ # geometry = gpd.points_from_xy(df['LONGITUDE'], df['LATITUDE'])
485
+ # gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")
486
+ # filename = 'EQ_STATION_' + str(date.today()) + '.gpkg'
487
+ # gdf.to_file(save_path.joinpath(filename), driver = 'GPKG')
488
+ # gdf.rename(columns={'SYS_LOC_CODE':'station_id'}, inplace=True)
mpcaHydro/etlSWD.py CHANGED
@@ -98,11 +98,10 @@ def transform(df):
98
98
  'resultUnit':'unit'},inplace=True)
99
99
 
100
100
  df['constituent'] = df['variable'].map(CONSTITUENT_MAP)
101
- df['source'] = 'swd'
101
+ df['station_origin'] = 'swd'
102
102
  df['quality_id'] = pd.NA
103
103
  station_name = df.iloc[0]['station_name']
104
- df = df.loc[:,['datetime','value','variable','unit','station_id','station_name','constituent','source']]
105
-
104
+ df = df.loc[:,['datetime','value','variable','unit','station_id','station_name','constituent','station_origin']]
106
105
 
107
106
  df = df.astype({'value':float,
108
107
  'unit':str,
@@ -130,7 +129,7 @@ def transform(df):
130
129
 
131
130
  df.index = df.index.round('h').round('h')
132
131
  df = df.reset_index()
133
- df = df.groupby(['datetime','variable','unit','station_id','station_name','constituent','data_format','data_type','source']).mean()
132
+ df = df.groupby(['datetime','variable','unit','station_id','station_name','constituent','data_format','data_type','station_origin']).mean()
134
133
  df = df.reset_index()
135
134
  df = df.set_index('datetime')
136
135
  df['quality_id'] = pd.NA
@@ -139,7 +138,7 @@ def transform(df):
139
138
 
140
139
  def load(df,file_path):
141
140
  '''
142
- date, time, value, variable, unit, station_id, station_name, constituent, source, data_format, data_type, quality_code,
141
+ date, time, value, variable, unit, station_id, station_name, constituent, station_origin, data_format, data_type, quality_code,
143
142
 
144
143
 
145
144