mpcaHydro 2.0.6__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -157,7 +157,7 @@ def download_chunk(ts_id,start_year = 1996,end_year = 2030, interval = 4, as_jso
157
157
  end = end_year
158
158
  df = pywisk.get_ts(ts_id,start_date = f'{start}-01-01',end_date = f'{end}-12-31',as_json = as_json)
159
159
  if not df.empty: frames.append(df)
160
- df['Timestamp'] = pd.to_datetime(df['Timestamp']).dt.tz_localize(None)
160
+ df.index = pd.to_datetime(df['Timestamp'])
161
161
  time.sleep(.1)
162
162
  return pd.concat(frames)
163
163
 
@@ -197,8 +197,11 @@ def tkn(station_nos,start_year = 1996,end_year = 2030):
197
197
  return _download('TKN',station_nos,start_year,end_year)
198
198
 
199
199
 
200
-
201
-
200
+ def filter_quality_codes(df):
201
+ '''
202
+ Filter dataframe by valid quality codes
203
+ '''
204
+ return df.loc[df['Quality Code'].isin(DATA_CODES)]
202
205
 
203
206
  def convert_units(df):
204
207
  '''
@@ -230,22 +233,12 @@ def normalize_columns(df):
230
233
  'station_no':'station_id',
231
234
  'Timestamp':'datetime',
232
235
  'Value':'value',
233
- 'ts_unitsymbol':'unit',
234
- 'Quality Code':'quality_code',
235
- 'Quality Code Name':'quality_code_name'}, inplace=True)
236
+ 'ts_unitsymbol':'unit'}, inplace=True)
236
237
  return df
237
238
 
238
-
239
-
240
- def filter_quality_codes(df, data_codes):
241
- '''
242
- Filter dataframe by valid quality codes
243
- '''
244
- return df.loc[df['quality_code'].isin(data_codes)]
245
-
246
239
  def average_results(df):
247
- #df['datetime'] = pd.to_datetime(df.loc[:,'datetime'])
248
- df.loc[:,'datetime'] = df.loc[:,'datetime'].dt.round('h')
240
+ df['datetime'] = pd.to_datetime(df.loc[:,'datetime'])
241
+ df['datetime'] = df['datetime'].dt.round('h')
249
242
  return df.groupby(['station_id', 'datetime', 'constituent', 'unit']).agg(value=('value', 'mean')).reset_index()
250
243
  # Convert units
251
244
 
@@ -274,35 +267,14 @@ def calculate_baseflow(df, method = 'Boughton'):
274
267
  return pd.concat(dfs)
275
268
 
276
269
 
277
- def normalize(df):
270
+ def transform(df, baseflow_method = 'Boughton'):
278
271
  '''
279
- Standardize raw WISKI data into standardized format without transformations.
280
- The standardized format includes normalized column names and units.
281
- ---
282
- Parameters:
283
- df (pandas.DataFrame): Raw WISKI data
284
- Returns:
285
- pandas.DataFrame: Normalized WISKI data
272
+ Transform raw WISKI data into standardized format
286
273
  '''
287
-
274
+ df = filter_quality_codes(df)
288
275
  df = convert_units(df)
289
276
  df = normalize_columns(df)
290
- return df
291
-
292
- def transform(df, filter_qc_codes = True, data_codes = None, baseflow_method = 'Boughton'):
293
- '''
294
- Transform normalized WISKI data into standardized format
295
- '''
296
- df = normalize(df)
297
- if filter_qc_codes:
298
- if data_codes is None:
299
- data_codes = DATA_CODES
300
- df = filter_quality_codes(df, data_codes)
301
277
  df = average_results(df)
302
278
  df = calculate_baseflow(df, method = baseflow_method)
303
279
  df['station_origin'] = 'wiski'
304
- #df.set_index('datetime',inplace=True)
305
280
  return df
306
-
307
-
308
-
mpcaHydro/data_manager.py CHANGED
@@ -10,14 +10,15 @@ import pandas as pd
10
10
  from pathlib import Path
11
11
  from mpcaHydro import etlSWD
12
12
  from mpcaHydro import equis, wiski, warehouse
13
- from mpcaHydro import xref
14
- from mpcaHydro import outlets
15
- from mpcaHydro.reports import reportManager
16
13
  import duckdb
17
14
 
15
+
16
+ WISKI_EQUIS_XREF = pd.read_csv(Path(__file__).parent/'data/WISKI_EQUIS_XREF.csv')
17
+ #WISKI_EQUIS_XREF = pd.read_csv('C:/Users/mfratki/Documents/GitHub/hspf_tools/WISKI_EQUIS_XREF.csv')
18
+
18
19
  AGG_DEFAULTS = {'cfs':'mean',
19
20
  'mg/l':'mean',
20
- 'degf': 'mean',
21
+ 'degF': 'mean',
21
22
  'lb':'sum'}
22
23
 
23
24
  UNIT_DEFAULTS = {'Q': 'cfs',
@@ -27,15 +28,29 @@ UNIT_DEFAULTS = {'Q': 'cfs',
27
28
  'OP' : 'mg/l',
28
29
  'TKN': 'mg/l',
29
30
  'N' : 'mg/l',
30
- 'WT' : 'degf',
31
+ 'WT' : 'degF',
31
32
  'WL' : 'ft'}
32
33
 
34
+ def are_lists_identical(nested_list):
35
+ # Sort each sublist
36
+ sorted_sublists = [sorted(sublist) for sublist in nested_list]
37
+ # Compare all sublists to the first one
38
+ return all(sublist == sorted_sublists[0] for sublist in sorted_sublists)
33
39
 
34
- def validate_constituent(constituent):
35
- assert constituent in ['Q','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
36
-
37
- def validate_unit(unit):
38
- assert(unit in ['mg/l','lb','cfs','degF'])
40
+ def construct_database(folderpath):
41
+ folderpath = Path(folderpath)
42
+ db_path = folderpath.joinpath('observations.duckdb').as_posix()
43
+ with duckdb.connect(db_path) as con:
44
+ con.execute("DROP TABLE IF EXISTS observations")
45
+ datafiles = folderpath.joinpath('*.csv').as_posix()
46
+ query = '''
47
+ CREATE TABLE observations AS SELECT *
48
+ FROM
49
+ read_csv_auto(?,
50
+ union_by_name = true);
51
+
52
+ '''
53
+ con.execute(query,[datafiles])
39
54
 
40
55
 
41
56
  def build_warehouse(folderpath):
@@ -71,14 +86,8 @@ class dataManager():
71
86
  self.data = {}
72
87
  self.folderpath = Path(folderpath)
73
88
  self.db_path = self.folderpath.joinpath('observations.duckdb')
74
-
75
89
  self.oracle_user = oracle_user
76
90
  self.oracle_password = oracle_password
77
- warehouse.init_db(self.db_path,reset = False)
78
- self.xref = xref
79
- self.outlets = outlets
80
- self.reports = reportManager(self.db_path)
81
-
82
91
 
83
92
  def connect_to_oracle(self):
84
93
  assert (self.credentials_exist(), 'Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
@@ -90,139 +99,295 @@ class dataManager():
90
99
  else:
91
100
  return False
92
101
 
102
+ def _reconstruct_database(self):
103
+ construct_database(self.folderpath)
104
+
93
105
  def _build_warehouse(self):
94
106
  build_warehouse(self.folderpath)
107
+
108
+ def constituent_summary(self,constituents = None):
109
+ with duckdb.connect(self.db_path) as con:
110
+ if constituents is None:
111
+ constituents = con.query('''
112
+ SELECT DISTINCT
113
+ constituent
114
+ FROM observations''').to_df()['constituent'].to_list()
95
115
 
96
- def download_station_data(self,station_id,station_origin,overwrite=True,to_csv = False,filter_qc_codes = True, start_year = 1996, end_year = 2030,baseflow_method = 'Boughton'):
97
- '''
98
- Method to download data for a specific station and load it into the warehouse.
99
-
100
- :param self: Description
101
- :param station_id: Station identifier
102
- :param station_origin: source of station data: wiski, equis, or swd
103
- :param overwrite: Whether to overwrite existing data
104
- :param to_csv: Whether to export data to CSV
105
- :param filter_qc_codes: Whether to filter quality control codes
106
- :param start_year: Start year for data download
107
- :param end_year: End year for data download
108
- :param baseflow_method: Method for baseflow calculation
109
- '''
110
- with duckdb.connect(self.db_path,read_only=False) as con:
111
- if overwrite:
112
- warehouse.drop_station_id(con,station_id,station_origin)
113
- warehouse.update_views(con)
114
-
115
- if station_origin == 'wiski':
116
- df = wiski.download([station_id],start_year = start_year, end_year = end_year)
117
- warehouse.load_df_to_staging(con,df, 'wiski_raw', replace = overwrite)
118
- warehouse.load_df_to_analytics(con,wiski.transform(df,filter_qc_codes = filter_qc_codes,baseflow_method = baseflow_method),'wiski') # method includes normalization
119
-
120
- elif station_origin == 'equis':
121
- assert (self.credentials_exist(), 'Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
122
- df = equis.download([station_id])
123
- warehouse.load_df_to_staging(con,df, 'equis_raw',replace = overwrite)
124
- warehouse.load_df_to_analytics(con,equis.transform(df),'equis')
125
-
126
- elif station_origin == 'swd':
127
- df = etlSWD.download(station_id)
128
- warehouse.load_df_to_staging(con,df, 'swd_raw', replace = overwrite)
129
- warehouse.load_df_to_analytics(con,etlSWD.transform(df),'swd')
130
- else:
131
- raise ValueError('station_origin must be wiski, equis, or swd')
116
+ query = '''
117
+ SELECT
118
+ station_id,
119
+ station_origin,
120
+ constituent,
121
+ COUNT(*) AS sample_count,
122
+ year(MIN(datetime)) AS start_date,
123
+ year(MAX(datetime)) AS end_date
124
+ FROM
125
+ observations
126
+ WHERE
127
+ constituent in (SELECT UNNEST(?))
128
+ GROUP BY
129
+ constituent,station_id,station_origin
130
+ ORDER BY
131
+ constituent,sample_count;'''
132
+
133
+ df = con.execute(query,[constituents]).fetch_df()
134
+ return df
135
+
136
+ def get_wiski_stations(self):
137
+ return list(WISKI_EQUIS_XREF['WISKI_STATION_NO'].unique())
138
+
139
+ def get_equis_stations(self):
140
+ return list(WISKI_EQUIS_XREF['EQUIS_STATION_ID'].unique())
132
141
 
133
- with duckdb.connect(self.db_path,read_only=False) as con:
134
- warehouse.update_views(con)
142
+ def wiski_equis_alias(self,wiski_station_id):
143
+ equis_ids = list(set(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_STATION_NO'] == wiski_station_id,'WISKI_EQUIS_ID'].to_list()))
144
+ equis_ids = [equis_id for equis_id in equis_ids if not pd.isna(equis_id)]
145
+ if len(equis_ids) == 0:
146
+ return []
147
+ elif len(equis_ids) > 1:
148
+ print(f'Too Many Equis Stations for {wiski_station_id}')
149
+ raise
150
+ else:
151
+ return equis_ids[0]
135
152
 
136
- if to_csv:
137
- self.to_csv(station_id)
153
+ def wiski_equis_associations(self,wiski_station_id):
154
+ equis_ids = list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_STATION_NO'] == wiski_station_id,'EQUIS_STATION_ID'].unique())
155
+ equis_ids = [equis_id for equis_id in equis_ids if not pd.isna(equis_id)]
156
+ if len(equis_ids) == 0:
157
+ return []
158
+ else:
159
+ return equis_ids
160
+
161
+ def equis_wiski_associations(self,equis_station_id):
162
+ wiski_ids = list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['EQUIS_STATION_ID'] == equis_station_id,'WISKI_STATION_NO'].unique())
163
+ wiski_ids = [wiski_id for wiski_id in wiski_ids if not pd.isna(wiski_id)]
164
+ if len(wiski_ids) == 0:
165
+ return []
166
+ else:
167
+ return wiski_ids
168
+
169
+ def equis_wiski_alias(self,equis_station_id):
170
+ wiski_ids = list(set(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_EQUIS_ID'] == equis_station_id,'WISKI_STATION_NO'].to_list()))
171
+ wiski_ids = [wiski_id for wiski_id in wiski_ids if not pd.isna(wiski_id)]
172
+ if len(wiski_ids) == 0:
173
+ return []
174
+ elif len(wiski_ids) > 1:
175
+ print(f'Too Many WISKI Stations for {equis_station_id}')
176
+ raise
177
+ else:
178
+ return wiski_ids[0]
179
+
180
+ def _equis_wiski_associations(self,equis_station_ids):
181
+ wiski_stations = [self.equis_wiski_associations(equis_station_id) for equis_station_id in equis_station_ids]
182
+ if are_lists_identical(wiski_stations):
183
+ return wiski_stations[0]
184
+ else:
185
+ return []
138
186
 
139
- return df
140
-
141
- def get_outlets(self):
142
- with duckdb.connect(self.db_path,read_only=True) as con:
143
- query = '''
144
- SELECT *
145
- FROM outlets.station_reach_pairs
146
- ORDER BY outlet_id'''
147
- df = con.execute(query).fetch_df()
148
- return df
149
- def get_station_ids(self,station_origin = None):
150
- with duckdb.connect(self.db_path,read_only=True) as con:
151
- if station_origin is None:
152
- query = '''
153
- SELECT DISTINCT station_id, station_origin
154
- FROM analytics.observations'''
155
- df = con.execute(query).fetch_df()
156
- else:
157
- query = '''
158
- SELECT DISTINCT station_id
159
- FROM analytics.observations
160
- WHERE station_origin = ?'''
161
- df = con.execute(query,[station_origin]).fetch_df()
162
-
163
- return df['station_id'].to_list()
187
+ def _stations_by_wid(self,wid_no,station_origin):
188
+ if station_origin in ['wiski','wplmn']:
189
+ station_col = 'WISKI_STATION_NO'
190
+ elif station_origin in ['equis','swd']:
191
+ station_col = 'EQUIS_STATION_ID'
192
+ else:
193
+ raise
194
+
195
+ return list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WID'] == wid_no,station_col].unique())
196
+
164
197
 
198
+ def download_stations_by_wid(self, wid_no,station_origin, folderpath = None, overwrite = False):
165
199
 
166
- def get_station_data(self,station_ids,constituent,agg_period = None):
200
+ station_ids = self._station_by_wid(wid_no,station_origin)
201
+
202
+ if not station_ids.empty:
203
+ for _, row in station_ids.iterrows():
204
+ self.download_station_data(row['station_id'],station_origin, folderpath, overwrite)
205
+
206
+ def _download_station_data(self,station_id,station_origin,overwrite=False):
207
+ assert(station_origin in ['wiski','equis','swd','wplmn'])
208
+ if station_origin == 'wiski':
209
+ self.download_station_data(station_id,'wiski',overwrite = overwrite)
210
+ elif station_origin == 'wplmn':
211
+ self.download_station_data(station_id,'wplmn',overwrite = overwrite)
212
+ elif station_origin == 'swd':
213
+ self.download_station_data(station_id,'swd',overwrite = overwrite)
214
+ else:
215
+ self.download_station_data(station_id,'equis',overwrite = overwrite)
216
+
217
+
218
+
219
+
220
+ def download_station_data(self,station_id,station_origin,start_year = 1996, end_year = 2030,folderpath=None,overwrite = False,baseflow_method = 'Boughton'):
221
+ assert(station_origin in ['wiski','equis','swd','wplmn'])
222
+ station_id = str(station_id)
223
+ save_name = station_id
224
+ if station_origin == 'wplmn':
225
+ save_name = station_id + '_wplmn'
226
+
227
+ if folderpath is None:
228
+ folderpath = self.folderpath
229
+ else:
230
+ folderpath = Path(folderpath)
231
+
232
+
233
+ if (folderpath.joinpath(save_name + '.csv').exists()) & (not overwrite):
234
+ print (f'{station_id} data already downloaded')
235
+ return
236
+
237
+ if station_origin == 'wiski':
238
+ data = wiski.transform(wiski.download([station_id],wplmn=False, baseflow_method = baseflow_method))
239
+ elif station_origin == 'swd':
240
+ data = etlSWD.download(station_id)
241
+ elif station_origin == 'equis':
242
+ assert (self.credentials_exist(), 'Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
243
+ data = equis.transform(equis.download([station_id]))
244
+ else:
245
+ data = wiski.transform(wiski.download([station_id],wplmn=True, baseflow_method = baseflow_method))
167
246
 
168
247
 
169
- with duckdb.connect(self.db_path,read_only=True) as con:
248
+
249
+
250
+ if len(data) > 0:
251
+ data.to_csv(folderpath.joinpath(save_name + '.csv'))
252
+ self.data[station_id] = data
253
+ else:
254
+ print(f'No {station_origin} calibration cata available at Station {station_id}')
255
+
256
+ def _load(self,station_id):
257
+ with duckdb.connect(self.db_path) as con:
170
258
  query = '''
171
259
  SELECT *
172
260
  FROM analytics.observations
173
- WHERE station_id IN ? AND constituent = ?'''
174
- df = con.execute(query,[station_ids,constituent]).fetch_df()
261
+ WHERE station_id = ?'''
262
+ df = con.execute(query,[station_id]).fetch_df()
263
+ df.set_index('datetime',inplace=True)
264
+ self.data[station_id] = df
265
+ return df
266
+
267
+ def _load2(self,station_id):
268
+ df = pd.read_csv(self.folderpath.joinpath(station_id + '.csv'),
269
+ index_col='datetime',
270
+ parse_dates=['datetime'],
271
+ #usecols=['Ts Date','Station number','variable', 'value','reach_id'],
272
+ dtype={'station_id': str, 'value': float, 'variable': str,'constituent':str,'unit':str})
273
+ self.data[station_id] = df
274
+ return df
275
+
276
+ def load(self,station_id):
277
+ try:
278
+ df = self.data[station_id]
279
+ except:
280
+ df = self._load(station_id)
281
+ return df
282
+
283
+ def info(self,constituent):
284
+ return pd.concat([self._load(file.stem) for file in self.folderpath.iterdir() if file.suffix == '.csv'])[['station_id','constituent','value']].groupby(by = ['station_id','constituent']).count()
175
285
 
176
- unit = UNIT_DEFAULTS[constituent]
177
- agg_func = AGG_DEFAULTS[unit]
286
+ def get_wplmn_data(self,station_id,constituent,unit = 'mg/l', agg_period = 'YE', samples_only = True):
287
+
288
+ assert constituent in ['Q','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
289
+ station_id = station_id + '_wplmn'
290
+ dfsub = self._load(station_id)
291
+
292
+ if samples_only:
293
+ dfsub = dfsub.loc[dfsub['quality_id'] == 3]
294
+ agg_func = 'mean'
295
+
296
+ dfsub = dfsub.loc[(dfsub['constituent'] == constituent) &
297
+ (dfsub['unit'] == unit),
298
+ ['value','station_origin']]
178
299
 
179
- df.set_index('datetime',inplace=True)
300
+
301
+ df = dfsub[['value']].resample(agg_period).agg(agg_func)
302
+
303
+ if df.empty:
304
+ dfsub = df
305
+ else:
306
+
307
+ df['station_origin'] = dfsub['station_origin'].iloc[0]
308
+
309
+ #if (constituent == 'TSS') & (unit == 'lb'): #convert TSS from lbs to us tons
310
+ # dfsub['value'] = dfsub['value']/2000
311
+
312
+ #dfsub = dfsub.resample('H').mean().dropna()
313
+
180
314
  df.attrs['unit'] = unit
181
315
  df.attrs['constituent'] = constituent
182
- if agg_period is not None:
183
- df = df[['value']].resample(agg_period).agg(agg_func)
184
- df.attrs['agg_period'] = agg_period
185
-
186
- df.rename(columns={'value': 'observed'}, inplace=True)
187
- return df
316
+ return df['value'].to_frame().dropna()
188
317
 
189
- def get_outlet_data(self,outlet_id,constituent,agg_period = 'D'):
190
- with duckdb.connect(self.db_path,read_only=True) as con:
191
- query = '''
192
- SELECT *
193
- FROM analytics.outlet_observations_with_flow
194
- WHERE outlet_id = ? AND constituent = ?'''
195
- df = con.execute(query,[outlet_id,constituent]).fetch_df()
318
+ def get_data(self,station_id,constituent,agg_period = 'D'):
319
+ return self._get_data([station_id],constituent,agg_period)
320
+
321
+ def _get_data(self,station_ids,constituent,agg_period = 'D',tz_offset = '-6'):
322
+ '''
323
+
324
+ Returns the processed observational data associated with the calibration specific id.
325
+
326
+
327
+ Parameters
328
+ ----------
329
+ station_id : STR
330
+ Station ID as a string
331
+ constituent : TYPE
332
+ Constituent abbreviation used for calibration. Valid options:
333
+ 'Q',
334
+ 'TSS',
335
+ 'TP',
336
+ 'OP',
337
+ 'TKN',
338
+ 'N',
339
+ 'WT',
340
+ 'DO',
341
+ 'WL']
342
+ unit : TYPE, optional
343
+ Units of data. The default is 'mg/l'.
344
+ sample_flag : TYPE, optional
345
+ For WPLMN data this flag determines modeled loads are returned. The default is False.
196
346
 
347
+ Returns
348
+ -------
349
+ dfsub : Pands.Series
350
+ Pandas series of data. Note that no metadata is returned.
351
+
352
+ '''
353
+
354
+ assert constituent in ['Q','QB','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
355
+
197
356
  unit = UNIT_DEFAULTS[constituent]
198
357
  agg_func = AGG_DEFAULTS[unit]
199
-
200
- df.set_index('datetime',inplace=True)
358
+
359
+ dfsub = pd.concat([self.load(station_id) for station_id in station_ids]) # Check cache
360
+ dfsub.index = dfsub.index.tz_localize(None) # Drop timezone info
361
+ #dfsub.set_index('datetime',drop=True,inplace=True)
362
+ dfsub.rename(columns={'source':'station_origin'},inplace=True)
363
+ dfsub = dfsub.loc[(dfsub['constituent'] == constituent) &
364
+ (dfsub['unit'] == unit),
365
+ ['value','station_origin']]
366
+
367
+ df = dfsub[['value']].resample(agg_period).agg(agg_func)
201
368
  df.attrs['unit'] = unit
202
369
  df.attrs['constituent'] = constituent
203
- if agg_period is not None:
204
- df = df[['value','flow_value','baseflow_value']].resample(agg_period).agg(agg_func)
205
- df.attrs['agg_period'] = agg_period
370
+
371
+ if df.empty:
372
+
373
+ return df
374
+ else:
375
+
376
+ df['station_origin'] = dfsub['station_origin'].iloc[0]
206
377
 
207
- df.rename(columns={'value': 'observed',
208
- 'flow_value': 'observed_flow',
209
- 'baseflow_value': 'observed_baseflow'}, inplace=True)
210
- return df
211
378
 
379
+ # convert to desired timzone before stripping timezone information.
380
+ #df.index.tz_convert('UTC-06:00').tz_localize(None)
381
+
382
+ return df['value'].to_frame().dropna()
212
383
 
213
384
 
214
- def to_csv(self,station_id,folderpath = None):
215
- if folderpath is None:
216
- folderpath = self.folderpath
217
- else:
218
- folderpath = Path(folderpath)
219
- df = self._load(station_id)
220
- if len(df) > 0:
221
- df.to_csv(folderpath.joinpath(station_id + '.csv'))
222
- else:
223
- print(f'No {station_id} calibration data available at Station {station_id}')
224
-
225
- df.to_csv(folderpath.joinpath(station_id + '.csv'))
385
+ def validate_constituent(constituent):
386
+ assert constituent in ['Q','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
387
+
388
+ def validate_unit(unit):
389
+ assert(unit in ['mg/l','lb','cfs','degF'])
390
+
226
391
 
227
392
 
228
393
  # class database():
mpcaHydro/equis.py CHANGED
@@ -164,25 +164,26 @@ def as_utc_offset(naive_dt: Union[datetime, str], tz_label: str, target_offset:
164
164
  aware_src = naive.replace(tzinfo=src_tz)
165
165
 
166
166
  # convert the instant to fixed UTC-6
167
- return aware_src.astimezone(target_offset).tz_localize(None)
167
+ return aware_src.astimezone(target_offset)
168
168
 
169
169
 
170
170
  def normalize_columns(df):
171
171
  '''Select relevant columns from Equis data.'''
172
172
  return df[['SYS_LOC_CODE',
173
- 'constituent',
174
- 'CAS_RN',
175
173
  'datetime',
176
174
  'RESULT_NUMERIC',
177
175
  'RESULT_UNIT',
176
+ 'constituent'
178
177
  ]].rename(columns={
179
178
  'SYS_LOC_CODE':'station_id',
180
179
  'RESULT_NUMERIC':'value',
181
- 'RESULT_UNIT':'unit',
182
- 'CAS_RN':'cas_rn'
180
+ 'RESULT_UNIT':'unit'
183
181
  })
184
182
 
185
-
183
+ def replace_nondetects(df):
184
+ '''Replace non-detect results with 0 in Equis data.'''
185
+ df.loc[df['RESULT_NUMERIC'].isna(), 'RESULT_NUMERIC'] = 0
186
+ return df
186
187
 
187
188
  def normalize_timezone(df):
188
189
  '''Normalize datetime to UTC in Equis data.'''
@@ -193,27 +194,27 @@ def normalize_timezone(df):
193
194
  except Exception:
194
195
  return pd.NaT
195
196
 
196
- df.loc[:,'datetime'] = df.apply(_conv, axis=1)
197
+ df['datetime'] = df.apply(_conv, axis=1)
197
198
  return df
198
199
 
199
200
  def convert_units(df):
200
201
  '''Convert units in Equis data to standard units.'''
201
202
  # Convert ug/L to mg/L
202
- df['unit'] = df['unit'].str.lower()
203
+ df['RESULT_UNIT'] = df['RESULT_UNIT'].str.lower()
203
204
 
204
- mask_ugL = df['unit'] == 'ug/l'
205
- df.loc[mask_ugL, 'value'] = df.loc[mask_ugL, 'value'] / 1000
206
- df.loc[mask_ugL, 'unit'] = 'mg/l'
205
+ mask_ugL = df['RESULT_UNIT'] == 'ug/l'
206
+ df.loc[mask_ugL, 'RESULT_NUMERIC'] = df.loc[mask_ugL, 'RESULT_NUMERIC'] / 1000
207
+ df.loc[mask_ugL, 'RESULT_UNIT'] = 'mg/l'
207
208
 
208
209
  # Convert mg/g to mg/L (assuming density of 1 g/mL)
209
- mask_mgg = df['unit'] == 'mg/g'
210
- df.loc[mask_mgg, 'value'] = df.loc[mask_mgg, 'value'] * 1000
211
- df.loc[mask_mgg, 'unit'] = 'mg/l'
210
+ mask_mgg = df['RESULT_UNIT'] == 'mg/g'
211
+ df.loc[mask_mgg, 'RESULT_NUMERIC'] = df.loc[mask_mgg, 'RESULT_NUMERIC'] * 1000
212
+ df.loc[mask_mgg, 'RESULT_UNIT'] = 'mg/l'
212
213
 
213
214
  # Convert deg C to degF
214
- mask_degC = df['unit'].isin(['deg c', 'degc'])
215
- df.loc[mask_degC, 'value'] = (df.loc[mask_degC, 'value'] * 9/5) + 32
216
- df.loc[mask_degC, 'unit'] = 'degf'
215
+ mask_degC = df['RESULT_UNIT'].isin(['deg c', 'degc'])
216
+ df.loc[mask_degC, 'RESULT_NUMERIC'] = (df.loc[mask_degC, 'RESULT_NUMERIC'] * 9/5) + 32
217
+ df.loc[mask_degC, 'RESULT_UNIT'] = 'degf'
217
218
 
218
219
  return df
219
220
 
@@ -231,25 +232,15 @@ def average_results(df):
231
232
  value=('value', 'mean')
232
233
  ).reset_index()
233
234
 
234
- def replace_nondetects(df):
235
- '''Replace non-detect results with 0 in Equis data.'''
236
- df.loc[df['value'].isna(), 'value'] = 0
237
- return df
238
-
239
- def normalize(df):
240
- '''Normalize Equis data: select relevant columns.'''
241
- df = map_constituents(df)
242
- df = normalize_timezone(df)
243
- df = normalize_columns(df)
244
- df = convert_units(df)
245
- return df
246
-
247
235
  def transform(df):
248
236
  '''Transform Equis data: handle non-detects, convert units, map constituents.'''
249
237
 
250
- df = normalize(df)
251
238
  df = replace_nondetects(df)
252
239
  if not df.empty:
240
+ df = normalize_timezone(df)
241
+ df = convert_units(df)
242
+ df = map_constituents(df)
243
+ df = normalize_columns(df)
253
244
  df = average_results(df)
254
245
  return df
255
246