mpcaHydro 2.0.5__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {mpcahydro-2.0.5 → mpcahydro-2.1.0}/PKG-INFO +1 -2
  2. {mpcahydro-2.0.5 → mpcahydro-2.1.0}/pyproject.toml +2 -3
  3. mpcahydro-2.0.5/src/mpcaHydro/wiski.py → mpcahydro-2.1.0/src/mpcaHydro/WISKI.py +12 -40
  4. mpcahydro-2.1.0/src/mpcaHydro/data_manager.py +412 -0
  5. {mpcahydro-2.0.5 → mpcahydro-2.1.0}/src/mpcaHydro/equis.py +22 -31
  6. mpcahydro-2.1.0/src/mpcaHydro/warehouse.py +203 -0
  7. mpcahydro-2.1.0/tests/pixi.toml +25 -0
  8. mpcahydro-2.0.5/ERROR.FIL +0 -6
  9. mpcahydro-2.0.5/demo.py +0 -167
  10. mpcahydro-2.0.5/src/mpcaHydro/data/WISKI_QUALITY_CODES.csv +0 -71
  11. mpcahydro-2.0.5/src/mpcaHydro/data/outlets.duckdb +0 -0
  12. mpcahydro-2.0.5/src/mpcaHydro/data/stations_EQUIS.gpkg +0 -0
  13. mpcahydro-2.0.5/src/mpcaHydro/data/stations_wiski.gpkg +0 -0
  14. mpcahydro-2.0.5/src/mpcaHydro/data_manager.py +0 -247
  15. mpcahydro-2.0.5/src/mpcaHydro/outlets.py +0 -371
  16. mpcahydro-2.0.5/src/mpcaHydro/reports.py +0 -80
  17. mpcahydro-2.0.5/src/mpcaHydro/warehouse.py +0 -581
  18. mpcahydro-2.0.5/src/mpcaHydro/warehouseManager.py +0 -47
  19. mpcahydro-2.0.5/src/mpcaHydro/xref.py +0 -74
  20. {mpcahydro-2.0.5 → mpcahydro-2.1.0}/.gitattributes +0 -0
  21. {mpcahydro-2.0.5 → mpcahydro-2.1.0}/.gitignore +0 -0
  22. {mpcahydro-2.0.5 → mpcahydro-2.1.0}/README.md +0 -0
  23. {mpcahydro-2.0.5 → mpcahydro-2.1.0}/src/mpcaHydro/__init__.py +0 -0
  24. {mpcahydro-2.0.5 → mpcahydro-2.1.0}/src/mpcaHydro/data/EQUIS_PARAMETER_XREF.csv +0 -0
  25. {mpcahydro-2.0.5 → mpcahydro-2.1.0}/src/mpcaHydro/data/WISKI_EQUIS_XREF.csv +0 -0
  26. {mpcahydro-2.0.5 → mpcahydro-2.1.0}/src/mpcaHydro/etlCSG.py +0 -0
  27. {mpcahydro-2.0.5 → mpcahydro-2.1.0}/src/mpcaHydro/etlSWD.py +0 -0
  28. {mpcahydro-2.0.5 → mpcahydro-2.1.0}/src/mpcaHydro/etlWISKI.py +0 -0
  29. {mpcahydro-2.0.5 → mpcahydro-2.1.0}/src/mpcaHydro/etlWPLMN.py +0 -0
  30. {mpcahydro-2.0.5 → mpcahydro-2.1.0}/src/mpcaHydro/pywisk.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mpcaHydro
3
- Version: 2.0.5
3
+ Version: 2.1.0
4
4
  Summary: Python package for downloading MPCA hydrology data
5
5
  Project-URL: Homepage, https://github.com/mfratkin1/mpcaHydro
6
6
  Author-email: Mulu Fratkin <michael.fratkin@state.mn.us>
@@ -11,7 +11,6 @@ Classifier: Development Status :: 3 - Alpha
11
11
  Classifier: Programming Language :: Python
12
12
  Requires-Python: >=3.8
13
13
  Requires-Dist: duckdb
14
- Requires-Dist: oracledb
15
14
  Requires-Dist: pandas
16
15
  Requires-Dist: pathlib
17
16
  Requires-Dist: requests
@@ -5,13 +5,12 @@ build-backend = "hatchling.build"
5
5
  [project]
6
6
  name = "mpcaHydro"
7
7
  urls = { "Homepage" = "https://github.com/mfratkin1/mpcaHydro" } # ? Add this!
8
- version = "2.0.5"
8
+ version = "2.1.0"
9
9
  dependencies = [
10
10
  "pandas",
11
11
  "requests",
12
12
  "pathlib",
13
- "duckdb",
14
- "oracledb"
13
+ "duckdb"
15
14
  ]
16
15
  requires-python = ">=3.8"
17
16
  authors = [
@@ -157,7 +157,7 @@ def download_chunk(ts_id,start_year = 1996,end_year = 2030, interval = 4, as_jso
157
157
  end = end_year
158
158
  df = pywisk.get_ts(ts_id,start_date = f'{start}-01-01',end_date = f'{end}-12-31',as_json = as_json)
159
159
  if not df.empty: frames.append(df)
160
- df['Timestamp'] = pd.to_datetime(df['Timestamp']).dt.tz_localize(None)
160
+ df.index = pd.to_datetime(df['Timestamp'])
161
161
  time.sleep(.1)
162
162
  return pd.concat(frames)
163
163
 
@@ -197,8 +197,11 @@ def tkn(station_nos,start_year = 1996,end_year = 2030):
197
197
  return _download('TKN',station_nos,start_year,end_year)
198
198
 
199
199
 
200
-
201
-
200
+ def filter_quality_codes(df):
201
+ '''
202
+ Filter dataframe by valid quality codes
203
+ '''
204
+ return df.loc[df['Quality Code'].isin(DATA_CODES)]
202
205
 
203
206
  def convert_units(df):
204
207
  '''
@@ -230,22 +233,12 @@ def normalize_columns(df):
230
233
  'station_no':'station_id',
231
234
  'Timestamp':'datetime',
232
235
  'Value':'value',
233
- 'ts_unitsymbol':'unit',
234
- 'Quality Code':'quality_code',
235
- 'Quality Code Name':'quality_code_name'}, inplace=True)
236
+ 'ts_unitsymbol':'unit'}, inplace=True)
236
237
  return df
237
238
 
238
-
239
-
240
- def filter_quality_codes(df, data_codes):
241
- '''
242
- Filter dataframe by valid quality codes
243
- '''
244
- return df.loc[df['quality_code'].isin(data_codes)]
245
-
246
239
  def average_results(df):
247
- #df['datetime'] = pd.to_datetime(df.loc[:,'datetime'])
248
- df.loc[:,'datetime'] = df.loc[:,'datetime'].dt.round('h')
240
+ df['datetime'] = pd.to_datetime(df.loc[:,'datetime'])
241
+ df['datetime'] = df['datetime'].dt.round('h')
249
242
  return df.groupby(['station_id', 'datetime', 'constituent', 'unit']).agg(value=('value', 'mean')).reset_index()
250
243
  # Convert units
251
244
 
@@ -274,35 +267,14 @@ def calculate_baseflow(df, method = 'Boughton'):
274
267
  return pd.concat(dfs)
275
268
 
276
269
 
277
- def normalize(df):
270
+ def transform(df, baseflow_method = 'Boughton'):
278
271
  '''
279
- Standardize raw WISKI data into standardized format without transformations.
280
- The standardized format includes normalized column names and units.
281
- ---
282
- Parameters:
283
- df (pandas.DataFrame): Raw WISKI data
284
- Returns:
285
- pandas.DataFrame: Normalized WISKI data
272
+ Transform raw WISKI data into standardized format
286
273
  '''
287
-
274
+ df = filter_quality_codes(df)
288
275
  df = convert_units(df)
289
276
  df = normalize_columns(df)
290
- return df
291
-
292
- def transform(df, filter_qc_codes = True, data_codes = None, baseflow_method = 'Boughton'):
293
- '''
294
- Transform normalized WISKI data into standardized format
295
- '''
296
- df = normalize(df)
297
- if filter_qc_codes:
298
- if data_codes is None:
299
- data_codes = DATA_CODES
300
- df = filter_quality_codes(df, data_codes)
301
277
  df = average_results(df)
302
278
  df = calculate_baseflow(df, method = baseflow_method)
303
279
  df['station_origin'] = 'wiski'
304
- #df.set_index('datetime',inplace=True)
305
280
  return df
306
-
307
-
308
-
@@ -0,0 +1,412 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Jun 3 10:01:14 2022
4
+
5
+ @author: mfratki
6
+ """
7
+
8
+ import pandas as pd
9
+ #from abc import abstractmethod
10
+ from pathlib import Path
11
+ from mpcaHydro import etlSWD
12
+ from mpcaHydro import equis, wiski, warehouse
13
+ import duckdb
14
+
15
+
16
+ WISKI_EQUIS_XREF = pd.read_csv(Path(__file__).parent/'data/WISKI_EQUIS_XREF.csv')
17
+ #WISKI_EQUIS_XREF = pd.read_csv('C:/Users/mfratki/Documents/GitHub/hspf_tools/WISKI_EQUIS_XREF.csv')
18
+
19
+ AGG_DEFAULTS = {'cfs':'mean',
20
+ 'mg/l':'mean',
21
+ 'degF': 'mean',
22
+ 'lb':'sum'}
23
+
24
+ UNIT_DEFAULTS = {'Q': 'cfs',
25
+ 'QB': 'cfs',
26
+ 'TSS': 'mg/l',
27
+ 'TP' : 'mg/l',
28
+ 'OP' : 'mg/l',
29
+ 'TKN': 'mg/l',
30
+ 'N' : 'mg/l',
31
+ 'WT' : 'degF',
32
+ 'WL' : 'ft'}
33
+
34
+ def are_lists_identical(nested_list):
35
+ # Sort each sublist
36
+ sorted_sublists = [sorted(sublist) for sublist in nested_list]
37
+ # Compare all sublists to the first one
38
+ return all(sublist == sorted_sublists[0] for sublist in sorted_sublists)
39
+
40
+ def construct_database(folderpath):
41
+ folderpath = Path(folderpath)
42
+ db_path = folderpath.joinpath('observations.duckdb').as_posix()
43
+ with duckdb.connect(db_path) as con:
44
+ con.execute("DROP TABLE IF EXISTS observations")
45
+ datafiles = folderpath.joinpath('*.csv').as_posix()
46
+ query = '''
47
+ CREATE TABLE observations AS SELECT *
48
+ FROM
49
+ read_csv_auto(?,
50
+ union_by_name = true);
51
+
52
+ '''
53
+ con.execute(query,[datafiles])
54
+
55
+
56
+ def build_warehouse(folderpath):
57
+ folderpath = Path(folderpath)
58
+ db_path = folderpath.joinpath('observations.duckdb').as_posix()
59
+ warehouse.init_db(db_path)
60
+
61
+ def constituent_summary(db_path):
62
+ with duckdb.connect(db_path) as con:
63
+ query = '''
64
+ SELECT
65
+ station_id,
66
+ station_origin,
67
+ constituent,
68
+ COUNT(*) AS sample_count,
69
+ year(MIN(datetime)) AS start_date,
70
+ year(MAX(datetime)) AS end_date
71
+ FROM
72
+ observations
73
+ GROUP BY
74
+ constituent, station_id,station_origin
75
+ ORDER BY
76
+ sample_count;'''
77
+
78
+ res = con.execute(query)
79
+ return res.fetch_df()
80
+
81
+
82
+ class dataManager():
83
+
84
+ def __init__(self,folderpath, oracle_user = None, oracle_password =None):
85
+
86
+ self.data = {}
87
+ self.folderpath = Path(folderpath)
88
+ self.db_path = self.folderpath.joinpath('observations.duckdb')
89
+ self.oracle_user = oracle_user
90
+ self.oracle_password = oracle_password
91
+
92
+ def connect_to_oracle(self):
93
+ assert (self.credentials_exist(), 'Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
94
+ equis.connect(user = self.oracle_user, password = self.oracle_password)
95
+
96
+ def credentials_exist(self):
97
+ if (self.oracle_user is not None) & (self.oracle_password is not None):
98
+ return True
99
+ else:
100
+ return False
101
+
102
+ def _reconstruct_database(self):
103
+ construct_database(self.folderpath)
104
+
105
+ def _build_warehouse(self):
106
+ build_warehouse(self.folderpath)
107
+
108
+ def constituent_summary(self,constituents = None):
109
+ with duckdb.connect(self.db_path) as con:
110
+ if constituents is None:
111
+ constituents = con.query('''
112
+ SELECT DISTINCT
113
+ constituent
114
+ FROM observations''').to_df()['constituent'].to_list()
115
+
116
+ query = '''
117
+ SELECT
118
+ station_id,
119
+ station_origin,
120
+ constituent,
121
+ COUNT(*) AS sample_count,
122
+ year(MIN(datetime)) AS start_date,
123
+ year(MAX(datetime)) AS end_date
124
+ FROM
125
+ observations
126
+ WHERE
127
+ constituent in (SELECT UNNEST(?))
128
+ GROUP BY
129
+ constituent,station_id,station_origin
130
+ ORDER BY
131
+ constituent,sample_count;'''
132
+
133
+ df = con.execute(query,[constituents]).fetch_df()
134
+ return df
135
+
136
+ def get_wiski_stations(self):
137
+ return list(WISKI_EQUIS_XREF['WISKI_STATION_NO'].unique())
138
+
139
+ def get_equis_stations(self):
140
+ return list(WISKI_EQUIS_XREF['EQUIS_STATION_ID'].unique())
141
+
142
+ def wiski_equis_alias(self,wiski_station_id):
143
+ equis_ids = list(set(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_STATION_NO'] == wiski_station_id,'WISKI_EQUIS_ID'].to_list()))
144
+ equis_ids = [equis_id for equis_id in equis_ids if not pd.isna(equis_id)]
145
+ if len(equis_ids) == 0:
146
+ return []
147
+ elif len(equis_ids) > 1:
148
+ print(f'Too Many Equis Stations for {wiski_station_id}')
149
+ raise
150
+ else:
151
+ return equis_ids[0]
152
+
153
+ def wiski_equis_associations(self,wiski_station_id):
154
+ equis_ids = list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_STATION_NO'] == wiski_station_id,'EQUIS_STATION_ID'].unique())
155
+ equis_ids = [equis_id for equis_id in equis_ids if not pd.isna(equis_id)]
156
+ if len(equis_ids) == 0:
157
+ return []
158
+ else:
159
+ return equis_ids
160
+
161
+ def equis_wiski_associations(self,equis_station_id):
162
+ wiski_ids = list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['EQUIS_STATION_ID'] == equis_station_id,'WISKI_STATION_NO'].unique())
163
+ wiski_ids = [wiski_id for wiski_id in wiski_ids if not pd.isna(wiski_id)]
164
+ if len(wiski_ids) == 0:
165
+ return []
166
+ else:
167
+ return wiski_ids
168
+
169
+ def equis_wiski_alias(self,equis_station_id):
170
+ wiski_ids = list(set(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_EQUIS_ID'] == equis_station_id,'WISKI_STATION_NO'].to_list()))
171
+ wiski_ids = [wiski_id for wiski_id in wiski_ids if not pd.isna(wiski_id)]
172
+ if len(wiski_ids) == 0:
173
+ return []
174
+ elif len(wiski_ids) > 1:
175
+ print(f'Too Many WISKI Stations for {equis_station_id}')
176
+ raise
177
+ else:
178
+ return wiski_ids[0]
179
+
180
+ def _equis_wiski_associations(self,equis_station_ids):
181
+ wiski_stations = [self.equis_wiski_associations(equis_station_id) for equis_station_id in equis_station_ids]
182
+ if are_lists_identical(wiski_stations):
183
+ return wiski_stations[0]
184
+ else:
185
+ return []
186
+
187
+ def _stations_by_wid(self,wid_no,station_origin):
188
+ if station_origin in ['wiski','wplmn']:
189
+ station_col = 'WISKI_STATION_NO'
190
+ elif station_origin in ['equis','swd']:
191
+ station_col = 'EQUIS_STATION_ID'
192
+ else:
193
+ raise
194
+
195
+ return list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WID'] == wid_no,station_col].unique())
196
+
197
+
198
+ def download_stations_by_wid(self, wid_no,station_origin, folderpath = None, overwrite = False):
199
+
200
+ station_ids = self._station_by_wid(wid_no,station_origin)
201
+
202
+ if not station_ids.empty:
203
+ for _, row in station_ids.iterrows():
204
+ self.download_station_data(row['station_id'],station_origin, folderpath, overwrite)
205
+
206
+ def _download_station_data(self,station_id,station_origin,overwrite=False):
207
+ assert(station_origin in ['wiski','equis','swd','wplmn'])
208
+ if station_origin == 'wiski':
209
+ self.download_station_data(station_id,'wiski',overwrite = overwrite)
210
+ elif station_origin == 'wplmn':
211
+ self.download_station_data(station_id,'wplmn',overwrite = overwrite)
212
+ elif station_origin == 'swd':
213
+ self.download_station_data(station_id,'swd',overwrite = overwrite)
214
+ else:
215
+ self.download_station_data(station_id,'equis',overwrite = overwrite)
216
+
217
+
218
+
219
+
220
+ def download_station_data(self,station_id,station_origin,start_year = 1996, end_year = 2030,folderpath=None,overwrite = False,baseflow_method = 'Boughton'):
221
+ assert(station_origin in ['wiski','equis','swd','wplmn'])
222
+ station_id = str(station_id)
223
+ save_name = station_id
224
+ if station_origin == 'wplmn':
225
+ save_name = station_id + '_wplmn'
226
+
227
+ if folderpath is None:
228
+ folderpath = self.folderpath
229
+ else:
230
+ folderpath = Path(folderpath)
231
+
232
+
233
+ if (folderpath.joinpath(save_name + '.csv').exists()) & (not overwrite):
234
+ print (f'{station_id} data already downloaded')
235
+ return
236
+
237
+ if station_origin == 'wiski':
238
+ data = wiski.transform(wiski.download([station_id],wplmn=False, baseflow_method = baseflow_method))
239
+ elif station_origin == 'swd':
240
+ data = etlSWD.download(station_id)
241
+ elif station_origin == 'equis':
242
+ assert (self.credentials_exist(), 'Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
243
+ data = equis.transform(equis.download([station_id]))
244
+ else:
245
+ data = wiski.transform(wiski.download([station_id],wplmn=True, baseflow_method = baseflow_method))
246
+
247
+
248
+
249
+
250
+ if len(data) > 0:
251
+ data.to_csv(folderpath.joinpath(save_name + '.csv'))
252
+ self.data[station_id] = data
253
+ else:
254
+ print(f'No {station_origin} calibration cata available at Station {station_id}')
255
+
256
+ def _load(self,station_id):
257
+ with duckdb.connect(self.db_path) as con:
258
+ query = '''
259
+ SELECT *
260
+ FROM analytics.observations
261
+ WHERE station_id = ?'''
262
+ df = con.execute(query,[station_id]).fetch_df()
263
+ df.set_index('datetime',inplace=True)
264
+ self.data[station_id] = df
265
+ return df
266
+
267
+ def _load2(self,station_id):
268
+ df = pd.read_csv(self.folderpath.joinpath(station_id + '.csv'),
269
+ index_col='datetime',
270
+ parse_dates=['datetime'],
271
+ #usecols=['Ts Date','Station number','variable', 'value','reach_id'],
272
+ dtype={'station_id': str, 'value': float, 'variable': str,'constituent':str,'unit':str})
273
+ self.data[station_id] = df
274
+ return df
275
+
276
+ def load(self,station_id):
277
+ try:
278
+ df = self.data[station_id]
279
+ except:
280
+ df = self._load(station_id)
281
+ return df
282
+
283
+ def info(self,constituent):
284
+ return pd.concat([self._load(file.stem) for file in self.folderpath.iterdir() if file.suffix == '.csv'])[['station_id','constituent','value']].groupby(by = ['station_id','constituent']).count()
285
+
286
+ def get_wplmn_data(self,station_id,constituent,unit = 'mg/l', agg_period = 'YE', samples_only = True):
287
+
288
+ assert constituent in ['Q','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
289
+ station_id = station_id + '_wplmn'
290
+ dfsub = self._load(station_id)
291
+
292
+ if samples_only:
293
+ dfsub = dfsub.loc[dfsub['quality_id'] == 3]
294
+ agg_func = 'mean'
295
+
296
+ dfsub = dfsub.loc[(dfsub['constituent'] == constituent) &
297
+ (dfsub['unit'] == unit),
298
+ ['value','station_origin']]
299
+
300
+
301
+ df = dfsub[['value']].resample(agg_period).agg(agg_func)
302
+
303
+ if df.empty:
304
+ dfsub = df
305
+ else:
306
+
307
+ df['station_origin'] = dfsub['station_origin'].iloc[0]
308
+
309
+ #if (constituent == 'TSS') & (unit == 'lb'): #convert TSS from lbs to us tons
310
+ # dfsub['value'] = dfsub['value']/2000
311
+
312
+ #dfsub = dfsub.resample('H').mean().dropna()
313
+
314
+ df.attrs['unit'] = unit
315
+ df.attrs['constituent'] = constituent
316
+ return df['value'].to_frame().dropna()
317
+
318
+ def get_data(self,station_id,constituent,agg_period = 'D'):
319
+ return self._get_data([station_id],constituent,agg_period)
320
+
321
+ def _get_data(self,station_ids,constituent,agg_period = 'D',tz_offset = '-6'):
322
+ '''
323
+
324
+ Returns the processed observational data associated with the calibration specific id.
325
+
326
+
327
+ Parameters
328
+ ----------
329
+ station_id : STR
330
+ Station ID as a string
331
+ constituent : TYPE
332
+ Constituent abbreviation used for calibration. Valid options:
333
+ 'Q',
334
+ 'TSS',
335
+ 'TP',
336
+ 'OP',
337
+ 'TKN',
338
+ 'N',
339
+ 'WT',
340
+ 'DO',
341
+ 'WL']
342
+ unit : TYPE, optional
343
+ Units of data. The default is 'mg/l'.
344
+ sample_flag : TYPE, optional
345
+ For WPLMN data this flag determines modeled loads are returned. The default is False.
346
+
347
+ Returns
348
+ -------
349
+ dfsub : Pands.Series
350
+ Pandas series of data. Note that no metadata is returned.
351
+
352
+ '''
353
+
354
+ assert constituent in ['Q','QB','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
355
+
356
+ unit = UNIT_DEFAULTS[constituent]
357
+ agg_func = AGG_DEFAULTS[unit]
358
+
359
+ dfsub = pd.concat([self.load(station_id) for station_id in station_ids]) # Check cache
360
+ dfsub.index = dfsub.index.tz_localize(None) # Drop timezone info
361
+ #dfsub.set_index('datetime',drop=True,inplace=True)
362
+ dfsub.rename(columns={'source':'station_origin'},inplace=True)
363
+ dfsub = dfsub.loc[(dfsub['constituent'] == constituent) &
364
+ (dfsub['unit'] == unit),
365
+ ['value','station_origin']]
366
+
367
+ df = dfsub[['value']].resample(agg_period).agg(agg_func)
368
+ df.attrs['unit'] = unit
369
+ df.attrs['constituent'] = constituent
370
+
371
+ if df.empty:
372
+
373
+ return df
374
+ else:
375
+
376
+ df['station_origin'] = dfsub['station_origin'].iloc[0]
377
+
378
+
379
+ # convert to desired timzone before stripping timezone information.
380
+ #df.index.tz_convert('UTC-06:00').tz_localize(None)
381
+
382
+ return df['value'].to_frame().dropna()
383
+
384
+
385
+ def validate_constituent(constituent):
386
+ assert constituent in ['Q','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
387
+
388
+ def validate_unit(unit):
389
+ assert(unit in ['mg/l','lb','cfs','degF'])
390
+
391
+
392
+
393
+ # class database():
394
+ # def __init__(self,db_path):
395
+ # self.dbm = MonitoringDatabase(db_path)
396
+
397
+
398
+ # def get_timeseries(self,station_ds, constituent,agg_period):
399
+ # validate_constituent(constituent)
400
+ # unit = UNIT_DEFAULTS[constituent]
401
+ # agg_func = AGG_DEFAULTS[unit]
402
+ # return odm.get_timeseries(station_id,constituent)
403
+
404
+
405
+ # def get_samples(self,station_ds, constituent,agg_period):
406
+ # validate_constituent(constituent)
407
+ # unit = UNIT_DEFAULTS[constituent]
408
+ # agg_func = AGG_DEFAULTS[unit]
409
+ # return odm.get_sample(station_id,constituent)
410
+
411
+ # def get_samples_and_timeseries(self,station_ds, constituent,agg_period)
412
+
@@ -164,25 +164,26 @@ def as_utc_offset(naive_dt: Union[datetime, str], tz_label: str, target_offset:
164
164
  aware_src = naive.replace(tzinfo=src_tz)
165
165
 
166
166
  # convert the instant to fixed UTC-6
167
- return aware_src.astimezone(target_offset).tz_localize(None)
167
+ return aware_src.astimezone(target_offset)
168
168
 
169
169
 
170
170
  def normalize_columns(df):
171
171
  '''Select relevant columns from Equis data.'''
172
172
  return df[['SYS_LOC_CODE',
173
- 'constituent',
174
- 'CAS_RN',
175
173
  'datetime',
176
174
  'RESULT_NUMERIC',
177
175
  'RESULT_UNIT',
176
+ 'constituent'
178
177
  ]].rename(columns={
179
178
  'SYS_LOC_CODE':'station_id',
180
179
  'RESULT_NUMERIC':'value',
181
- 'RESULT_UNIT':'unit',
182
- 'CAS_RN':'cas_rn'
180
+ 'RESULT_UNIT':'unit'
183
181
  })
184
182
 
185
-
183
+ def replace_nondetects(df):
184
+ '''Replace non-detect results with 0 in Equis data.'''
185
+ df.loc[df['RESULT_NUMERIC'].isna(), 'RESULT_NUMERIC'] = 0
186
+ return df
186
187
 
187
188
  def normalize_timezone(df):
188
189
  '''Normalize datetime to UTC in Equis data.'''
@@ -193,27 +194,27 @@ def normalize_timezone(df):
193
194
  except Exception:
194
195
  return pd.NaT
195
196
 
196
- df.loc[:,'datetime'] = df.apply(_conv, axis=1)
197
+ df['datetime'] = df.apply(_conv, axis=1)
197
198
  return df
198
199
 
199
200
  def convert_units(df):
200
201
  '''Convert units in Equis data to standard units.'''
201
202
  # Convert ug/L to mg/L
202
- df['unit'] = df['unit'].str.lower()
203
+ df['RESULT_UNIT'] = df['RESULT_UNIT'].str.lower()
203
204
 
204
- mask_ugL = df['unit'] == 'ug/l'
205
- df.loc[mask_ugL, 'value'] = df.loc[mask_ugL, 'value'] / 1000
206
- df.loc[mask_ugL, 'unit'] = 'mg/l'
205
+ mask_ugL = df['RESULT_UNIT'] == 'ug/l'
206
+ df.loc[mask_ugL, 'RESULT_NUMERIC'] = df.loc[mask_ugL, 'RESULT_NUMERIC'] / 1000
207
+ df.loc[mask_ugL, 'RESULT_UNIT'] = 'mg/l'
207
208
 
208
209
  # Convert mg/g to mg/L (assuming density of 1 g/mL)
209
- mask_mgg = df['unit'] == 'mg/g'
210
- df.loc[mask_mgg, 'value'] = df.loc[mask_mgg, 'value'] * 1000
211
- df.loc[mask_mgg, 'unit'] = 'mg/l'
210
+ mask_mgg = df['RESULT_UNIT'] == 'mg/g'
211
+ df.loc[mask_mgg, 'RESULT_NUMERIC'] = df.loc[mask_mgg, 'RESULT_NUMERIC'] * 1000
212
+ df.loc[mask_mgg, 'RESULT_UNIT'] = 'mg/l'
212
213
 
213
214
  # Convert deg C to degF
214
- mask_degC = df['unit'].isin(['deg c', 'degc'])
215
- df.loc[mask_degC, 'value'] = (df.loc[mask_degC, 'value'] * 9/5) + 32
216
- df.loc[mask_degC, 'unit'] = 'degf'
215
+ mask_degC = df['RESULT_UNIT'].isin(['deg c', 'degc'])
216
+ df.loc[mask_degC, 'RESULT_NUMERIC'] = (df.loc[mask_degC, 'RESULT_NUMERIC'] * 9/5) + 32
217
+ df.loc[mask_degC, 'RESULT_UNIT'] = 'degf'
217
218
 
218
219
  return df
219
220
 
@@ -231,25 +232,15 @@ def average_results(df):
231
232
  value=('value', 'mean')
232
233
  ).reset_index()
233
234
 
234
- def replace_nondetects(df):
235
- '''Replace non-detect results with 0 in Equis data.'''
236
- df.loc[df['value'].isna(), 'value'] = 0
237
- return df
238
-
239
- def normalize(df):
240
- '''Normalize Equis data: select relevant columns.'''
241
- df = map_constituents(df)
242
- df = normalize_timezone(df)
243
- df = normalize_columns(df)
244
- df = convert_units(df)
245
- return df
246
-
247
235
  def transform(df):
248
236
  '''Transform Equis data: handle non-detects, convert units, map constituents.'''
249
237
 
250
- df = normalize(df)
251
238
  df = replace_nondetects(df)
252
239
  if not df.empty:
240
+ df = normalize_timezone(df)
241
+ df = convert_units(df)
242
+ df = map_constituents(df)
243
+ df = normalize_columns(df)
253
244
  df = average_results(df)
254
245
  return df
255
246