mpcaHydro 2.0.6__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mpcahydro-2.0.6 → mpcahydro-2.1.0}/PKG-INFO +1 -3
- {mpcahydro-2.0.6 → mpcahydro-2.1.0}/pyproject.toml +2 -4
- mpcahydro-2.0.6/src/mpcaHydro/wiski.py → mpcahydro-2.1.0/src/mpcaHydro/WISKI.py +12 -40
- mpcahydro-2.1.0/src/mpcaHydro/data_manager.py +412 -0
- {mpcahydro-2.0.6 → mpcahydro-2.1.0}/src/mpcaHydro/equis.py +22 -31
- mpcahydro-2.1.0/src/mpcaHydro/warehouse.py +203 -0
- mpcahydro-2.1.0/tests/pixi.toml +25 -0
- mpcahydro-2.0.6/ERROR.FIL +0 -6
- mpcahydro-2.0.6/demo.py +0 -167
- mpcahydro-2.0.6/src/mpcaHydro/data/WISKI_QUALITY_CODES.csv +0 -71
- mpcahydro-2.0.6/src/mpcaHydro/data/outlets.duckdb +0 -0
- mpcahydro-2.0.6/src/mpcaHydro/data/stations_EQUIS.gpkg +0 -0
- mpcahydro-2.0.6/src/mpcaHydro/data/stations_wiski.gpkg +0 -0
- mpcahydro-2.0.6/src/mpcaHydro/data_manager.py +0 -247
- mpcahydro-2.0.6/src/mpcaHydro/outlets.py +0 -371
- mpcahydro-2.0.6/src/mpcaHydro/reports.py +0 -80
- mpcahydro-2.0.6/src/mpcaHydro/warehouse.py +0 -581
- mpcahydro-2.0.6/src/mpcaHydro/warehouseManager.py +0 -47
- mpcahydro-2.0.6/src/mpcaHydro/xref.py +0 -74
- {mpcahydro-2.0.6 → mpcahydro-2.1.0}/.gitattributes +0 -0
- {mpcahydro-2.0.6 → mpcahydro-2.1.0}/.gitignore +0 -0
- {mpcahydro-2.0.6 → mpcahydro-2.1.0}/README.md +0 -0
- {mpcahydro-2.0.6 → mpcahydro-2.1.0}/src/mpcaHydro/__init__.py +0 -0
- {mpcahydro-2.0.6 → mpcahydro-2.1.0}/src/mpcaHydro/data/EQUIS_PARAMETER_XREF.csv +0 -0
- {mpcahydro-2.0.6 → mpcahydro-2.1.0}/src/mpcaHydro/data/WISKI_EQUIS_XREF.csv +0 -0
- {mpcahydro-2.0.6 → mpcahydro-2.1.0}/src/mpcaHydro/etlCSG.py +0 -0
- {mpcahydro-2.0.6 → mpcahydro-2.1.0}/src/mpcaHydro/etlSWD.py +0 -0
- {mpcahydro-2.0.6 → mpcahydro-2.1.0}/src/mpcaHydro/etlWISKI.py +0 -0
- {mpcahydro-2.0.6 → mpcahydro-2.1.0}/src/mpcaHydro/etlWPLMN.py +0 -0
- {mpcahydro-2.0.6 → mpcahydro-2.1.0}/src/mpcaHydro/pywisk.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mpcaHydro
|
|
3
|
-
Version: 2.0
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: Python package for downloading MPCA hydrology data
|
|
5
5
|
Project-URL: Homepage, https://github.com/mfratkin1/mpcaHydro
|
|
6
6
|
Author-email: Mulu Fratkin <michael.fratkin@state.mn.us>
|
|
@@ -10,9 +10,7 @@ Keywords: Hydrology,MPCA
|
|
|
10
10
|
Classifier: Development Status :: 3 - Alpha
|
|
11
11
|
Classifier: Programming Language :: Python
|
|
12
12
|
Requires-Python: >=3.8
|
|
13
|
-
Requires-Dist: baseflow
|
|
14
13
|
Requires-Dist: duckdb
|
|
15
|
-
Requires-Dist: oracledb
|
|
16
14
|
Requires-Dist: pandas
|
|
17
15
|
Requires-Dist: pathlib
|
|
18
16
|
Requires-Dist: requests
|
|
@@ -5,14 +5,12 @@ build-backend = "hatchling.build"
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "mpcaHydro"
|
|
7
7
|
urls = { "Homepage" = "https://github.com/mfratkin1/mpcaHydro" } # ? Add this!
|
|
8
|
-
version = "2.0
|
|
8
|
+
version = "2.1.0"
|
|
9
9
|
dependencies = [
|
|
10
10
|
"pandas",
|
|
11
11
|
"requests",
|
|
12
12
|
"pathlib",
|
|
13
|
-
"duckdb"
|
|
14
|
-
"oracledb",
|
|
15
|
-
"baseflow"
|
|
13
|
+
"duckdb"
|
|
16
14
|
]
|
|
17
15
|
requires-python = ">=3.8"
|
|
18
16
|
authors = [
|
|
@@ -157,7 +157,7 @@ def download_chunk(ts_id,start_year = 1996,end_year = 2030, interval = 4, as_jso
|
|
|
157
157
|
end = end_year
|
|
158
158
|
df = pywisk.get_ts(ts_id,start_date = f'{start}-01-01',end_date = f'{end}-12-31',as_json = as_json)
|
|
159
159
|
if not df.empty: frames.append(df)
|
|
160
|
-
df
|
|
160
|
+
df.index = pd.to_datetime(df['Timestamp'])
|
|
161
161
|
time.sleep(.1)
|
|
162
162
|
return pd.concat(frames)
|
|
163
163
|
|
|
@@ -197,8 +197,11 @@ def tkn(station_nos,start_year = 1996,end_year = 2030):
|
|
|
197
197
|
return _download('TKN',station_nos,start_year,end_year)
|
|
198
198
|
|
|
199
199
|
|
|
200
|
-
|
|
201
|
-
|
|
200
|
+
def filter_quality_codes(df):
|
|
201
|
+
'''
|
|
202
|
+
Filter dataframe by valid quality codes
|
|
203
|
+
'''
|
|
204
|
+
return df.loc[df['Quality Code'].isin(DATA_CODES)]
|
|
202
205
|
|
|
203
206
|
def convert_units(df):
|
|
204
207
|
'''
|
|
@@ -230,22 +233,12 @@ def normalize_columns(df):
|
|
|
230
233
|
'station_no':'station_id',
|
|
231
234
|
'Timestamp':'datetime',
|
|
232
235
|
'Value':'value',
|
|
233
|
-
'ts_unitsymbol':'unit',
|
|
234
|
-
'Quality Code':'quality_code',
|
|
235
|
-
'Quality Code Name':'quality_code_name'}, inplace=True)
|
|
236
|
+
'ts_unitsymbol':'unit'}, inplace=True)
|
|
236
237
|
return df
|
|
237
238
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
def filter_quality_codes(df, data_codes):
|
|
241
|
-
'''
|
|
242
|
-
Filter dataframe by valid quality codes
|
|
243
|
-
'''
|
|
244
|
-
return df.loc[df['quality_code'].isin(data_codes)]
|
|
245
|
-
|
|
246
239
|
def average_results(df):
|
|
247
|
-
|
|
248
|
-
df
|
|
240
|
+
df['datetime'] = pd.to_datetime(df.loc[:,'datetime'])
|
|
241
|
+
df['datetime'] = df['datetime'].dt.round('h')
|
|
249
242
|
return df.groupby(['station_id', 'datetime', 'constituent', 'unit']).agg(value=('value', 'mean')).reset_index()
|
|
250
243
|
# Convert units
|
|
251
244
|
|
|
@@ -274,35 +267,14 @@ def calculate_baseflow(df, method = 'Boughton'):
|
|
|
274
267
|
return pd.concat(dfs)
|
|
275
268
|
|
|
276
269
|
|
|
277
|
-
def
|
|
270
|
+
def transform(df, baseflow_method = 'Boughton'):
|
|
278
271
|
'''
|
|
279
|
-
|
|
280
|
-
The standardized format includes normalized column names and units.
|
|
281
|
-
---
|
|
282
|
-
Parameters:
|
|
283
|
-
df (pandas.DataFrame): Raw WISKI data
|
|
284
|
-
Returns:
|
|
285
|
-
pandas.DataFrame: Normalized WISKI data
|
|
272
|
+
Transform raw WISKI data into standardized format
|
|
286
273
|
'''
|
|
287
|
-
|
|
274
|
+
df = filter_quality_codes(df)
|
|
288
275
|
df = convert_units(df)
|
|
289
276
|
df = normalize_columns(df)
|
|
290
|
-
return df
|
|
291
|
-
|
|
292
|
-
def transform(df, filter_qc_codes = True, data_codes = None, baseflow_method = 'Boughton'):
|
|
293
|
-
'''
|
|
294
|
-
Transform normalized WISKI data into standardized format
|
|
295
|
-
'''
|
|
296
|
-
df = normalize(df)
|
|
297
|
-
if filter_qc_codes:
|
|
298
|
-
if data_codes is None:
|
|
299
|
-
data_codes = DATA_CODES
|
|
300
|
-
df = filter_quality_codes(df, data_codes)
|
|
301
277
|
df = average_results(df)
|
|
302
278
|
df = calculate_baseflow(df, method = baseflow_method)
|
|
303
279
|
df['station_origin'] = 'wiski'
|
|
304
|
-
#df.set_index('datetime',inplace=True)
|
|
305
280
|
return df
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Created on Fri Jun 3 10:01:14 2022
|
|
4
|
+
|
|
5
|
+
@author: mfratki
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
#from abc import abstractmethod
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from mpcaHydro import etlSWD
|
|
12
|
+
from mpcaHydro import equis, wiski, warehouse
|
|
13
|
+
import duckdb
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
WISKI_EQUIS_XREF = pd.read_csv(Path(__file__).parent/'data/WISKI_EQUIS_XREF.csv')
|
|
17
|
+
#WISKI_EQUIS_XREF = pd.read_csv('C:/Users/mfratki/Documents/GitHub/hspf_tools/WISKI_EQUIS_XREF.csv')
|
|
18
|
+
|
|
19
|
+
AGG_DEFAULTS = {'cfs':'mean',
|
|
20
|
+
'mg/l':'mean',
|
|
21
|
+
'degF': 'mean',
|
|
22
|
+
'lb':'sum'}
|
|
23
|
+
|
|
24
|
+
UNIT_DEFAULTS = {'Q': 'cfs',
|
|
25
|
+
'QB': 'cfs',
|
|
26
|
+
'TSS': 'mg/l',
|
|
27
|
+
'TP' : 'mg/l',
|
|
28
|
+
'OP' : 'mg/l',
|
|
29
|
+
'TKN': 'mg/l',
|
|
30
|
+
'N' : 'mg/l',
|
|
31
|
+
'WT' : 'degF',
|
|
32
|
+
'WL' : 'ft'}
|
|
33
|
+
|
|
34
|
+
def are_lists_identical(nested_list):
|
|
35
|
+
# Sort each sublist
|
|
36
|
+
sorted_sublists = [sorted(sublist) for sublist in nested_list]
|
|
37
|
+
# Compare all sublists to the first one
|
|
38
|
+
return all(sublist == sorted_sublists[0] for sublist in sorted_sublists)
|
|
39
|
+
|
|
40
|
+
def construct_database(folderpath):
|
|
41
|
+
folderpath = Path(folderpath)
|
|
42
|
+
db_path = folderpath.joinpath('observations.duckdb').as_posix()
|
|
43
|
+
with duckdb.connect(db_path) as con:
|
|
44
|
+
con.execute("DROP TABLE IF EXISTS observations")
|
|
45
|
+
datafiles = folderpath.joinpath('*.csv').as_posix()
|
|
46
|
+
query = '''
|
|
47
|
+
CREATE TABLE observations AS SELECT *
|
|
48
|
+
FROM
|
|
49
|
+
read_csv_auto(?,
|
|
50
|
+
union_by_name = true);
|
|
51
|
+
|
|
52
|
+
'''
|
|
53
|
+
con.execute(query,[datafiles])
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def build_warehouse(folderpath):
|
|
57
|
+
folderpath = Path(folderpath)
|
|
58
|
+
db_path = folderpath.joinpath('observations.duckdb').as_posix()
|
|
59
|
+
warehouse.init_db(db_path)
|
|
60
|
+
|
|
61
|
+
def constituent_summary(db_path):
|
|
62
|
+
with duckdb.connect(db_path) as con:
|
|
63
|
+
query = '''
|
|
64
|
+
SELECT
|
|
65
|
+
station_id,
|
|
66
|
+
station_origin,
|
|
67
|
+
constituent,
|
|
68
|
+
COUNT(*) AS sample_count,
|
|
69
|
+
year(MIN(datetime)) AS start_date,
|
|
70
|
+
year(MAX(datetime)) AS end_date
|
|
71
|
+
FROM
|
|
72
|
+
observations
|
|
73
|
+
GROUP BY
|
|
74
|
+
constituent, station_id,station_origin
|
|
75
|
+
ORDER BY
|
|
76
|
+
sample_count;'''
|
|
77
|
+
|
|
78
|
+
res = con.execute(query)
|
|
79
|
+
return res.fetch_df()
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class dataManager():
|
|
83
|
+
|
|
84
|
+
def __init__(self,folderpath, oracle_user = None, oracle_password =None):
|
|
85
|
+
|
|
86
|
+
self.data = {}
|
|
87
|
+
self.folderpath = Path(folderpath)
|
|
88
|
+
self.db_path = self.folderpath.joinpath('observations.duckdb')
|
|
89
|
+
self.oracle_user = oracle_user
|
|
90
|
+
self.oracle_password = oracle_password
|
|
91
|
+
|
|
92
|
+
def connect_to_oracle(self):
|
|
93
|
+
assert (self.credentials_exist(), 'Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
|
|
94
|
+
equis.connect(user = self.oracle_user, password = self.oracle_password)
|
|
95
|
+
|
|
96
|
+
def credentials_exist(self):
|
|
97
|
+
if (self.oracle_user is not None) & (self.oracle_password is not None):
|
|
98
|
+
return True
|
|
99
|
+
else:
|
|
100
|
+
return False
|
|
101
|
+
|
|
102
|
+
def _reconstruct_database(self):
|
|
103
|
+
construct_database(self.folderpath)
|
|
104
|
+
|
|
105
|
+
def _build_warehouse(self):
|
|
106
|
+
build_warehouse(self.folderpath)
|
|
107
|
+
|
|
108
|
+
def constituent_summary(self,constituents = None):
|
|
109
|
+
with duckdb.connect(self.db_path) as con:
|
|
110
|
+
if constituents is None:
|
|
111
|
+
constituents = con.query('''
|
|
112
|
+
SELECT DISTINCT
|
|
113
|
+
constituent
|
|
114
|
+
FROM observations''').to_df()['constituent'].to_list()
|
|
115
|
+
|
|
116
|
+
query = '''
|
|
117
|
+
SELECT
|
|
118
|
+
station_id,
|
|
119
|
+
station_origin,
|
|
120
|
+
constituent,
|
|
121
|
+
COUNT(*) AS sample_count,
|
|
122
|
+
year(MIN(datetime)) AS start_date,
|
|
123
|
+
year(MAX(datetime)) AS end_date
|
|
124
|
+
FROM
|
|
125
|
+
observations
|
|
126
|
+
WHERE
|
|
127
|
+
constituent in (SELECT UNNEST(?))
|
|
128
|
+
GROUP BY
|
|
129
|
+
constituent,station_id,station_origin
|
|
130
|
+
ORDER BY
|
|
131
|
+
constituent,sample_count;'''
|
|
132
|
+
|
|
133
|
+
df = con.execute(query,[constituents]).fetch_df()
|
|
134
|
+
return df
|
|
135
|
+
|
|
136
|
+
def get_wiski_stations(self):
|
|
137
|
+
return list(WISKI_EQUIS_XREF['WISKI_STATION_NO'].unique())
|
|
138
|
+
|
|
139
|
+
def get_equis_stations(self):
|
|
140
|
+
return list(WISKI_EQUIS_XREF['EQUIS_STATION_ID'].unique())
|
|
141
|
+
|
|
142
|
+
def wiski_equis_alias(self,wiski_station_id):
|
|
143
|
+
equis_ids = list(set(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_STATION_NO'] == wiski_station_id,'WISKI_EQUIS_ID'].to_list()))
|
|
144
|
+
equis_ids = [equis_id for equis_id in equis_ids if not pd.isna(equis_id)]
|
|
145
|
+
if len(equis_ids) == 0:
|
|
146
|
+
return []
|
|
147
|
+
elif len(equis_ids) > 1:
|
|
148
|
+
print(f'Too Many Equis Stations for {wiski_station_id}')
|
|
149
|
+
raise
|
|
150
|
+
else:
|
|
151
|
+
return equis_ids[0]
|
|
152
|
+
|
|
153
|
+
def wiski_equis_associations(self,wiski_station_id):
|
|
154
|
+
equis_ids = list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_STATION_NO'] == wiski_station_id,'EQUIS_STATION_ID'].unique())
|
|
155
|
+
equis_ids = [equis_id for equis_id in equis_ids if not pd.isna(equis_id)]
|
|
156
|
+
if len(equis_ids) == 0:
|
|
157
|
+
return []
|
|
158
|
+
else:
|
|
159
|
+
return equis_ids
|
|
160
|
+
|
|
161
|
+
def equis_wiski_associations(self,equis_station_id):
|
|
162
|
+
wiski_ids = list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['EQUIS_STATION_ID'] == equis_station_id,'WISKI_STATION_NO'].unique())
|
|
163
|
+
wiski_ids = [wiski_id for wiski_id in wiski_ids if not pd.isna(wiski_id)]
|
|
164
|
+
if len(wiski_ids) == 0:
|
|
165
|
+
return []
|
|
166
|
+
else:
|
|
167
|
+
return wiski_ids
|
|
168
|
+
|
|
169
|
+
def equis_wiski_alias(self,equis_station_id):
|
|
170
|
+
wiski_ids = list(set(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_EQUIS_ID'] == equis_station_id,'WISKI_STATION_NO'].to_list()))
|
|
171
|
+
wiski_ids = [wiski_id for wiski_id in wiski_ids if not pd.isna(wiski_id)]
|
|
172
|
+
if len(wiski_ids) == 0:
|
|
173
|
+
return []
|
|
174
|
+
elif len(wiski_ids) > 1:
|
|
175
|
+
print(f'Too Many WISKI Stations for {equis_station_id}')
|
|
176
|
+
raise
|
|
177
|
+
else:
|
|
178
|
+
return wiski_ids[0]
|
|
179
|
+
|
|
180
|
+
def _equis_wiski_associations(self,equis_station_ids):
|
|
181
|
+
wiski_stations = [self.equis_wiski_associations(equis_station_id) for equis_station_id in equis_station_ids]
|
|
182
|
+
if are_lists_identical(wiski_stations):
|
|
183
|
+
return wiski_stations[0]
|
|
184
|
+
else:
|
|
185
|
+
return []
|
|
186
|
+
|
|
187
|
+
def _stations_by_wid(self,wid_no,station_origin):
|
|
188
|
+
if station_origin in ['wiski','wplmn']:
|
|
189
|
+
station_col = 'WISKI_STATION_NO'
|
|
190
|
+
elif station_origin in ['equis','swd']:
|
|
191
|
+
station_col = 'EQUIS_STATION_ID'
|
|
192
|
+
else:
|
|
193
|
+
raise
|
|
194
|
+
|
|
195
|
+
return list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WID'] == wid_no,station_col].unique())
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def download_stations_by_wid(self, wid_no,station_origin, folderpath = None, overwrite = False):
|
|
199
|
+
|
|
200
|
+
station_ids = self._station_by_wid(wid_no,station_origin)
|
|
201
|
+
|
|
202
|
+
if not station_ids.empty:
|
|
203
|
+
for _, row in station_ids.iterrows():
|
|
204
|
+
self.download_station_data(row['station_id'],station_origin, folderpath, overwrite)
|
|
205
|
+
|
|
206
|
+
def _download_station_data(self,station_id,station_origin,overwrite=False):
|
|
207
|
+
assert(station_origin in ['wiski','equis','swd','wplmn'])
|
|
208
|
+
if station_origin == 'wiski':
|
|
209
|
+
self.download_station_data(station_id,'wiski',overwrite = overwrite)
|
|
210
|
+
elif station_origin == 'wplmn':
|
|
211
|
+
self.download_station_data(station_id,'wplmn',overwrite = overwrite)
|
|
212
|
+
elif station_origin == 'swd':
|
|
213
|
+
self.download_station_data(station_id,'swd',overwrite = overwrite)
|
|
214
|
+
else:
|
|
215
|
+
self.download_station_data(station_id,'equis',overwrite = overwrite)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def download_station_data(self,station_id,station_origin,start_year = 1996, end_year = 2030,folderpath=None,overwrite = False,baseflow_method = 'Boughton'):
|
|
221
|
+
assert(station_origin in ['wiski','equis','swd','wplmn'])
|
|
222
|
+
station_id = str(station_id)
|
|
223
|
+
save_name = station_id
|
|
224
|
+
if station_origin == 'wplmn':
|
|
225
|
+
save_name = station_id + '_wplmn'
|
|
226
|
+
|
|
227
|
+
if folderpath is None:
|
|
228
|
+
folderpath = self.folderpath
|
|
229
|
+
else:
|
|
230
|
+
folderpath = Path(folderpath)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
if (folderpath.joinpath(save_name + '.csv').exists()) & (not overwrite):
|
|
234
|
+
print (f'{station_id} data already downloaded')
|
|
235
|
+
return
|
|
236
|
+
|
|
237
|
+
if station_origin == 'wiski':
|
|
238
|
+
data = wiski.transform(wiski.download([station_id],wplmn=False, baseflow_method = baseflow_method))
|
|
239
|
+
elif station_origin == 'swd':
|
|
240
|
+
data = etlSWD.download(station_id)
|
|
241
|
+
elif station_origin == 'equis':
|
|
242
|
+
assert (self.credentials_exist(), 'Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
|
|
243
|
+
data = equis.transform(equis.download([station_id]))
|
|
244
|
+
else:
|
|
245
|
+
data = wiski.transform(wiski.download([station_id],wplmn=True, baseflow_method = baseflow_method))
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
if len(data) > 0:
|
|
251
|
+
data.to_csv(folderpath.joinpath(save_name + '.csv'))
|
|
252
|
+
self.data[station_id] = data
|
|
253
|
+
else:
|
|
254
|
+
print(f'No {station_origin} calibration cata available at Station {station_id}')
|
|
255
|
+
|
|
256
|
+
def _load(self,station_id):
|
|
257
|
+
with duckdb.connect(self.db_path) as con:
|
|
258
|
+
query = '''
|
|
259
|
+
SELECT *
|
|
260
|
+
FROM analytics.observations
|
|
261
|
+
WHERE station_id = ?'''
|
|
262
|
+
df = con.execute(query,[station_id]).fetch_df()
|
|
263
|
+
df.set_index('datetime',inplace=True)
|
|
264
|
+
self.data[station_id] = df
|
|
265
|
+
return df
|
|
266
|
+
|
|
267
|
+
def _load2(self,station_id):
|
|
268
|
+
df = pd.read_csv(self.folderpath.joinpath(station_id + '.csv'),
|
|
269
|
+
index_col='datetime',
|
|
270
|
+
parse_dates=['datetime'],
|
|
271
|
+
#usecols=['Ts Date','Station number','variable', 'value','reach_id'],
|
|
272
|
+
dtype={'station_id': str, 'value': float, 'variable': str,'constituent':str,'unit':str})
|
|
273
|
+
self.data[station_id] = df
|
|
274
|
+
return df
|
|
275
|
+
|
|
276
|
+
def load(self,station_id):
|
|
277
|
+
try:
|
|
278
|
+
df = self.data[station_id]
|
|
279
|
+
except:
|
|
280
|
+
df = self._load(station_id)
|
|
281
|
+
return df
|
|
282
|
+
|
|
283
|
+
def info(self,constituent):
|
|
284
|
+
return pd.concat([self._load(file.stem) for file in self.folderpath.iterdir() if file.suffix == '.csv'])[['station_id','constituent','value']].groupby(by = ['station_id','constituent']).count()
|
|
285
|
+
|
|
286
|
+
def get_wplmn_data(self,station_id,constituent,unit = 'mg/l', agg_period = 'YE', samples_only = True):
|
|
287
|
+
|
|
288
|
+
assert constituent in ['Q','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
|
|
289
|
+
station_id = station_id + '_wplmn'
|
|
290
|
+
dfsub = self._load(station_id)
|
|
291
|
+
|
|
292
|
+
if samples_only:
|
|
293
|
+
dfsub = dfsub.loc[dfsub['quality_id'] == 3]
|
|
294
|
+
agg_func = 'mean'
|
|
295
|
+
|
|
296
|
+
dfsub = dfsub.loc[(dfsub['constituent'] == constituent) &
|
|
297
|
+
(dfsub['unit'] == unit),
|
|
298
|
+
['value','station_origin']]
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
df = dfsub[['value']].resample(agg_period).agg(agg_func)
|
|
302
|
+
|
|
303
|
+
if df.empty:
|
|
304
|
+
dfsub = df
|
|
305
|
+
else:
|
|
306
|
+
|
|
307
|
+
df['station_origin'] = dfsub['station_origin'].iloc[0]
|
|
308
|
+
|
|
309
|
+
#if (constituent == 'TSS') & (unit == 'lb'): #convert TSS from lbs to us tons
|
|
310
|
+
# dfsub['value'] = dfsub['value']/2000
|
|
311
|
+
|
|
312
|
+
#dfsub = dfsub.resample('H').mean().dropna()
|
|
313
|
+
|
|
314
|
+
df.attrs['unit'] = unit
|
|
315
|
+
df.attrs['constituent'] = constituent
|
|
316
|
+
return df['value'].to_frame().dropna()
|
|
317
|
+
|
|
318
|
+
def get_data(self,station_id,constituent,agg_period = 'D'):
|
|
319
|
+
return self._get_data([station_id],constituent,agg_period)
|
|
320
|
+
|
|
321
|
+
def _get_data(self,station_ids,constituent,agg_period = 'D',tz_offset = '-6'):
|
|
322
|
+
'''
|
|
323
|
+
|
|
324
|
+
Returns the processed observational data associated with the calibration specific id.
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
Parameters
|
|
328
|
+
----------
|
|
329
|
+
station_id : STR
|
|
330
|
+
Station ID as a string
|
|
331
|
+
constituent : TYPE
|
|
332
|
+
Constituent abbreviation used for calibration. Valid options:
|
|
333
|
+
'Q',
|
|
334
|
+
'TSS',
|
|
335
|
+
'TP',
|
|
336
|
+
'OP',
|
|
337
|
+
'TKN',
|
|
338
|
+
'N',
|
|
339
|
+
'WT',
|
|
340
|
+
'DO',
|
|
341
|
+
'WL']
|
|
342
|
+
unit : TYPE, optional
|
|
343
|
+
Units of data. The default is 'mg/l'.
|
|
344
|
+
sample_flag : TYPE, optional
|
|
345
|
+
For WPLMN data this flag determines modeled loads are returned. The default is False.
|
|
346
|
+
|
|
347
|
+
Returns
|
|
348
|
+
-------
|
|
349
|
+
dfsub : Pands.Series
|
|
350
|
+
Pandas series of data. Note that no metadata is returned.
|
|
351
|
+
|
|
352
|
+
'''
|
|
353
|
+
|
|
354
|
+
assert constituent in ['Q','QB','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
|
|
355
|
+
|
|
356
|
+
unit = UNIT_DEFAULTS[constituent]
|
|
357
|
+
agg_func = AGG_DEFAULTS[unit]
|
|
358
|
+
|
|
359
|
+
dfsub = pd.concat([self.load(station_id) for station_id in station_ids]) # Check cache
|
|
360
|
+
dfsub.index = dfsub.index.tz_localize(None) # Drop timezone info
|
|
361
|
+
#dfsub.set_index('datetime',drop=True,inplace=True)
|
|
362
|
+
dfsub.rename(columns={'source':'station_origin'},inplace=True)
|
|
363
|
+
dfsub = dfsub.loc[(dfsub['constituent'] == constituent) &
|
|
364
|
+
(dfsub['unit'] == unit),
|
|
365
|
+
['value','station_origin']]
|
|
366
|
+
|
|
367
|
+
df = dfsub[['value']].resample(agg_period).agg(agg_func)
|
|
368
|
+
df.attrs['unit'] = unit
|
|
369
|
+
df.attrs['constituent'] = constituent
|
|
370
|
+
|
|
371
|
+
if df.empty:
|
|
372
|
+
|
|
373
|
+
return df
|
|
374
|
+
else:
|
|
375
|
+
|
|
376
|
+
df['station_origin'] = dfsub['station_origin'].iloc[0]
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
# convert to desired timzone before stripping timezone information.
|
|
380
|
+
#df.index.tz_convert('UTC-06:00').tz_localize(None)
|
|
381
|
+
|
|
382
|
+
return df['value'].to_frame().dropna()
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def validate_constituent(constituent):
|
|
386
|
+
assert constituent in ['Q','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
|
|
387
|
+
|
|
388
|
+
def validate_unit(unit):
|
|
389
|
+
assert(unit in ['mg/l','lb','cfs','degF'])
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
# class database():
|
|
394
|
+
# def __init__(self,db_path):
|
|
395
|
+
# self.dbm = MonitoringDatabase(db_path)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
# def get_timeseries(self,station_ds, constituent,agg_period):
|
|
399
|
+
# validate_constituent(constituent)
|
|
400
|
+
# unit = UNIT_DEFAULTS[constituent]
|
|
401
|
+
# agg_func = AGG_DEFAULTS[unit]
|
|
402
|
+
# return odm.get_timeseries(station_id,constituent)
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
# def get_samples(self,station_ds, constituent,agg_period):
|
|
406
|
+
# validate_constituent(constituent)
|
|
407
|
+
# unit = UNIT_DEFAULTS[constituent]
|
|
408
|
+
# agg_func = AGG_DEFAULTS[unit]
|
|
409
|
+
# return odm.get_sample(station_id,constituent)
|
|
410
|
+
|
|
411
|
+
# def get_samples_and_timeseries(self,station_ds, constituent,agg_period)
|
|
412
|
+
|
|
@@ -164,25 +164,26 @@ def as_utc_offset(naive_dt: Union[datetime, str], tz_label: str, target_offset:
|
|
|
164
164
|
aware_src = naive.replace(tzinfo=src_tz)
|
|
165
165
|
|
|
166
166
|
# convert the instant to fixed UTC-6
|
|
167
|
-
return aware_src.astimezone(target_offset)
|
|
167
|
+
return aware_src.astimezone(target_offset)
|
|
168
168
|
|
|
169
169
|
|
|
170
170
|
def normalize_columns(df):
|
|
171
171
|
'''Select relevant columns from Equis data.'''
|
|
172
172
|
return df[['SYS_LOC_CODE',
|
|
173
|
-
'constituent',
|
|
174
|
-
'CAS_RN',
|
|
175
173
|
'datetime',
|
|
176
174
|
'RESULT_NUMERIC',
|
|
177
175
|
'RESULT_UNIT',
|
|
176
|
+
'constituent'
|
|
178
177
|
]].rename(columns={
|
|
179
178
|
'SYS_LOC_CODE':'station_id',
|
|
180
179
|
'RESULT_NUMERIC':'value',
|
|
181
|
-
'RESULT_UNIT':'unit'
|
|
182
|
-
'CAS_RN':'cas_rn'
|
|
180
|
+
'RESULT_UNIT':'unit'
|
|
183
181
|
})
|
|
184
182
|
|
|
185
|
-
|
|
183
|
+
def replace_nondetects(df):
|
|
184
|
+
'''Replace non-detect results with 0 in Equis data.'''
|
|
185
|
+
df.loc[df['RESULT_NUMERIC'].isna(), 'RESULT_NUMERIC'] = 0
|
|
186
|
+
return df
|
|
186
187
|
|
|
187
188
|
def normalize_timezone(df):
|
|
188
189
|
'''Normalize datetime to UTC in Equis data.'''
|
|
@@ -193,27 +194,27 @@ def normalize_timezone(df):
|
|
|
193
194
|
except Exception:
|
|
194
195
|
return pd.NaT
|
|
195
196
|
|
|
196
|
-
df
|
|
197
|
+
df['datetime'] = df.apply(_conv, axis=1)
|
|
197
198
|
return df
|
|
198
199
|
|
|
199
200
|
def convert_units(df):
|
|
200
201
|
'''Convert units in Equis data to standard units.'''
|
|
201
202
|
# Convert ug/L to mg/L
|
|
202
|
-
df['
|
|
203
|
+
df['RESULT_UNIT'] = df['RESULT_UNIT'].str.lower()
|
|
203
204
|
|
|
204
|
-
mask_ugL = df['
|
|
205
|
-
df.loc[mask_ugL, '
|
|
206
|
-
df.loc[mask_ugL, '
|
|
205
|
+
mask_ugL = df['RESULT_UNIT'] == 'ug/l'
|
|
206
|
+
df.loc[mask_ugL, 'RESULT_NUMERIC'] = df.loc[mask_ugL, 'RESULT_NUMERIC'] / 1000
|
|
207
|
+
df.loc[mask_ugL, 'RESULT_UNIT'] = 'mg/l'
|
|
207
208
|
|
|
208
209
|
# Convert mg/g to mg/L (assuming density of 1 g/mL)
|
|
209
|
-
mask_mgg = df['
|
|
210
|
-
df.loc[mask_mgg, '
|
|
211
|
-
df.loc[mask_mgg, '
|
|
210
|
+
mask_mgg = df['RESULT_UNIT'] == 'mg/g'
|
|
211
|
+
df.loc[mask_mgg, 'RESULT_NUMERIC'] = df.loc[mask_mgg, 'RESULT_NUMERIC'] * 1000
|
|
212
|
+
df.loc[mask_mgg, 'RESULT_UNIT'] = 'mg/l'
|
|
212
213
|
|
|
213
214
|
# Convert deg C to degF
|
|
214
|
-
mask_degC = df['
|
|
215
|
-
df.loc[mask_degC, '
|
|
216
|
-
df.loc[mask_degC, '
|
|
215
|
+
mask_degC = df['RESULT_UNIT'].isin(['deg c', 'degc'])
|
|
216
|
+
df.loc[mask_degC, 'RESULT_NUMERIC'] = (df.loc[mask_degC, 'RESULT_NUMERIC'] * 9/5) + 32
|
|
217
|
+
df.loc[mask_degC, 'RESULT_UNIT'] = 'degf'
|
|
217
218
|
|
|
218
219
|
return df
|
|
219
220
|
|
|
@@ -231,25 +232,15 @@ def average_results(df):
|
|
|
231
232
|
value=('value', 'mean')
|
|
232
233
|
).reset_index()
|
|
233
234
|
|
|
234
|
-
def replace_nondetects(df):
|
|
235
|
-
'''Replace non-detect results with 0 in Equis data.'''
|
|
236
|
-
df.loc[df['value'].isna(), 'value'] = 0
|
|
237
|
-
return df
|
|
238
|
-
|
|
239
|
-
def normalize(df):
|
|
240
|
-
'''Normalize Equis data: select relevant columns.'''
|
|
241
|
-
df = map_constituents(df)
|
|
242
|
-
df = normalize_timezone(df)
|
|
243
|
-
df = normalize_columns(df)
|
|
244
|
-
df = convert_units(df)
|
|
245
|
-
return df
|
|
246
|
-
|
|
247
235
|
def transform(df):
|
|
248
236
|
'''Transform Equis data: handle non-detects, convert units, map constituents.'''
|
|
249
237
|
|
|
250
|
-
df = normalize(df)
|
|
251
238
|
df = replace_nondetects(df)
|
|
252
239
|
if not df.empty:
|
|
240
|
+
df = normalize_timezone(df)
|
|
241
|
+
df = convert_units(df)
|
|
242
|
+
df = map_constituents(df)
|
|
243
|
+
df = normalize_columns(df)
|
|
253
244
|
df = average_results(df)
|
|
254
245
|
return df
|
|
255
246
|
|