mpcaHydro 2.0.1__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mpcaHydro/data_manager.py
CHANGED
|
@@ -9,7 +9,7 @@ import pandas as pd
|
|
|
9
9
|
#from abc import abstractmethod
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from mpcaHydro import etlWISKI, etlSWD#, etlEQUIS
|
|
12
|
-
|
|
12
|
+
import duckdb
|
|
13
13
|
|
|
14
14
|
#
|
|
15
15
|
'''
|
|
@@ -69,12 +69,59 @@ def are_lists_identical(nested_list):
|
|
|
69
69
|
# Compare all sublists to the first one
|
|
70
70
|
return all(sublist == sorted_sublists[0] for sublist in sorted_sublists)
|
|
71
71
|
|
|
72
|
+
def construct_database(folderpath):
|
|
73
|
+
folderpath = Path(folderpath)
|
|
74
|
+
db_path = folderpath.joinpath('observations.duckdb').as_posix()
|
|
75
|
+
with duckdb.connect(db_path) as con:
|
|
76
|
+
con.execute("DROP TABLE IF EXISTS observations")
|
|
77
|
+
datafiles = folderpath.joinpath('*.csv').as_posix()
|
|
78
|
+
query = '''
|
|
79
|
+
CREATE TABLE observations AS SELECT *
|
|
80
|
+
FROM
|
|
81
|
+
read_csv_auto(?,
|
|
82
|
+
union_by_name = true);
|
|
83
|
+
|
|
84
|
+
'''
|
|
85
|
+
con.execute(query,[datafiles])
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
|
|
72
89
|
class dataManager():
|
|
73
90
|
|
|
74
91
|
def __init__(self,folderpath):
|
|
75
92
|
|
|
76
93
|
self.data = {}
|
|
77
94
|
self.folderpath = Path(folderpath)
|
|
95
|
+
self.db_path = self.folderpath.joinpath('observations.duckdb')
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def constituent_summary(self,constituents = None):
|
|
99
|
+
with duckdb.connect(self.db_path) as con:
|
|
100
|
+
if constituents is None:
|
|
101
|
+
constituents = con.query('''
|
|
102
|
+
SELECT DISTINCT
|
|
103
|
+
constituent
|
|
104
|
+
FROM observations''').to_df()['constituent'].to_list()
|
|
105
|
+
|
|
106
|
+
query = '''
|
|
107
|
+
SELECT
|
|
108
|
+
station_id,
|
|
109
|
+
source,
|
|
110
|
+
constituent,
|
|
111
|
+
COUNT(*) AS sample_count,
|
|
112
|
+
year(MIN(datetime)) AS start_date,
|
|
113
|
+
year(MAX(datetime)) AS end_date
|
|
114
|
+
FROM
|
|
115
|
+
observations
|
|
116
|
+
WHERE
|
|
117
|
+
constituent in (SELECT UNNEST(?))
|
|
118
|
+
GROUP BY
|
|
119
|
+
constituent,station_id,source
|
|
120
|
+
ORDER BY
|
|
121
|
+
constituent,sample_count;'''
|
|
122
|
+
|
|
123
|
+
df = con.execute(query,[constituents]).fetch_df()
|
|
124
|
+
return df
|
|
78
125
|
|
|
79
126
|
def get_wiski_stations(self):
|
|
80
127
|
return list(WISKI_EQUIS_XREF['WISKI_STATION_NO'].unique())
|
|
@@ -108,6 +155,17 @@ class dataManager():
|
|
|
108
155
|
return []
|
|
109
156
|
else:
|
|
110
157
|
return wiski_ids
|
|
158
|
+
|
|
159
|
+
def equis_wiski_alias(self,equis_station_id):
|
|
160
|
+
wiski_ids = list(set(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_EQUIS_ID'] == equis_station_id,'WISKI_STATION_NO'].to_list()))
|
|
161
|
+
wiski_ids = [wiski_id for wiski_id in wiski_ids if not pd.isna(wiski_id)]
|
|
162
|
+
if len(wiski_ids) == 0:
|
|
163
|
+
return []
|
|
164
|
+
elif len(wiski_ids) > 1:
|
|
165
|
+
print(f'Too Many WISKI Stations for {equis_station_id}')
|
|
166
|
+
raise
|
|
167
|
+
else:
|
|
168
|
+
return wiski_ids[0]
|
|
111
169
|
|
|
112
170
|
def _equis_wiski_associations(self,equis_station_ids):
|
|
113
171
|
wiski_stations = [self.equis_wiski_associations(equis_station_id) for equis_station_id in equis_station_ids]
|
|
@@ -115,6 +173,25 @@ class dataManager():
|
|
|
115
173
|
return wiski_stations[0]
|
|
116
174
|
else:
|
|
117
175
|
return []
|
|
176
|
+
|
|
177
|
+
def _stations_by_wid(self,wid_no,station_origin):
|
|
178
|
+
if station_origin in ['wiski','wplmn']:
|
|
179
|
+
station_col = 'WISKI_STATION_NO'
|
|
180
|
+
elif station_origin in ['equis','swd']:
|
|
181
|
+
station_col = 'EQUIS_STATION_ID'
|
|
182
|
+
else:
|
|
183
|
+
raise
|
|
184
|
+
|
|
185
|
+
return list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WID'] == wid_no,station_col].unique())
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def download_stations_by_wid(self, wid_no,station_origin, folderpath = None, overwrite = False):
|
|
189
|
+
|
|
190
|
+
station_ids = self._station_by_wid(wid_no,station_origin)
|
|
191
|
+
|
|
192
|
+
if not station_ids.empty:
|
|
193
|
+
for _, row in station_ids.iterrows():
|
|
194
|
+
self.download_station_data(row['station_id'],station_origin, folderpath, overwrite)
|
|
118
195
|
|
|
119
196
|
def _download_station_data(self,station_id,station_origin,overwrite=False):
|
|
120
197
|
assert(station_origin in ['wiski','equis','swd','wplmn'])
|
|
@@ -232,7 +309,7 @@ class dataManager():
|
|
|
232
309
|
def get_data(self,station_id,constituent,agg_period = 'D'):
|
|
233
310
|
return self._get_data([station_id],constituent,agg_period)
|
|
234
311
|
|
|
235
|
-
def _get_data(self,station_ids,constituent,agg_period = 'D'):
|
|
312
|
+
def _get_data(self,station_ids,constituent,agg_period = 'D',tz_offset = '-6'):
|
|
236
313
|
'''
|
|
237
314
|
|
|
238
315
|
Returns the processed observational data associated with the calibration specific id.
|
|
@@ -287,7 +364,10 @@ class dataManager():
|
|
|
287
364
|
df['data_format'] = dfsub['data_format'].iloc[0]
|
|
288
365
|
df['source'] = dfsub['source'].iloc[0]
|
|
289
366
|
|
|
290
|
-
|
|
367
|
+
|
|
368
|
+
# convert to desired timzone before stripping timezone information.
|
|
369
|
+
#df.index.tz_convert('UTC-06:00').tz_localize(None)
|
|
370
|
+
df.index = df.index.tz_localize(None)
|
|
291
371
|
return df['value'].to_frame().dropna()
|
|
292
372
|
|
|
293
373
|
|
mpcaHydro/etlCSG.py
CHANGED
|
@@ -6,6 +6,9 @@ Created on Tue Oct 10 14:13:23 2023
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
|
+
import requests
|
|
10
|
+
import zipfile
|
|
11
|
+
import io
|
|
9
12
|
# import geopandas as gpd
|
|
10
13
|
|
|
11
14
|
|
|
@@ -14,20 +17,30 @@ CONSITUENT_MAP = {'Water Temp. (C)': 'WT',
|
|
|
14
17
|
'DO (mg/L)': 'DO'
|
|
15
18
|
}
|
|
16
19
|
|
|
17
|
-
def download(station_no):
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
station = station_no[1:]
|
|
22
|
-
df = pd.read_csv(f'https://maps2.dnr.state.mn.us/cgi-bin/csg.cgi?mode=dump_hydro_data_as_csv&site={station}&startdate=1996-1-1&enddate=2050-1-1')
|
|
23
|
-
df['station_id'] = station_no
|
|
20
|
+
# def download(station_no):
|
|
21
|
+
# # save_path = Path(save_path)
|
|
22
|
+
# # file_path = save_path.joinpath('csg.csv')
|
|
24
23
|
|
|
25
|
-
|
|
24
|
+
# station = station_no[1:]
|
|
25
|
+
# df = pd.read_csv(f'https://maps2.dnr.state.mn.us/cgi-bin/csg.cgi?mode=dump_hydro_data_as_csv&site={station}&startdate=1996-1-1&enddate=2050-1-1')
|
|
26
|
+
# df = pd.read_csv(f'https://apps.dnr.state.mn.us/csg/api/v1/download?callback=json&ids=66050001&vars=262')
|
|
27
|
+
# df['station_id'] = station_no
|
|
26
28
|
|
|
29
|
+
# return df
|
|
27
30
|
|
|
31
|
+
def download(station_no):
|
|
32
|
+
station = station_no[1:]
|
|
33
|
+
url = f'https://apps.dnr.state.mn.us/csg/api/v1/download?ids={station}&vars=262'
|
|
34
|
+
response = requests.get(url)
|
|
35
|
+
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
|
|
36
|
+
df = pd.read_csv(zip_ref.open(zip_ref.namelist()[0]))
|
|
37
|
+
df['station_id'] = station_no
|
|
38
|
+
|
|
39
|
+
return df
|
|
40
|
+
|
|
28
41
|
|
|
29
42
|
# def process(df):
|
|
30
|
-
#
|
|
43
|
+
#
|
|
31
44
|
# df.set_index('Timestamp',inplace=True)
|
|
32
45
|
# value_variables = [column for column in df.columns if (column not in ['Site','Timestamp','station_no']) & ~(column.endswith('Quality'))]
|
|
33
46
|
|
|
@@ -35,45 +48,24 @@ def download(station_no):
|
|
|
35
48
|
# df = df['Value'].resample(rule='1H', kind='interval').mean().to_frame()
|
|
36
49
|
|
|
37
50
|
def transform(data):
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
data['Timestamp'].dt.tz_localize('UTC')
|
|
42
|
-
|
|
43
|
-
id_columns = ['Timestamp','station_id']
|
|
44
|
-
quality_columns = ['Water Temp. (C) Quality',
|
|
45
|
-
'Discharge (cfs) Quality',
|
|
46
|
-
'DO (mg/L) Quality']
|
|
47
|
-
|
|
48
|
-
value_columns = ['Water Temp. (C)',
|
|
49
|
-
'Discharge (cfs)',
|
|
50
|
-
'DO (mg/L)']
|
|
51
|
-
|
|
52
|
-
value_columns = [column for column in data.columns if column in value_columns]
|
|
53
|
-
quality_columns = [column for column in data.columns if column in quality_columns]
|
|
51
|
+
data.rename(columns = {'tstamp': 'datetime',
|
|
52
|
+
'var_name': 'variable',
|
|
53
|
+
'station_no': 'station_id'}, inplace = True)
|
|
54
54
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
data_melt = pd.melt(data,col_level=0,id_vars = id_columns,value_vars = value_columns)
|
|
58
|
-
data_melt['Quality'] = pd.melt(data,col_level=0,id_vars = id_columns,value_vars = quality_columns)['value']
|
|
59
|
-
|
|
60
|
-
data_melt.rename(columns = {'Timestamp': 'datetime',
|
|
61
|
-
'Value': 'value',
|
|
62
|
-
'stationparameter_name': 'variable',
|
|
63
|
-
'station_no': 'station_id',
|
|
64
|
-
'Quality' : 'quality'},inplace = True)
|
|
65
|
-
|
|
66
|
-
data_melt['unit'] = data_melt['variable'].map({'Water Temp. (C)' : 'C',
|
|
55
|
+
data['unit'] = data['variable'].map({'Water Temp. (C)' : 'C',
|
|
67
56
|
'Discharge (cfs)' : 'cfs',
|
|
68
57
|
'DO (mg/L)' : 'mg/L'})
|
|
69
58
|
|
|
70
|
-
|
|
59
|
+
data['constituent'] = data['variable'].map({'Water Temp. (C)' : 'WT',
|
|
71
60
|
'Discharge (cfs)' : 'Q',
|
|
72
61
|
'DO (mg/L)' : 'DO'})
|
|
73
62
|
|
|
74
|
-
|
|
63
|
+
data['datetime'] = pd.to_datetime(data['datetime'])
|
|
64
|
+
data.set_index('datetime',drop=True,inplace=True)
|
|
65
|
+
data.index = data.index.tz_localize('UTC-06:00')
|
|
66
|
+
data.dropna(subset = 'value',inplace=True)
|
|
75
67
|
data['source'] = 'csg'
|
|
76
|
-
return
|
|
68
|
+
return data
|
|
77
69
|
|
|
78
70
|
|
|
79
71
|
|
mpcaHydro/etlSWD.py
CHANGED
|
@@ -14,16 +14,25 @@ import pandas as pd
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
CONSTITUENT_MAP = {'Total suspended solids':'TSS',
|
|
17
|
+
'Total solids': 'TSS',
|
|
18
|
+
'Solids, Suspended' : 'TSS',
|
|
19
|
+
'Solids, Total Suspended' : 'TSS',
|
|
17
20
|
'Residue - nonfilterable (TSS)': 'TSS',
|
|
18
21
|
'Kjeldahl nitrogen as N': 'TKN',
|
|
22
|
+
'Inorganic nitrogen (nitrate and nitrate) as N': 'N',
|
|
19
23
|
'Nitrogen, Total Kjeldahl (TKN) as N': 'TKN',
|
|
20
24
|
'Nitrate + Nitrite Nitrogen, Total as N': 'N',
|
|
21
25
|
'Nitrate/Nitrite as N (N+N) as N': 'N',
|
|
22
26
|
'Nutrient-nitrogen as N': 'N',
|
|
27
|
+
'Nitrate/Nitrite as N': 'N',
|
|
23
28
|
'Phosphorus, Total as P as P':'TP',
|
|
29
|
+
'Phosphorus, Total as P' : 'TP',
|
|
24
30
|
'Phosphorus as P': 'TP',
|
|
31
|
+
'Total Phosphorus as P': 'TP',
|
|
32
|
+
'Orthophosphate as P': 'OP',
|
|
25
33
|
'Carbonaceous biochemical oxygen demand, standard conditions': 'BOD',
|
|
26
34
|
'Chemical oxygen demand':'BOD',
|
|
35
|
+
'Biochemical oxygen demand, standard conditions': 'BOD',
|
|
27
36
|
'Chlorophyll a, corrected for pheophytin':'CHLA',
|
|
28
37
|
'Chlorophyll-A':'CHLA',
|
|
29
38
|
'Chlorophyll-a, Pheophytin Corrected':'CHLA',
|
|
@@ -145,7 +154,7 @@ def transform(df):
|
|
|
145
154
|
df.set_index('datetime',drop=True,inplace=True)
|
|
146
155
|
df.index = df.index.tz_localize('UTC-06:00')
|
|
147
156
|
|
|
148
|
-
df.index = df.index.round('
|
|
157
|
+
df.index = df.index.round('h').round('h')
|
|
149
158
|
df = df.reset_index()
|
|
150
159
|
df = df.groupby(['datetime','variable','unit','station_id','station_name','constituent','data_format','data_type','source']).mean()
|
|
151
160
|
df = df.reset_index()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mpcaHydro
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.2
|
|
4
4
|
Summary: Python package for downloading MPCA hydrology data
|
|
5
5
|
Project-URL: Homepage, https://github.com/mfratkin1/mpcaHydro
|
|
6
6
|
Author-email: Mulu Fratkin <michael.fratkin@state.mn.us>
|
|
@@ -10,6 +10,7 @@ Keywords: Hydrology,MPCA
|
|
|
10
10
|
Classifier: Development Status :: 3 - Alpha
|
|
11
11
|
Classifier: Programming Language :: Python
|
|
12
12
|
Requires-Python: >=3.8
|
|
13
|
+
Requires-Dist: duckdb
|
|
13
14
|
Requires-Dist: pandas
|
|
14
15
|
Requires-Dist: pathlib
|
|
15
16
|
Requires-Dist: requests
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
mpcaHydro/WISKI.py,sha256=yqsljbx8TlFA8HIrXFGs5meO0RcTis5Px3__UUzrtiI,13303
|
|
2
2
|
mpcaHydro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
mpcaHydro/data_manager.py,sha256=
|
|
4
|
-
mpcaHydro/etlCSG.py,sha256=
|
|
5
|
-
mpcaHydro/etlSWD.py,sha256=
|
|
3
|
+
mpcaHydro/data_manager.py,sha256=NOB0fP-K40sBDdHOgwd_0FbBIGyz3kgIVCGKouIDd9A,14766
|
|
4
|
+
mpcaHydro/etlCSG.py,sha256=5QT6V2dHvNKC9r5-dspt-NpOmECP2LFw1Lyq1zdkqps,2630
|
|
5
|
+
mpcaHydro/etlSWD.py,sha256=FnpFv-LjK2zAvI2-wrN_4YaS70bI1AGi-aX5lEevkrc,7509
|
|
6
6
|
mpcaHydro/etlWISKI.py,sha256=6I1uTJfM-yL_hY0q-X0JKFqz9DVDaFR7wt4ssmjbcEU,19645
|
|
7
7
|
mpcaHydro/etlWPLMN.py,sha256=b44xvx4s7lwXhpRtfR6rj7RnBpbVKXaYqZCr26BexUI,4160
|
|
8
8
|
mpcaHydro/data/WISKI_EQUIS_XREF.csv,sha256=bPYq-f4-Qc6jsvUgl81lwXBeFamfDe5TjohqUV1XJlg,1244704
|
|
9
|
-
mpcahydro-2.0.
|
|
10
|
-
mpcahydro-2.0.
|
|
11
|
-
mpcahydro-2.0.
|
|
9
|
+
mpcahydro-2.0.2.dist-info/METADATA,sha256=YqLg7UMGbK1kYMXujTxAmgo7amEn3i7o7Ao_0x9ho30,543
|
|
10
|
+
mpcahydro-2.0.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
11
|
+
mpcahydro-2.0.2.dist-info/RECORD,,
|
|
File without changes
|