mpcaHydro 2.0.1__py3-none-any.whl → 2.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mpcaHydro/data_manager.py CHANGED
@@ -9,7 +9,7 @@ import pandas as pd
9
9
  #from abc import abstractmethod
10
10
  from pathlib import Path
11
11
  from mpcaHydro import etlWISKI, etlSWD#, etlEQUIS
12
-
12
+ import duckdb
13
13
 
14
14
  #
15
15
  '''
@@ -69,12 +69,82 @@ def are_lists_identical(nested_list):
69
69
  # Compare all sublists to the first one
70
70
  return all(sublist == sorted_sublists[0] for sublist in sorted_sublists)
71
71
 
72
+ def construct_database(folderpath):
73
+ folderpath = Path(folderpath)
74
+ db_path = folderpath.joinpath('observations.duckdb').as_posix()
75
+ with duckdb.connect(db_path) as con:
76
+ con.execute("DROP TABLE IF EXISTS observations")
77
+ datafiles = folderpath.joinpath('*.csv').as_posix()
78
+ query = '''
79
+ CREATE TABLE observations AS SELECT *
80
+ FROM
81
+ read_csv_auto(?,
82
+ union_by_name = true);
83
+
84
+ '''
85
+ con.execute(query,[datafiles])
86
+
87
+
88
+ def constituent_summary(db_path):
89
+ with duckdb.connect(db_path) as con:
90
+ query = '''
91
+ SELECT
92
+ station_id,
93
+ source,
94
+ constituent,
95
+ COUNT(*) AS sample_count,
96
+ year(MIN(datetime)) AS start_date,
97
+ year(MAX(datetime)) AS end_date
98
+ FROM
99
+ observations
100
+ GROUP BY
101
+ constituent, station_id,source
102
+ ORDER BY
103
+ sample_count;'''
104
+
105
+ res = con.execute(query)
106
+ return res.fetch_df()
107
+
108
+
72
109
  class dataManager():
73
110
 
74
111
  def __init__(self,folderpath):
75
112
 
76
113
  self.data = {}
77
114
  self.folderpath = Path(folderpath)
115
+ self.db_path = self.folderpath.joinpath('observations.duckdb')
116
+
117
+ def _reconstruct_database(self):
118
+ construct_database(self.folderpath)
119
+
120
+
121
+ def constituent_summary(self,constituents = None):
122
+ with duckdb.connect(self.db_path) as con:
123
+ if constituents is None:
124
+ constituents = con.query('''
125
+ SELECT DISTINCT
126
+ constituent
127
+ FROM observations''').to_df()['constituent'].to_list()
128
+
129
+ query = '''
130
+ SELECT
131
+ station_id,
132
+ source,
133
+ constituent,
134
+ COUNT(*) AS sample_count,
135
+ year(MIN(datetime)) AS start_date,
136
+ year(MAX(datetime)) AS end_date
137
+ FROM
138
+ observations
139
+ WHERE
140
+ constituent in (SELECT UNNEST(?))
141
+ GROUP BY
142
+ constituent,station_id,source
143
+ ORDER BY
144
+ constituent,sample_count;'''
145
+
146
+ df = con.execute(query,[constituents]).fetch_df()
147
+ return df
78
148
 
79
149
  def get_wiski_stations(self):
80
150
  return list(WISKI_EQUIS_XREF['WISKI_STATION_NO'].unique())
@@ -108,6 +178,17 @@ class dataManager():
108
178
  return []
109
179
  else:
110
180
  return wiski_ids
181
+
182
+ def equis_wiski_alias(self,equis_station_id):
183
+ wiski_ids = list(set(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_EQUIS_ID'] == equis_station_id,'WISKI_STATION_NO'].to_list()))
184
+ wiski_ids = [wiski_id for wiski_id in wiski_ids if not pd.isna(wiski_id)]
185
+ if len(wiski_ids) == 0:
186
+ return []
187
+ elif len(wiski_ids) > 1:
188
+ print(f'Too Many WISKI Stations for {equis_station_id}')
189
+ raise
190
+ else:
191
+ return wiski_ids[0]
111
192
 
112
193
  def _equis_wiski_associations(self,equis_station_ids):
113
194
  wiski_stations = [self.equis_wiski_associations(equis_station_id) for equis_station_id in equis_station_ids]
@@ -115,6 +196,25 @@ class dataManager():
115
196
  return wiski_stations[0]
116
197
  else:
117
198
  return []
199
+
200
+ def _stations_by_wid(self,wid_no,station_origin):
201
+ if station_origin in ['wiski','wplmn']:
202
+ station_col = 'WISKI_STATION_NO'
203
+ elif station_origin in ['equis','swd']:
204
+ station_col = 'EQUIS_STATION_ID'
205
+ else:
206
+ raise
207
+
208
+ return list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WID'] == wid_no,station_col].unique())
209
+
210
+
211
+ def download_stations_by_wid(self, wid_no,station_origin, folderpath = None, overwrite = False):
212
+
213
+ station_ids = self._station_by_wid(wid_no,station_origin)
214
+
215
+ if not station_ids.empty:
216
+ for _, row in station_ids.iterrows():
217
+ self.download_station_data(row['station_id'],station_origin, folderpath, overwrite)
118
218
 
119
219
  def _download_station_data(self,station_id,station_origin,overwrite=False):
120
220
  assert(station_origin in ['wiski','equis','swd','wplmn'])
@@ -232,7 +332,7 @@ class dataManager():
232
332
  def get_data(self,station_id,constituent,agg_period = 'D'):
233
333
  return self._get_data([station_id],constituent,agg_period)
234
334
 
235
- def _get_data(self,station_ids,constituent,agg_period = 'D'):
335
+ def _get_data(self,station_ids,constituent,agg_period = 'D',tz_offset = '-6'):
236
336
  '''
237
337
 
238
338
  Returns the processed observational data associated with the calibration specific id.
@@ -287,7 +387,10 @@ class dataManager():
287
387
  df['data_format'] = dfsub['data_format'].iloc[0]
288
388
  df['source'] = dfsub['source'].iloc[0]
289
389
 
290
-
390
+
391
+ # convert to desired timzone before stripping timezone information.
392
+ #df.index.tz_convert('UTC-06:00').tz_localize(None)
393
+ df.index = df.index.tz_localize(None)
291
394
  return df['value'].to_frame().dropna()
292
395
 
293
396
 
mpcaHydro/etlCSG.py CHANGED
@@ -6,6 +6,9 @@ Created on Tue Oct 10 14:13:23 2023
6
6
  """
7
7
 
8
8
  import pandas as pd
9
+ import requests
10
+ import zipfile
11
+ import io
9
12
  # import geopandas as gpd
10
13
 
11
14
 
@@ -14,20 +17,30 @@ CONSITUENT_MAP = {'Water Temp. (C)': 'WT',
14
17
  'DO (mg/L)': 'DO'
15
18
  }
16
19
 
17
- def download(station_no):
18
- # save_path = Path(save_path)
19
- # file_path = save_path.joinpath('csg.csv')
20
-
21
- station = station_no[1:]
22
- df = pd.read_csv(f'https://maps2.dnr.state.mn.us/cgi-bin/csg.cgi?mode=dump_hydro_data_as_csv&site={station}&startdate=1996-1-1&enddate=2050-1-1')
23
- df['station_id'] = station_no
20
+ # def download(station_no):
21
+ # # save_path = Path(save_path)
22
+ # # file_path = save_path.joinpath('csg.csv')
24
23
 
25
- return df
24
+ # station = station_no[1:]
25
+ # df = pd.read_csv(f'https://maps2.dnr.state.mn.us/cgi-bin/csg.cgi?mode=dump_hydro_data_as_csv&site={station}&startdate=1996-1-1&enddate=2050-1-1')
26
+ # df = pd.read_csv(f'https://apps.dnr.state.mn.us/csg/api/v1/download?callback=json&ids=66050001&vars=262')
27
+ # df['station_id'] = station_no
26
28
 
29
+ # return df
27
30
 
31
+ def download(station_no):
32
+ station = station_no[1:]
33
+ url = f'https://apps.dnr.state.mn.us/csg/api/v1/download?ids={station}&vars=262'
34
+ response = requests.get(url)
35
+ with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
36
+ df = pd.read_csv(zip_ref.open(zip_ref.namelist()[0]))
37
+ df['station_id'] = station_no
38
+
39
+ return df
40
+
28
41
 
29
42
  # def process(df):
30
- # df['Timestamp'] = pd.to_datetime(df['Timestamp'])
43
+ #
31
44
  # df.set_index('Timestamp',inplace=True)
32
45
  # value_variables = [column for column in df.columns if (column not in ['Site','Timestamp','station_no']) & ~(column.endswith('Quality'))]
33
46
 
@@ -35,45 +48,24 @@ def download(station_no):
35
48
  # df = df['Value'].resample(rule='1H', kind='interval').mean().to_frame()
36
49
 
37
50
  def transform(data):
38
-
39
-
40
- data['Timestamp'] = pd.to_datetime(data['Timestamp'])
41
- data['Timestamp'].dt.tz_localize('UTC')
42
-
43
- id_columns = ['Timestamp','station_id']
44
- quality_columns = ['Water Temp. (C) Quality',
45
- 'Discharge (cfs) Quality',
46
- 'DO (mg/L) Quality']
47
-
48
- value_columns = ['Water Temp. (C)',
49
- 'Discharge (cfs)',
50
- 'DO (mg/L)']
51
-
52
- value_columns = [column for column in data.columns if column in value_columns]
53
- quality_columns = [column for column in data.columns if column in quality_columns]
51
+ data.rename(columns = {'tstamp': 'datetime',
52
+ 'var_name': 'variable',
53
+ 'station_no': 'station_id'}, inplace = True)
54
54
 
55
-
56
-
57
- data_melt = pd.melt(data,col_level=0,id_vars = id_columns,value_vars = value_columns)
58
- data_melt['Quality'] = pd.melt(data,col_level=0,id_vars = id_columns,value_vars = quality_columns)['value']
59
-
60
- data_melt.rename(columns = {'Timestamp': 'datetime',
61
- 'Value': 'value',
62
- 'stationparameter_name': 'variable',
63
- 'station_no': 'station_id',
64
- 'Quality' : 'quality'},inplace = True)
65
-
66
- data_melt['unit'] = data_melt['variable'].map({'Water Temp. (C)' : 'C',
55
+ data['unit'] = data['variable'].map({'Water Temp. (C)' : 'C',
67
56
  'Discharge (cfs)' : 'cfs',
68
57
  'DO (mg/L)' : 'mg/L'})
69
58
 
70
- data_melt['constituent'] = data_melt['variable'].map({'Water Temp. (C)' : 'WT',
59
+ data['constituent'] = data['variable'].map({'Water Temp. (C)' : 'WT',
71
60
  'Discharge (cfs)' : 'Q',
72
61
  'DO (mg/L)' : 'DO'})
73
62
 
74
- data_melt.dropna(subset = 'value',inplace=True)
63
+ data['datetime'] = pd.to_datetime(data['datetime'])
64
+ data.set_index('datetime',drop=True,inplace=True)
65
+ data.index = data.index.tz_localize('UTC-06:00')
66
+ data.dropna(subset = 'value',inplace=True)
75
67
  data['source'] = 'csg'
76
- return data_melt
68
+ return data
77
69
 
78
70
 
79
71
 
mpcaHydro/etlSWD.py CHANGED
@@ -14,16 +14,25 @@ import pandas as pd
14
14
 
15
15
 
16
16
  CONSTITUENT_MAP = {'Total suspended solids':'TSS',
17
+ 'Total solids': 'TSS',
18
+ 'Solids, Suspended' : 'TSS',
19
+ 'Solids, Total Suspended' : 'TSS',
17
20
  'Residue - nonfilterable (TSS)': 'TSS',
18
21
  'Kjeldahl nitrogen as N': 'TKN',
22
+ 'Inorganic nitrogen (nitrate and nitrate) as N': 'N',
19
23
  'Nitrogen, Total Kjeldahl (TKN) as N': 'TKN',
20
24
  'Nitrate + Nitrite Nitrogen, Total as N': 'N',
21
25
  'Nitrate/Nitrite as N (N+N) as N': 'N',
22
26
  'Nutrient-nitrogen as N': 'N',
27
+ 'Nitrate/Nitrite as N': 'N',
23
28
  'Phosphorus, Total as P as P':'TP',
29
+ 'Phosphorus, Total as P' : 'TP',
24
30
  'Phosphorus as P': 'TP',
31
+ 'Total Phosphorus as P': 'TP',
32
+ 'Orthophosphate as P': 'OP',
25
33
  'Carbonaceous biochemical oxygen demand, standard conditions': 'BOD',
26
34
  'Chemical oxygen demand':'BOD',
35
+ 'Biochemical oxygen demand, standard conditions': 'BOD',
27
36
  'Chlorophyll a, corrected for pheophytin':'CHLA',
28
37
  'Chlorophyll-A':'CHLA',
29
38
  'Chlorophyll-a, Pheophytin Corrected':'CHLA',
@@ -145,7 +154,7 @@ def transform(df):
145
154
  df.set_index('datetime',drop=True,inplace=True)
146
155
  df.index = df.index.tz_localize('UTC-06:00')
147
156
 
148
- df.index = df.index.round('H').round('H')
157
+ df.index = df.index.round('h').round('h')
149
158
  df = df.reset_index()
150
159
  df = df.groupby(['datetime','variable','unit','station_id','station_name','constituent','data_format','data_type','source']).mean()
151
160
  df = df.reset_index()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mpcaHydro
3
- Version: 2.0.1
3
+ Version: 2.0.3
4
4
  Summary: Python package for downloading MPCA hydrology data
5
5
  Project-URL: Homepage, https://github.com/mfratkin1/mpcaHydro
6
6
  Author-email: Mulu Fratkin <michael.fratkin@state.mn.us>
@@ -10,6 +10,7 @@ Keywords: Hydrology,MPCA
10
10
  Classifier: Development Status :: 3 - Alpha
11
11
  Classifier: Programming Language :: Python
12
12
  Requires-Python: >=3.8
13
+ Requires-Dist: duckdb
13
14
  Requires-Dist: pandas
14
15
  Requires-Dist: pathlib
15
16
  Requires-Dist: requests
@@ -1,11 +1,11 @@
1
1
  mpcaHydro/WISKI.py,sha256=yqsljbx8TlFA8HIrXFGs5meO0RcTis5Px3__UUzrtiI,13303
2
2
  mpcaHydro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- mpcaHydro/data_manager.py,sha256=QAjWBwSAd2ziQ7KbujzedOnYy6YJiRXJ3-4imCpNwys,11682
4
- mpcaHydro/etlCSG.py,sha256=gPk6D2r0R0Okx-S0C9vLtRlmGzf9tExtVoJZrj8IA8U,2950
5
- mpcaHydro/etlSWD.py,sha256=rn71939arFQ08gSrRMKg1JbTBH_4GV4d0zBPp-opH18,7021
3
+ mpcaHydro/data_manager.py,sha256=UR4mE93eUUXXs74qnJCFstNt_z0yaX1IB8USD4-XkTc,15396
4
+ mpcaHydro/etlCSG.py,sha256=5QT6V2dHvNKC9r5-dspt-NpOmECP2LFw1Lyq1zdkqps,2630
5
+ mpcaHydro/etlSWD.py,sha256=FnpFv-LjK2zAvI2-wrN_4YaS70bI1AGi-aX5lEevkrc,7509
6
6
  mpcaHydro/etlWISKI.py,sha256=6I1uTJfM-yL_hY0q-X0JKFqz9DVDaFR7wt4ssmjbcEU,19645
7
7
  mpcaHydro/etlWPLMN.py,sha256=b44xvx4s7lwXhpRtfR6rj7RnBpbVKXaYqZCr26BexUI,4160
8
8
  mpcaHydro/data/WISKI_EQUIS_XREF.csv,sha256=bPYq-f4-Qc6jsvUgl81lwXBeFamfDe5TjohqUV1XJlg,1244704
9
- mpcahydro-2.0.1.dist-info/METADATA,sha256=hG1tAHrflPN5fBcHhxHVwG6C0lL6WEqeGrrVz-oLsx4,521
10
- mpcahydro-2.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
- mpcahydro-2.0.1.dist-info/RECORD,,
9
+ mpcahydro-2.0.3.dist-info/METADATA,sha256=FKpSp78k6axfes_kk4NL_-VdsyuSKeGRla3ZC5lxY8M,543
10
+ mpcahydro-2.0.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
+ mpcahydro-2.0.3.dist-info/RECORD,,