mpcaHydro 2.0.1__py3-none-any.whl → 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mpcaHydro/data_manager.py CHANGED
@@ -9,7 +9,7 @@ import pandas as pd
9
9
  #from abc import abstractmethod
10
10
  from pathlib import Path
11
11
  from mpcaHydro import etlWISKI, etlSWD#, etlEQUIS
12
-
12
+ import duckdb
13
13
 
14
14
  #
15
15
  '''
@@ -69,12 +69,59 @@ def are_lists_identical(nested_list):
69
69
  # Compare all sublists to the first one
70
70
  return all(sublist == sorted_sublists[0] for sublist in sorted_sublists)
71
71
 
72
+ def construct_database(folderpath):
73
+ folderpath = Path(folderpath)
74
+ db_path = folderpath.joinpath('observations.duckdb').as_posix()
75
+ with duckdb.connect(db_path) as con:
76
+ con.execute("DROP TABLE IF EXISTS observations")
77
+ datafiles = folderpath.joinpath('*.csv').as_posix()
78
+ query = '''
79
+ CREATE TABLE observations AS SELECT *
80
+ FROM
81
+ read_csv_auto(?,
82
+ union_by_name = true);
83
+
84
+ '''
85
+ con.execute(query,[datafiles])
86
+
87
+
88
+
72
89
  class dataManager():
73
90
 
74
91
  def __init__(self,folderpath):
75
92
 
76
93
  self.data = {}
77
94
  self.folderpath = Path(folderpath)
95
+ self.db_path = self.folderpath.joinpath('observations.duckdb')
96
+
97
+
98
+ def constituent_summary(self,constituents = None):
99
+ with duckdb.connect(self.db_path) as con:
100
+ if constituents is None:
101
+ constituents = con.query('''
102
+ SELECT DISTINCT
103
+ constituent
104
+ FROM observations''').to_df()['constituent'].to_list()
105
+
106
+ query = '''
107
+ SELECT
108
+ station_id,
109
+ source,
110
+ constituent,
111
+ COUNT(*) AS sample_count,
112
+ year(MIN(datetime)) AS start_date,
113
+ year(MAX(datetime)) AS end_date
114
+ FROM
115
+ observations
116
+ WHERE
117
+ constituent in (SELECT UNNEST(?))
118
+ GROUP BY
119
+ constituent,station_id,source
120
+ ORDER BY
121
+ constituent,sample_count;'''
122
+
123
+ df = con.execute(query,[constituents]).fetch_df()
124
+ return df
78
125
 
79
126
  def get_wiski_stations(self):
80
127
  return list(WISKI_EQUIS_XREF['WISKI_STATION_NO'].unique())
@@ -108,6 +155,17 @@ class dataManager():
108
155
  return []
109
156
  else:
110
157
  return wiski_ids
158
+
159
+ def equis_wiski_alias(self,equis_station_id):
160
+ wiski_ids = list(set(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_EQUIS_ID'] == equis_station_id,'WISKI_STATION_NO'].to_list()))
161
+ wiski_ids = [wiski_id for wiski_id in wiski_ids if not pd.isna(wiski_id)]
162
+ if len(wiski_ids) == 0:
163
+ return []
164
+ elif len(wiski_ids) > 1:
165
+ print(f'Too Many WISKI Stations for {equis_station_id}')
166
+ raise
167
+ else:
168
+ return wiski_ids[0]
111
169
 
112
170
  def _equis_wiski_associations(self,equis_station_ids):
113
171
  wiski_stations = [self.equis_wiski_associations(equis_station_id) for equis_station_id in equis_station_ids]
@@ -115,6 +173,25 @@ class dataManager():
115
173
  return wiski_stations[0]
116
174
  else:
117
175
  return []
176
+
177
+ def _stations_by_wid(self,wid_no,station_origin):
178
+ if station_origin in ['wiski','wplmn']:
179
+ station_col = 'WISKI_STATION_NO'
180
+ elif station_origin in ['equis','swd']:
181
+ station_col = 'EQUIS_STATION_ID'
182
+ else:
183
+ raise
184
+
185
+ return list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WID'] == wid_no,station_col].unique())
186
+
187
+
188
+ def download_stations_by_wid(self, wid_no,station_origin, folderpath = None, overwrite = False):
189
+
190
+ station_ids = self._station_by_wid(wid_no,station_origin)
191
+
192
+ if not station_ids.empty:
193
+ for _, row in station_ids.iterrows():
194
+ self.download_station_data(row['station_id'],station_origin, folderpath, overwrite)
118
195
 
119
196
  def _download_station_data(self,station_id,station_origin,overwrite=False):
120
197
  assert(station_origin in ['wiski','equis','swd','wplmn'])
@@ -232,7 +309,7 @@ class dataManager():
232
309
  def get_data(self,station_id,constituent,agg_period = 'D'):
233
310
  return self._get_data([station_id],constituent,agg_period)
234
311
 
235
- def _get_data(self,station_ids,constituent,agg_period = 'D'):
312
+ def _get_data(self,station_ids,constituent,agg_period = 'D',tz_offset = '-6'):
236
313
  '''
237
314
 
238
315
  Returns the processed observational data associated with the calibration specific id.
@@ -287,7 +364,10 @@ class dataManager():
287
364
  df['data_format'] = dfsub['data_format'].iloc[0]
288
365
  df['source'] = dfsub['source'].iloc[0]
289
366
 
290
-
367
+
368
+ # convert to desired timzone before stripping timezone information.
369
+ #df.index.tz_convert('UTC-06:00').tz_localize(None)
370
+ df.index = df.index.tz_localize(None)
291
371
  return df['value'].to_frame().dropna()
292
372
 
293
373
 
mpcaHydro/etlCSG.py CHANGED
@@ -6,6 +6,9 @@ Created on Tue Oct 10 14:13:23 2023
6
6
  """
7
7
 
8
8
  import pandas as pd
9
+ import requests
10
+ import zipfile
11
+ import io
9
12
  # import geopandas as gpd
10
13
 
11
14
 
@@ -14,20 +17,30 @@ CONSITUENT_MAP = {'Water Temp. (C)': 'WT',
14
17
  'DO (mg/L)': 'DO'
15
18
  }
16
19
 
17
- def download(station_no):
18
- # save_path = Path(save_path)
19
- # file_path = save_path.joinpath('csg.csv')
20
-
21
- station = station_no[1:]
22
- df = pd.read_csv(f'https://maps2.dnr.state.mn.us/cgi-bin/csg.cgi?mode=dump_hydro_data_as_csv&site={station}&startdate=1996-1-1&enddate=2050-1-1')
23
- df['station_id'] = station_no
20
+ # def download(station_no):
21
+ # # save_path = Path(save_path)
22
+ # # file_path = save_path.joinpath('csg.csv')
24
23
 
25
- return df
24
+ # station = station_no[1:]
25
+ # df = pd.read_csv(f'https://maps2.dnr.state.mn.us/cgi-bin/csg.cgi?mode=dump_hydro_data_as_csv&site={station}&startdate=1996-1-1&enddate=2050-1-1')
26
+ # df = pd.read_csv(f'https://apps.dnr.state.mn.us/csg/api/v1/download?callback=json&ids=66050001&vars=262')
27
+ # df['station_id'] = station_no
26
28
 
29
+ # return df
27
30
 
31
+ def download(station_no):
32
+ station = station_no[1:]
33
+ url = f'https://apps.dnr.state.mn.us/csg/api/v1/download?ids={station}&vars=262'
34
+ response = requests.get(url)
35
+ with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
36
+ df = pd.read_csv(zip_ref.open(zip_ref.namelist()[0]))
37
+ df['station_id'] = station_no
38
+
39
+ return df
40
+
28
41
 
29
42
  # def process(df):
30
- # df['Timestamp'] = pd.to_datetime(df['Timestamp'])
43
+ #
31
44
  # df.set_index('Timestamp',inplace=True)
32
45
  # value_variables = [column for column in df.columns if (column not in ['Site','Timestamp','station_no']) & ~(column.endswith('Quality'))]
33
46
 
@@ -35,45 +48,24 @@ def download(station_no):
35
48
  # df = df['Value'].resample(rule='1H', kind='interval').mean().to_frame()
36
49
 
37
50
  def transform(data):
38
-
39
-
40
- data['Timestamp'] = pd.to_datetime(data['Timestamp'])
41
- data['Timestamp'].dt.tz_localize('UTC')
42
-
43
- id_columns = ['Timestamp','station_id']
44
- quality_columns = ['Water Temp. (C) Quality',
45
- 'Discharge (cfs) Quality',
46
- 'DO (mg/L) Quality']
47
-
48
- value_columns = ['Water Temp. (C)',
49
- 'Discharge (cfs)',
50
- 'DO (mg/L)']
51
-
52
- value_columns = [column for column in data.columns if column in value_columns]
53
- quality_columns = [column for column in data.columns if column in quality_columns]
51
+ data.rename(columns = {'tstamp': 'datetime',
52
+ 'var_name': 'variable',
53
+ 'station_no': 'station_id'}, inplace = True)
54
54
 
55
-
56
-
57
- data_melt = pd.melt(data,col_level=0,id_vars = id_columns,value_vars = value_columns)
58
- data_melt['Quality'] = pd.melt(data,col_level=0,id_vars = id_columns,value_vars = quality_columns)['value']
59
-
60
- data_melt.rename(columns = {'Timestamp': 'datetime',
61
- 'Value': 'value',
62
- 'stationparameter_name': 'variable',
63
- 'station_no': 'station_id',
64
- 'Quality' : 'quality'},inplace = True)
65
-
66
- data_melt['unit'] = data_melt['variable'].map({'Water Temp. (C)' : 'C',
55
+ data['unit'] = data['variable'].map({'Water Temp. (C)' : 'C',
67
56
  'Discharge (cfs)' : 'cfs',
68
57
  'DO (mg/L)' : 'mg/L'})
69
58
 
70
- data_melt['constituent'] = data_melt['variable'].map({'Water Temp. (C)' : 'WT',
59
+ data['constituent'] = data['variable'].map({'Water Temp. (C)' : 'WT',
71
60
  'Discharge (cfs)' : 'Q',
72
61
  'DO (mg/L)' : 'DO'})
73
62
 
74
- data_melt.dropna(subset = 'value',inplace=True)
63
+ data['datetime'] = pd.to_datetime(data['datetime'])
64
+ data.set_index('datetime',drop=True,inplace=True)
65
+ data.index = data.index.tz_localize('UTC-06:00')
66
+ data.dropna(subset = 'value',inplace=True)
75
67
  data['source'] = 'csg'
76
- return data_melt
68
+ return data
77
69
 
78
70
 
79
71
 
mpcaHydro/etlSWD.py CHANGED
@@ -14,16 +14,25 @@ import pandas as pd
14
14
 
15
15
 
16
16
  CONSTITUENT_MAP = {'Total suspended solids':'TSS',
17
+ 'Total solids': 'TSS',
18
+ 'Solids, Suspended' : 'TSS',
19
+ 'Solids, Total Suspended' : 'TSS',
17
20
  'Residue - nonfilterable (TSS)': 'TSS',
18
21
  'Kjeldahl nitrogen as N': 'TKN',
22
+ 'Inorganic nitrogen (nitrate and nitrate) as N': 'N',
19
23
  'Nitrogen, Total Kjeldahl (TKN) as N': 'TKN',
20
24
  'Nitrate + Nitrite Nitrogen, Total as N': 'N',
21
25
  'Nitrate/Nitrite as N (N+N) as N': 'N',
22
26
  'Nutrient-nitrogen as N': 'N',
27
+ 'Nitrate/Nitrite as N': 'N',
23
28
  'Phosphorus, Total as P as P':'TP',
29
+ 'Phosphorus, Total as P' : 'TP',
24
30
  'Phosphorus as P': 'TP',
31
+ 'Total Phosphorus as P': 'TP',
32
+ 'Orthophosphate as P': 'OP',
25
33
  'Carbonaceous biochemical oxygen demand, standard conditions': 'BOD',
26
34
  'Chemical oxygen demand':'BOD',
35
+ 'Biochemical oxygen demand, standard conditions': 'BOD',
27
36
  'Chlorophyll a, corrected for pheophytin':'CHLA',
28
37
  'Chlorophyll-A':'CHLA',
29
38
  'Chlorophyll-a, Pheophytin Corrected':'CHLA',
@@ -145,7 +154,7 @@ def transform(df):
145
154
  df.set_index('datetime',drop=True,inplace=True)
146
155
  df.index = df.index.tz_localize('UTC-06:00')
147
156
 
148
- df.index = df.index.round('H').round('H')
157
+ df.index = df.index.round('h').round('h')
149
158
  df = df.reset_index()
150
159
  df = df.groupby(['datetime','variable','unit','station_id','station_name','constituent','data_format','data_type','source']).mean()
151
160
  df = df.reset_index()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mpcaHydro
3
- Version: 2.0.1
3
+ Version: 2.0.2
4
4
  Summary: Python package for downloading MPCA hydrology data
5
5
  Project-URL: Homepage, https://github.com/mfratkin1/mpcaHydro
6
6
  Author-email: Mulu Fratkin <michael.fratkin@state.mn.us>
@@ -10,6 +10,7 @@ Keywords: Hydrology,MPCA
10
10
  Classifier: Development Status :: 3 - Alpha
11
11
  Classifier: Programming Language :: Python
12
12
  Requires-Python: >=3.8
13
+ Requires-Dist: duckdb
13
14
  Requires-Dist: pandas
14
15
  Requires-Dist: pathlib
15
16
  Requires-Dist: requests
@@ -1,11 +1,11 @@
1
1
  mpcaHydro/WISKI.py,sha256=yqsljbx8TlFA8HIrXFGs5meO0RcTis5Px3__UUzrtiI,13303
2
2
  mpcaHydro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- mpcaHydro/data_manager.py,sha256=QAjWBwSAd2ziQ7KbujzedOnYy6YJiRXJ3-4imCpNwys,11682
4
- mpcaHydro/etlCSG.py,sha256=gPk6D2r0R0Okx-S0C9vLtRlmGzf9tExtVoJZrj8IA8U,2950
5
- mpcaHydro/etlSWD.py,sha256=rn71939arFQ08gSrRMKg1JbTBH_4GV4d0zBPp-opH18,7021
3
+ mpcaHydro/data_manager.py,sha256=NOB0fP-K40sBDdHOgwd_0FbBIGyz3kgIVCGKouIDd9A,14766
4
+ mpcaHydro/etlCSG.py,sha256=5QT6V2dHvNKC9r5-dspt-NpOmECP2LFw1Lyq1zdkqps,2630
5
+ mpcaHydro/etlSWD.py,sha256=FnpFv-LjK2zAvI2-wrN_4YaS70bI1AGi-aX5lEevkrc,7509
6
6
  mpcaHydro/etlWISKI.py,sha256=6I1uTJfM-yL_hY0q-X0JKFqz9DVDaFR7wt4ssmjbcEU,19645
7
7
  mpcaHydro/etlWPLMN.py,sha256=b44xvx4s7lwXhpRtfR6rj7RnBpbVKXaYqZCr26BexUI,4160
8
8
  mpcaHydro/data/WISKI_EQUIS_XREF.csv,sha256=bPYq-f4-Qc6jsvUgl81lwXBeFamfDe5TjohqUV1XJlg,1244704
9
- mpcahydro-2.0.1.dist-info/METADATA,sha256=hG1tAHrflPN5fBcHhxHVwG6C0lL6WEqeGrrVz-oLsx4,521
10
- mpcahydro-2.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
- mpcahydro-2.0.1.dist-info/RECORD,,
9
+ mpcahydro-2.0.2.dist-info/METADATA,sha256=YqLg7UMGbK1kYMXujTxAmgo7amEn3i7o7Ao_0x9ho30,543
10
+ mpcahydro-2.0.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
+ mpcahydro-2.0.2.dist-info/RECORD,,