mpcaHydro 2.2.0__tar.gz → 2.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/PKG-INFO +1 -1
  2. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/demo.py +68 -9
  3. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/pyproject.toml +1 -1
  4. mpcahydro-2.2.0/src/mpcaHydro/data/outlets.duckdb → mpcahydro-2.2.2/src/mpcaHydro/data/outlet.duckdb +0 -0
  5. mpcahydro-2.2.2/src/mpcaHydro/data/stations_EQUIS.gpkg +0 -0
  6. mpcahydro-2.2.2/src/mpcaHydro/data/stations_wiski.gpkg +0 -0
  7. mpcahydro-2.2.2/src/mpcaHydro/data/stations_wiski.gpkg-shm +0 -0
  8. mpcahydro-2.2.2/src/mpcaHydro/data/stations_wiski.gpkg-wal +0 -0
  9. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/data_manager.py +104 -61
  10. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/etlSWD.py +21 -15
  11. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/outlets.py +70 -74
  12. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/reports.py +1 -1
  13. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/warehouse.py +276 -146
  14. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/warehouseManager.py +8 -0
  15. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/wiski.py +57 -5
  16. mpcahydro-2.2.2/tests/integration/observations.duckdb +0 -0
  17. mpcahydro-2.2.2/tests/integration/test_dataManager.py +61 -0
  18. mpcahydro-2.2.2/tests/integration/test_warehouse.duckdb +0 -0
  19. mpcahydro-2.2.2/tests/integration/test_warehouse.py +113 -0
  20. mpcahydro-2.2.2/tests/unit/test_equis.py +19 -0
  21. mpcahydro-2.2.0/src/mpcaHydro/data/stations_EQUIS.gpkg +0 -0
  22. mpcahydro-2.2.0/src/mpcaHydro/data/stations_wiski.gpkg +0 -0
  23. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/.gitattributes +0 -0
  24. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/.gitignore +0 -0
  25. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/ERROR.FIL +0 -0
  26. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/README.md +0 -0
  27. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/__init__.py +0 -0
  28. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/data/EQUIS_PARAMETER_XREF.csv +0 -0
  29. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/data/WISKI_EQUIS_XREF.csv +0 -0
  30. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/data/WISKI_QUALITY_CODES.csv +0 -0
  31. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/equis.py +0 -0
  32. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/etlCSG.py +0 -0
  33. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/etlWISKI.py +0 -0
  34. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/etlWPLMN.py +0 -0
  35. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/pywisk.py +0 -0
  36. {mpcahydro-2.2.0 → mpcahydro-2.2.2}/src/mpcaHydro/xref.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mpcaHydro
3
- Version: 2.2.0
3
+ Version: 2.2.2
4
4
  Summary: Python package for downloading MPCA hydrology data
5
5
  Project-URL: Homepage, https://github.com/mfratkin1/mpcaHydro
6
6
  Author-email: Mulu Fratkin <michael.fratkin@state.mn.us>
@@ -6,7 +6,7 @@ import duckdb
6
6
  from mpcaHydro import equis, warehouse, wiski
7
7
  from hspf.hspfModel import hspfModel
8
8
  from hspf.uci import UCI
9
-
9
+ from mpcaHydro import etlSWD
10
10
 
11
11
 
12
12
  #%%
@@ -34,19 +34,78 @@ wiski_stations = outlets.wiski_stations(model_name)
34
34
  equis.connect('MFRATKI',password = 'DeltaT#MPCA3')
35
35
  warehouse.init_db(db_path,reset = True)
36
36
 
37
- #%%
38
37
 
39
- with warehouse.connect(db_path) as con:
40
- df = equis.download(equis_stations)
38
+ #%% Old approach. Store as indvidual processed station files then load to warehouse
39
+ #df_equis = equis.download(equis_stations)
40
+ #df_wiski = wiski.download(wiski_stations,start_year = start_year, end_year = end_year)
41
+
42
+ #%% equis
43
+
44
+
45
+
46
+
47
+ def download_equis_data(db_path,station_ids,replace = False):
48
+ with warehouse.connect(db_path,read_only = False) as con:
49
+ df = equis.download(station_ids)
50
+ if not df.empty:
51
+ warehouse.load_df_to_table(con,df, 'staging.equis',replace = replace)
52
+ warehouse.load_df_to_table(con,equis.transform(df), 'analytics.equis',replace = replace)
53
+ else:
54
+ print('No data neccesary for HSPF calibration available from equis for stations:',station_ids)
55
+
56
+ def download_wiski_data(db_path,station_ids,replace = False):
57
+ with warehouse.connect(db_path,read_only = False) as con:
58
+ df = wiski.download(station_ids,start_year = start_year, end_year = end_year)
59
+ if not df.empty:
60
+ warehouse.load_df_to_table(con,df, 'staging.wiski', replace = replace)
61
+ warehouse.load_df_to_table(con,wiski.transform(df), 'analytics.wiski',replace = replace)
62
+ else:
63
+ print('No data neccesary for HSPF calibration available from wiski for stations:',station_ids)
64
+
65
+
66
+ # Add to warehouse from custom df. Must contain required normalized columns.
67
+ with warehouse.connect(db_path,read_only = False) as con:
68
+ if replace:
69
+ warehouse.drop_station_id(con,station_id,station_origin='equis')
70
+ warehouse.add_to_table(con,df, 'staging','equis_normalized')
71
+
72
+
41
73
  warehouse.load_df_to_staging(con,df, 'equis_raw',replace = replace)
42
- warehouse.load_df_to_analytics(con,equis.transform(df),'equis')
74
+ df = equis.normalize(df.copy())
75
+ warehouse.add_to_table(con,df, 'staging','equis_normalized')
76
+ df = equis.transform(df)
77
+ warehouse.add_to_table(con,df, 'analytics','equis')
78
+
43
79
 
44
- df = wiski.download(wiski_stations,start_year = start_year, end_year = end_year)
45
- warehouse.load_df_to_staging(con,df, 'wiski_raw', replace = replace)
46
- warehouse.load_df_to_analytics(con,wiski.transform(df,filter_qc_codes = filter_qc_codes),'wiski') # method includes normalization
47
80
 
48
- outlets.build_outlets(con, model_name)
81
+ #%% swd
49
82
 
83
+ df = etlSWD.download(equis_stations)
84
+
85
+ with warehouse.connect(db_path,read_only = False) as con:
86
+ warehouse.load_df_to_staging(con,df, 'equis_raw',replace = replace)
87
+ df = equis.normalize(df.copy())
88
+ warehouse.add_to_table(con,df, 'staging','equis_normalized')
89
+ df = equis.transform(df)
90
+ warehouse.add_to_table(con,df, 'analytics','equis')
91
+ #%% wiski
92
+
93
+
94
+
95
+ if station_origin == 'wiski':
96
+ df = wiski.download(station_ids,start_year = start_year, end_year = end_year)
97
+ warehouse.load_df_to_staging(con,df, 'wiski_raw', replace = replace)
98
+ df = wiski.normalize(df.copy())
99
+ warehouse.add_to_table(con,df, 'staging','wiski_normalized')
100
+ df = wiski.transform(df,filter_qc_codes = filter_qc_codes)
101
+ warehouse.add_to_table(con,df, 'analytics','wiski') # method includes normalization
102
+
103
+ if station_origin == 'swd':
104
+ df = pd.concat([etlSWD.download(station_id) for station_id in station_ids])
105
+ warehouse.load_df_to_staging(con,df, 'equis_raw', replace = replace)
106
+ df = etlSWD.transform(df.copy())
107
+ warehouse.add_to_table(con,df, 'analytics','equis')
108
+ warehouse.update_views(con)
50
109
 
51
110
  with warehouse.connect(db_path) as con:
52
111
  warehouse.update_views(con)
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
5
5
  [project]
6
6
  name = "mpcaHydro"
7
7
  urls = { "Homepage" = "https://github.com/mfratkin1/mpcaHydro" } # ? Add this!
8
- version = "2.2.0"
8
+ version = "2.2.2"
9
9
  dependencies = [
10
10
  "pandas",
11
11
  "requests",
@@ -5,7 +5,6 @@ Created on Fri Jun 3 10:01:14 2022
5
5
  @author: mfratki
6
6
  """
7
7
 
8
- import pandas as pd
9
8
  #from abc import abstractmethod
10
9
  from pathlib import Path
11
10
  from mpcaHydro import etlSWD
@@ -64,88 +63,115 @@ def constituent_summary(db_path):
64
63
  return res.fetch_df()
65
64
 
66
65
 
66
+
67
+
67
68
  class dataManager():
68
69
 
69
- def __init__(self,folderpath, oracle_user = None, oracle_password =None):
70
+ def __init__(self,folderpath, oracle_username = None, oracle_password =None, reset = False):
70
71
 
71
72
  self.data = {}
72
73
  self.folderpath = Path(folderpath)
73
74
  self.db_path = self.folderpath.joinpath('observations.duckdb')
74
-
75
- self.oracle_user = oracle_user
75
+ self.oracle_username = oracle_username
76
76
  self.oracle_password = oracle_password
77
- warehouse.init_db(self.db_path,reset = False)
78
- self.xref = xref
79
- self.outlets = outlets
77
+
78
+ if not self.db_path.exists() or reset:
79
+ self._build_warehouse()
80
+
81
+ self.xref = xref #TODO: implement xref manager class
82
+ self.outlets = outlets #TODO: implement outlets manager class
80
83
  self.reports = reportManager(self.db_path)
81
84
 
82
85
 
83
86
  def connect_to_oracle(self):
84
87
  assert (self.credentials_exist(), 'Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
85
- equis.connect(user = self.oracle_user, password = self.oracle_password)
88
+ equis.connect(user = self.oracle_username, password = self.oracle_password)
86
89
 
87
90
  def credentials_exist(self):
88
- if (self.oracle_user is not None) & (self.oracle_password is not None):
91
+ if (self.oracle_username is not None) & (self.oracle_password is not None):
89
92
  return True
90
93
  else:
91
94
  return False
92
95
 
93
96
  def _build_warehouse(self):
94
- build_warehouse(self.folderpath)
97
+ warehouse.init_db(self.db_path.as_posix(),True)
95
98
 
96
- def download_station_data(self,station_id,station_origin,overwrite=True,to_csv = False,filter_qc_codes = True, start_year = 1996, end_year = 2030,baseflow_method = 'Boughton'):
97
- '''
98
- Method to download data for a specific station and load it into the warehouse.
99
-
100
- :param self: Description
101
- :param station_id: Station identifier
102
- :param station_origin: source of station data: wiski, equis, or swd
103
- :param overwrite: Whether to overwrite existing data
104
- :param to_csv: Whether to export data to CSV
105
- :param filter_qc_codes: Whether to filter quality control codes
106
- :param start_year: Start year for data download
107
- :param end_year: End year for data download
108
- :param baseflow_method: Method for baseflow calculation
109
- '''
110
- with duckdb.connect(self.db_path,read_only=False) as con:
111
- if overwrite:
112
- warehouse.drop_station_id(con,station_id,station_origin)
113
- warehouse.update_views(con)
99
+ def _process_wiski_data(self,filter_qc_codes = True, data_codes = None, baseflow_method = 'Boughton'):
100
+ with warehouse.connect(self.db_path,read_only = False) as con:
101
+ df = con.execute("SELECT * FROM staging.wiski").df()
102
+ df_transformed = wiski.transform(df, filter_qc_codes, data_codes, baseflow_method)
103
+ warehouse.load_df_to_table(con,df_transformed, 'analytics.wiski')
104
+ warehouse.update_views(con)
114
105
 
115
- if station_origin == 'wiski':
116
- df = wiski.download([station_id],start_year = start_year, end_year = end_year)
117
- warehouse.load_df_to_staging(con,df, 'wiski_raw', replace = overwrite)
118
- warehouse.load_df_to_analytics(con,wiski.transform(df,filter_qc_codes = filter_qc_codes,baseflow_method = baseflow_method),'wiski') # method includes normalization
119
-
120
- elif station_origin == 'equis':
121
- assert (self.credentials_exist(), 'Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
122
- df = equis.download([station_id])
123
- warehouse.load_df_to_staging(con,df, 'equis_raw',replace = overwrite)
124
- warehouse.load_df_to_analytics(con,equis.transform(df),'equis')
125
-
126
- elif station_origin == 'swd':
127
- df = etlSWD.download(station_id)
128
- warehouse.load_df_to_staging(con,df, 'swd_raw', replace = overwrite)
129
- warehouse.load_df_to_analytics(con,etlSWD.transform(df),'swd')
130
- else:
131
- raise ValueError('station_origin must be wiski, equis, or swd')
132
-
133
- with duckdb.connect(self.db_path,read_only=False) as con:
106
+ def _process_equis_data(self):
107
+ with warehouse.connect(self.db_path,read_only = False) as con:
108
+ df = con.execute("SELECT * FROM staging.equis").df()
109
+ df_transformed = equis.transform(df)
110
+ warehouse.load_df_to_table(con,df_transformed, 'analytics.equis')
134
111
  warehouse.update_views(con)
135
112
 
136
- if to_csv:
137
- self.to_csv(station_id)
138
-
113
+ def _process_data(self,filter_qc_codes = True, data_codes = None, baseflow_method = 'Boughton'):
114
+ self._process_wiski_data(filter_qc_codes, data_codes, baseflow_method)
115
+ self._process_equis_data()
116
+
117
+ def _update_views(self):
118
+ with warehouse.connect(self.db_path,read_only = False) as con:
119
+ warehouse.update_views(con)
120
+
121
+ def _download_wiski_data(self,station_ids,start_year = 1996, end_year = 2030, filter_qc_codes = True, data_codes = None, baseflow_method = 'Boughton'):
122
+ with warehouse.connect(self.db_path,read_only = False) as con:
123
+ df = wiski.download(station_ids,start_year = start_year, end_year = end_year)
124
+ if not df.empty:
125
+ warehouse.load_df_to_table(con,df, 'staging.wiski')
126
+ warehouse.load_df_to_table(con,wiski.transform(df, filter_qc_codes,data_codes,baseflow_method), 'analytics.wiski')
127
+ warehouse.update_views(con)
128
+ else:
129
+ print('No data neccesary for HSPF calibration available from wiski for stations:',station_ids)
130
+
131
+ def _download_equis_data(self,station_ids):
132
+ if self.credentials_exist():
133
+ self.connect_to_oracle()
134
+ print('Connected to Oracle database.')
135
+ with warehouse.connect(self.db_path,read_only = False) as con:
136
+ df = equis.download(station_ids)
137
+ if not df.empty:
138
+ warehouse.load_df_to_table(con,df, 'staging.equis')
139
+ warehouse.load_df_to_table(con,equis.transform(df.copy()), 'analytics.equis')
140
+ warehouse.update_views(con)
141
+ else:
142
+ print('No data neccesary for HSPF calibration available from equis for stations:',station_ids)
143
+ else:
144
+ raise ValueError('Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
145
+
146
+
147
+ def _get_equis_template(self):
148
+ with duckdb.connect(self.db_path,read_only=True) as con:
149
+ query = '''
150
+ SELECT *
151
+ FROM staging.equis
152
+ LIMIT 0'''
153
+ df = con.execute(query).fetch_df().to_csv(self.folderpath.joinpath('equis_template.csv'), index=False)
139
154
  return df
140
155
 
141
- def get_outlets(self):
156
+ def _get_wiski_template(self):
157
+ with duckdb.connect(self.db_path,read_only=True) as con:
158
+ query = '''
159
+ SELECT *
160
+ FROM staging.wiski
161
+ LIMIT 0'''
162
+ df = con.execute(query).fetch_df().to_csv(self.folderpath.joinpath('wiski_template.csv'), index=False)
163
+ return df
164
+
165
+ def get_outlets(self,model_name):
142
166
  with duckdb.connect(self.db_path,read_only=True) as con:
143
167
  query = '''
144
168
  SELECT *
145
169
  FROM outlets.station_reach_pairs
170
+ WHERE repository_name = ?
146
171
  ORDER BY outlet_id'''
147
- df = con.execute(query).fetch_df()
172
+ df = con.execute(query,[model_name]).fetch_df()
148
173
  return df
174
+
149
175
  def get_station_ids(self,station_origin = None):
150
176
  with duckdb.connect(self.db_path,read_only=True) as con:
151
177
  if station_origin is None:
@@ -163,9 +189,7 @@ class dataManager():
163
189
  return df['station_id'].to_list()
164
190
 
165
191
 
166
- def get_station_data(self,station_ids,constituent,agg_period = None):
167
-
168
-
192
+ def get_observation_data(self,station_ids,constituent,agg_period = None):
169
193
  with duckdb.connect(self.db_path,read_only=True) as con:
170
194
  query = '''
171
195
  SELECT *
@@ -184,9 +208,9 @@ class dataManager():
184
208
  df.attrs['agg_period'] = agg_period
185
209
 
186
210
  df.rename(columns={'value': 'observed'}, inplace=True)
187
- return df
211
+ return df.dropna(subset=['observed'])
188
212
 
189
- def get_outlet_data(self,outlet_id,constituent,agg_period = 'D'):
213
+ def get_outlet_data(self,outlet_id,constituent,agg_period = 'D',to_csv = False):
190
214
  with duckdb.connect(self.db_path,read_only=True) as con:
191
215
  query = '''
192
216
  SELECT *
@@ -207,16 +231,35 @@ class dataManager():
207
231
  df.rename(columns={'value': 'observed',
208
232
  'flow_value': 'observed_flow',
209
233
  'baseflow_value': 'observed_baseflow'}, inplace=True)
210
- return df
211
-
234
+ return df.dropna(subset=['observed'])
212
235
 
236
+ def get_raw_data(self,station_id,station_origin, to_csv = False):
237
+ with duckdb.connect(self.db_path,read_only=True) as con:
238
+ if station_origin.lower() == 'equis':
239
+ query = '''
240
+ SELECT *
241
+ FROM staging.equis_raw
242
+ WHERE station_id = ?'''
243
+ elif station_origin.lower() == 'wiski':
244
+ query = '''
245
+ SELECT *
246
+ FROM staging.wiski_raw
247
+ WHERE station_id = ?'''
248
+ else:
249
+ raise ValueError(f'Station origin {station_origin} not recognized. Valid options are equis or wiski.')
250
+
251
+ df = con.execute(query,[station_id]).fetch_df()
252
+
253
+ if to_csv:
254
+ df.to_csv(self.folderpath.joinpath(f'{station_id}_raw.csv'), index=False)
255
+ return df
213
256
 
214
- def to_csv(self,station_id,folderpath = None):
257
+ def to_csv(self,station_id ,station_origin,folderpath = None):
215
258
  if folderpath is None:
216
259
  folderpath = self.folderpath
217
260
  else:
218
261
  folderpath = Path(folderpath)
219
- df = self._load(station_id)
262
+ df = self.get_station_data([station_id],constituent = 'Q',agg_period = None)
220
263
  if len(df) > 0:
221
264
  df.to_csv(folderpath.joinpath(station_id + '.csv'))
222
265
  else:
@@ -26,19 +26,21 @@ CONSTITUENT_MAP = {i[0]:i[1] for i in EQUIS_PARAMETER_XREF[['PARAMETER','constit
26
26
  # return df
27
27
  import requests
28
28
 
29
- def _download(station_no):
29
+ def _download(station_id):
30
30
  # Replace {station_no} in the URL with the actual station number
31
- url = f"https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=json"
32
-
31
+ #url = f"https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=json"
32
+ url = 'https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results'
33
+
33
34
  try:
34
35
  # Send a GET request to the URL
35
- response = requests.get(url)
36
+ params = {
37
+ 'stationId': station_id,
38
+ 'format': 'json'
39
+ }
40
+ response = requests.get(url,params = params)
36
41
  response.raise_for_status() # Raise exception for HTTP errors
37
42
  # Parse the JSON data
38
- if response.json()['recordCount'] == 0:
39
- return pd.DataFrame(columns = response.json()['column_names'])
40
- else:
41
- return pd.DataFrame(response.json()['data'])
43
+ return pd.DataFrame(response.json()['data'])
42
44
 
43
45
  except requests.exceptions.RequestException as e:
44
46
  print(f"An error occurred: {e}")
@@ -46,14 +48,18 @@ def _download(station_no):
46
48
 
47
49
 
48
50
 
49
- def download(station_no):
51
+ def download(station_ids):
50
52
  #df = pd.read_csv(f'https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=csv')
51
- df = _download(station_no)
52
- if df.empty:
53
- return df
54
- else:
55
- df['station_id'] = station_no
56
- return transform(df)
53
+ dfs = []
54
+ for station_id in station_ids:
55
+ df = _download(station_id)
56
+ if not df.empty:
57
+ df['station_id'] = station_id
58
+ dfs.append(df)
59
+
60
+ return pd.concat(dfs, ignore_index=True)
61
+
62
+
57
63
 
58
64
  def info(station_no):
59
65
  #df = pd.read_csv(f'https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=csv')