mpcaHydro 2.2.7__tar.gz → 2.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/PKG-INFO +1 -1
  2. mpcahydro-2.2.9/demo.py +226 -0
  3. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/pyproject.toml +1 -1
  4. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/data/outlet.duckdb +0 -0
  5. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/data/stations_EQUIS.gpkg +0 -0
  6. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/data/stations_wiski.gpkg +0 -0
  7. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/equis.py +8 -0
  8. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/outlets.py +70 -24
  9. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/swd.py +21 -15
  10. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/warehouse.py +57 -3
  11. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/warehouse_functions.py +3 -2
  12. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/wiski.py +9 -0
  13. mpcahydro-2.2.9/tests/integration/test_dataManager.py +61 -0
  14. mpcahydro-2.2.9/tests/integration/test_warehouse.duckdb +0 -0
  15. mpcahydro-2.2.9/tests/unit/test_equis.py +19 -0
  16. mpcahydro-2.2.7/tests/pixi.toml +0 -25
  17. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/.gitattributes +0 -0
  18. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/.gitignore +0 -0
  19. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/README.md +0 -0
  20. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/__init__.py +0 -0
  21. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/csg.py +0 -0
  22. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/data/EQUIS_PARAMETER_XREF.csv +0 -0
  23. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/data/WISKI_EQUIS_XREF.csv +0 -0
  24. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/data/WISKI_QUALITY_CODES.csv +0 -0
  25. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/data/wiskiweb01.pca.state.mn.us.crt +0 -0
  26. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/pywisk.py +0 -0
  27. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/reports.py +0 -0
  28. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/sql/analytics_tables.sql +0 -0
  29. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/sql/outlets_schema.sql +0 -0
  30. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/sql/schemas.sql +0 -0
  31. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/sql/staging_tables.sql +0 -0
  32. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/sql/views_analytics.sql +0 -0
  33. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/sql/views_outlets.sql +0 -0
  34. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/sql/views_reports.sql +0 -0
  35. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/sql_loader.py +0 -0
  36. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/src/mpcaHydro/xref.py +0 -0
  37. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/tests/integration/README.md +0 -0
  38. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/tests/integration/conftest.py +0 -0
  39. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/tests/integration/test_data_manager.py +0 -0
  40. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/tests/integration/test_data_manager_integration.py +0 -0
  41. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/tests/integration/test_equis_integration.py +0 -0
  42. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/tests/integration/test_warehouse.py +0 -0
  43. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/tests/integration/test_wiski.py +0 -0
  44. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/tests/integration/test_wiski_integration.py +0 -0
  45. {mpcahydro-2.2.7 → mpcahydro-2.2.9}/tests/test_data_manager_functions.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mpcaHydro
3
- Version: 2.2.7
3
+ Version: 2.2.9
4
4
  Summary: Python package for downloading MPCA hydrology data
5
5
  Project-URL: Homepage, https://github.com/mfratkin1/mpcaHydro
6
6
  Author-email: Mulu Fratkin <michael.fratkin@state.mn.us>
@@ -0,0 +1,226 @@
1
+ #%%
2
+ from mpcaHydro.data_manager import dataManager
3
+ from pyhcal.repository import Repository
4
+ from mpcaHydro import outlets
5
+ import duckdb
6
+ from mpcaHydro import equis, warehouse, wiski
7
+ from hspf.hspfModel import hspfModel
8
+ from hspf.uci import UCI
9
+ from mpcaHydro import etlSWD
10
+
11
+
12
+ #%%
13
+ '''
14
+ New approach. Directly load to warehouse from downloads.
15
+ Store raw and processed data in warehouse. For large timeseries I could store
16
+ as parquet files. The transformations using pandas take a bit of time. I imagine doing them
17
+ within duckdb would be faster.
18
+
19
+ '''
20
+
21
+ # with warehouse.connect(db_path) as con:
22
+ # df = con.execute("SELECT * FROM staging.wiski").df()
23
+ # df = wiski.transform(df,filter_qc_codes = False)
24
+
25
+ #%%
26
+ model_name = 'Nemadji'
27
+ db_path = f'C:/Users/mfratki/Documents/{model_name}.duckdb'
28
+ start_year = 1996
29
+ end_year = 2030
30
+ replace = True
31
+ filter_qc_codes = True
32
+ equis_stations = outlets.equis_stations(model_name)
33
+ wiski_stations = outlets.wiski_stations(model_name)
34
+ equis.connect('MFRATKI',password = 'DeltaT#MPCA3')
35
+ warehouse.init_db(db_path,reset = True)
36
+
37
+
38
+ #%% Old approach. Store as indvidual processed station files then load to warehouse
39
+ #df_equis = equis.download(equis_stations)
40
+ #df_wiski = wiski.download(wiski_stations,start_year = start_year, end_year = end_year)
41
+
42
+ #%% equis
43
+
44
+
45
+
46
+
47
+ def download_equis_data(db_path,station_ids,replace = False):
48
+ with warehouse.connect(db_path,read_only = False) as con:
49
+ df = equis.download(station_ids)
50
+ if not df.empty:
51
+ warehouse.load_df_to_table(con,df, 'staging.equis',replace = replace)
52
+ warehouse.load_df_to_table(con,equis.transform(df), 'analytics.equis',replace = replace)
53
+ else:
54
+ print('No data neccesary for HSPF calibration available from equis for stations:',station_ids)
55
+
56
+ def download_wiski_data(db_path,station_ids,replace = False):
57
+ with warehouse.connect(db_path,read_only = False) as con:
58
+ df = wiski.download(station_ids,start_year = start_year, end_year = end_year)
59
+ if not df.empty:
60
+ warehouse.load_df_to_table(con,df, 'staging.wiski', replace = replace)
61
+ warehouse.load_df_to_table(con,wiski.transform(df), 'analytics.wiski',replace = replace)
62
+ else:
63
+ print('No data neccesary for HSPF calibration available from wiski for stations:',station_ids)
64
+
65
+
66
+ # Add to warehouse from custom df. Must contain required normalized columns.
67
+ with warehouse.connect(db_path,read_only = False) as con:
68
+ if replace:
69
+ warehouse.drop_station_id(con,station_id,station_origin='equis')
70
+ warehouse.add_to_table(con,df, 'staging','equis_normalized')
71
+
72
+
73
+ warehouse.load_df_to_staging(con,df, 'equis_raw',replace = replace)
74
+ df = equis.normalize(df.copy())
75
+ warehouse.add_to_table(con,df, 'staging','equis_normalized')
76
+ df = equis.transform(df)
77
+ warehouse.add_to_table(con,df, 'analytics','equis')
78
+
79
+
80
+
81
+ #%% swd
82
+
83
+ df = etlSWD.download(equis_stations)
84
+
85
+ with warehouse.connect(db_path,read_only = False) as con:
86
+ warehouse.load_df_to_staging(con,df, 'equis_raw',replace = replace)
87
+ df = equis.normalize(df.copy())
88
+ warehouse.add_to_table(con,df, 'staging','equis_normalized')
89
+ df = equis.transform(df)
90
+ warehouse.add_to_table(con,df, 'analytics','equis')
91
+ #%% wiski
92
+
93
+
94
+
95
+ if station_origin == 'wiski':
96
+ df = wiski.download(station_ids,start_year = start_year, end_year = end_year)
97
+ warehouse.load_df_to_staging(con,df, 'wiski_raw', replace = replace)
98
+ df = wiski.normalize(df.copy())
99
+ warehouse.add_to_table(con,df, 'staging','wiski_normalized')
100
+ df = wiski.transform(df,filter_qc_codes = filter_qc_codes)
101
+ warehouse.add_to_table(con,df, 'analytics','wiski') # method includes normalization
102
+
103
+ if station_origin == 'swd':
104
+ df = pd.concat([etlSWD.download(station_id) for station_id in station_ids])
105
+ warehouse.load_df_to_staging(con,df, 'equis_raw', replace = replace)
106
+ df = etlSWD.transform(df.copy())
107
+ warehouse.add_to_table(con,df, 'analytics','equis')
108
+ warehouse.update_views(con)
109
+
110
+ with warehouse.connect(db_path) as con:
111
+ warehouse.update_views(con)
112
+
113
+
114
+ #%%
115
+
116
+ import requests
117
+ url = 'http://ifrshiny.seas.umich.edu/mglp/'
118
+ requests.get(url)
119
+
120
+
121
+
122
+ db_path = 'C:/Users/mfratki/Documents/Rum.duckdb'
123
+ modl_db.build_outlet_db(db_path)
124
+ con = duckdb.connect(db_path)
125
+ con.execute("SELECT * FROM station_reach_pairs").df()
126
+ con.execute('SELECT * FROM station_reach_pairs WHERE outlet_id = 76').df()
127
+
128
+ # Need to remove duplicates from MODL_DB
129
+ modl_db.MODL_DB.loc[modl_db.MODL_DB.duplicated(['station_id','source'])]
130
+
131
+ #%%
132
+ dm = dataManager('C:/Users/mfratki/Documents/')
133
+ dm._build_warehouse()
134
+ equis_stations = modl_db.equis_stations('Nemadji')
135
+ wiski_stations = modl_db.wiski_stations('Nemadji')
136
+
137
+ #%% Old approach. Store as indvidual processed station files then load to warehouse
138
+ for station_id in equis_stations:
139
+ dm._download_station_data(station_id,'equis', True)
140
+
141
+ for station_id in wiski_stations:
142
+ dm._download_station_data(station_id,'wiski', True)
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+ #%% Adding HSPF outputs to warehouse
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+ con = duckdb.connect(db_path)
166
+
167
+ model_name = 'Nemadji'
168
+ outlets = [group for _, group in modl_db.MODL_DB.query('repository_name == @model_name').groupby(by = ['opnids','repository_name'])]
169
+
170
+ for outlet in outlets:
171
+ 1+1
172
+
173
+
174
+ dfs = []
175
+ for constituent in ['Q','TSS','TP','N','OP','TKN']:
176
+ opnids = modl_db.split_opnids([opnid.split(',') for opnid in set(outlet['opnids'].tolist())])
177
+ for opnid in opnids:
178
+ df = mod.hbns.get_reach_constituent(constituent,opnids,time_step='h')
179
+ df.columns = ['value']
180
+ df['constituent'] = constituent
181
+ df['operation'] = operation
182
+ df['opnid'] = opnid
183
+ dfs.append(df)
184
+
185
+ df = pd.concat(dfs).reset_index()
186
+ df['model_name'] = model_name
187
+
188
+
189
+
190
+ station_ids = ['H05018001','S006-214','S015-102']
191
+ target_constituent = 'TSS'
192
+ flow_constituent = 'Q'
193
+
194
+ # build placeholders for the IN list (one ? per station id)
195
+ placeholders = ','.join(['?'] * len(station_ids))
196
+
197
+ sql = f'''
198
+ SELECT o.*, f.datetime AS flow_datetime, f.value AS flow, f.baseflow, f.station_id AS flow_station_id, f.station_origin AS flow_station_origin
199
+ FROM analytics.observations o
200
+ JOIN analytics.observations f
201
+ ON o.datetime = f.datetime
202
+ WHERE o.constituent = ?
203
+ AND o.station_id IN ({placeholders})
204
+ AND f.constituent = ?;
205
+ '''
206
+
207
+ # parameter order must match the ? positions in the query
208
+ params = [target_constituent] + station_ids + [flow_constituent]
209
+
210
+ df = con.execute(sql, params).df()
211
+
212
+ outlet_id: station_ids
213
+
214
+ outlet_id: opnid
215
+
216
+
217
+ outlets = []
218
+ for index, (_, group) in enumerate(modl_db.MODL_DB.groupby(by = ['opnids','repository_name'])):
219
+ group['outlet_id'] = index
220
+ group.reset_index(drop=True, inplace=True)
221
+ outlets.append(group)
222
+
223
+
224
+ for _, row in group.iterrows():
225
+ opnids = group.split_opnids(row['opnids'].str.split(',').to_list())
226
+ row*len(opnids)
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
5
5
  [project]
6
6
  name = "mpcaHydro"
7
7
  urls = { "Homepage" = "https://github.com/mfratkin1/mpcaHydro" } # ? Add this!
8
- version = "2.2.7"
8
+ version = "2.2.9"
9
9
  dependencies = [
10
10
  "pandas",
11
11
  "requests",
@@ -265,6 +265,13 @@ def replace_nondetects(df):
265
265
  df.loc[df['value'].isna(), 'value'] = 0
266
266
  return df
267
267
 
268
+ def filter_years(df, start_year=1996, end_year=None):
269
+ '''Filter Equis data to include only samples within a certain year range.'''
270
+ df = df[df['datetime'].dt.year >= start_year]
271
+ if end_year is not None:
272
+ df = df[df['datetime'].dt.year <= end_year]
273
+ return df
274
+
268
275
  def normalize(df):
269
276
  '''Normalize Equis data: select relevant columns.'''
270
277
  df = map_constituents(df)
@@ -278,6 +285,7 @@ def transform(df):
278
285
 
279
286
  df = normalize(df)
280
287
  df = replace_nondetects(df)
288
+ df = filter_years(df)
281
289
  if not df.empty:
282
290
  df = average_results(df)
283
291
  return df
@@ -31,7 +31,15 @@ DB_PATH = str(Path(__file__).resolve().parent/'data\\outlet.duckdb')
31
31
  MODL_DB = pd.concat([stations_wiski,stations_equis])
32
32
  MODL_DB['opnids'] = MODL_DB['opnids'].str.strip().replace('',pd.NA)
33
33
  MODL_DB = MODL_DB.dropna(subset='opnids')
34
+ MODL_DB = MODL_DB.dropna(subset = 'repo_name')
34
35
  MODL_DB = MODL_DB.drop_duplicates(['station_id','source']).reset_index(drop=True)
36
+ # Add outlet_id column to MODL_DB based on enumerate grouping
37
+ outlet_id_map = {}
38
+ for outlet_id, (_, group) in enumerate(MODL_DB.drop_duplicates(['station_id','source']).groupby(by=['opnids','repo_name'])):
39
+ for idx in group.index:
40
+ outlet_id_map[idx] = int(outlet_id)
41
+ MODL_DB['outlet_id'] = MODL_DB.index.map(outlet_id_map)
42
+
35
43
 
36
44
  def _reload():
37
45
  global _stations_wiski, stations_wiski, _stations_equis, stations_equis, MODL_DB
@@ -47,7 +55,14 @@ def _reload():
47
55
  MODL_DB = pd.concat([stations_wiski,stations_equis])
48
56
  MODL_DB['opnids'] = MODL_DB['opnids'].str.strip().replace('',pd.NA)
49
57
  MODL_DB = MODL_DB.dropna(subset='opnids')
58
+ MODL_DB = MODL_DB.dropna(subset = 'repo_name')
50
59
  MODL_DB = MODL_DB.drop_duplicates(['station_id','source']).reset_index(drop=True)
60
+ # Add outlet_id column to MODL_DB based on enumerate grouping
61
+ outlet_id_map = {}
62
+ for outlet_id, (_, group) in enumerate(MODL_DB.drop_duplicates(['station_id','source']).groupby(by=['opnids','repo_name'])):
63
+ for idx in group.index:
64
+ outlet_id_map[idx] = int(outlet_id)
65
+ MODL_DB['outlet_id'] = MODL_DB.index.map(outlet_id_map)
51
66
 
52
67
 
53
68
  def split_opnids(opnids: list):
@@ -144,7 +159,7 @@ def get_outlets_by_reach(reach_id: int, model_name: str):
144
159
  """,
145
160
  [reach_id, model_name]).fetchdf()
146
161
  return df
147
-
162
+
148
163
  def get_outlets_by_station(station_id: str, station_origin: str):
149
164
  """
150
165
  Return all outlet rows for outlets that include the given reach_id in the given model_name.
@@ -160,6 +175,47 @@ def get_outlets_by_station(station_id: str, station_origin: str):
160
175
  [station_id, station_origin]).fetchdf()
161
176
  return df
162
177
 
178
+ def get_station_opnids(station_id: str, station_origin: str):
179
+ """
180
+ Return all model reach IDs (opnids) associated with the given station ID and origin.
181
+ """
182
+ with connect(DB_PATH) as con:
183
+ df = con.execute(
184
+ """
185
+ SELECT r.reach_id
186
+ FROM outlets.station_reach_pairs r
187
+ WHERE r.station_id = ? AND r.station_origin = ?
188
+ """,
189
+ [station_id, station_origin]).fetchdf()
190
+ return df['reach_id'].tolist()
191
+
192
+ def get_outlet_opnids(outlet_id: int):
193
+ """
194
+ Return all model reach IDs (opnids) associated with the given outlet ID.
195
+ """
196
+ with connect(DB_PATH) as con:
197
+ df = con.execute(
198
+ """
199
+ SELECT r.reach_id
200
+ FROM outlets.station_reach_pairs r
201
+ WHERE r.outlet_id = ?
202
+ """,
203
+ [outlet_id]).fetchdf()
204
+ return list(set(df['reach_id'].tolist()))
205
+
206
+ def get_outlet_stations(outlet_id: int):
207
+ """
208
+ Return all station IDs and origins associated with the given outlet ID.
209
+ """
210
+ with connect(DB_PATH) as con:
211
+ df = con.execute(
212
+ """
213
+ SELECT r.station_id, r.station_origin
214
+ FROM outlets.station_reach_pairs r
215
+ WHERE r.outlet_id = ?
216
+ """,
217
+ [outlet_id]).fetchdf()
218
+ return df[['station_id', 'station_origin']].drop_duplicates().to_dict(orient='records')
163
219
 
164
220
 
165
221
  class OutletGateway:
@@ -179,7 +235,7 @@ class OutletGateway:
179
235
  return equis_station_opnids(self.model_name)
180
236
 
181
237
  def station_opnids(self):
182
- return station_opnids(self.model_name)
238
+ return mapped_station_opnids(self.model_name)
183
239
 
184
240
  def equis_stations(self):
185
241
  return equis_stations(self.model_name)
@@ -207,6 +263,12 @@ class OutletGateway:
207
263
  assert(station_id in self.wiski_stations() + self.equis_stations()), f"Station ID {station_id} not found in model {self.model_name}"
208
264
  return get_outlets_by_station(station_id, station_origin)
209
265
 
266
+ def get_outlet_opnids(self, outlet_id: int):
267
+ return get_outlet_opnids(outlet_id)
268
+
269
+ def get_outlet_stations(self, outlet_id: int):
270
+ return get_outlet_stations(outlet_id)
271
+
210
272
  # constructors:
211
273
  def build_outlet_db(db_path: str = None):
212
274
  if db_path is None:
@@ -222,31 +284,15 @@ def build_outlets(con, model_name: str = None):
222
284
  else:
223
285
  modl_db = MODL_DB
224
286
 
225
- for index, (_, group) in enumerate(modl_db.drop_duplicates(['station_id','source']).groupby(by = ['opnids','repo_name'])):
226
- repo_name = group['repo_name'].iloc[0]
227
- add_outlet(con, outlet_id = index, outlet_name = None, repository_name = repo_name, notes = None)
228
-
287
+ for outlet_id in modl_db['outlet_id'].unique():
288
+ group = modl_db.query('outlet_id == @outlet_id')
289
+ repo_name = group['repo_name'].iloc[0]
290
+ add_outlet(con, outlet_id = int(outlet_id), outlet_name = None, repository_name = repo_name, notes = None)
229
291
  opnids = set(split_opnids(group['opnids'].str.split(',').to_list()))
230
-
231
292
  for opnid in opnids:
232
- add_reach(con, outlet_id = index, reach_id = int(opnid), repository_name = repo_name)
233
-
234
- for _, row in group.drop_duplicates(subset=['station_id', 'source']).iterrows():
235
- add_station(con, outlet_id = index, station_id = row['station_id'], station_origin = row['source'], true_opnid = row['true_opnid'], repository_name= repo_name, comments = row['comments'])
236
-
237
-
238
- def create_outlet_schema(con, model_name : str):
239
- for index, (_, group) in enumerate(outlets(model_name)):
240
- repo_name = group['repo_name'].iloc[0]
241
- add_outlet(con, outlet_id = index, outlet_name = None, repository_name = repo_name, notes = None)
242
-
243
- opnids = set(split_opnids(group['opnids'].str.split(',').to_list()))
244
-
245
- for opnid in opnids:
246
- add_reach(con, outlet_id = index, reach_id = int(opnid), repository_name = repo_name)
247
-
293
+ add_reach(con, outlet_id = int(outlet_id), reach_id = int(opnid), repository_name = repo_name)
248
294
  for _, row in group.drop_duplicates(subset=['station_id', 'source']).iterrows():
249
- add_station(con, outlet_id = index, station_id = row['station_id'], station_origin = row['source'], true_opnid = row['true_opnid'], repository_name= repo_name, comments = row['comments'])
295
+ add_station(con, outlet_id = int(outlet_id), station_id = row['station_id'], station_origin = row['source'], true_opnid = row['true_opnid'], repository_name= repo_name, comments = row['comments'])
250
296
 
251
297
 
252
298
  def add_outlet(con,
@@ -26,19 +26,21 @@ CONSTITUENT_MAP = {i[0]:i[1] for i in EQUIS_PARAMETER_XREF[['PARAMETER','constit
26
26
  # return df
27
27
  import requests
28
28
 
29
- def _download(station_no):
29
+ def _download(station_id):
30
30
  # Replace {station_no} in the URL with the actual station number
31
- url = f"https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=json"
32
-
31
+ #url = f"https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=json"
32
+ url = 'https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results'
33
+
33
34
  try:
34
35
  # Send a GET request to the URL
35
- response = requests.get(url)
36
+ params = {
37
+ 'stationId': station_id,
38
+ 'format': 'json'
39
+ }
40
+ response = requests.get(url,params = params)
36
41
  response.raise_for_status() # Raise exception for HTTP errors
37
42
  # Parse the JSON data
38
- if response.json()['recordCount'] == 0:
39
- return pd.DataFrame(columns = response.json()['column_names'])
40
- else:
41
- return pd.DataFrame(response.json()['data'])
43
+ return pd.DataFrame(response.json()['data'])
42
44
 
43
45
  except requests.exceptions.RequestException as e:
44
46
  print(f"An error occurred: {e}")
@@ -46,14 +48,18 @@ def _download(station_no):
46
48
 
47
49
 
48
50
 
49
- def download(station_no):
51
+ def download(station_ids):
50
52
  #df = pd.read_csv(f'https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=csv')
51
- df = _download(station_no)
52
- if df.empty:
53
- return df
54
- else:
55
- df['station_id'] = station_no
56
- return transform(df)
53
+ dfs = []
54
+ for station_id in station_ids:
55
+ df = _download(station_id)
56
+ if not df.empty:
57
+ df['station_id'] = station_id
58
+ dfs.append(df)
59
+
60
+ return pd.concat(dfs, ignore_index=True)
61
+
62
+
57
63
 
58
64
  def info(station_no):
59
65
  #df = pd.read_csv(f'https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=csv')
@@ -28,6 +28,23 @@ def init_db(db_path: str,reset: bool = False):
28
28
 
29
29
 
30
30
 
31
+ def validate_schemas(con: duckdb.DuckDBPyConnection):
32
+ """Validate that the database has the expected schemas and tables."""
33
+ expected_schemas = {'staging', 'analytics', 'mappings', 'outlets', 'reports'}
34
+ result = con.execute("SELECT schema_name FROM information_schema.schemata").fetchall()
35
+ existing_schemas = {row[0] for row in result}
36
+ missing_schemas = expected_schemas - existing_schemas
37
+ if missing_schemas:
38
+ raise ValueError(f"Missing schemas: {missing_schemas}")
39
+
40
+ def validate_tables(con: duckdb.DuckDBPyConnection, schema: str, expected_tables: set):
41
+ """Validate that a schema contains the expected tables."""
42
+ result = con.execute(f"SELECT table_name FROM information_schema.tables WHERE table_schema = ?", [schema]).fetchall()
43
+ existing_tables = {row[0] for row in result}
44
+ missing_tables = expected_tables - existing_tables
45
+ if missing_tables:
46
+ raise ValueError(f"Missing tables in {schema} schema: {missing_tables}")
47
+
31
48
  def create_schemas(con: duckdb.DuckDBPyConnection):
32
49
  """Create staging, analytics, hspf, and reports schemas if they do not exist."""
33
50
  con.execute(sql_loader.get_schemas_sql())
@@ -96,12 +113,49 @@ def create_mapping_tables(con: duckdb.DuckDBPyConnection):
96
113
  else:
97
114
  print(f"Warning: WISKI_QUALITY_CODES.csv not found at {wiski_qc_csv_path}")
98
115
 
99
- def create_outlets_tables(con: duckdb.DuckDBPyConnection):
116
+
117
+ def attach_outlets_db(con: duckdb.DuckDBPyConnection, outlets_db_path: str):
118
+ """
119
+ Attach an external DuckDB database containing outlet definitions.
120
+ """
121
+ create_schemas(con)
122
+
123
+ con.execute(f"ATTACH DATABASE '{outlets_db_path}' AS outlets_db;")
124
+
125
+ tables = con.execute("SHOW TABLES FROM outlets_db").fetchall()
126
+ print(f"Tables in the source database: {tables}")
127
+
128
+ for table in tables:
129
+ table_name = table[0] # Extract table name
130
+ con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM outlets_db.{table_name}") # Copy table contents
131
+
132
+ # -- Step 2: Copy all views --
133
+ # Retrieve the list of views in the source database
134
+ views = con.execute("SHOW VIEWS FROM outlets_db").fetchall()
135
+ print(f"Views in the source database: {views}")
136
+
137
+ # Copy each view from source to destination
138
+ for view in views:
139
+ view_name = view[0] # Extract view name
140
+
141
+ # Get the CREATE VIEW statement for the view
142
+ create_view_sql = con.execute(f"SHOW CREATE VIEW outlets_db.{view_name}").fetchone()[0]
143
+
144
+ # Recreate the view in the destination database (remove the `outlets_db.` prefix if exists)
145
+ create_view_sql = create_view_sql.replace(f"outlets_db.", "")
146
+ con.execute(create_view_sql)
147
+
148
+
149
+ con.execute(f"ATTACH DATABASE '{outlets_db_path}' AS outlets_db;")
150
+ # Optional: Detach the source database
151
+ con.execute("DETACH 'outlets_db'")
152
+
153
+
154
+ def create_outlets_tables(con: duckdb.DuckDBPyConnection, model_name: str = None):
100
155
  """Create tables in the outlets schema to define outlet-station-reach relationships."""
101
156
  con.execute(sql_loader.get_outlets_schema_sql())
102
157
  con.execute(sql_loader.get_views_outlets_sql())
103
- outlets.build_outlets(con)
104
-
158
+ outlets.build_outlets(con, model_name=model_name)
105
159
 
106
160
  def create_filtered_wiski_view(con: duckdb.DuckDBPyConnection, data_codes: list):
107
161
  """Create a view filtering WISKI data based on specified data codes."""
@@ -101,7 +101,8 @@ def download_wiski_data(
101
101
  if overwrite:
102
102
  warehouse.drop_station_data(con, station_ids, 'wiski')
103
103
  warehouse.add_df_to_table(con, df, 'staging', 'wiski')
104
- warehouse.add_df_to_table(con, df_transformed, 'analytics', 'wiski')
104
+ if not df_transformed.empty:
105
+ warehouse.add_df_to_table(con, df_transformed, 'analytics', 'wiski')
105
106
  warehouse.update_views(con)
106
107
  else:
107
108
  print('No data necessary for HSPF calibration from wiski for:', station_ids)
@@ -351,7 +352,7 @@ def station_reach_pairs(con: duckdb.DuckDBPyConnection):
351
352
  query = '''
352
353
  SELECT *,
353
354
  FROM
354
- reports.station_reach_pairs
355
+ outlets.station_reach_pairs
355
356
  ORDER BY
356
357
  outlet_id,
357
358
  station_id
@@ -336,6 +336,14 @@ def filter_quality_codes(df, data_codes):
336
336
  '''
337
337
  return df.loc[df['quality_code'].isin(data_codes)]
338
338
 
339
+ def filter_years(df, start_year=1996, end_year=None):
340
+ '''Filter Equis data to include only samples within a certain year range.'''
341
+ df = df[df['datetime'].dt.year >= start_year]
342
+ if end_year is not None:
343
+ df = df[df['datetime'].dt.year <= end_year]
344
+ return df
345
+
346
+
339
347
  def average_results(df):
340
348
  #df['datetime'] = pd.to_datetime(df.loc[:,'datetime'])
341
349
  df.loc[:,'datetime'] = df.loc[:,'datetime'].dt.round('h')
@@ -392,6 +400,7 @@ def transform(df, filter_qc_codes = True, data_codes = None, baseflow_method = '
392
400
  data_codes = DATA_CODES
393
401
  df = filter_quality_codes(df, data_codes)
394
402
  df = average_results(df)
403
+ df = filter_years(df, start_year=1996)
395
404
  df = calculate_baseflow(df, method = baseflow_method)
396
405
  df['station_origin'] = 'wiski'
397
406
  #df.set_index('datetime',inplace=True)
@@ -0,0 +1,61 @@
1
+ #%% Imports
2
+ from mpcaHydro.data_manager import dataManager
3
+ from pathlib import Path
4
+ import duckdb
5
+ THIS_DIR = Path(__file__).parent
6
+ WISKI_STATIONS = ['E05011002']
7
+ EQUIS_STATIONS = ['S001-235','S005-115']
8
+
9
+ #%%
10
+ def test_build_warehouse():
11
+ dm = dataManager(THIS_DIR)
12
+ dm._build_warehouse()
13
+
14
+ test_build_warehouse()
15
+ # %%
16
+ def test_equis_data_download():
17
+ dm = dataManager(THIS_DIR,
18
+ oracle_username = 'MFRATKI',
19
+ oracle_password = 'DeltaT#MPCA3',
20
+ reset=True)
21
+
22
+ dm.connect_to_oracle()
23
+ dm._download_equis_data(EQUIS_STATIONS)
24
+
25
+ test_equis_data_download()
26
+ #%%
27
+ def test_wiski_data_download():
28
+ dm = dataManager(THIS_DIR, reset=True)
29
+ dm._download_wiski_data(WISKI_STATIONS)
30
+
31
+
32
+ test_wiski_data_download()
33
+
34
+ #%%
35
+ dm = dataManager(THIS_DIR, reset=False)
36
+ with duckdb.connect(dm.db_path, read_only=True) as con:
37
+ df = con.execute('SELECT * FROM analytics.outlet_observations').fetch_df()
38
+ assert(df['outlet_id'].isnull().sum() == 0)
39
+
40
+ with duckdb.connect(dm.db_path, read_only=True) as con:
41
+ df = con.execute('SELECT * FROM analytics.outlet_observations_with_flow').fetch_df()
42
+ assert(df['outlet_id'].isnull().sum() == 0)
43
+ assert(df['value'].isnull().sum() == 0)
44
+ # %%
45
+ dm = dataManager(THIS_DIR, reset=False)
46
+
47
+
48
+ def test_wiski_download():
49
+ dm = dataManager(THIS_DIR, reset=False)
50
+ wiski_stations = WISKI_STATIONS
51
+ dm._download_wiski_data(wiski_stations)
52
+ return dm
53
+
54
+ test_wiski_download()
55
+
56
+
57
+ with duckdb.connect(dm.db_path, read_only=True) as con:
58
+ df = con.execute('SELECT * FROM analytics.outlet_observations_with_flow').fetch_df()
59
+ assert(df['outlet_id'].isnull().sum() == 0)
60
+
61
+ # %%
@@ -0,0 +1,19 @@
1
+
2
+ #%%
3
+ from mpcaHydro import equis
4
+ from mpcaHydro import outlets
5
+
6
+
7
+
8
+ #%%
9
+ model_name = 'Rum'
10
+ equis_stations = outlets.equis_stations(model_name)
11
+ equis.connect('MFRATKI',password = 'DeltaT#MPCA3')
12
+
13
+ df = equis.download(equis_stations)
14
+
15
+ df_normalized = equis.normalize(df.copy())
16
+ expected_columns = ['station_id', 'constituent', 'cas_rn', 'datetime', 'value', 'unit']
17
+
18
+ assert all(col in df_normalized.columns for col in expected_columns)
19
+ # %%
@@ -1,25 +0,0 @@
1
- [workspace]
2
- channels = ["https://prefix.dev/conda-forge"]
3
- platforms = ["linux-64", "osx-64", "win-64"]
4
-
5
-
6
- [dependencies]
7
- requests = "*"
8
- pandas = "*"
9
- time = "*"
10
- pathlib = "*"
11
- spyder = "*"
12
- jupyter = "*"
13
-
14
- [package]
15
- name = "mpcaHydro"
16
- version = "0.1.0"
17
-
18
- [package.build]
19
- backend = { name = "pixi-build-python", version = "0.1.*" }
20
-
21
- [package.run-dependencies]
22
- requests = "*"
23
- pandas = "*"
24
- time = "*"
25
- pathlib = "*"
File without changes
File without changes
File without changes