mpcaHydro 2.1.0__tar.gz → 2.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. mpcahydro-2.2.1/ERROR.FIL +6 -0
  2. {mpcahydro-2.1.0 → mpcahydro-2.2.1}/PKG-INFO +3 -1
  3. mpcahydro-2.2.1/demo.py +226 -0
  4. {mpcahydro-2.1.0 → mpcahydro-2.2.1}/pyproject.toml +4 -2
  5. mpcahydro-2.2.1/src/mpcaHydro/data/WISKI_QUALITY_CODES.csv +71 -0
  6. mpcahydro-2.2.1/src/mpcaHydro/data/outlet.duckdb +0 -0
  7. mpcahydro-2.2.1/src/mpcaHydro/data/stations_EQUIS.gpkg +0 -0
  8. mpcahydro-2.2.1/src/mpcaHydro/data/stations_wiski.gpkg +0 -0
  9. mpcahydro-2.2.1/src/mpcaHydro/data_manager.py +292 -0
  10. {mpcahydro-2.1.0 → mpcahydro-2.2.1}/src/mpcaHydro/equis.py +31 -22
  11. {mpcahydro-2.1.0 → mpcahydro-2.2.1}/src/mpcaHydro/etlSWD.py +21 -15
  12. mpcahydro-2.2.1/src/mpcaHydro/outlets.py +367 -0
  13. mpcahydro-2.2.1/src/mpcaHydro/reports.py +80 -0
  14. mpcahydro-2.2.1/src/mpcaHydro/warehouse.py +711 -0
  15. mpcahydro-2.2.1/src/mpcaHydro/warehouseManager.py +55 -0
  16. mpcahydro-2.1.0/src/mpcaHydro/WISKI.py → mpcahydro-2.2.1/src/mpcaHydro/wiski.py +97 -17
  17. mpcahydro-2.2.1/src/mpcaHydro/xref.py +74 -0
  18. mpcahydro-2.2.1/tests/integration/observations.duckdb +0 -0
  19. mpcahydro-2.2.1/tests/integration/test_dataManager.py +61 -0
  20. mpcahydro-2.2.1/tests/integration/test_warehouse.duckdb +0 -0
  21. mpcahydro-2.2.1/tests/integration/test_warehouse.py +113 -0
  22. mpcahydro-2.2.1/tests/unit/test_equis.py +19 -0
  23. mpcahydro-2.1.0/src/mpcaHydro/data_manager.py +0 -412
  24. mpcahydro-2.1.0/src/mpcaHydro/warehouse.py +0 -203
  25. mpcahydro-2.1.0/tests/pixi.toml +0 -25
  26. {mpcahydro-2.1.0 → mpcahydro-2.2.1}/.gitattributes +0 -0
  27. {mpcahydro-2.1.0 → mpcahydro-2.2.1}/.gitignore +0 -0
  28. {mpcahydro-2.1.0 → mpcahydro-2.2.1}/README.md +0 -0
  29. {mpcahydro-2.1.0 → mpcahydro-2.2.1}/src/mpcaHydro/__init__.py +0 -0
  30. {mpcahydro-2.1.0 → mpcahydro-2.2.1}/src/mpcaHydro/data/EQUIS_PARAMETER_XREF.csv +0 -0
  31. {mpcahydro-2.1.0 → mpcahydro-2.2.1}/src/mpcaHydro/data/WISKI_EQUIS_XREF.csv +0 -0
  32. {mpcahydro-2.1.0 → mpcahydro-2.2.1}/src/mpcaHydro/etlCSG.py +0 -0
  33. {mpcahydro-2.1.0 → mpcahydro-2.2.1}/src/mpcaHydro/etlWISKI.py +0 -0
  34. {mpcahydro-2.1.0 → mpcahydro-2.2.1}/src/mpcaHydro/etlWPLMN.py +0 -0
  35. {mpcahydro-2.1.0 → mpcahydro-2.2.1}/src/mpcaHydro/pywisk.py +0 -0
@@ -0,0 +1,6 @@
1
+ 15:21:44.605 : LOG_MSG:ERROR.FIL OPENED
2
+ 15:21:44.607 : HASS_ENT:F90_WDBOPNR:entr:WDMSFL,RWFLG: 100 1 C:\Users\mfratki\Documents\github\pyHSPF\src\hspf\bin\WinHSPFLt\hspfmsg.wdm
3
+ 15:21:44.608 : HASS_ENT:F90_WDBOPNR:exit:WDMSFL,RETCOD 100 0
4
+ FILBLK RETCOD 0
5
+ wdmfl 0 0 0 0
6
+ FILBLK RETCOD 0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mpcaHydro
3
- Version: 2.1.0
3
+ Version: 2.2.1
4
4
  Summary: Python package for downloading MPCA hydrology data
5
5
  Project-URL: Homepage, https://github.com/mfratkin1/mpcaHydro
6
6
  Author-email: Mulu Fratkin <michael.fratkin@state.mn.us>
@@ -10,7 +10,9 @@ Keywords: Hydrology,MPCA
10
10
  Classifier: Development Status :: 3 - Alpha
11
11
  Classifier: Programming Language :: Python
12
12
  Requires-Python: >=3.8
13
+ Requires-Dist: baseflow
13
14
  Requires-Dist: duckdb
15
+ Requires-Dist: oracledb
14
16
  Requires-Dist: pandas
15
17
  Requires-Dist: pathlib
16
18
  Requires-Dist: requests
@@ -0,0 +1,226 @@
1
+ #%%
2
+ from mpcaHydro.data_manager import dataManager
3
+ from pyhcal.repository import Repository
4
+ from mpcaHydro import outlets
5
+ import duckdb
6
+ from mpcaHydro import equis, warehouse, wiski
7
+ from hspf.hspfModel import hspfModel
8
+ from hspf.uci import UCI
9
+ from mpcaHydro import etlSWD
10
+
11
+
12
+ #%%
13
+ '''
14
+ New approach. Directly load to warehouse from downloads.
15
+ Store raw and processed data in warehouse. For large timeseries I could store
16
+ as parquet files. The transformations using pandas take a bit of time. I imagine doing them
17
+ within duckdb would be faster.
18
+
19
+ '''
20
+
21
+ # with warehouse.connect(db_path) as con:
22
+ # df = con.execute("SELECT * FROM staging.wiski").df()
23
+ # df = wiski.transform(df,filter_qc_codes = False)
24
+
25
+ #%%
26
+ model_name = 'Nemadji'
27
+ db_path = f'C:/Users/mfratki/Documents/{model_name}.duckdb'
28
+ start_year = 1996
29
+ end_year = 2030
30
+ replace = True
31
+ filter_qc_codes = True
32
+ equis_stations = outlets.equis_stations(model_name)
33
+ wiski_stations = outlets.wiski_stations(model_name)
34
+ equis.connect('MFRATKI',password = 'DeltaT#MPCA3')
35
+ warehouse.init_db(db_path,reset = True)
36
+
37
+
38
+ #%% Old approach. Store as indvidual processed station files then load to warehouse
39
+ #df_equis = equis.download(equis_stations)
40
+ #df_wiski = wiski.download(wiski_stations,start_year = start_year, end_year = end_year)
41
+
42
+ #%% equis
43
+
44
+
45
+
46
+
47
+ def download_equis_data(db_path,station_ids,replace = False):
48
+ with warehouse.connect(db_path,read_only = False) as con:
49
+ df = equis.download(station_ids)
50
+ if not df.empty:
51
+ warehouse.load_df_to_table(con,df, 'staging.equis',replace = replace)
52
+ warehouse.load_df_to_table(con,equis.transform(df), 'analytics.equis',replace = replace)
53
+ else:
54
+ print('No data neccesary for HSPF calibration available from equis for stations:',station_ids)
55
+
56
+ def download_wiski_data(db_path,station_ids,replace = False):
57
+ with warehouse.connect(db_path,read_only = False) as con:
58
+ df = wiski.download(station_ids,start_year = start_year, end_year = end_year)
59
+ if not df.empty:
60
+ warehouse.load_df_to_table(con,df, 'staging.wiski', replace = replace)
61
+ warehouse.load_df_to_table(con,wiski.transform(df), 'analytics.wiski',replace = replace)
62
+ else:
63
+ print('No data neccesary for HSPF calibration available from wiski for stations:',station_ids)
64
+
65
+
66
+ # Add to warehouse from custom df. Must contain required normalized columns.
67
+ with warehouse.connect(db_path,read_only = False) as con:
68
+ if replace:
69
+ warehouse.drop_station_id(con,station_id,station_origin='equis')
70
+ warehouse.add_to_table(con,df, 'staging','equis_normalized')
71
+
72
+
73
+ warehouse.load_df_to_staging(con,df, 'equis_raw',replace = replace)
74
+ df = equis.normalize(df.copy())
75
+ warehouse.add_to_table(con,df, 'staging','equis_normalized')
76
+ df = equis.transform(df)
77
+ warehouse.add_to_table(con,df, 'analytics','equis')
78
+
79
+
80
+
81
+ #%% swd
82
+
83
+ df = etlSWD.download(equis_stations)
84
+
85
+ with warehouse.connect(db_path,read_only = False) as con:
86
+ warehouse.load_df_to_staging(con,df, 'equis_raw',replace = replace)
87
+ df = equis.normalize(df.copy())
88
+ warehouse.add_to_table(con,df, 'staging','equis_normalized')
89
+ df = equis.transform(df)
90
+ warehouse.add_to_table(con,df, 'analytics','equis')
91
+ #%% wiski
92
+
93
+
94
+
95
+ if station_origin == 'wiski':
96
+ df = wiski.download(station_ids,start_year = start_year, end_year = end_year)
97
+ warehouse.load_df_to_staging(con,df, 'wiski_raw', replace = replace)
98
+ df = wiski.normalize(df.copy())
99
+ warehouse.add_to_table(con,df, 'staging','wiski_normalized')
100
+ df = wiski.transform(df,filter_qc_codes = filter_qc_codes)
101
+ warehouse.add_to_table(con,df, 'analytics','wiski') # method includes normalization
102
+
103
+ if station_origin == 'swd':
104
+ df = pd.concat([etlSWD.download(station_id) for station_id in station_ids])
105
+ warehouse.load_df_to_staging(con,df, 'equis_raw', replace = replace)
106
+ df = etlSWD.transform(df.copy())
107
+ warehouse.add_to_table(con,df, 'analytics','equis')
108
+ warehouse.update_views(con)
109
+
110
+ with warehouse.connect(db_path) as con:
111
+ warehouse.update_views(con)
112
+
113
+
114
+ #%%
115
+
116
+ import requests
117
+ url = 'http://ifrshiny.seas.umich.edu/mglp/'
118
+ requests.get(url)
119
+
120
+
121
+
122
+ db_path = 'C:/Users/mfratki/Documents/Rum.duckdb'
123
+ modl_db.build_outlet_db(db_path)
124
+ con = duckdb.connect(db_path)
125
+ con.execute("SELECT * FROM station_reach_pairs").df()
126
+ con.execute('SELECT * FROM station_reach_pairs WHERE outlet_id = 76').df()
127
+
128
+ # Need to remove duplicates from MODL_DB
129
+ modl_db.MODL_DB.loc[modl_db.MODL_DB.duplicated(['station_id','source'])]
130
+
131
+ #%%
132
+ dm = dataManager('C:/Users/mfratki/Documents/')
133
+ dm._build_warehouse()
134
+ equis_stations = modl_db.equis_stations('Nemadji')
135
+ wiski_stations = modl_db.wiski_stations('Nemadji')
136
+
137
+ #%% Old approach. Store as indvidual processed station files then load to warehouse
138
+ for station_id in equis_stations:
139
+ dm._download_station_data(station_id,'equis', True)
140
+
141
+ for station_id in wiski_stations:
142
+ dm._download_station_data(station_id,'wiski', True)
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+ #%% Adding HSPF outputs to warehouse
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+ con = duckdb.connect(db_path)
166
+
167
+ model_name = 'Nemadji'
168
+ outlets = [group for _, group in modl_db.MODL_DB.query('repository_name == @model_name').groupby(by = ['opnids','repository_name'])]
169
+
170
+ for outlet in outlets:
171
+ 1+1
172
+
173
+
174
+ dfs = []
175
+ for constituent in ['Q','TSS','TP','N','OP','TKN']:
176
+ opnids = modl_db.split_opnids([opnid.split(',') for opnid in set(outlet['opnids'].tolist())])
177
+ for opnid in opnids:
178
+ df = mod.hbns.get_reach_constituent(constituent,opnids,time_step='h')
179
+ df.columns = ['value']
180
+ df['constituent'] = constituent
181
+ df['operation'] = operation
182
+ df['opnid'] = opnid
183
+ dfs.append(df)
184
+
185
+ df = pd.concat(dfs).reset_index()
186
+ df['model_name'] = model_name
187
+
188
+
189
+
190
+ station_ids = ['H05018001','S006-214','S015-102']
191
+ target_constituent = 'TSS'
192
+ flow_constituent = 'Q'
193
+
194
+ # build placeholders for the IN list (one ? per station id)
195
+ placeholders = ','.join(['?'] * len(station_ids))
196
+
197
+ sql = f'''
198
+ SELECT o.*, f.datetime AS flow_datetime, f.value AS flow, f.baseflow, f.station_id AS flow_station_id, f.station_origin AS flow_station_origin
199
+ FROM analytics.observations o
200
+ JOIN analytics.observations f
201
+ ON o.datetime = f.datetime
202
+ WHERE o.constituent = ?
203
+ AND o.station_id IN ({placeholders})
204
+ AND f.constituent = ?;
205
+ '''
206
+
207
+ # parameter order must match the ? positions in the query
208
+ params = [target_constituent] + station_ids + [flow_constituent]
209
+
210
+ df = con.execute(sql, params).df()
211
+
212
+ outlet_id: station_ids
213
+
214
+ outlet_id: opnid
215
+
216
+
217
+ outlets = []
218
+ for index, (_, group) in enumerate(modl_db.MODL_DB.groupby(by = ['opnids','repository_name'])):
219
+ group['outlet_id'] = index
220
+ group.reset_index(drop=True, inplace=True)
221
+ outlets.append(group)
222
+
223
+
224
+ for _, row in group.iterrows():
225
+ opnids = group.split_opnids(row['opnids'].str.split(',').to_list())
226
+ row*len(opnids)
@@ -5,12 +5,14 @@ build-backend = "hatchling.build"
5
5
  [project]
6
6
  name = "mpcaHydro"
7
7
  urls = { "Homepage" = "https://github.com/mfratkin1/mpcaHydro" } # ? Add this!
8
- version = "2.1.0"
8
+ version = "2.2.1"
9
9
  dependencies = [
10
10
  "pandas",
11
11
  "requests",
12
12
  "pathlib",
13
- "duckdb"
13
+ "duckdb",
14
+ "oracledb",
15
+ "baseflow"
14
16
  ]
15
17
  requires-python = ">=3.8"
16
18
  authors = [
@@ -0,0 +1,71 @@
1
+ quality_code,Text,Description,Active
2
+ 0,Unchecked,Unchecked data in progress or data that is not quality coded as part of the workup. Default coding for shifts so the quality codes from Level and Ratings are used for discharges. ,1
3
+ 3,Instantaneous,Instantaneous groundwater measurements or sampled date for load stations.,1
4
+ 5,Excellent,Discharge measurements that are excellent.,1
5
+ 8,Reliable Interpolation,The value of the data point is an interpolation between adjacent points. Code used for filling gaps less than 4 hours or with no change in data trend likely based on reference information.,1
6
+ 10,Good,Time series data that tracks well and requires no corrections or corrections of very small magnitude or timeseries data that has been reviewed and accepted for precipitation and groundwater level. Also used for discharge measurements and rating points. ,1
7
+ 15,Fair,Time series data that tracks fairly well and requires some corrections of relatively small magnitude. Also used for discharge measurements and rating points. ,1
8
+ 20,Poor,Time series data that tracks poorly and requires significant or many corrections. Also used for discharge measurements and rating points. ,1
9
+ 27,Questionable,"Timeseries data or discharge measurements that are questionable due to operator error, equipment error, etc). Extra scrutiny should be used for these data. ",1
10
+ 28,Unknown data quality,"Unknown quality of time series data, ratings or discharge measurements.",1
11
+ 29,Modeled,"Time-series data, rating point or discharge from a reliable mathematical and\or computer model. ",1
12
+ 34,Estimated,"Time-series data estimated from reference traces, models or extrapolation of the rating curve using supporting data and up to two times the maximum measured discharge.",1
13
+ 35,Unreliable,Time-series data computed with a rating extrapolated without supporting data or beyond two times the maximum measured discharge without a model.,1
14
+ 36,Threshold Exceedance,"Time-series data may be beyond the measuring limits of the monitoring equipment, or outside the bounds of historical extremes.",1
15
+ 40,Default import code,WISKI default coding for gaugings. ,1
16
+ 45,Approved Ext Data,"External data that has been graded externally as ""Approved"".",1
17
+ 48,Unknown Ext Data,External data that has been graded internally as “Unknown”.,1
18
+ 49,Estimated Ext Data,External data that has been graded externally as “Estimated.” Typically this is finalized ice data.,1
19
+ 50,Provisional Ext Data,External data that has been graded internally or externally as “Provisional”.,1
20
+ 80,Ice - Estimated,Ice affected time series data. Discharge computed with ice affected stage data is considered estimated.,1
21
+ 199,199-Logger Unknown,Initial code for data coming to the system from the logger.,1
22
+ 200,200,Initial code for data coming to the system from telemetry or default coding for WISKI timeseries. ,1
23
+ 228,Info Parameter,This parameter is collected for informational purposes only. Data has been through a cursory check only. This is stored in the database and available upon request.,1
24
+ 255,---,System assigned code for gaps in the data set. Records with null values. ,1
25
+ 1,Continuous Data,~Discontinued~ Good TS data that requires no correction.,0
26
+ 2,Edited Data,~Discontinued~ TS data that has been edited. Typically used when spikes are removed or when points are edited manual for datum corrections.,0
27
+ 3,Instantaneous Data,Final WQ data.,0
28
+ 4,Questionable data,~Discontinued~,0
29
+ 5,Excellent measurment,Used to indicated discharge measurements that are excellent as well as excellent sections of the rating.,0
30
+ 10,Good measurement,Used to indicated discharge measurements and sections of the rating that are good and time series data that tracks well and requires no corrections or corrections of very small magnitude.,0
31
+ 12,Modeled measurement,~Discontinued~ Rating point or discharge was obtained from a relizble mathematical and/or computer model. After 3/1/11 use QC148.,0
32
+ 15,Fair measurement,Used to indicated discharge measurements and sections of the rating that are fair and time series data that tracks fairly well and requires some corrections of relatively small magnitude.,0
33
+ 20,Poor measurement,Used to indicated discharge measurements and sections of the rating that are poor and time series data that tracks poorly and requires significant or many corrections.,0
34
+ 25,Unknown measurement,Measurement data not available.,0
35
+ 27,Questionable data,"Flow measurement is very poor and should be given extra scrutiny or time series data that is questionable due to operator error, equipment error, etc.",0
36
+ 30,Good Archived Daily Value,This code is used for archived daily value data that is considered “Good”.,0
37
+ 31,Fair Archived Daily Value,This code is used for archived daily value data that is considered “Fair”.,0
38
+ 32,Poor Archived Daily Value,This code is used for archived daily value data that is considered “Poor”.,0
39
+ 33,Unknown Archived Daily Value,This code is used for archived daily value data that has unknown quality based on lack of documentation.,0
40
+ 34,Estimated Archived Daily Value,This code is used for archived daily value data that has been estimated.,0
41
+ 35,Unreliable Archived Daily Value,This code is used for archived daily value data that is unreliable based on the quality of the supporting time series data and/or rating.,0
42
+ 45,Good External Data,This code is used for external data that has been graded internally as “Good”.,0
43
+ 46,Fair External Data,This code is used for external data that has been graded internally as “Fair”.,0
44
+ 47,Poor External Data,This code is used for external data that has been graded internally as “Poor”.,0
45
+ 48,Unknown External Data,This code is used for external data that has been graded internally as “Unknown”,0
46
+ 49,Estimated External Data,This code is used for external data that has been graded externally as “Estimated.” Typically this is finalized ice data.,0
47
+ 50,Provisional External Data,This code is used for external data that has been graded internally as “Provisional”,0
48
+ 51,Telemetry data - DCP,This code is used for time-series data when imported into hydstra using an automated telemetry method that accesses a DCP through the GOES network. The “questionable measurement” flag is set through the shef code that accompanies the DCP data.,0
49
+ 60,Above rating,~Discontinued~,0
50
+ 70,Estimated Data,Value of the data point is estimated.,0
51
+ 76,Reliable interpolation,Value of the data point is an interpolation between adjacent points. ,0
52
+ 80,Ice,"(DISCONTINUED) Used to indicate ice conditions when the data should not be exported. Use in conjunction with 80 to code 232.00 values, run USDAY to compute daily flow, then recode 232.00 80 values to 180 so unit value export cannot occur.",0
53
+ 82,Linear interpolation across a gap in records,~Discontinued~ Points that were added to fill a gap in the data record. The points fall on a straight line between the end points of the gap. This code was changed to 8 in WISKI.,0
54
+ 103,Provisional Instantaneous Data,Provisional WQ data.,0
55
+ 130,Good Provisional Daily Value,This code is used for archived daily value data that is considered “Good” but Provisional because there is only one year of gaging measurements.,0
56
+ 131,Fair Provisional Daily Value,This code is used for archived daily value data that is considered “Fair” but Provisional because there is only one year of gaging measurements.,0
57
+ 132,Poor Provisional Daily Value,This code is used for archived daily value data that is considered “Poor” but Provisional because there is only one year of gaging measurements.,0
58
+ 133,Unknown Provisional Archived Daily Value,This code is used for archived daily value data that has unknown quality based on lack of documentation but Provisional because there is only one year of gaging measurements.,0
59
+ 134,Estimated Provisional Archived Daily Value,This code is used for archived daily value data that has been estimated but Provisional because there is only one year of gaging measurements.,0
60
+ 135,Unreliable Provisional Archived Daily Value,This code is used for archived daily value data that is unreliable based on the quality of the supporting time series data and/or rating but Provisional because there is only one year of gaging measurements.,0
61
+ 140,Data not yet checked,This code is used for time-series data when it is initially imported into hydstra using manual import methods. ,0
62
+ 141,Telemetry data - not yet checked,This code is used for time-series data when it is imported into hydstra using an automated telemetry method.,0
63
+ 148,Modeled measurement,Rating point or discharge was obtained from a reliable mathematical and/or computer model.,0
64
+ 149,Extrapolated rating point,Rating point accurately extrapolated using supporting data and is less than two times the maxiumum measured discharge.,0
65
+ 150,Over-extrapolated rating point,Rating point extrapolated without supporting data or beyone two times the maximum measured discharge without a mathematical model.,0
66
+ 151,Data Missing,"This code is used to flag the end of a period of missing time-series data, before the next good data value.",0
67
+ 160,Above rating,~Discontinued~,0
68
+ 169,Datalogger Hardware Error Code 6999,"This code is used to indicate that a time-series point had a value of 6999 or -6999, a typical hardware error code, and the value was changed.",0
69
+ 170,Estimated Data,"Used to indicate estimated data when the data should not be exported. Often used in conjunction with 70 to code 232.00 values, run USDAY to compute daily flow, then recode 232.00 70 values to 170 so unit value export can not occur.",0
70
+ 180,Ice,Used to indicate ice conditions.,0
71
+ 255,Data Missing,This code is used when data is exported and does not exist for a given time period.,0
@@ -0,0 +1,292 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Jun 3 10:01:14 2022
4
+
5
+ @author: mfratki
6
+ """
7
+
8
+ from copy import replace
9
+ import pandas as pd
10
+ #from abc import abstractmethod
11
+ from pathlib import Path
12
+ from mpcaHydro import etlSWD
13
+ from mpcaHydro import equis, wiski, warehouse
14
+ from mpcaHydro import xref
15
+ from mpcaHydro import outlets
16
+ from mpcaHydro.reports import reportManager
17
+ import duckdb
18
+
19
+ AGG_DEFAULTS = {'cfs':'mean',
20
+ 'mg/l':'mean',
21
+ 'degf': 'mean',
22
+ 'lb':'sum'}
23
+
24
+ UNIT_DEFAULTS = {'Q': 'cfs',
25
+ 'QB': 'cfs',
26
+ 'TSS': 'mg/l',
27
+ 'TP' : 'mg/l',
28
+ 'OP' : 'mg/l',
29
+ 'TKN': 'mg/l',
30
+ 'N' : 'mg/l',
31
+ 'WT' : 'degf',
32
+ 'WL' : 'ft'}
33
+
34
+
35
+ def validate_constituent(constituent):
36
+ assert constituent in ['Q','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
37
+
38
+ def validate_unit(unit):
39
+ assert(unit in ['mg/l','lb','cfs','degF'])
40
+
41
+
42
+ def build_warehouse(folderpath):
43
+ folderpath = Path(folderpath)
44
+ db_path = folderpath.joinpath('observations.duckdb').as_posix()
45
+ warehouse.init_db(db_path)
46
+
47
+ def constituent_summary(db_path):
48
+ with duckdb.connect(db_path) as con:
49
+ query = '''
50
+ SELECT
51
+ station_id,
52
+ station_origin,
53
+ constituent,
54
+ COUNT(*) AS sample_count,
55
+ year(MIN(datetime)) AS start_date,
56
+ year(MAX(datetime)) AS end_date
57
+ FROM
58
+ observations
59
+ GROUP BY
60
+ constituent, station_id,station_origin
61
+ ORDER BY
62
+ sample_count;'''
63
+
64
+ res = con.execute(query)
65
+ return res.fetch_df()
66
+
67
+
68
+
69
+
70
+ class dataManager():
71
+
72
+ def __init__(self,folderpath, oracle_username = None, oracle_password =None, reset = False):
73
+
74
+ self.data = {}
75
+ self.folderpath = Path(folderpath)
76
+ self.db_path = self.folderpath.joinpath('observations.duckdb')
77
+ self.oracle_username = oracle_username
78
+ self.oracle_password = oracle_password
79
+
80
+ if not self.db_path.exists() or reset:
81
+ self._build_warehouse()
82
+
83
+ self.xref = xref #TODO: implement xref manager class
84
+ self.outlets = outlets #TODO: implement outlets manager class
85
+ self.reports = reportManager(self.db_path)
86
+
87
+
88
+ def connect_to_oracle(self):
89
+ assert (self.credentials_exist(), 'Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
90
+ equis.connect(user = self.oracle_username, password = self.oracle_password)
91
+
92
+ def credentials_exist(self):
93
+ if (self.oracle_username is not None) & (self.oracle_password is not None):
94
+ return True
95
+ else:
96
+ return False
97
+
98
+ def _build_warehouse(self):
99
+ warehouse.init_db(self.db_path.as_posix(),True)
100
+
101
+ def _process_wiski_data(self,filter_qc_codes = True, data_codes = None, baseflow_method = 'Boughton'):
102
+ with warehouse.connect(self.db_path,read_only = False) as con:
103
+ df = con.execute("SELECT * FROM staging.wiski").df()
104
+ df_transformed = wiski.transform(df, filter_qc_codes, data_codes, baseflow_method)
105
+ warehouse.load_df_to_table(con,df_transformed, 'analytics.wiski')
106
+ warehouse.update_views(con)
107
+
108
+ def _process_equis_data(self):
109
+ with warehouse.connect(self.db_path,read_only = False) as con:
110
+ df = con.execute("SELECT * FROM staging.equis").df()
111
+ df_transformed = equis.transform(df)
112
+ warehouse.load_df_to_table(con,df_transformed, 'analytics.equis')
113
+ warehouse.update_views(con)
114
+
115
+ def _process_data(self,filter_qc_codes = True, data_codes = None, baseflow_method = 'Boughton'):
116
+ self._process_wiski_data(filter_qc_codes, data_codes, baseflow_method)
117
+ self._process_equis_data()
118
+
119
+ def _update_views(self):
120
+ with warehouse.connect(self.db_path,read_only = False) as con:
121
+ warehouse.update_views(con)
122
+
123
+ def _download_wiski_data(self,station_ids,start_year = 1996, end_year = 2030, filter_qc_codes = True, data_codes = None, baseflow_method = 'Boughton'):
124
+ with warehouse.connect(self.db_path,read_only = False) as con:
125
+ df = wiski.download(station_ids,start_year = start_year, end_year = end_year)
126
+ if not df.empty:
127
+ warehouse.load_df_to_table(con,df, 'staging.wiski')
128
+ warehouse.load_df_to_table(con,wiski.transform(df, filter_qc_codes,data_codes,baseflow_method), 'analytics.wiski')
129
+ warehouse.update_views(con)
130
+ else:
131
+ print('No data neccesary for HSPF calibration available from wiski for stations:',station_ids)
132
+
133
+ def _download_equis_data(self,station_ids):
134
+ if self.credentials_exist():
135
+ self.connect_to_oracle()
136
+ print('Connected to Oracle database.')
137
+ with warehouse.connect(self.db_path,read_only = False) as con:
138
+ df = equis.download(station_ids)
139
+ if not df.empty:
140
+ warehouse.load_df_to_table(con,df, 'staging.equis')
141
+ warehouse.load_df_to_table(con,equis.transform(df.copy()), 'analytics.equis')
142
+ warehouse.update_views(con)
143
+ else:
144
+ print('No data neccesary for HSPF calibration available from equis for stations:',station_ids)
145
+ else:
146
+ raise ValueError('Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
147
+
148
+
149
+ def _get_equis_template(self):
150
+ with duckdb.connect(self.db_path,read_only=True) as con:
151
+ query = '''
152
+ SELECT *
153
+ FROM staging.equis
154
+ LIMIT 0'''
155
+ df = con.execute(query).fetch_df().to_csv(self.folderpath.joinpath('equis_template.csv'), index=False)
156
+ return df
157
+
158
+ def _get_wiski_template(self):
159
+ with duckdb.connect(self.db_path,read_only=True) as con:
160
+ query = '''
161
+ SELECT *
162
+ FROM staging.wiski
163
+ LIMIT 0'''
164
+ df = con.execute(query).fetch_df().to_csv(self.folderpath.joinpath('wiski_template.csv'), index=False)
165
+ return df
166
+
167
+ def get_outlets(self,model_name):
168
+ with duckdb.connect(self.db_path,read_only=True) as con:
169
+ query = '''
170
+ SELECT *
171
+ FROM outlets.station_reach_pairs
172
+ WHERE repository_name = ?
173
+ ORDER BY outlet_id'''
174
+ df = con.execute(query,[model_name]).fetch_df()
175
+ return df
176
+
177
+ def get_station_ids(self,station_origin = None):
178
+ with duckdb.connect(self.db_path,read_only=True) as con:
179
+ if station_origin is None:
180
+ query = '''
181
+ SELECT DISTINCT station_id, station_origin
182
+ FROM analytics.observations'''
183
+ df = con.execute(query).fetch_df()
184
+ else:
185
+ query = '''
186
+ SELECT DISTINCT station_id
187
+ FROM analytics.observations
188
+ WHERE station_origin = ?'''
189
+ df = con.execute(query,[station_origin]).fetch_df()
190
+
191
+ return df['station_id'].to_list()
192
+
193
+
194
+ def get_observation_data(self,station_ids,constituent,agg_period = None):
195
+ with duckdb.connect(self.db_path,read_only=True) as con:
196
+ query = '''
197
+ SELECT *
198
+ FROM analytics.observations
199
+ WHERE station_id IN ? AND constituent = ?'''
200
+ df = con.execute(query,[station_ids,constituent]).fetch_df()
201
+
202
+ unit = UNIT_DEFAULTS[constituent]
203
+ agg_func = AGG_DEFAULTS[unit]
204
+
205
+ df.set_index('datetime',inplace=True)
206
+ df.attrs['unit'] = unit
207
+ df.attrs['constituent'] = constituent
208
+ if agg_period is not None:
209
+ df = df[['value']].resample(agg_period).agg(agg_func)
210
+ df.attrs['agg_period'] = agg_period
211
+
212
+ df.rename(columns={'value': 'observed'}, inplace=True)
213
+ return df.dropna(subset=['observed'])
214
+
215
+ def get_outlet_data(self,outlet_id,constituent,agg_period = 'D',to_csv = False):
216
+ with duckdb.connect(self.db_path,read_only=True) as con:
217
+ query = '''
218
+ SELECT *
219
+ FROM analytics.outlet_observations_with_flow
220
+ WHERE outlet_id = ? AND constituent = ?'''
221
+ df = con.execute(query,[outlet_id,constituent]).fetch_df()
222
+
223
+ unit = UNIT_DEFAULTS[constituent]
224
+ agg_func = AGG_DEFAULTS[unit]
225
+
226
+ df.set_index('datetime',inplace=True)
227
+ df.attrs['unit'] = unit
228
+ df.attrs['constituent'] = constituent
229
+ if agg_period is not None:
230
+ df = df[['value','flow_value','baseflow_value']].resample(agg_period).agg(agg_func)
231
+ df.attrs['agg_period'] = agg_period
232
+
233
+ df.rename(columns={'value': 'observed',
234
+ 'flow_value': 'observed_flow',
235
+ 'baseflow_value': 'observed_baseflow'}, inplace=True)
236
+ return df.dropna(subset=['observed'])
237
+
238
+ def get_raw_data(self,station_id,station_origin, to_csv = False):
239
+ with duckdb.connect(self.db_path,read_only=True) as con:
240
+ if station_origin.lower() == 'equis':
241
+ query = '''
242
+ SELECT *
243
+ FROM staging.equis_raw
244
+ WHERE station_id = ?'''
245
+ elif station_origin.lower() == 'wiski':
246
+ query = '''
247
+ SELECT *
248
+ FROM staging.wiski_raw
249
+ WHERE station_id = ?'''
250
+ else:
251
+ raise ValueError(f'Station origin {station_origin} not recognized. Valid options are equis or wiski.')
252
+
253
+ df = con.execute(query,[station_id]).fetch_df()
254
+
255
+ if to_csv:
256
+ df.to_csv(self.folderpath.joinpath(f'{station_id}_raw.csv'), index=False)
257
+ return df
258
+
259
+ def to_csv(self,station_id ,station_origin,folderpath = None):
260
+ if folderpath is None:
261
+ folderpath = self.folderpath
262
+ else:
263
+ folderpath = Path(folderpath)
264
+ df = self.get_station_data([station_id],constituent = 'Q',agg_period = None)
265
+ if len(df) > 0:
266
+ df.to_csv(folderpath.joinpath(station_id + '.csv'))
267
+ else:
268
+ print(f'No {station_id} calibration data available at Station {station_id}')
269
+
270
+ df.to_csv(folderpath.joinpath(station_id + '.csv'))
271
+
272
+
273
+ # class database():
274
+ # def __init__(self,db_path):
275
+ # self.dbm = MonitoringDatabase(db_path)
276
+
277
+
278
+ # def get_timeseries(self,station_ds, constituent,agg_period):
279
+ # validate_constituent(constituent)
280
+ # unit = UNIT_DEFAULTS[constituent]
281
+ # agg_func = AGG_DEFAULTS[unit]
282
+ # return odm.get_timeseries(station_id,constituent)
283
+
284
+
285
+ # def get_samples(self,station_ds, constituent,agg_period):
286
+ # validate_constituent(constituent)
287
+ # unit = UNIT_DEFAULTS[constituent]
288
+ # agg_func = AGG_DEFAULTS[unit]
289
+ # return odm.get_sample(station_id,constituent)
290
+
291
+ # def get_samples_and_timeseries(self,station_ds, constituent,agg_period)
292
+