mpcaHydro 2.1.0__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mpcaHydro/data/WISKI_QUALITY_CODES.csv +71 -0
- mpcaHydro/data/outlets.duckdb +0 -0
- mpcaHydro/data/stations_EQUIS.gpkg +0 -0
- mpcaHydro/data/stations_wiski.gpkg +0 -0
- mpcaHydro/data_manager.py +122 -287
- mpcaHydro/equis.py +31 -22
- mpcaHydro/outlets.py +371 -0
- mpcaHydro/reports.py +80 -0
- mpcaHydro/warehouse.py +389 -11
- mpcaHydro/warehouseManager.py +47 -0
- mpcaHydro/{WISKI.py → wiski.py} +40 -12
- mpcaHydro/xref.py +74 -0
- {mpcahydro-2.1.0.dist-info → mpcahydro-2.2.0.dist-info}/METADATA +3 -1
- mpcahydro-2.2.0.dist-info/RECORD +23 -0
- mpcahydro-2.1.0.dist-info/RECORD +0 -15
- {mpcahydro-2.1.0.dist-info → mpcahydro-2.2.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
quality_code,Text,Description,Active
|
|
2
|
+
0,Unchecked,Unchecked data in progress or data that is not quality coded as part of the workup. Default coding for shifts so the quality codes from Level and Ratings are used for discharges. ,1
|
|
3
|
+
3,Instantaneous,Instantaneous groundwater measurements or sampled date for load stations.,1
|
|
4
|
+
5,Excellent,Discharge measurements that are excellent.,1
|
|
5
|
+
8,Reliable Interpolation,The value of the data point is an interpolation between adjacent points. Code used for filling gaps less than 4 hours or with no change in data trend likely based on reference information.,1
|
|
6
|
+
10,Good,Time series data that tracks well and requires no corrections or corrections of very small magnitude or timeseries data that has been reviewed and accepted for precipitation and groundwater level. Also used for discharge measurements and rating points. ,1
|
|
7
|
+
15,Fair,Time series data that tracks fairly well and requires some corrections of relatively small magnitude. Also used for discharge measurements and rating points. ,1
|
|
8
|
+
20,Poor,Time series data that tracks poorly and requires significant or many corrections. Also used for discharge measurements and rating points. ,1
|
|
9
|
+
27,Questionable,"Timeseries data or discharge measurements that are questionable due to operator error, equipment error, etc). Extra scrutiny should be used for these data. ",1
|
|
10
|
+
28,Unknown data quality,"Unknown quality of time series data, ratings or discharge measurements.",1
|
|
11
|
+
29,Modeled,"Time-series data, rating point or discharge from a reliable mathematical and\or computer model. ",1
|
|
12
|
+
34,Estimated,"Time-series data estimated from reference traces, models or extrapolation of the rating curve using supporting data and up to two times the maximum measured discharge.",1
|
|
13
|
+
35,Unreliable,Time-series data computed with a rating extrapolated without supporting data or beyond two times the maximum measured discharge without a model.,1
|
|
14
|
+
36,Threshold Exceedance,"Time-series data may be beyond the measuring limits of the monitoring equipment, or outside the bounds of historical extremes.",1
|
|
15
|
+
40,Default import code,WISKI default coding for gaugings. ,1
|
|
16
|
+
45,Approved Ext Data,"External data that has been graded externally as ""Approved"".",1
|
|
17
|
+
48,Unknown Ext Data,External data that has been graded internally as “Unknown”.,1
|
|
18
|
+
49,Estimated Ext Data,External data that has been graded externally as “Estimated.” Typically this is finalized ice data.,1
|
|
19
|
+
50,Provisional Ext Data,External data that has been graded internally or externally as “Provisional”.,1
|
|
20
|
+
80,Ice - Estimated,Ice affected time series data. Discharge computed with ice affected stage data is considered estimated.,1
|
|
21
|
+
199,199-Logger Unknown,Initial code for data coming to the system from the logger.,1
|
|
22
|
+
200,200,Initial code for data coming to the system from telemetry or default coding for WISKI timeseries. ,1
|
|
23
|
+
228,Info Parameter,This parameter is collected for informational purposes only. Data has been through a cursory check only. This is stored in the database and available upon request.,1
|
|
24
|
+
255,---,System assigned code for gaps in the data set. Records with null values. ,1
|
|
25
|
+
1,Continuous Data,~Discontinued~ Good TS data that requires no correction.,0
|
|
26
|
+
2,Edited Data,~Discontinued~ TS data that has been edited. Typically used when spikes are removed or when points are edited manual for datum corrections.,0
|
|
27
|
+
3,Instantaneous Data,Final WQ data.,0
|
|
28
|
+
4,Questionable data,~Discontinued~,0
|
|
29
|
+
5,Excellent measurment,Used to indicated discharge measurements that are excellent as well as excellent sections of the rating.,0
|
|
30
|
+
10,Good measurement,Used to indicated discharge measurements and sections of the rating that are good and time series data that tracks well and requires no corrections or corrections of very small magnitude.,0
|
|
31
|
+
12,Modeled measurement,~Discontinued~ Rating point or discharge was obtained from a relizble mathematical and/or computer model. After 3/1/11 use QC148.,0
|
|
32
|
+
15,Fair measurement,Used to indicated discharge measurements and sections of the rating that are fair and time series data that tracks fairly well and requires some corrections of relatively small magnitude.,0
|
|
33
|
+
20,Poor measurement,Used to indicated discharge measurements and sections of the rating that are poor and time series data that tracks poorly and requires significant or many corrections.,0
|
|
34
|
+
25,Unknown measurement,Measurement data not available.,0
|
|
35
|
+
27,Questionable data,"Flow measurement is very poor and should be given extra scrutiny or time series data that is questionable due to operator error, equipment error, etc.",0
|
|
36
|
+
30,Good Archived Daily Value,This code is used for archived daily value data that is considered “Good”.,0
|
|
37
|
+
31,Fair Archived Daily Value,This code is used for archived daily value data that is considered “Fair”.,0
|
|
38
|
+
32,Poor Archived Daily Value,This code is used for archived daily value data that is considered “Poor”.,0
|
|
39
|
+
33,Unknown Archived Daily Value,This code is used for archived daily value data that has unknown quality based on lack of documentation.,0
|
|
40
|
+
34,Estimated Archived Daily Value,This code is used for archived daily value data that has been estimated.,0
|
|
41
|
+
35,Unreliable Archived Daily Value,This code is used for archived daily value data that is unreliable based on the quality of the supporting time series data and/or rating.,0
|
|
42
|
+
45,Good External Data,This code is used for external data that has been graded internally as “Good”.,0
|
|
43
|
+
46,Fair External Data,This code is used for external data that has been graded internally as “Fair”.,0
|
|
44
|
+
47,Poor External Data,This code is used for external data that has been graded internally as “Poor”.,0
|
|
45
|
+
48,Unknown External Data,This code is used for external data that has been graded internally as “Unknown”,0
|
|
46
|
+
49,Estimated External Data,This code is used for external data that has been graded externally as “Estimated.” Typically this is finalized ice data.,0
|
|
47
|
+
50,Provisional External Data,This code is used for external data that has been graded internally as “Provisional”,0
|
|
48
|
+
51,Telemetry data - DCP,This code is used for time-series data when imported into hydstra using an automated telemetry method that accesses a DCP through the GOES network. The “questionable measurement” flag is set through the shef code that accompanies the DCP data.,0
|
|
49
|
+
60,Above rating,~Discontinued~,0
|
|
50
|
+
70,Estimated Data,Value of the data point is estimated.,0
|
|
51
|
+
76,Reliable interpolation,Value of the data point is an interpolation between adjacent points. ,0
|
|
52
|
+
80,Ice,"(DISCONTINUED) Used to indicate ice conditions when the data should not be exported. Use in conjunction with 80 to code 232.00 values, run USDAY to compute daily flow, then recode 232.00 80 values to 180 so unit value export cannot occur.",0
|
|
53
|
+
82,Linear interpolation across a gap in records,~Discontinued~ Points that were added to fill a gap in the data record. The points fall on a straight line between the end points of the gap. This code was changed to 8 in WISKI.,0
|
|
54
|
+
103,Provisional Instantaneous Data,Provisional WQ data.,0
|
|
55
|
+
130,Good Provisional Daily Value,This code is used for archived daily value data that is considered “Good” but Provisional because there is only one year of gaging measurements.,0
|
|
56
|
+
131,Fair Provisional Daily Value,This code is used for archived daily value data that is considered “Fair” but Provisional because there is only one year of gaging measurements.,0
|
|
57
|
+
132,Poor Provisional Daily Value,This code is used for archived daily value data that is considered “Poor” but Provisional because there is only one year of gaging measurements.,0
|
|
58
|
+
133,Unknown Provisional Archived Daily Value,This code is used for archived daily value data that has unknown quality based on lack of documentation but Provisional because there is only one year of gaging measurements.,0
|
|
59
|
+
134,Estimated Provisional Archived Daily Value,This code is used for archived daily value data that has been estimated but Provisional because there is only one year of gaging measurements.,0
|
|
60
|
+
135,Unreliable Provisional Archived Daily Value,This code is used for archived daily value data that is unreliable based on the quality of the supporting time series data and/or rating but Provisional because there is only one year of gaging measurements.,0
|
|
61
|
+
140,Data not yet checked,This code is used for time-series data when it is initially imported into hydstra using manual import methods. ,0
|
|
62
|
+
141,Telemetry data - not yet checked,This code is used for time-series data when it is imported into hydstra using an automated telemetry method.,0
|
|
63
|
+
148,Modeled measurement,Rating point or discharge was obtained from a reliable mathematical and/or computer model.,0
|
|
64
|
+
149,Extrapolated rating point,Rating point accurately extrapolated using supporting data and is less than two times the maxiumum measured discharge.,0
|
|
65
|
+
150,Over-extrapolated rating point,Rating point extrapolated without supporting data or beyone two times the maximum measured discharge without a mathematical model.,0
|
|
66
|
+
151,Data Missing,"This code is used to flag the end of a period of missing time-series data, before the next good data value.",0
|
|
67
|
+
160,Above rating,~Discontinued~,0
|
|
68
|
+
169,Datalogger Hardware Error Code 6999,"This code is used to indicate that a time-series point had a value of 6999 or -6999, a typical hardware error code, and the value was changed.",0
|
|
69
|
+
170,Estimated Data,"Used to indicate estimated data when the data should not be exported. Often used in conjunction with 70 to code 232.00 values, run USDAY to compute daily flow, then recode 232.00 70 values to 170 so unit value export can not occur.",0
|
|
70
|
+
180,Ice,Used to indicate ice conditions.,0
|
|
71
|
+
255,Data Missing,This code is used when data is exported and does not exist for a given time period.,0
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
mpcaHydro/data_manager.py
CHANGED
|
@@ -10,15 +10,14 @@ import pandas as pd
|
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from mpcaHydro import etlSWD
|
|
12
12
|
from mpcaHydro import equis, wiski, warehouse
|
|
13
|
+
from mpcaHydro import xref
|
|
14
|
+
from mpcaHydro import outlets
|
|
15
|
+
from mpcaHydro.reports import reportManager
|
|
13
16
|
import duckdb
|
|
14
17
|
|
|
15
|
-
|
|
16
|
-
WISKI_EQUIS_XREF = pd.read_csv(Path(__file__).parent/'data/WISKI_EQUIS_XREF.csv')
|
|
17
|
-
#WISKI_EQUIS_XREF = pd.read_csv('C:/Users/mfratki/Documents/GitHub/hspf_tools/WISKI_EQUIS_XREF.csv')
|
|
18
|
-
|
|
19
18
|
AGG_DEFAULTS = {'cfs':'mean',
|
|
20
19
|
'mg/l':'mean',
|
|
21
|
-
'
|
|
20
|
+
'degf': 'mean',
|
|
22
21
|
'lb':'sum'}
|
|
23
22
|
|
|
24
23
|
UNIT_DEFAULTS = {'Q': 'cfs',
|
|
@@ -28,29 +27,15 @@ UNIT_DEFAULTS = {'Q': 'cfs',
|
|
|
28
27
|
'OP' : 'mg/l',
|
|
29
28
|
'TKN': 'mg/l',
|
|
30
29
|
'N' : 'mg/l',
|
|
31
|
-
'WT' : '
|
|
30
|
+
'WT' : 'degf',
|
|
32
31
|
'WL' : 'ft'}
|
|
33
32
|
|
|
34
|
-
def are_lists_identical(nested_list):
|
|
35
|
-
# Sort each sublist
|
|
36
|
-
sorted_sublists = [sorted(sublist) for sublist in nested_list]
|
|
37
|
-
# Compare all sublists to the first one
|
|
38
|
-
return all(sublist == sorted_sublists[0] for sublist in sorted_sublists)
|
|
39
33
|
|
|
40
|
-
def
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
datafiles = folderpath.joinpath('*.csv').as_posix()
|
|
46
|
-
query = '''
|
|
47
|
-
CREATE TABLE observations AS SELECT *
|
|
48
|
-
FROM
|
|
49
|
-
read_csv_auto(?,
|
|
50
|
-
union_by_name = true);
|
|
51
|
-
|
|
52
|
-
'''
|
|
53
|
-
con.execute(query,[datafiles])
|
|
34
|
+
def validate_constituent(constituent):
|
|
35
|
+
assert constituent in ['Q','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
|
|
36
|
+
|
|
37
|
+
def validate_unit(unit):
|
|
38
|
+
assert(unit in ['mg/l','lb','cfs','degF'])
|
|
54
39
|
|
|
55
40
|
|
|
56
41
|
def build_warehouse(folderpath):
|
|
@@ -86,8 +71,14 @@ class dataManager():
|
|
|
86
71
|
self.data = {}
|
|
87
72
|
self.folderpath = Path(folderpath)
|
|
88
73
|
self.db_path = self.folderpath.joinpath('observations.duckdb')
|
|
74
|
+
|
|
89
75
|
self.oracle_user = oracle_user
|
|
90
76
|
self.oracle_password = oracle_password
|
|
77
|
+
warehouse.init_db(self.db_path,reset = False)
|
|
78
|
+
self.xref = xref
|
|
79
|
+
self.outlets = outlets
|
|
80
|
+
self.reports = reportManager(self.db_path)
|
|
81
|
+
|
|
91
82
|
|
|
92
83
|
def connect_to_oracle(self):
|
|
93
84
|
assert (self.credentials_exist(), 'Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
|
|
@@ -99,295 +90,139 @@ class dataManager():
|
|
|
99
90
|
else:
|
|
100
91
|
return False
|
|
101
92
|
|
|
102
|
-
def _reconstruct_database(self):
|
|
103
|
-
construct_database(self.folderpath)
|
|
104
|
-
|
|
105
93
|
def _build_warehouse(self):
|
|
106
94
|
build_warehouse(self.folderpath)
|
|
107
|
-
|
|
108
|
-
def constituent_summary(self,constituents = None):
|
|
109
|
-
with duckdb.connect(self.db_path) as con:
|
|
110
|
-
if constituents is None:
|
|
111
|
-
constituents = con.query('''
|
|
112
|
-
SELECT DISTINCT
|
|
113
|
-
constituent
|
|
114
|
-
FROM observations''').to_df()['constituent'].to_list()
|
|
115
|
-
|
|
116
|
-
query = '''
|
|
117
|
-
SELECT
|
|
118
|
-
station_id,
|
|
119
|
-
station_origin,
|
|
120
|
-
constituent,
|
|
121
|
-
COUNT(*) AS sample_count,
|
|
122
|
-
year(MIN(datetime)) AS start_date,
|
|
123
|
-
year(MAX(datetime)) AS end_date
|
|
124
|
-
FROM
|
|
125
|
-
observations
|
|
126
|
-
WHERE
|
|
127
|
-
constituent in (SELECT UNNEST(?))
|
|
128
|
-
GROUP BY
|
|
129
|
-
constituent,station_id,station_origin
|
|
130
|
-
ORDER BY
|
|
131
|
-
constituent,sample_count;'''
|
|
132
|
-
|
|
133
|
-
df = con.execute(query,[constituents]).fetch_df()
|
|
134
|
-
return df
|
|
135
95
|
|
|
136
|
-
def
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
96
|
+
def download_station_data(self,station_id,station_origin,overwrite=True,to_csv = False,filter_qc_codes = True, start_year = 1996, end_year = 2030,baseflow_method = 'Boughton'):
|
|
97
|
+
'''
|
|
98
|
+
Method to download data for a specific station and load it into the warehouse.
|
|
99
|
+
|
|
100
|
+
:param self: Description
|
|
101
|
+
:param station_id: Station identifier
|
|
102
|
+
:param station_origin: source of station data: wiski, equis, or swd
|
|
103
|
+
:param overwrite: Whether to overwrite existing data
|
|
104
|
+
:param to_csv: Whether to export data to CSV
|
|
105
|
+
:param filter_qc_codes: Whether to filter quality control codes
|
|
106
|
+
:param start_year: Start year for data download
|
|
107
|
+
:param end_year: End year for data download
|
|
108
|
+
:param baseflow_method: Method for baseflow calculation
|
|
109
|
+
'''
|
|
110
|
+
with duckdb.connect(self.db_path,read_only=False) as con:
|
|
111
|
+
if overwrite:
|
|
112
|
+
warehouse.drop_station_id(con,station_id,station_origin)
|
|
113
|
+
warehouse.update_views(con)
|
|
114
|
+
|
|
115
|
+
if station_origin == 'wiski':
|
|
116
|
+
df = wiski.download([station_id],start_year = start_year, end_year = end_year)
|
|
117
|
+
warehouse.load_df_to_staging(con,df, 'wiski_raw', replace = overwrite)
|
|
118
|
+
warehouse.load_df_to_analytics(con,wiski.transform(df,filter_qc_codes = filter_qc_codes,baseflow_method = baseflow_method),'wiski') # method includes normalization
|
|
119
|
+
|
|
120
|
+
elif station_origin == 'equis':
|
|
121
|
+
assert (self.credentials_exist(), 'Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
|
|
122
|
+
df = equis.download([station_id])
|
|
123
|
+
warehouse.load_df_to_staging(con,df, 'equis_raw',replace = overwrite)
|
|
124
|
+
warehouse.load_df_to_analytics(con,equis.transform(df),'equis')
|
|
125
|
+
|
|
126
|
+
elif station_origin == 'swd':
|
|
127
|
+
df = etlSWD.download(station_id)
|
|
128
|
+
warehouse.load_df_to_staging(con,df, 'swd_raw', replace = overwrite)
|
|
129
|
+
warehouse.load_df_to_analytics(con,etlSWD.transform(df),'swd')
|
|
130
|
+
else:
|
|
131
|
+
raise ValueError('station_origin must be wiski, equis, or swd')
|
|
141
132
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
equis_ids = [equis_id for equis_id in equis_ids if not pd.isna(equis_id)]
|
|
145
|
-
if len(equis_ids) == 0:
|
|
146
|
-
return []
|
|
147
|
-
elif len(equis_ids) > 1:
|
|
148
|
-
print(f'Too Many Equis Stations for {wiski_station_id}')
|
|
149
|
-
raise
|
|
150
|
-
else:
|
|
151
|
-
return equis_ids[0]
|
|
133
|
+
with duckdb.connect(self.db_path,read_only=False) as con:
|
|
134
|
+
warehouse.update_views(con)
|
|
152
135
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
equis_ids = [equis_id for equis_id in equis_ids if not pd.isna(equis_id)]
|
|
156
|
-
if len(equis_ids) == 0:
|
|
157
|
-
return []
|
|
158
|
-
else:
|
|
159
|
-
return equis_ids
|
|
160
|
-
|
|
161
|
-
def equis_wiski_associations(self,equis_station_id):
|
|
162
|
-
wiski_ids = list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['EQUIS_STATION_ID'] == equis_station_id,'WISKI_STATION_NO'].unique())
|
|
163
|
-
wiski_ids = [wiski_id for wiski_id in wiski_ids if not pd.isna(wiski_id)]
|
|
164
|
-
if len(wiski_ids) == 0:
|
|
165
|
-
return []
|
|
166
|
-
else:
|
|
167
|
-
return wiski_ids
|
|
168
|
-
|
|
169
|
-
def equis_wiski_alias(self,equis_station_id):
|
|
170
|
-
wiski_ids = list(set(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_EQUIS_ID'] == equis_station_id,'WISKI_STATION_NO'].to_list()))
|
|
171
|
-
wiski_ids = [wiski_id for wiski_id in wiski_ids if not pd.isna(wiski_id)]
|
|
172
|
-
if len(wiski_ids) == 0:
|
|
173
|
-
return []
|
|
174
|
-
elif len(wiski_ids) > 1:
|
|
175
|
-
print(f'Too Many WISKI Stations for {equis_station_id}')
|
|
176
|
-
raise
|
|
177
|
-
else:
|
|
178
|
-
return wiski_ids[0]
|
|
179
|
-
|
|
180
|
-
def _equis_wiski_associations(self,equis_station_ids):
|
|
181
|
-
wiski_stations = [self.equis_wiski_associations(equis_station_id) for equis_station_id in equis_station_ids]
|
|
182
|
-
if are_lists_identical(wiski_stations):
|
|
183
|
-
return wiski_stations[0]
|
|
184
|
-
else:
|
|
185
|
-
return []
|
|
186
|
-
|
|
187
|
-
def _stations_by_wid(self,wid_no,station_origin):
|
|
188
|
-
if station_origin in ['wiski','wplmn']:
|
|
189
|
-
station_col = 'WISKI_STATION_NO'
|
|
190
|
-
elif station_origin in ['equis','swd']:
|
|
191
|
-
station_col = 'EQUIS_STATION_ID'
|
|
192
|
-
else:
|
|
193
|
-
raise
|
|
136
|
+
if to_csv:
|
|
137
|
+
self.to_csv(station_id)
|
|
194
138
|
|
|
195
|
-
return
|
|
196
|
-
|
|
139
|
+
return df
|
|
140
|
+
|
|
141
|
+
def get_outlets(self):
|
|
142
|
+
with duckdb.connect(self.db_path,read_only=True) as con:
|
|
143
|
+
query = '''
|
|
144
|
+
SELECT *
|
|
145
|
+
FROM outlets.station_reach_pairs
|
|
146
|
+
ORDER BY outlet_id'''
|
|
147
|
+
df = con.execute(query).fetch_df()
|
|
148
|
+
return df
|
|
149
|
+
def get_station_ids(self,station_origin = None):
|
|
150
|
+
with duckdb.connect(self.db_path,read_only=True) as con:
|
|
151
|
+
if station_origin is None:
|
|
152
|
+
query = '''
|
|
153
|
+
SELECT DISTINCT station_id, station_origin
|
|
154
|
+
FROM analytics.observations'''
|
|
155
|
+
df = con.execute(query).fetch_df()
|
|
156
|
+
else:
|
|
157
|
+
query = '''
|
|
158
|
+
SELECT DISTINCT station_id
|
|
159
|
+
FROM analytics.observations
|
|
160
|
+
WHERE station_origin = ?'''
|
|
161
|
+
df = con.execute(query,[station_origin]).fetch_df()
|
|
162
|
+
|
|
163
|
+
return df['station_id'].to_list()
|
|
197
164
|
|
|
198
|
-
def download_stations_by_wid(self, wid_no,station_origin, folderpath = None, overwrite = False):
|
|
199
|
-
|
|
200
|
-
station_ids = self._station_by_wid(wid_no,station_origin)
|
|
201
|
-
|
|
202
|
-
if not station_ids.empty:
|
|
203
|
-
for _, row in station_ids.iterrows():
|
|
204
|
-
self.download_station_data(row['station_id'],station_origin, folderpath, overwrite)
|
|
205
|
-
|
|
206
|
-
def _download_station_data(self,station_id,station_origin,overwrite=False):
|
|
207
|
-
assert(station_origin in ['wiski','equis','swd','wplmn'])
|
|
208
|
-
if station_origin == 'wiski':
|
|
209
|
-
self.download_station_data(station_id,'wiski',overwrite = overwrite)
|
|
210
|
-
elif station_origin == 'wplmn':
|
|
211
|
-
self.download_station_data(station_id,'wplmn',overwrite = overwrite)
|
|
212
|
-
elif station_origin == 'swd':
|
|
213
|
-
self.download_station_data(station_id,'swd',overwrite = overwrite)
|
|
214
|
-
else:
|
|
215
|
-
self.download_station_data(station_id,'equis',overwrite = overwrite)
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
165
|
|
|
220
|
-
def
|
|
221
|
-
assert(station_origin in ['wiski','equis','swd','wplmn'])
|
|
222
|
-
station_id = str(station_id)
|
|
223
|
-
save_name = station_id
|
|
224
|
-
if station_origin == 'wplmn':
|
|
225
|
-
save_name = station_id + '_wplmn'
|
|
226
|
-
|
|
227
|
-
if folderpath is None:
|
|
228
|
-
folderpath = self.folderpath
|
|
229
|
-
else:
|
|
230
|
-
folderpath = Path(folderpath)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
if (folderpath.joinpath(save_name + '.csv').exists()) & (not overwrite):
|
|
234
|
-
print (f'{station_id} data already downloaded')
|
|
235
|
-
return
|
|
236
|
-
|
|
237
|
-
if station_origin == 'wiski':
|
|
238
|
-
data = wiski.transform(wiski.download([station_id],wplmn=False, baseflow_method = baseflow_method))
|
|
239
|
-
elif station_origin == 'swd':
|
|
240
|
-
data = etlSWD.download(station_id)
|
|
241
|
-
elif station_origin == 'equis':
|
|
242
|
-
assert (self.credentials_exist(), 'Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
|
|
243
|
-
data = equis.transform(equis.download([station_id]))
|
|
244
|
-
else:
|
|
245
|
-
data = wiski.transform(wiski.download([station_id],wplmn=True, baseflow_method = baseflow_method))
|
|
166
|
+
def get_station_data(self,station_ids,constituent,agg_period = None):
|
|
246
167
|
|
|
247
168
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
if len(data) > 0:
|
|
251
|
-
data.to_csv(folderpath.joinpath(save_name + '.csv'))
|
|
252
|
-
self.data[station_id] = data
|
|
253
|
-
else:
|
|
254
|
-
print(f'No {station_origin} calibration cata available at Station {station_id}')
|
|
255
|
-
|
|
256
|
-
def _load(self,station_id):
|
|
257
|
-
with duckdb.connect(self.db_path) as con:
|
|
169
|
+
with duckdb.connect(self.db_path,read_only=True) as con:
|
|
258
170
|
query = '''
|
|
259
171
|
SELECT *
|
|
260
172
|
FROM analytics.observations
|
|
261
|
-
WHERE station_id = ?'''
|
|
262
|
-
df = con.execute(query,[
|
|
263
|
-
df.set_index('datetime',inplace=True)
|
|
264
|
-
self.data[station_id] = df
|
|
265
|
-
return df
|
|
266
|
-
|
|
267
|
-
def _load2(self,station_id):
|
|
268
|
-
df = pd.read_csv(self.folderpath.joinpath(station_id + '.csv'),
|
|
269
|
-
index_col='datetime',
|
|
270
|
-
parse_dates=['datetime'],
|
|
271
|
-
#usecols=['Ts Date','Station number','variable', 'value','reach_id'],
|
|
272
|
-
dtype={'station_id': str, 'value': float, 'variable': str,'constituent':str,'unit':str})
|
|
273
|
-
self.data[station_id] = df
|
|
274
|
-
return df
|
|
275
|
-
|
|
276
|
-
def load(self,station_id):
|
|
277
|
-
try:
|
|
278
|
-
df = self.data[station_id]
|
|
279
|
-
except:
|
|
280
|
-
df = self._load(station_id)
|
|
281
|
-
return df
|
|
282
|
-
|
|
283
|
-
def info(self,constituent):
|
|
284
|
-
return pd.concat([self._load(file.stem) for file in self.folderpath.iterdir() if file.suffix == '.csv'])[['station_id','constituent','value']].groupby(by = ['station_id','constituent']).count()
|
|
285
|
-
|
|
286
|
-
def get_wplmn_data(self,station_id,constituent,unit = 'mg/l', agg_period = 'YE', samples_only = True):
|
|
287
|
-
|
|
288
|
-
assert constituent in ['Q','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
|
|
289
|
-
station_id = station_id + '_wplmn'
|
|
290
|
-
dfsub = self._load(station_id)
|
|
173
|
+
WHERE station_id IN ? AND constituent = ?'''
|
|
174
|
+
df = con.execute(query,[station_ids,constituent]).fetch_df()
|
|
291
175
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
agg_func = 'mean'
|
|
295
|
-
|
|
296
|
-
dfsub = dfsub.loc[(dfsub['constituent'] == constituent) &
|
|
297
|
-
(dfsub['unit'] == unit),
|
|
298
|
-
['value','station_origin']]
|
|
176
|
+
unit = UNIT_DEFAULTS[constituent]
|
|
177
|
+
agg_func = AGG_DEFAULTS[unit]
|
|
299
178
|
|
|
300
|
-
|
|
301
|
-
df = dfsub[['value']].resample(agg_period).agg(agg_func)
|
|
302
|
-
|
|
303
|
-
if df.empty:
|
|
304
|
-
dfsub = df
|
|
305
|
-
else:
|
|
306
|
-
|
|
307
|
-
df['station_origin'] = dfsub['station_origin'].iloc[0]
|
|
308
|
-
|
|
309
|
-
#if (constituent == 'TSS') & (unit == 'lb'): #convert TSS from lbs to us tons
|
|
310
|
-
# dfsub['value'] = dfsub['value']/2000
|
|
311
|
-
|
|
312
|
-
#dfsub = dfsub.resample('H').mean().dropna()
|
|
313
|
-
|
|
179
|
+
df.set_index('datetime',inplace=True)
|
|
314
180
|
df.attrs['unit'] = unit
|
|
315
181
|
df.attrs['constituent'] = constituent
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
return self._get_data([station_id],constituent,agg_period)
|
|
320
|
-
|
|
321
|
-
def _get_data(self,station_ids,constituent,agg_period = 'D',tz_offset = '-6'):
|
|
322
|
-
'''
|
|
323
|
-
|
|
324
|
-
Returns the processed observational data associated with the calibration specific id.
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
Parameters
|
|
328
|
-
----------
|
|
329
|
-
station_id : STR
|
|
330
|
-
Station ID as a string
|
|
331
|
-
constituent : TYPE
|
|
332
|
-
Constituent abbreviation used for calibration. Valid options:
|
|
333
|
-
'Q',
|
|
334
|
-
'TSS',
|
|
335
|
-
'TP',
|
|
336
|
-
'OP',
|
|
337
|
-
'TKN',
|
|
338
|
-
'N',
|
|
339
|
-
'WT',
|
|
340
|
-
'DO',
|
|
341
|
-
'WL']
|
|
342
|
-
unit : TYPE, optional
|
|
343
|
-
Units of data. The default is 'mg/l'.
|
|
344
|
-
sample_flag : TYPE, optional
|
|
345
|
-
For WPLMN data this flag determines modeled loads are returned. The default is False.
|
|
182
|
+
if agg_period is not None:
|
|
183
|
+
df = df[['value']].resample(agg_period).agg(agg_func)
|
|
184
|
+
df.attrs['agg_period'] = agg_period
|
|
346
185
|
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
186
|
+
df.rename(columns={'value': 'observed'}, inplace=True)
|
|
187
|
+
return df
|
|
188
|
+
|
|
189
|
+
def get_outlet_data(self,outlet_id,constituent,agg_period = 'D'):
|
|
190
|
+
with duckdb.connect(self.db_path,read_only=True) as con:
|
|
191
|
+
query = '''
|
|
192
|
+
SELECT *
|
|
193
|
+
FROM analytics.outlet_observations_with_flow
|
|
194
|
+
WHERE outlet_id = ? AND constituent = ?'''
|
|
195
|
+
df = con.execute(query,[outlet_id,constituent]).fetch_df()
|
|
351
196
|
|
|
352
|
-
'''
|
|
353
|
-
|
|
354
|
-
assert constituent in ['Q','QB','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
|
|
355
|
-
|
|
356
197
|
unit = UNIT_DEFAULTS[constituent]
|
|
357
198
|
agg_func = AGG_DEFAULTS[unit]
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
dfsub.index = dfsub.index.tz_localize(None) # Drop timezone info
|
|
361
|
-
#dfsub.set_index('datetime',drop=True,inplace=True)
|
|
362
|
-
dfsub.rename(columns={'source':'station_origin'},inplace=True)
|
|
363
|
-
dfsub = dfsub.loc[(dfsub['constituent'] == constituent) &
|
|
364
|
-
(dfsub['unit'] == unit),
|
|
365
|
-
['value','station_origin']]
|
|
366
|
-
|
|
367
|
-
df = dfsub[['value']].resample(agg_period).agg(agg_func)
|
|
199
|
+
|
|
200
|
+
df.set_index('datetime',inplace=True)
|
|
368
201
|
df.attrs['unit'] = unit
|
|
369
202
|
df.attrs['constituent'] = constituent
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
return df
|
|
374
|
-
else:
|
|
375
|
-
|
|
376
|
-
df['station_origin'] = dfsub['station_origin'].iloc[0]
|
|
203
|
+
if agg_period is not None:
|
|
204
|
+
df = df[['value','flow_value','baseflow_value']].resample(agg_period).agg(agg_func)
|
|
205
|
+
df.attrs['agg_period'] = agg_period
|
|
377
206
|
|
|
207
|
+
df.rename(columns={'value': 'observed',
|
|
208
|
+
'flow_value': 'observed_flow',
|
|
209
|
+
'baseflow_value': 'observed_baseflow'}, inplace=True)
|
|
210
|
+
return df
|
|
378
211
|
|
|
379
|
-
# convert to desired timzone before stripping timezone information.
|
|
380
|
-
#df.index.tz_convert('UTC-06:00').tz_localize(None)
|
|
381
212
|
|
|
382
|
-
return df['value'].to_frame().dropna()
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
def validate_constituent(constituent):
|
|
386
|
-
assert constituent in ['Q','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
|
|
387
|
-
|
|
388
|
-
def validate_unit(unit):
|
|
389
|
-
assert(unit in ['mg/l','lb','cfs','degF'])
|
|
390
213
|
|
|
214
|
+
def to_csv(self,station_id,folderpath = None):
|
|
215
|
+
if folderpath is None:
|
|
216
|
+
folderpath = self.folderpath
|
|
217
|
+
else:
|
|
218
|
+
folderpath = Path(folderpath)
|
|
219
|
+
df = self._load(station_id)
|
|
220
|
+
if len(df) > 0:
|
|
221
|
+
df.to_csv(folderpath.joinpath(station_id + '.csv'))
|
|
222
|
+
else:
|
|
223
|
+
print(f'No {station_id} calibration data available at Station {station_id}')
|
|
224
|
+
|
|
225
|
+
df.to_csv(folderpath.joinpath(station_id + '.csv'))
|
|
391
226
|
|
|
392
227
|
|
|
393
228
|
# class database():
|
mpcaHydro/equis.py
CHANGED
|
@@ -164,26 +164,25 @@ def as_utc_offset(naive_dt: Union[datetime, str], tz_label: str, target_offset:
|
|
|
164
164
|
aware_src = naive.replace(tzinfo=src_tz)
|
|
165
165
|
|
|
166
166
|
# convert the instant to fixed UTC-6
|
|
167
|
-
return aware_src.astimezone(target_offset)
|
|
167
|
+
return aware_src.astimezone(target_offset).tz_localize(None)
|
|
168
168
|
|
|
169
169
|
|
|
170
170
|
def normalize_columns(df):
|
|
171
171
|
'''Select relevant columns from Equis data.'''
|
|
172
172
|
return df[['SYS_LOC_CODE',
|
|
173
|
+
'constituent',
|
|
174
|
+
'CAS_RN',
|
|
173
175
|
'datetime',
|
|
174
176
|
'RESULT_NUMERIC',
|
|
175
177
|
'RESULT_UNIT',
|
|
176
|
-
'constituent'
|
|
177
178
|
]].rename(columns={
|
|
178
179
|
'SYS_LOC_CODE':'station_id',
|
|
179
180
|
'RESULT_NUMERIC':'value',
|
|
180
|
-
'RESULT_UNIT':'unit'
|
|
181
|
+
'RESULT_UNIT':'unit',
|
|
182
|
+
'CAS_RN':'cas_rn'
|
|
181
183
|
})
|
|
182
184
|
|
|
183
|
-
|
|
184
|
-
'''Replace non-detect results with 0 in Equis data.'''
|
|
185
|
-
df.loc[df['RESULT_NUMERIC'].isna(), 'RESULT_NUMERIC'] = 0
|
|
186
|
-
return df
|
|
185
|
+
|
|
187
186
|
|
|
188
187
|
def normalize_timezone(df):
|
|
189
188
|
'''Normalize datetime to UTC in Equis data.'''
|
|
@@ -194,27 +193,27 @@ def normalize_timezone(df):
|
|
|
194
193
|
except Exception:
|
|
195
194
|
return pd.NaT
|
|
196
195
|
|
|
197
|
-
df['datetime'] = df.apply(_conv, axis=1)
|
|
196
|
+
df.loc[:,'datetime'] = df.apply(_conv, axis=1)
|
|
198
197
|
return df
|
|
199
198
|
|
|
200
199
|
def convert_units(df):
|
|
201
200
|
'''Convert units in Equis data to standard units.'''
|
|
202
201
|
# Convert ug/L to mg/L
|
|
203
|
-
df['
|
|
202
|
+
df['unit'] = df['unit'].str.lower()
|
|
204
203
|
|
|
205
|
-
mask_ugL = df['
|
|
206
|
-
df.loc[mask_ugL, '
|
|
207
|
-
df.loc[mask_ugL, '
|
|
204
|
+
mask_ugL = df['unit'] == 'ug/l'
|
|
205
|
+
df.loc[mask_ugL, 'value'] = df.loc[mask_ugL, 'value'] / 1000
|
|
206
|
+
df.loc[mask_ugL, 'unit'] = 'mg/l'
|
|
208
207
|
|
|
209
208
|
# Convert mg/g to mg/L (assuming density of 1 g/mL)
|
|
210
|
-
mask_mgg = df['
|
|
211
|
-
df.loc[mask_mgg, '
|
|
212
|
-
df.loc[mask_mgg, '
|
|
209
|
+
mask_mgg = df['unit'] == 'mg/g'
|
|
210
|
+
df.loc[mask_mgg, 'value'] = df.loc[mask_mgg, 'value'] * 1000
|
|
211
|
+
df.loc[mask_mgg, 'unit'] = 'mg/l'
|
|
213
212
|
|
|
214
213
|
# Convert deg C to degF
|
|
215
|
-
mask_degC = df['
|
|
216
|
-
df.loc[mask_degC, '
|
|
217
|
-
df.loc[mask_degC, '
|
|
214
|
+
mask_degC = df['unit'].isin(['deg c', 'degc'])
|
|
215
|
+
df.loc[mask_degC, 'value'] = (df.loc[mask_degC, 'value'] * 9/5) + 32
|
|
216
|
+
df.loc[mask_degC, 'unit'] = 'degf'
|
|
218
217
|
|
|
219
218
|
return df
|
|
220
219
|
|
|
@@ -232,15 +231,25 @@ def average_results(df):
|
|
|
232
231
|
value=('value', 'mean')
|
|
233
232
|
).reset_index()
|
|
234
233
|
|
|
234
|
+
def replace_nondetects(df):
|
|
235
|
+
'''Replace non-detect results with 0 in Equis data.'''
|
|
236
|
+
df.loc[df['value'].isna(), 'value'] = 0
|
|
237
|
+
return df
|
|
238
|
+
|
|
239
|
+
def normalize(df):
|
|
240
|
+
'''Normalize Equis data: select relevant columns.'''
|
|
241
|
+
df = map_constituents(df)
|
|
242
|
+
df = normalize_timezone(df)
|
|
243
|
+
df = normalize_columns(df)
|
|
244
|
+
df = convert_units(df)
|
|
245
|
+
return df
|
|
246
|
+
|
|
235
247
|
def transform(df):
|
|
236
248
|
'''Transform Equis data: handle non-detects, convert units, map constituents.'''
|
|
237
249
|
|
|
250
|
+
df = normalize(df)
|
|
238
251
|
df = replace_nondetects(df)
|
|
239
252
|
if not df.empty:
|
|
240
|
-
df = normalize_timezone(df)
|
|
241
|
-
df = convert_units(df)
|
|
242
|
-
df = map_constituents(df)
|
|
243
|
-
df = normalize_columns(df)
|
|
244
253
|
df = average_results(df)
|
|
245
254
|
return df
|
|
246
255
|
|