mpcaHydro 2.2.0__py3-none-any.whl → 2.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mpcaHydro/data/{outlets.duckdb → outlet.duckdb} +0 -0
- mpcaHydro/data/stations_EQUIS.gpkg +0 -0
- mpcaHydro/data/stations_wiski.gpkg +0 -0
- mpcaHydro/data/stations_wiski.gpkg-shm +0 -0
- mpcaHydro/data/stations_wiski.gpkg-wal +0 -0
- mpcaHydro/data_manager.py +104 -61
- mpcaHydro/etlSWD.py +21 -15
- mpcaHydro/outlets.py +70 -74
- mpcaHydro/reports.py +1 -1
- mpcaHydro/warehouse.py +276 -146
- mpcaHydro/warehouseManager.py +8 -0
- mpcaHydro/wiski.py +57 -5
- {mpcahydro-2.2.0.dist-info → mpcahydro-2.2.2.dist-info}/METADATA +1 -1
- mpcahydro-2.2.2.dist-info/RECORD +25 -0
- mpcahydro-2.2.0.dist-info/RECORD +0 -23
- {mpcahydro-2.2.0.dist-info → mpcahydro-2.2.2.dist-info}/WHEEL +0 -0
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
File without changes
|
mpcaHydro/data_manager.py
CHANGED
|
@@ -5,7 +5,6 @@ Created on Fri Jun 3 10:01:14 2022
|
|
|
5
5
|
@author: mfratki
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
import pandas as pd
|
|
9
8
|
#from abc import abstractmethod
|
|
10
9
|
from pathlib import Path
|
|
11
10
|
from mpcaHydro import etlSWD
|
|
@@ -64,88 +63,115 @@ def constituent_summary(db_path):
|
|
|
64
63
|
return res.fetch_df()
|
|
65
64
|
|
|
66
65
|
|
|
66
|
+
|
|
67
|
+
|
|
67
68
|
class dataManager():
|
|
68
69
|
|
|
69
|
-
def __init__(self,folderpath,
|
|
70
|
+
def __init__(self,folderpath, oracle_username = None, oracle_password =None, reset = False):
|
|
70
71
|
|
|
71
72
|
self.data = {}
|
|
72
73
|
self.folderpath = Path(folderpath)
|
|
73
74
|
self.db_path = self.folderpath.joinpath('observations.duckdb')
|
|
74
|
-
|
|
75
|
-
self.oracle_user = oracle_user
|
|
75
|
+
self.oracle_username = oracle_username
|
|
76
76
|
self.oracle_password = oracle_password
|
|
77
|
-
|
|
78
|
-
self.
|
|
79
|
-
|
|
77
|
+
|
|
78
|
+
if not self.db_path.exists() or reset:
|
|
79
|
+
self._build_warehouse()
|
|
80
|
+
|
|
81
|
+
self.xref = xref #TODO: implement xref manager class
|
|
82
|
+
self.outlets = outlets #TODO: implement outlets manager class
|
|
80
83
|
self.reports = reportManager(self.db_path)
|
|
81
84
|
|
|
82
85
|
|
|
83
86
|
def connect_to_oracle(self):
|
|
84
87
|
assert (self.credentials_exist(), 'Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
|
|
85
|
-
equis.connect(user = self.
|
|
88
|
+
equis.connect(user = self.oracle_username, password = self.oracle_password)
|
|
86
89
|
|
|
87
90
|
def credentials_exist(self):
|
|
88
|
-
if (self.
|
|
91
|
+
if (self.oracle_username is not None) & (self.oracle_password is not None):
|
|
89
92
|
return True
|
|
90
93
|
else:
|
|
91
94
|
return False
|
|
92
95
|
|
|
93
96
|
def _build_warehouse(self):
|
|
94
|
-
|
|
97
|
+
warehouse.init_db(self.db_path.as_posix(),True)
|
|
95
98
|
|
|
96
|
-
def
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
:param station_origin: source of station data: wiski, equis, or swd
|
|
103
|
-
:param overwrite: Whether to overwrite existing data
|
|
104
|
-
:param to_csv: Whether to export data to CSV
|
|
105
|
-
:param filter_qc_codes: Whether to filter quality control codes
|
|
106
|
-
:param start_year: Start year for data download
|
|
107
|
-
:param end_year: End year for data download
|
|
108
|
-
:param baseflow_method: Method for baseflow calculation
|
|
109
|
-
'''
|
|
110
|
-
with duckdb.connect(self.db_path,read_only=False) as con:
|
|
111
|
-
if overwrite:
|
|
112
|
-
warehouse.drop_station_id(con,station_id,station_origin)
|
|
113
|
-
warehouse.update_views(con)
|
|
99
|
+
def _process_wiski_data(self,filter_qc_codes = True, data_codes = None, baseflow_method = 'Boughton'):
|
|
100
|
+
with warehouse.connect(self.db_path,read_only = False) as con:
|
|
101
|
+
df = con.execute("SELECT * FROM staging.wiski").df()
|
|
102
|
+
df_transformed = wiski.transform(df, filter_qc_codes, data_codes, baseflow_method)
|
|
103
|
+
warehouse.load_df_to_table(con,df_transformed, 'analytics.wiski')
|
|
104
|
+
warehouse.update_views(con)
|
|
114
105
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
elif station_origin == 'equis':
|
|
121
|
-
assert (self.credentials_exist(), 'Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
|
|
122
|
-
df = equis.download([station_id])
|
|
123
|
-
warehouse.load_df_to_staging(con,df, 'equis_raw',replace = overwrite)
|
|
124
|
-
warehouse.load_df_to_analytics(con,equis.transform(df),'equis')
|
|
125
|
-
|
|
126
|
-
elif station_origin == 'swd':
|
|
127
|
-
df = etlSWD.download(station_id)
|
|
128
|
-
warehouse.load_df_to_staging(con,df, 'swd_raw', replace = overwrite)
|
|
129
|
-
warehouse.load_df_to_analytics(con,etlSWD.transform(df),'swd')
|
|
130
|
-
else:
|
|
131
|
-
raise ValueError('station_origin must be wiski, equis, or swd')
|
|
132
|
-
|
|
133
|
-
with duckdb.connect(self.db_path,read_only=False) as con:
|
|
106
|
+
def _process_equis_data(self):
|
|
107
|
+
with warehouse.connect(self.db_path,read_only = False) as con:
|
|
108
|
+
df = con.execute("SELECT * FROM staging.equis").df()
|
|
109
|
+
df_transformed = equis.transform(df)
|
|
110
|
+
warehouse.load_df_to_table(con,df_transformed, 'analytics.equis')
|
|
134
111
|
warehouse.update_views(con)
|
|
135
112
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
113
|
+
def _process_data(self,filter_qc_codes = True, data_codes = None, baseflow_method = 'Boughton'):
|
|
114
|
+
self._process_wiski_data(filter_qc_codes, data_codes, baseflow_method)
|
|
115
|
+
self._process_equis_data()
|
|
116
|
+
|
|
117
|
+
def _update_views(self):
|
|
118
|
+
with warehouse.connect(self.db_path,read_only = False) as con:
|
|
119
|
+
warehouse.update_views(con)
|
|
120
|
+
|
|
121
|
+
def _download_wiski_data(self,station_ids,start_year = 1996, end_year = 2030, filter_qc_codes = True, data_codes = None, baseflow_method = 'Boughton'):
|
|
122
|
+
with warehouse.connect(self.db_path,read_only = False) as con:
|
|
123
|
+
df = wiski.download(station_ids,start_year = start_year, end_year = end_year)
|
|
124
|
+
if not df.empty:
|
|
125
|
+
warehouse.load_df_to_table(con,df, 'staging.wiski')
|
|
126
|
+
warehouse.load_df_to_table(con,wiski.transform(df, filter_qc_codes,data_codes,baseflow_method), 'analytics.wiski')
|
|
127
|
+
warehouse.update_views(con)
|
|
128
|
+
else:
|
|
129
|
+
print('No data neccesary for HSPF calibration available from wiski for stations:',station_ids)
|
|
130
|
+
|
|
131
|
+
def _download_equis_data(self,station_ids):
|
|
132
|
+
if self.credentials_exist():
|
|
133
|
+
self.connect_to_oracle()
|
|
134
|
+
print('Connected to Oracle database.')
|
|
135
|
+
with warehouse.connect(self.db_path,read_only = False) as con:
|
|
136
|
+
df = equis.download(station_ids)
|
|
137
|
+
if not df.empty:
|
|
138
|
+
warehouse.load_df_to_table(con,df, 'staging.equis')
|
|
139
|
+
warehouse.load_df_to_table(con,equis.transform(df.copy()), 'analytics.equis')
|
|
140
|
+
warehouse.update_views(con)
|
|
141
|
+
else:
|
|
142
|
+
print('No data neccesary for HSPF calibration available from equis for stations:',station_ids)
|
|
143
|
+
else:
|
|
144
|
+
raise ValueError('Oracle credentials not found. Set ORACLE_USER and ORACLE_PASSWORD environment variables or use swd as station_origin')
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _get_equis_template(self):
|
|
148
|
+
with duckdb.connect(self.db_path,read_only=True) as con:
|
|
149
|
+
query = '''
|
|
150
|
+
SELECT *
|
|
151
|
+
FROM staging.equis
|
|
152
|
+
LIMIT 0'''
|
|
153
|
+
df = con.execute(query).fetch_df().to_csv(self.folderpath.joinpath('equis_template.csv'), index=False)
|
|
139
154
|
return df
|
|
140
155
|
|
|
141
|
-
def
|
|
156
|
+
def _get_wiski_template(self):
|
|
157
|
+
with duckdb.connect(self.db_path,read_only=True) as con:
|
|
158
|
+
query = '''
|
|
159
|
+
SELECT *
|
|
160
|
+
FROM staging.wiski
|
|
161
|
+
LIMIT 0'''
|
|
162
|
+
df = con.execute(query).fetch_df().to_csv(self.folderpath.joinpath('wiski_template.csv'), index=False)
|
|
163
|
+
return df
|
|
164
|
+
|
|
165
|
+
def get_outlets(self,model_name):
|
|
142
166
|
with duckdb.connect(self.db_path,read_only=True) as con:
|
|
143
167
|
query = '''
|
|
144
168
|
SELECT *
|
|
145
169
|
FROM outlets.station_reach_pairs
|
|
170
|
+
WHERE repository_name = ?
|
|
146
171
|
ORDER BY outlet_id'''
|
|
147
|
-
df = con.execute(query).fetch_df()
|
|
172
|
+
df = con.execute(query,[model_name]).fetch_df()
|
|
148
173
|
return df
|
|
174
|
+
|
|
149
175
|
def get_station_ids(self,station_origin = None):
|
|
150
176
|
with duckdb.connect(self.db_path,read_only=True) as con:
|
|
151
177
|
if station_origin is None:
|
|
@@ -163,9 +189,7 @@ class dataManager():
|
|
|
163
189
|
return df['station_id'].to_list()
|
|
164
190
|
|
|
165
191
|
|
|
166
|
-
def
|
|
167
|
-
|
|
168
|
-
|
|
192
|
+
def get_observation_data(self,station_ids,constituent,agg_period = None):
|
|
169
193
|
with duckdb.connect(self.db_path,read_only=True) as con:
|
|
170
194
|
query = '''
|
|
171
195
|
SELECT *
|
|
@@ -184,9 +208,9 @@ class dataManager():
|
|
|
184
208
|
df.attrs['agg_period'] = agg_period
|
|
185
209
|
|
|
186
210
|
df.rename(columns={'value': 'observed'}, inplace=True)
|
|
187
|
-
return df
|
|
211
|
+
return df.dropna(subset=['observed'])
|
|
188
212
|
|
|
189
|
-
def get_outlet_data(self,outlet_id,constituent,agg_period = 'D'):
|
|
213
|
+
def get_outlet_data(self,outlet_id,constituent,agg_period = 'D',to_csv = False):
|
|
190
214
|
with duckdb.connect(self.db_path,read_only=True) as con:
|
|
191
215
|
query = '''
|
|
192
216
|
SELECT *
|
|
@@ -207,16 +231,35 @@ class dataManager():
|
|
|
207
231
|
df.rename(columns={'value': 'observed',
|
|
208
232
|
'flow_value': 'observed_flow',
|
|
209
233
|
'baseflow_value': 'observed_baseflow'}, inplace=True)
|
|
210
|
-
return df
|
|
211
|
-
|
|
234
|
+
return df.dropna(subset=['observed'])
|
|
212
235
|
|
|
236
|
+
def get_raw_data(self,station_id,station_origin, to_csv = False):
|
|
237
|
+
with duckdb.connect(self.db_path,read_only=True) as con:
|
|
238
|
+
if station_origin.lower() == 'equis':
|
|
239
|
+
query = '''
|
|
240
|
+
SELECT *
|
|
241
|
+
FROM staging.equis_raw
|
|
242
|
+
WHERE station_id = ?'''
|
|
243
|
+
elif station_origin.lower() == 'wiski':
|
|
244
|
+
query = '''
|
|
245
|
+
SELECT *
|
|
246
|
+
FROM staging.wiski_raw
|
|
247
|
+
WHERE station_id = ?'''
|
|
248
|
+
else:
|
|
249
|
+
raise ValueError(f'Station origin {station_origin} not recognized. Valid options are equis or wiski.')
|
|
250
|
+
|
|
251
|
+
df = con.execute(query,[station_id]).fetch_df()
|
|
252
|
+
|
|
253
|
+
if to_csv:
|
|
254
|
+
df.to_csv(self.folderpath.joinpath(f'{station_id}_raw.csv'), index=False)
|
|
255
|
+
return df
|
|
213
256
|
|
|
214
|
-
def to_csv(self,station_id,folderpath = None):
|
|
257
|
+
def to_csv(self,station_id ,station_origin,folderpath = None):
|
|
215
258
|
if folderpath is None:
|
|
216
259
|
folderpath = self.folderpath
|
|
217
260
|
else:
|
|
218
261
|
folderpath = Path(folderpath)
|
|
219
|
-
df = self.
|
|
262
|
+
df = self.get_station_data([station_id],constituent = 'Q',agg_period = None)
|
|
220
263
|
if len(df) > 0:
|
|
221
264
|
df.to_csv(folderpath.joinpath(station_id + '.csv'))
|
|
222
265
|
else:
|
mpcaHydro/etlSWD.py
CHANGED
|
@@ -26,19 +26,21 @@ CONSTITUENT_MAP = {i[0]:i[1] for i in EQUIS_PARAMETER_XREF[['PARAMETER','constit
|
|
|
26
26
|
# return df
|
|
27
27
|
import requests
|
|
28
28
|
|
|
29
|
-
def _download(
|
|
29
|
+
def _download(station_id):
|
|
30
30
|
# Replace {station_no} in the URL with the actual station number
|
|
31
|
-
url = f"https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=json"
|
|
32
|
-
|
|
31
|
+
#url = f"https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=json"
|
|
32
|
+
url = 'https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results'
|
|
33
|
+
|
|
33
34
|
try:
|
|
34
35
|
# Send a GET request to the URL
|
|
35
|
-
|
|
36
|
+
params = {
|
|
37
|
+
'stationId': station_id,
|
|
38
|
+
'format': 'json'
|
|
39
|
+
}
|
|
40
|
+
response = requests.get(url,params = params)
|
|
36
41
|
response.raise_for_status() # Raise exception for HTTP errors
|
|
37
42
|
# Parse the JSON data
|
|
38
|
-
|
|
39
|
-
return pd.DataFrame(columns = response.json()['column_names'])
|
|
40
|
-
else:
|
|
41
|
-
return pd.DataFrame(response.json()['data'])
|
|
43
|
+
return pd.DataFrame(response.json()['data'])
|
|
42
44
|
|
|
43
45
|
except requests.exceptions.RequestException as e:
|
|
44
46
|
print(f"An error occurred: {e}")
|
|
@@ -46,14 +48,18 @@ def _download(station_no):
|
|
|
46
48
|
|
|
47
49
|
|
|
48
50
|
|
|
49
|
-
def download(
|
|
51
|
+
def download(station_ids):
|
|
50
52
|
#df = pd.read_csv(f'https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=csv')
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
53
|
+
dfs = []
|
|
54
|
+
for station_id in station_ids:
|
|
55
|
+
df = _download(station_id)
|
|
56
|
+
if not df.empty:
|
|
57
|
+
df['station_id'] = station_id
|
|
58
|
+
dfs.append(df)
|
|
59
|
+
|
|
60
|
+
return pd.concat(dfs, ignore_index=True)
|
|
61
|
+
|
|
62
|
+
|
|
57
63
|
|
|
58
64
|
def info(station_no):
|
|
59
65
|
#df = pd.read_csv(f'https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=csv')
|
mpcaHydro/outlets.py
CHANGED
|
@@ -14,17 +14,18 @@ import duckdb
|
|
|
14
14
|
|
|
15
15
|
#stations_wiski = gpd.read_file('C:/Users/mfratki/Documents/GitHub/pyhcal/src/pyhcal/data/stations_wiski.gpkg')
|
|
16
16
|
|
|
17
|
-
|
|
18
17
|
_stations_wiski = gpd.read_file(str(Path(__file__).resolve().parent/'data\\stations_wiski.gpkg'))
|
|
19
|
-
stations_wiski = _stations_wiski.
|
|
18
|
+
stations_wiski = _stations_wiski.loc[:,['station_id','true_opnid','opnids','comments','modeled','repo_name','wplmn_flag']]
|
|
20
19
|
stations_wiski['source'] = 'wiski'
|
|
21
20
|
_stations_equis = gpd.read_file(str(Path(__file__).resolve().parent/'data\\stations_EQUIS.gpkg'))
|
|
22
|
-
stations_equis = _stations_equis.
|
|
21
|
+
stations_equis = _stations_equis.loc[:,['station_id','true_opnid','opnids','comments','modeled','repo_name']]
|
|
23
22
|
stations_equis['source'] = 'equis'
|
|
24
23
|
stations_equis['wplmn_flag'] = 0
|
|
25
24
|
|
|
26
25
|
|
|
27
|
-
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
DB_PATH = str(Path(__file__).resolve().parent/'data\\outlet.duckdb')
|
|
28
29
|
|
|
29
30
|
MODL_DB = pd.concat([stations_wiski,stations_equis])
|
|
30
31
|
MODL_DB['opnids'] = MODL_DB['opnids'].str.strip().replace('',pd.NA)
|
|
@@ -34,64 +35,69 @@ MODL_DB = MODL_DB.drop_duplicates(['station_id','source']).reset_index(drop=True
|
|
|
34
35
|
def _reload():
|
|
35
36
|
global _stations_wiski, stations_wiski, _stations_equis, stations_equis, MODL_DB
|
|
36
37
|
_stations_wiski = gpd.read_file(str(Path(__file__).resolve().parent/'data\\stations_wiski.gpkg'))
|
|
37
|
-
stations_wiski = _stations_wiski.
|
|
38
|
+
stations_wiski = _stations_wiski.loc[:,['station_id','true_opnid','opnids','comments','modeled','repo_name','wplmn_flag']]
|
|
38
39
|
stations_wiski['source'] = 'wiski'
|
|
39
40
|
_stations_equis = gpd.read_file(str(Path(__file__).resolve().parent/'data\\stations_EQUIS.gpkg'))
|
|
40
|
-
stations_equis = _stations_equis.
|
|
41
|
+
stations_equis = _stations_equis.loc[:,['station_id','true_opnid','opnids','comments','modeled','repo_name']]
|
|
41
42
|
stations_equis['source'] = 'equis'
|
|
42
43
|
stations_equis['wplmn_flag'] = 0
|
|
43
44
|
|
|
45
|
+
|
|
44
46
|
MODL_DB = pd.concat([stations_wiski,stations_equis])
|
|
45
47
|
MODL_DB['opnids'] = MODL_DB['opnids'].str.strip().replace('',pd.NA)
|
|
46
48
|
MODL_DB = MODL_DB.dropna(subset='opnids')
|
|
47
49
|
MODL_DB = MODL_DB.drop_duplicates(['station_id','source']).reset_index(drop=True)
|
|
48
50
|
|
|
49
51
|
|
|
50
|
-
def get_model_db(model_name: str):
|
|
51
|
-
return MODL_DB.query('repository_name == @model_name')
|
|
52
|
-
|
|
53
52
|
def split_opnids(opnids: list):
|
|
54
|
-
return [
|
|
53
|
+
return [int(float(j)) for i in opnids for j in i]
|
|
54
|
+
|
|
55
|
+
def get_model_db(model_name: str):
|
|
56
|
+
return MODL_DB.query('repo_name == @model_name')
|
|
55
57
|
|
|
56
58
|
def valid_models():
|
|
57
|
-
return MODL_DB['
|
|
59
|
+
return MODL_DB['repo_name'].unique().tolist()
|
|
60
|
+
|
|
61
|
+
def equis_stations(model_name):
|
|
62
|
+
return _stations_equis.query('repo_name == @model_name')['station_id'].tolist()
|
|
63
|
+
|
|
64
|
+
def wiski_stations(model_name):
|
|
65
|
+
return _stations_wiski.query('repo_name == @model_name')['station_id'].tolist()
|
|
66
|
+
|
|
67
|
+
def wplmn_stations(model_name):
|
|
68
|
+
return MODL_DB.query('repo_name == @model_name and wplmn_flag == 1 and source == "wiski"')['station_id'].tolist()
|
|
58
69
|
|
|
59
70
|
def wplmn_station_opnids(model_name):
|
|
60
|
-
opnids = MODL_DB.query('
|
|
71
|
+
opnids = MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name and wplmn_flag == 1 and source == "wiski"')['opnids'].str.split(',').to_list()
|
|
61
72
|
return split_opnids(opnids)
|
|
62
73
|
|
|
63
74
|
def wiski_station_opnids(model_name):
|
|
64
|
-
opnids = MODL_DB.query('
|
|
75
|
+
opnids = MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name and source == "wiski"')['opnids'].str.split(',').to_list()
|
|
65
76
|
return split_opnids(opnids)
|
|
66
77
|
|
|
67
78
|
def equis_station_opnids(model_name):
|
|
68
|
-
opnids = MODL_DB.query('
|
|
79
|
+
opnids = MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name and source == "equis"')['opnids'].str.split(',').to_list()
|
|
69
80
|
return split_opnids(opnids)
|
|
70
81
|
|
|
71
82
|
def station_opnids(model_name):
|
|
72
|
-
opnids = MODL_DB.query('
|
|
83
|
+
opnids = MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name')['opnids'].str.split(',').to_list()
|
|
73
84
|
return split_opnids(opnids)
|
|
74
85
|
|
|
75
|
-
def
|
|
76
|
-
return MODL_DB.query('
|
|
77
|
-
|
|
78
|
-
def wiski_stations(model_name):
|
|
79
|
-
return MODL_DB.query('repository_name == @model_name and source == "wiski"')['station_id'].tolist()
|
|
86
|
+
def mapped_equis_stations(model_name):
|
|
87
|
+
return MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name and source == "equis"')['station_id'].tolist()
|
|
80
88
|
|
|
81
|
-
def
|
|
82
|
-
return MODL_DB.query('
|
|
89
|
+
def mapped_wiski_stations(model_name):
|
|
90
|
+
return MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name and source == "wiski"')['station_id'].tolist()
|
|
83
91
|
|
|
84
92
|
def outlets(model_name):
|
|
85
|
-
return [group for _, group in MODL_DB.query('
|
|
93
|
+
return [group for _, group in MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name').groupby(by = ['opnids','repo_name'])]
|
|
86
94
|
|
|
87
95
|
def outlet_stations(model_name):
|
|
88
|
-
return [group['station_id'].to_list() for _, group in MODL_DB.query('
|
|
96
|
+
return [group['station_id'].to_list() for _, group in MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name').groupby(by = ['opnids','repo_name'])]
|
|
89
97
|
|
|
90
|
-
def _split_opnids(opnids: list):
|
|
91
|
-
return [int(float(j)) for i in opnids for j in i]
|
|
92
98
|
|
|
93
99
|
def connect(db_path, read_only=True):
|
|
94
|
-
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
#Path(db_path).parent.mkdir(parents=True, exist_ok=True)
|
|
95
101
|
return duckdb.connect(db_path,read_only=read_only)
|
|
96
102
|
|
|
97
103
|
|
|
@@ -103,7 +109,7 @@ def init_db(db_path: str,reset: bool = False):
|
|
|
103
109
|
if reset and db_path.exists():
|
|
104
110
|
db_path.unlink()
|
|
105
111
|
|
|
106
|
-
with connect(db_path.as_posix()) as con:
|
|
112
|
+
with connect(db_path.as_posix(),False) as con:
|
|
107
113
|
con.execute(OUTLETS_SCHEMA)
|
|
108
114
|
|
|
109
115
|
|
|
@@ -202,7 +208,7 @@ def build_outlet_db(db_path: str = None):
|
|
|
202
208
|
if db_path is None:
|
|
203
209
|
db_path = DB_PATH
|
|
204
210
|
init_db(db_path,reset=True)
|
|
205
|
-
with connect(db_path) as con:
|
|
211
|
+
with connect(db_path,False) as con:
|
|
206
212
|
build_outlets(con)
|
|
207
213
|
|
|
208
214
|
|
|
@@ -212,43 +218,35 @@ def build_outlets(con, model_name: str = None):
|
|
|
212
218
|
else:
|
|
213
219
|
modl_db = MODL_DB
|
|
214
220
|
|
|
215
|
-
for index, (_, group) in enumerate(modl_db.drop_duplicates(['station_id','source']).groupby(by = ['opnids','
|
|
216
|
-
repo_name = group['
|
|
221
|
+
for index, (_, group) in enumerate(modl_db.drop_duplicates(['station_id','source']).groupby(by = ['opnids','repo_name'])):
|
|
222
|
+
repo_name = group['repo_name'].iloc[0]
|
|
217
223
|
add_outlet(con, outlet_id = index, outlet_name = None, repository_name = repo_name, notes = None)
|
|
218
224
|
|
|
219
|
-
opnids = set(
|
|
225
|
+
opnids = set(split_opnids(group['opnids'].str.split(',').to_list()))
|
|
220
226
|
|
|
221
227
|
for opnid in opnids:
|
|
222
|
-
|
|
223
|
-
exclude = 1
|
|
224
|
-
else:
|
|
225
|
-
exclude = 0
|
|
226
|
-
add_reach(con, outlet_id = index, reach_id = abs(opnid),exclude = exclude, repository_name = repo_name)
|
|
228
|
+
add_reach(con, outlet_id = index, reach_id = int(opnid), repository_name = repo_name)
|
|
227
229
|
|
|
228
230
|
for _, row in group.drop_duplicates(subset=['station_id', 'source']).iterrows():
|
|
229
231
|
add_station(con, outlet_id = index, station_id = row['station_id'], station_origin = row['source'], true_opnid = row['true_opnid'], repository_name= repo_name, comments = row['comments'])
|
|
230
232
|
|
|
231
233
|
|
|
232
234
|
def create_outlet_schema(con, model_name : str):
|
|
233
|
-
for index, (_, group) in enumerate(
|
|
234
|
-
repo_name = group['
|
|
235
|
+
for index, (_, group) in enumerate(outlets(model_name)):
|
|
236
|
+
repo_name = group['repo_name'].iloc[0]
|
|
235
237
|
add_outlet(con, outlet_id = index, outlet_name = None, repository_name = repo_name, notes = None)
|
|
236
238
|
|
|
237
|
-
opnids = set(
|
|
239
|
+
opnids = set(split_opnids(group['opnids'].str.split(',').to_list()))
|
|
238
240
|
|
|
239
241
|
for opnid in opnids:
|
|
240
|
-
|
|
241
|
-
exclude = 1
|
|
242
|
-
else:
|
|
243
|
-
exclude = 0
|
|
244
|
-
add_reach(con, outlet_id = index, reach_id = abs(opnid),exclude = exclude, repository_name = repo_name)
|
|
242
|
+
add_reach(con, outlet_id = index, reach_id = int(opnid), repository_name = repo_name)
|
|
245
243
|
|
|
246
244
|
for _, row in group.drop_duplicates(subset=['station_id', 'source']).iterrows():
|
|
247
245
|
add_station(con, outlet_id = index, station_id = row['station_id'], station_origin = row['source'], true_opnid = row['true_opnid'], repository_name= repo_name, comments = row['comments'])
|
|
248
246
|
|
|
249
247
|
|
|
250
248
|
def add_outlet(con,
|
|
251
|
-
outlet_id:
|
|
249
|
+
outlet_id: int,
|
|
252
250
|
repository_name: str,
|
|
253
251
|
outlet_name = None,
|
|
254
252
|
notes = None):
|
|
@@ -256,15 +254,15 @@ def add_outlet(con,
|
|
|
256
254
|
Insert an outlet. repository_name is required.
|
|
257
255
|
"""
|
|
258
256
|
con.execute(
|
|
259
|
-
"INSERT INTO outlets.
|
|
257
|
+
"INSERT INTO outlets.outlet_groups (outlet_id, repository_name, outlet_name, notes) VALUES (?, ?, ?, ?)",
|
|
260
258
|
[outlet_id, repository_name, outlet_name, notes]
|
|
261
259
|
)
|
|
262
260
|
|
|
263
261
|
def add_station(con,
|
|
264
|
-
outlet_id:
|
|
265
|
-
station_id:
|
|
262
|
+
outlet_id: int,
|
|
263
|
+
station_id: int,
|
|
266
264
|
station_origin: str,
|
|
267
|
-
true_opnid:
|
|
265
|
+
true_opnid: int,
|
|
268
266
|
repository_name: str,
|
|
269
267
|
comments = None):
|
|
270
268
|
"""
|
|
@@ -281,19 +279,17 @@ def add_station(con,
|
|
|
281
279
|
)
|
|
282
280
|
|
|
283
281
|
def add_reach(con,
|
|
284
|
-
outlet_id:
|
|
285
|
-
reach_id:
|
|
286
|
-
repository_name: str
|
|
287
|
-
exclude: int = 0):
|
|
282
|
+
outlet_id: int,
|
|
283
|
+
reach_id: int,
|
|
284
|
+
repository_name: str):
|
|
288
285
|
"""
|
|
289
286
|
Insert a reach membership for an outlet.
|
|
290
287
|
- repository_name is required and participates in the PK (reach_id, repository_name).
|
|
291
|
-
- exclude = 1 to mark a reach as excluded from association views.
|
|
292
288
|
"""
|
|
293
289
|
con.execute(
|
|
294
|
-
"""INSERT INTO outlets.outlet_reaches (outlet_id, reach_id, repository_name
|
|
295
|
-
VALUES (?, ?,
|
|
296
|
-
[outlet_id, reach_id, repository_name
|
|
290
|
+
"""INSERT INTO outlets.outlet_reaches (outlet_id, reach_id, repository_name)
|
|
291
|
+
VALUES (?, ?, ?)""",
|
|
292
|
+
[outlet_id, reach_id, repository_name]
|
|
297
293
|
)
|
|
298
294
|
|
|
299
295
|
|
|
@@ -303,8 +299,10 @@ OUTLETS_SCHEMA = """-- schema.sql
|
|
|
303
299
|
|
|
304
300
|
-- Table 1: outlets
|
|
305
301
|
-- Represents a logical grouping that ties stations and reaches together.
|
|
306
|
-
CREATE
|
|
307
|
-
|
|
302
|
+
CREATE SCHEMA IF NOT EXISTS outlets;
|
|
303
|
+
|
|
304
|
+
CREATE TABLE IF NOT EXISTS outlets.outlet_groups (
|
|
305
|
+
outlet_id INTEGER PRIMARY KEY,
|
|
308
306
|
repository_name TEXT NOT NULL,
|
|
309
307
|
outlet_name TEXT,
|
|
310
308
|
notes TEXT -- optional: general notes about the outlet grouping
|
|
@@ -312,42 +310,40 @@ CREATE TABLE IF NOT EXISTS outlets (
|
|
|
312
310
|
|
|
313
311
|
-- Table 2: outlet_stations
|
|
314
312
|
-- One-to-many: outlet -> stations
|
|
315
|
-
CREATE TABLE IF NOT EXISTS outlet_stations (
|
|
316
|
-
outlet_id
|
|
313
|
+
CREATE TABLE IF NOT EXISTS outlets.outlet_stations (
|
|
314
|
+
outlet_id INTEGER NOT NULL,
|
|
317
315
|
station_id TEXT NOT NULL,
|
|
318
316
|
station_origin TEXT NOT NULL, -- e.g., 'wiski', 'equis'
|
|
319
317
|
repository_name TEXT NOT NULL, -- repository model the station is physically located in
|
|
320
|
-
true_opnid
|
|
318
|
+
true_opnid INTEGER NOT NULL, -- The specific reach the station physically sits on (optional)
|
|
321
319
|
comments TEXT, -- Per-station comments, issues, etc.
|
|
322
320
|
CONSTRAINT uq_station_origin UNIQUE (station_id, station_origin),
|
|
323
|
-
FOREIGN KEY (outlet_id) REFERENCES outlets(outlet_id)
|
|
321
|
+
FOREIGN KEY (outlet_id) REFERENCES outlets.outlet_groups(outlet_id)
|
|
324
322
|
);
|
|
325
323
|
|
|
326
324
|
-- Table 3: outlet_reaches
|
|
327
325
|
-- One-to-many: outlet -> reaches
|
|
328
326
|
-- A reach can appear in multiple outlets, enabling many-to-many overall.
|
|
329
|
-
CREATE TABLE IF NOT EXISTS outlet_reaches (
|
|
330
|
-
outlet_id
|
|
331
|
-
reach_id
|
|
327
|
+
CREATE TABLE IF NOT EXISTS outlets.outlet_reaches (
|
|
328
|
+
outlet_id INTEGER NOT NULL,
|
|
329
|
+
reach_id INTEGER NOT NULL, -- model reach identifier (aka opind)
|
|
332
330
|
repository_name TEXT NOT NULL, -- optional: where the mapping comes from
|
|
333
|
-
|
|
334
|
-
FOREIGN KEY (outlet_id) REFERENCES outlets(outlet_id)
|
|
331
|
+
FOREIGN KEY (outlet_id) REFERENCES outlets.outlet_groups(outlet_id)
|
|
335
332
|
);
|
|
336
333
|
|
|
337
334
|
-- Useful views:
|
|
338
335
|
|
|
339
336
|
-- View: station_reach_pairs
|
|
340
337
|
-- Derives the implicit many-to-many station <-> reach relationship via shared outlet_id
|
|
341
|
-
CREATE
|
|
338
|
+
CREATE OR REPLACE VIEW outlets.station_reach_pairs AS
|
|
342
339
|
SELECT
|
|
343
340
|
s.outlet_id,
|
|
344
341
|
s.station_id,
|
|
345
342
|
s.station_origin,
|
|
346
343
|
r.reach_id,
|
|
347
|
-
r.
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
JOIN outlet_reaches r
|
|
344
|
+
r.repository_name
|
|
345
|
+
FROM outlets.outlet_stations AS s
|
|
346
|
+
JOIN outlets.outlet_reaches AS r
|
|
351
347
|
ON s.outlet_id = r.outlet_id;
|
|
352
348
|
|
|
353
349
|
"""
|