mpcaHydro 2.1.0__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mpcaHydro/data/WISKI_QUALITY_CODES.csv +71 -0
- mpcaHydro/data/outlets.duckdb +0 -0
- mpcaHydro/data/stations_EQUIS.gpkg +0 -0
- mpcaHydro/data/stations_wiski.gpkg +0 -0
- mpcaHydro/data_manager.py +122 -287
- mpcaHydro/equis.py +31 -22
- mpcaHydro/outlets.py +371 -0
- mpcaHydro/reports.py +80 -0
- mpcaHydro/warehouse.py +389 -11
- mpcaHydro/warehouseManager.py +47 -0
- mpcaHydro/{WISKI.py → wiski.py} +40 -12
- mpcaHydro/xref.py +74 -0
- {mpcahydro-2.1.0.dist-info → mpcahydro-2.2.0.dist-info}/METADATA +3 -1
- mpcahydro-2.2.0.dist-info/RECORD +23 -0
- mpcahydro-2.1.0.dist-info/RECORD +0 -15
- {mpcahydro-2.1.0.dist-info → mpcahydro-2.2.0.dist-info}/WHEEL +0 -0
mpcaHydro/warehouse.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import duckdb
|
|
2
2
|
import pandas as pd
|
|
3
3
|
from pathlib import Path
|
|
4
|
+
from mpcaHydro import outlets
|
|
4
5
|
|
|
5
6
|
def init_db(db_path: str,reset: bool = False):
|
|
6
7
|
"""
|
|
@@ -14,14 +15,14 @@ def init_db(db_path: str,reset: bool = False):
|
|
|
14
15
|
# Create all schemas
|
|
15
16
|
create_schemas(con)
|
|
16
17
|
|
|
17
|
-
# Create tables
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
create_constituent_summary_report(con)
|
|
22
|
-
except duckdb.CatalogException as e:
|
|
23
|
-
print(f"Could not create observation views, likely because backing tables don't exist yet. This is safe to ignore on first run. Details: {e}")
|
|
18
|
+
# Create tables
|
|
19
|
+
create_outlets_tables(con)
|
|
20
|
+
create_mapping_tables(con)
|
|
21
|
+
create_analytics_tables(con)
|
|
24
22
|
|
|
23
|
+
# Create views
|
|
24
|
+
#update_views(con)
|
|
25
|
+
|
|
25
26
|
|
|
26
27
|
def create_schemas(con: duckdb.DuckDBPyConnection):
|
|
27
28
|
"""
|
|
@@ -30,8 +31,245 @@ def create_schemas(con: duckdb.DuckDBPyConnection):
|
|
|
30
31
|
con.execute("CREATE SCHEMA IF NOT EXISTS staging")
|
|
31
32
|
con.execute("CREATE SCHEMA IF NOT EXISTS analytics")
|
|
32
33
|
con.execute("CREATE SCHEMA IF NOT EXISTS reports")
|
|
33
|
-
con.execute("CREATE SCHEMA IF NOT EXISTS
|
|
34
|
+
con.execute("CREATE SCHEMA IF NOT EXISTS outlets")
|
|
35
|
+
con.execute("CREATE SCHEMA IF NOT EXISTS mappings")
|
|
34
36
|
|
|
37
|
+
def create_analytics_tables(con: duckdb.DuckDBPyConnection):
|
|
38
|
+
"""
|
|
39
|
+
Create necessary tables in the analytics schema.
|
|
40
|
+
"""
|
|
41
|
+
con.execute("""
|
|
42
|
+
CREATE TABLE IF NOT EXISTS analytics.equis (
|
|
43
|
+
datetime TIMESTAMP,
|
|
44
|
+
value DOUBLE,
|
|
45
|
+
station_id TEXT,
|
|
46
|
+
station_origin TEXT,
|
|
47
|
+
constituent TEXT,
|
|
48
|
+
unit TEXT
|
|
49
|
+
);
|
|
50
|
+
""")
|
|
51
|
+
con.execute("""
|
|
52
|
+
CREATE TABLE IF NOT EXISTS analytics.wiski (
|
|
53
|
+
datetime TIMESTAMP,
|
|
54
|
+
value DOUBLE,
|
|
55
|
+
station_id TEXT,
|
|
56
|
+
station_origin TEXT,
|
|
57
|
+
constituent TEXT,
|
|
58
|
+
unit TEXT
|
|
59
|
+
);
|
|
60
|
+
""")
|
|
61
|
+
|
|
62
|
+
def create_mapping_tables(con: duckdb.DuckDBPyConnection):
|
|
63
|
+
"""
|
|
64
|
+
Create and populate tables in the mappings schema from Python dicts and CSVs.
|
|
65
|
+
"""
|
|
66
|
+
# WISKI parametertype_id -> constituent
|
|
67
|
+
wiski_parametertype_map = {
|
|
68
|
+
'11522': 'TP',
|
|
69
|
+
'11531': 'TP',
|
|
70
|
+
'11532': 'TSS',
|
|
71
|
+
'11523': 'TSS',
|
|
72
|
+
'11526': 'N',
|
|
73
|
+
'11519': 'N',
|
|
74
|
+
'11520': 'OP',
|
|
75
|
+
'11528': 'OP',
|
|
76
|
+
'11530': 'TKN',
|
|
77
|
+
'11521': 'TKN',
|
|
78
|
+
'11500': 'Q',
|
|
79
|
+
'11504': 'WT',
|
|
80
|
+
'11533': 'DO',
|
|
81
|
+
'11507': 'WL'
|
|
82
|
+
}
|
|
83
|
+
df_wiski_params = pd.DataFrame(wiski_parametertype_map.items(), columns=['parametertype_id', 'constituent'])
|
|
84
|
+
con.execute("CREATE TABLE IF NOT EXISTS mappings.wiski_parametertype AS SELECT * FROM df_wiski_params")
|
|
85
|
+
|
|
86
|
+
# EQuIS cas_rn -> constituent
|
|
87
|
+
equis_casrn_map = {
|
|
88
|
+
'479-61-8': 'CHLA',
|
|
89
|
+
'CHLA-CORR': 'CHLA',
|
|
90
|
+
'BOD': 'BOD',
|
|
91
|
+
'NO2NO3': 'N',
|
|
92
|
+
'14797-55-8': 'NO3',
|
|
93
|
+
'14797-65-0': 'NO2',
|
|
94
|
+
'14265-44-2': 'OP',
|
|
95
|
+
'N-KJEL': 'TKN',
|
|
96
|
+
'PHOSPHATE-P': 'TP',
|
|
97
|
+
'7723-14-0': 'TP',
|
|
98
|
+
'SOLIDS-TSS': 'TSS',
|
|
99
|
+
'TEMP-W': 'WT',
|
|
100
|
+
'7664-41-7': 'NH3'
|
|
101
|
+
}
|
|
102
|
+
df_equis_cas = pd.DataFrame(equis_casrn_map.items(), columns=['cas_rn', 'constituent'])
|
|
103
|
+
con.execute("CREATE TABLE IF NOT EXISTS mappings.equis_casrn AS SELECT * FROM df_equis_cas")
|
|
104
|
+
|
|
105
|
+
# Load station cross-reference from CSV
|
|
106
|
+
# Assumes this script is run from a location where this relative path is valid
|
|
107
|
+
xref_csv_path = Path(__file__).parent / 'data/WISKI_EQUIS_XREF.csv'
|
|
108
|
+
if xref_csv_path.exists():
|
|
109
|
+
con.execute(f"CREATE TABLE IF NOT EXISTS mappings.station_xref AS SELECT * FROM read_csv_auto('{xref_csv_path.as_posix()}')")
|
|
110
|
+
else:
|
|
111
|
+
print(f"Warning: WISKI_EQUIS_XREF.csv not found at {xref_csv_path}")
|
|
112
|
+
|
|
113
|
+
# Load wiski_quality_codes from CSV
|
|
114
|
+
wiski_qc_csv_path = Path(__file__).parent / 'data/WISKI_QUALITY_CODES.csv'
|
|
115
|
+
if wiski_qc_csv_path.exists():
|
|
116
|
+
con.execute(f"CREATE TABLE IF NOT EXISTS mappings.wiski_quality_codes AS SELECT * FROM read_csv_auto('{wiski_qc_csv_path.as_posix()}')")
|
|
117
|
+
else:
|
|
118
|
+
print(f"Warning: WISKI_QUALITY_CODES.csv not found at {wiski_qc_csv_path}")
|
|
119
|
+
|
|
120
|
+
def create_outlets_tables(con: duckdb.DuckDBPyConnection):
|
|
121
|
+
"""
|
|
122
|
+
Create tables in the outlets schema to define outlet-station-reach relationships.
|
|
123
|
+
"""
|
|
124
|
+
con.execute("""-- schema.sql
|
|
125
|
+
-- Simple 3-table design to manage associations between model reaches and observation stations via outlets.
|
|
126
|
+
-- Compatible with DuckDB and SQLite.
|
|
127
|
+
|
|
128
|
+
-- Table 1: outlets
|
|
129
|
+
-- Represents a logical grouping that ties stations and reaches together.
|
|
130
|
+
CREATE TABLE IF NOT EXISTS outlets.outlets (
|
|
131
|
+
outlet_id TEXT PRIMARY KEY,
|
|
132
|
+
repository_name TEXT NOT NULL,
|
|
133
|
+
outlet_name TEXT,
|
|
134
|
+
notes TEXT -- optional: general notes about the outlet grouping
|
|
135
|
+
);
|
|
136
|
+
|
|
137
|
+
-- Table 2: outlet_stations
|
|
138
|
+
-- One-to-many: outlet -> stations
|
|
139
|
+
CREATE TABLE IF NOT EXISTS outlets.outlet_stations (
|
|
140
|
+
outlet_id TEXT NOT NULL,
|
|
141
|
+
station_id TEXT NOT NULL,
|
|
142
|
+
station_origin TEXT NOT NULL, -- e.g., 'wiski', 'equis'
|
|
143
|
+
repository_name TEXT NOT NULL, -- repository model the station is physically located in
|
|
144
|
+
true_opnid TEXT NOT NULL, -- The specific reach the station physically sits on (optional)
|
|
145
|
+
comments TEXT, -- Per-station comments, issues, etc.
|
|
146
|
+
CONSTRAINT uq_station_origin UNIQUE (station_id, station_origin),
|
|
147
|
+
FOREIGN KEY (outlet_id) REFERENCES outlets.outlets(outlet_id)
|
|
148
|
+
);
|
|
149
|
+
|
|
150
|
+
-- Table 3: outlet_reaches
|
|
151
|
+
-- One-to-many: outlet -> reaches
|
|
152
|
+
-- A reach can appear in multiple outlets, enabling many-to-many overall.
|
|
153
|
+
CREATE TABLE IF NOT EXISTS outlets.outlet_reaches (
|
|
154
|
+
outlet_id TEXT NOT NULL,
|
|
155
|
+
reach_id TEXT NOT NULL, -- model reach identifier (aka opind)
|
|
156
|
+
repository_name TEXT NOT NULL, -- optional: where the mapping comes from
|
|
157
|
+
exclude INTEGER DEFAULT 0, -- flag to indicate if this reach should be excluded (1) or included (0)
|
|
158
|
+
FOREIGN KEY (outlet_id) REFERENCES outlets.outlets(outlet_id)
|
|
159
|
+
);
|
|
160
|
+
|
|
161
|
+
-- Useful views:
|
|
162
|
+
|
|
163
|
+
-- View: station_reach_pairs
|
|
164
|
+
-- Derives the implicit many-to-many station <-> reach relationship via shared outlet_id
|
|
165
|
+
CREATE VIEW IF NOT EXISTS outlets.station_reach_pairs AS
|
|
166
|
+
SELECT
|
|
167
|
+
s.outlet_id,
|
|
168
|
+
s.station_id,
|
|
169
|
+
s.station_origin,
|
|
170
|
+
r.reach_id,
|
|
171
|
+
r.exclude,
|
|
172
|
+
r.repository_name,
|
|
173
|
+
FROM outlets.outlet_stations s
|
|
174
|
+
JOIN outlets.outlet_reaches r
|
|
175
|
+
ON s.outlet_id = r.outlet_id;
|
|
176
|
+
|
|
177
|
+
""")
|
|
178
|
+
|
|
179
|
+
def create_normalized_wiski_view(con: duckdb.DuckDBPyConnection):
|
|
180
|
+
"""
|
|
181
|
+
Create a view in the database that contains normalized WISKI data.
|
|
182
|
+
Units converted to standard units.
|
|
183
|
+
columns renamed.
|
|
184
|
+
constituents mapped.
|
|
185
|
+
"""
|
|
186
|
+
con.execute("""
|
|
187
|
+
-- Create a single view with all transformations
|
|
188
|
+
CREATE OR REPLACE VIEW analytics.wiski_normalized AS
|
|
189
|
+
SELECT
|
|
190
|
+
|
|
191
|
+
-- Convert °C to °F and keep other values unchanged
|
|
192
|
+
CASE
|
|
193
|
+
WHEN LOWER(ts_unitsymbol) = '°c' THEN (value * 9.0 / 5.0) + 32
|
|
194
|
+
WHEN ts_unitsymbol = 'kg' THEN value * 2.20462 -- Convert kg to lb
|
|
195
|
+
ELSE value
|
|
196
|
+
END AS value,
|
|
197
|
+
|
|
198
|
+
-- Normalize units
|
|
199
|
+
CASE
|
|
200
|
+
WHEN LOWER(ts_unitsymbol) = '°c' THEN 'degf' -- Normalize °C to degF
|
|
201
|
+
WHEN ts_unitsymbol = 'kg' THEN 'lb' -- Normalize kg to lb
|
|
202
|
+
WHEN ts_unitsymbol = 'ft³/s' THEN 'cfs' -- Rename ft³/s to cfs
|
|
203
|
+
ELSE ts_unitsymbol
|
|
204
|
+
END AS unit,
|
|
205
|
+
|
|
206
|
+
-- Normalize column names
|
|
207
|
+
station_no AS station_id, -- Rename station_no to station_id
|
|
208
|
+
Timestamp AS datetime, -- Rename Timestamp to datetime
|
|
209
|
+
"Quality Code" AS quality_code, -- Rename Quality Code to quality_code
|
|
210
|
+
"Quality Code Name" AS quality_code_name, -- Rename Quality Code Name to quality_code_name
|
|
211
|
+
parametertype_id, -- Keeps parametertype_id as is
|
|
212
|
+
constituent -- Keeps constituent as is
|
|
213
|
+
FROM staging.wiski_raw;""")
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def create_filtered_wiski_view(con: duckdb.DuckDBPyConnection, data_codes: list):
|
|
217
|
+
"""
|
|
218
|
+
Create a view in the database that filters WISKI data based on specified data codes.
|
|
219
|
+
"""
|
|
220
|
+
query = f"""
|
|
221
|
+
CREATE OR REPLACE VIEW analytics.wiski_filtered AS
|
|
222
|
+
SELECT *
|
|
223
|
+
FROM analytics.wiski_normalized
|
|
224
|
+
WHERE quality_code IN ({placeholders});
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
placeholders = ', '.join(['?'] * len(data_codes))
|
|
228
|
+
query = query.format(placeholders=placeholders)
|
|
229
|
+
con.execute(query, data_codes)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def create_aggregated_wiski_view(con: duckdb.DuckDBPyConnection):
|
|
233
|
+
"""
|
|
234
|
+
Create a view in the database that aggregates WISKI data by hour, station, and constituent.
|
|
235
|
+
"""
|
|
236
|
+
con.execute("""
|
|
237
|
+
CREATE OR REPLACE Table analytics.wiski_aggregated AS
|
|
238
|
+
SELECT
|
|
239
|
+
station_id,
|
|
240
|
+
constituent,
|
|
241
|
+
time_bucket(INTERVAL '1 hour', datetime) AS hour_start,
|
|
242
|
+
AVG(value) AS value,
|
|
243
|
+
unit
|
|
244
|
+
FROM analytics.wiski_normalized
|
|
245
|
+
GROUP BY
|
|
246
|
+
station_id,
|
|
247
|
+
constituent,
|
|
248
|
+
hour_start,
|
|
249
|
+
unit;
|
|
250
|
+
""")
|
|
251
|
+
|
|
252
|
+
def create_staging_qc_count_view(con: duckdb.DuckDBPyConnection):
|
|
253
|
+
"""
|
|
254
|
+
Create a view in staging schema that counts quality codes for each station and constituent.
|
|
255
|
+
"""
|
|
256
|
+
con.execute("""
|
|
257
|
+
CREATE OR REPLACE VIEW staging.wiski_qc_count AS (
|
|
258
|
+
SELECT
|
|
259
|
+
w.station_no,
|
|
260
|
+
w.parametertype_name,
|
|
261
|
+
w."Quality Code",
|
|
262
|
+
w."Quality Code Name",
|
|
263
|
+
COUNT(w."Quality Code") AS count
|
|
264
|
+
FROM staging.wiski_raw w
|
|
265
|
+
GROUP BY
|
|
266
|
+
w."Quality Code",w."Quality Code Name",w.parametertype_name, w.station_no
|
|
267
|
+
);
|
|
268
|
+
""")
|
|
269
|
+
# ORDER BY
|
|
270
|
+
# w.station_no,w.parametertype_name, w."Quality Code"
|
|
271
|
+
# )
|
|
272
|
+
# """)
|
|
35
273
|
|
|
36
274
|
def create_combined_observations_view(con: duckdb.DuckDBPyConnection):
|
|
37
275
|
"""
|
|
@@ -47,6 +285,87 @@ def create_combined_observations_view(con: duckdb.DuckDBPyConnection):
|
|
|
47
285
|
""")
|
|
48
286
|
|
|
49
287
|
|
|
288
|
+
def create_outlet_observations_view(con: duckdb.DuckDBPyConnection):
|
|
289
|
+
"""
|
|
290
|
+
Create a view in analytics schema that links observations to model reaches via outlets.
|
|
291
|
+
"""
|
|
292
|
+
con.execute("""
|
|
293
|
+
CREATE OR REPLACE VIEW analytics.outlet_observations AS
|
|
294
|
+
SELECT
|
|
295
|
+
o.datetime,
|
|
296
|
+
os.outlet_id,
|
|
297
|
+
o.constituent,
|
|
298
|
+
AVG(o.value) AS value,
|
|
299
|
+
COUNT(o.value) AS count
|
|
300
|
+
FROM
|
|
301
|
+
analytics.observations AS o
|
|
302
|
+
LEFT JOIN
|
|
303
|
+
outlets.outlet_stations AS os ON o.station_id = os.station_id AND o.station_origin = os.station_origin
|
|
304
|
+
GROUP BY
|
|
305
|
+
os.outlet_id,
|
|
306
|
+
o.constituent,
|
|
307
|
+
o.datetime; -- Group by the truncated date
|
|
308
|
+
""")
|
|
309
|
+
# ORDER BY
|
|
310
|
+
# os.outlet_id,
|
|
311
|
+
# o.constituent,
|
|
312
|
+
# datetime);
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def create_outlet_observations_with_flow_view(con: duckdb.DuckDBPyConnection):
|
|
317
|
+
|
|
318
|
+
con.execute("""
|
|
319
|
+
CREATE OR REPLACE VIEW analytics.outlet_observations_with_flow AS
|
|
320
|
+
WITH baseflow_data AS (
|
|
321
|
+
SELECT
|
|
322
|
+
outlet_id,
|
|
323
|
+
datetime,
|
|
324
|
+
"value" AS baseflow_value
|
|
325
|
+
FROM
|
|
326
|
+
analytics.outlet_observations
|
|
327
|
+
WHERE
|
|
328
|
+
(constituent = 'QB')),
|
|
329
|
+
flow_data AS (
|
|
330
|
+
SELECT
|
|
331
|
+
outlet_id,
|
|
332
|
+
datetime,
|
|
333
|
+
"value" AS flow_value
|
|
334
|
+
FROM
|
|
335
|
+
analytics.outlet_observations
|
|
336
|
+
WHERE
|
|
337
|
+
(constituent = 'Q')),
|
|
338
|
+
constituent_data AS (
|
|
339
|
+
SELECT
|
|
340
|
+
outlet_id,
|
|
341
|
+
datetime,
|
|
342
|
+
constituent,
|
|
343
|
+
"value",
|
|
344
|
+
count
|
|
345
|
+
FROM
|
|
346
|
+
analytics.outlet_observations
|
|
347
|
+
WHERE
|
|
348
|
+
(constituent NOT IN ('Q', 'QB')))
|
|
349
|
+
SELECT
|
|
350
|
+
constituent_data.outlet_id,
|
|
351
|
+
constituent_data.constituent,
|
|
352
|
+
constituent_data.datetime,
|
|
353
|
+
constituent_data."value",
|
|
354
|
+
flow_data.flow_value,
|
|
355
|
+
baseflow_data.baseflow_value
|
|
356
|
+
FROM
|
|
357
|
+
constituent_data
|
|
358
|
+
FULL JOIN flow_data ON
|
|
359
|
+
(((constituent_data.outlet_id = flow_data.outlet_id)
|
|
360
|
+
AND (constituent_data.datetime = flow_data.datetime)))
|
|
361
|
+
LEFT JOIN baseflow_data ON
|
|
362
|
+
(((constituent_data.outlet_id = baseflow_data.outlet_id)
|
|
363
|
+
AND (constituent_data.datetime = baseflow_data.datetime)));""")
|
|
364
|
+
# ORDER BY
|
|
365
|
+
# constituent_data.outlet_id,
|
|
366
|
+
# constituent_data.datetime;
|
|
367
|
+
#
|
|
368
|
+
|
|
50
369
|
def create_constituent_summary_report(con: duckdb.DuckDBPyConnection):
|
|
51
370
|
"""
|
|
52
371
|
Create a constituent summary report in the reports schema that groups observations by constituent and station.
|
|
@@ -66,11 +385,54 @@ def create_constituent_summary_report(con: duckdb.DuckDBPyConnection):
|
|
|
66
385
|
FROM
|
|
67
386
|
analytics.observations
|
|
68
387
|
GROUP BY
|
|
69
|
-
constituent,station_id,station_origin
|
|
70
|
-
|
|
71
|
-
|
|
388
|
+
constituent,station_id,station_origin;
|
|
389
|
+
''')
|
|
390
|
+
|
|
391
|
+
# ORDER BY
|
|
392
|
+
# constituent,sample_count;''')
|
|
393
|
+
|
|
394
|
+
def create_outlet_summary_report(con: duckdb.DuckDBPyConnection):
|
|
395
|
+
con.execute("""
|
|
396
|
+
CREATE VIEW reports.outlet_constituent_summary AS
|
|
397
|
+
SELECT
|
|
398
|
+
outlet_id,
|
|
399
|
+
constituent,
|
|
400
|
+
count_star() AS sample_count,
|
|
401
|
+
avg("value") AS average_value,
|
|
402
|
+
min("value") AS min_value,
|
|
403
|
+
max("value") AS max_value,
|
|
404
|
+
"year"(min(datetime)) AS start_date,
|
|
405
|
+
"year"(max(datetime)) AS end_date
|
|
406
|
+
FROM
|
|
407
|
+
analytics.outlet_observations
|
|
408
|
+
GROUP BY
|
|
409
|
+
constituent,
|
|
410
|
+
outlet_id
|
|
411
|
+
""")
|
|
72
412
|
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def drop_station_id(con: duckdb.DuckDBPyConnection, station_id: str,station_origin: str):
|
|
416
|
+
"""
|
|
417
|
+
Drop all data for a specific station from staging and analytics schemas.
|
|
418
|
+
"""
|
|
419
|
+
con.execute(f"DELETE FROM staging.equis_raw WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
420
|
+
con.execute(f"DELETE FROM staging.wiski_raw WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
421
|
+
con.execute(f"DELETE FROM analytics.equis WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
422
|
+
con.execute(f"DELETE FROM analytics.wiski WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
423
|
+
update_views(con)
|
|
73
424
|
|
|
425
|
+
def update_views(con: duckdb.DuckDBPyConnection):
|
|
426
|
+
"""
|
|
427
|
+
Update all views in the database.
|
|
428
|
+
"""
|
|
429
|
+
create_staging_qc_count_view(con)
|
|
430
|
+
create_combined_observations_view(con)
|
|
431
|
+
create_constituent_summary_report(con)
|
|
432
|
+
create_outlet_observations_view(con)
|
|
433
|
+
create_outlet_observations_with_flow_view(con)
|
|
434
|
+
create_outlet_summary_report(con)
|
|
435
|
+
|
|
74
436
|
def connect(db_path: str, read_only: bool = False) -> duckdb.DuckDBPyConnection:
|
|
75
437
|
"""
|
|
76
438
|
Returns a DuckDB connection to the given database path.
|
|
@@ -106,6 +468,22 @@ def load_df_to_staging(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_n
|
|
|
106
468
|
con.execute(f"CREATE TABLE staging.{table_name} AS SELECT * FROM tmp_df")
|
|
107
469
|
con.unregister("tmp_df")
|
|
108
470
|
|
|
471
|
+
def add_df_to_staging(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str):
|
|
472
|
+
"""
|
|
473
|
+
Append a pandas DataFrame into a staging table. This will create the staging
|
|
474
|
+
table if it does not exist.
|
|
475
|
+
"""
|
|
476
|
+
# register pandas DF and create table if not exists
|
|
477
|
+
con.register("tmp_df", df)
|
|
478
|
+
con.execute(f"""
|
|
479
|
+
CREATE TABLE IF NOT EXISTS staging.{table_name} AS
|
|
480
|
+
SELECT * FROM tmp_df
|
|
481
|
+
""")
|
|
482
|
+
con.execute(f"""
|
|
483
|
+
INSERT INTO staging.{table_name}
|
|
484
|
+
SELECT * FROM tmp_df
|
|
485
|
+
""")
|
|
486
|
+
con.unregister("tmp_df")
|
|
109
487
|
|
|
110
488
|
def load_csv_to_staging(con: duckdb.DuckDBPyConnection, csv_path: str, table_name: str, replace: bool = True, **read_csv_kwargs):
|
|
111
489
|
"""
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
|
|
2
|
+
import pandas as pd
|
|
3
|
+
#from abc import abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from mpcaHydro import equis, wiski, warehouse
|
|
6
|
+
import duckdb
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
#%%
|
|
13
|
+
'''
|
|
14
|
+
This modules contains classes and functions to manage data downloads and storage into a local data warehouse.
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
'''
|
|
18
|
+
|
|
19
|
+
def get_db_path(warehouse_path:Path,db_name:str = 'observations')->Path:
|
|
20
|
+
'''
|
|
21
|
+
Constructs the full path to the database file within the warehouse directory.
|
|
22
|
+
|
|
23
|
+
Parameters:
|
|
24
|
+
warehouse_path (Path): The path to the warehouse directory.
|
|
25
|
+
db_name (str): The name of the database file.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Path: The full path to the database file.
|
|
29
|
+
'''
|
|
30
|
+
return Path(warehouse_path) / db_name
|
|
31
|
+
|
|
32
|
+
def construct_database(db_path:Path,db_name:str = 'observations')->Path:
|
|
33
|
+
'''
|
|
34
|
+
Constructs the full path to the database file within the warehouse directory.
|
|
35
|
+
|
|
36
|
+
Parameters:
|
|
37
|
+
warehouse_path (Path): The path to the warehouse directory.
|
|
38
|
+
db_name (str): The name of the database file.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Path: The full path to the database file.
|
|
42
|
+
'''
|
|
43
|
+
db_path = Path(db_path) / db_name
|
|
44
|
+
warehouse.init_db(warehouse_path=db_path)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
mpcaHydro/{WISKI.py → wiski.py}
RENAMED
|
@@ -157,7 +157,7 @@ def download_chunk(ts_id,start_year = 1996,end_year = 2030, interval = 4, as_jso
|
|
|
157
157
|
end = end_year
|
|
158
158
|
df = pywisk.get_ts(ts_id,start_date = f'{start}-01-01',end_date = f'{end}-12-31',as_json = as_json)
|
|
159
159
|
if not df.empty: frames.append(df)
|
|
160
|
-
df
|
|
160
|
+
df['Timestamp'] = pd.to_datetime(df['Timestamp']).dt.tz_localize(None)
|
|
161
161
|
time.sleep(.1)
|
|
162
162
|
return pd.concat(frames)
|
|
163
163
|
|
|
@@ -197,11 +197,8 @@ def tkn(station_nos,start_year = 1996,end_year = 2030):
|
|
|
197
197
|
return _download('TKN',station_nos,start_year,end_year)
|
|
198
198
|
|
|
199
199
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
Filter dataframe by valid quality codes
|
|
203
|
-
'''
|
|
204
|
-
return df.loc[df['Quality Code'].isin(DATA_CODES)]
|
|
200
|
+
|
|
201
|
+
|
|
205
202
|
|
|
206
203
|
def convert_units(df):
|
|
207
204
|
'''
|
|
@@ -233,12 +230,22 @@ def normalize_columns(df):
|
|
|
233
230
|
'station_no':'station_id',
|
|
234
231
|
'Timestamp':'datetime',
|
|
235
232
|
'Value':'value',
|
|
236
|
-
'ts_unitsymbol':'unit'
|
|
233
|
+
'ts_unitsymbol':'unit',
|
|
234
|
+
'Quality Code':'quality_code',
|
|
235
|
+
'Quality Code Name':'quality_code_name'}, inplace=True)
|
|
237
236
|
return df
|
|
238
237
|
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def filter_quality_codes(df, data_codes):
|
|
241
|
+
'''
|
|
242
|
+
Filter dataframe by valid quality codes
|
|
243
|
+
'''
|
|
244
|
+
return df.loc[df['quality_code'].isin(data_codes)]
|
|
245
|
+
|
|
239
246
|
def average_results(df):
|
|
240
|
-
df['datetime'] = pd.to_datetime(df.loc[:,'datetime'])
|
|
241
|
-
df['datetime'] = df['datetime'].dt.round('h')
|
|
247
|
+
#df['datetime'] = pd.to_datetime(df.loc[:,'datetime'])
|
|
248
|
+
df.loc[:,'datetime'] = df.loc[:,'datetime'].dt.round('h')
|
|
242
249
|
return df.groupby(['station_id', 'datetime', 'constituent', 'unit']).agg(value=('value', 'mean')).reset_index()
|
|
243
250
|
# Convert units
|
|
244
251
|
|
|
@@ -267,14 +274,35 @@ def calculate_baseflow(df, method = 'Boughton'):
|
|
|
267
274
|
return pd.concat(dfs)
|
|
268
275
|
|
|
269
276
|
|
|
270
|
-
def
|
|
277
|
+
def normalize(df):
|
|
271
278
|
'''
|
|
272
|
-
|
|
279
|
+
Standardize raw WISKI data into standardized format without transformations.
|
|
280
|
+
The standardized format includes normalized column names and units.
|
|
281
|
+
---
|
|
282
|
+
Parameters:
|
|
283
|
+
df (pandas.DataFrame): Raw WISKI data
|
|
284
|
+
Returns:
|
|
285
|
+
pandas.DataFrame: Normalized WISKI data
|
|
273
286
|
'''
|
|
274
|
-
|
|
287
|
+
|
|
275
288
|
df = convert_units(df)
|
|
276
289
|
df = normalize_columns(df)
|
|
290
|
+
return df
|
|
291
|
+
|
|
292
|
+
def transform(df, filter_qc_codes = True, data_codes = None, baseflow_method = 'Boughton'):
|
|
293
|
+
'''
|
|
294
|
+
Transform normalized WISKI data into standardized format
|
|
295
|
+
'''
|
|
296
|
+
df = normalize(df)
|
|
297
|
+
if filter_qc_codes:
|
|
298
|
+
if data_codes is None:
|
|
299
|
+
data_codes = DATA_CODES
|
|
300
|
+
df = filter_quality_codes(df, data_codes)
|
|
277
301
|
df = average_results(df)
|
|
278
302
|
df = calculate_baseflow(df, method = baseflow_method)
|
|
279
303
|
df['station_origin'] = 'wiski'
|
|
304
|
+
#df.set_index('datetime',inplace=True)
|
|
280
305
|
return df
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
|
mpcaHydro/xref.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
WISKI_EQUIS_XREF = pd.read_csv(Path(__file__).parent/'data/WISKI_EQUIS_XREF.csv')
|
|
5
|
+
#WISKI_EQUIS_XREF = pd.read_csv('C:/Users/mfratki/Documents/GitHub/hspf_tools/WISKI_EQUIS_XREF.csv')
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def are_lists_identical(nested_list):
|
|
9
|
+
# Sort each sublist
|
|
10
|
+
sorted_sublists = [sorted(sublist) for sublist in nested_list]
|
|
11
|
+
# Compare all sublists to the first one
|
|
12
|
+
return all(sublist == sorted_sublists[0] for sublist in sorted_sublists)
|
|
13
|
+
|
|
14
|
+
def get_wiski_stations():
|
|
15
|
+
return list(WISKI_EQUIS_XREF['WISKI_STATION_NO'].unique())
|
|
16
|
+
|
|
17
|
+
def get_equis_stations():
|
|
18
|
+
return list(WISKI_EQUIS_XREF['EQUIS_STATION_ID'].unique())
|
|
19
|
+
|
|
20
|
+
def wiski_equis_alias(wiski_station_id):
|
|
21
|
+
equis_ids = list(set(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_STATION_NO'] == wiski_station_id,'WISKI_EQUIS_ID'].to_list()))
|
|
22
|
+
equis_ids = [equis_id for equis_id in equis_ids if not pd.isna(equis_id)]
|
|
23
|
+
if len(equis_ids) == 0:
|
|
24
|
+
return []
|
|
25
|
+
elif len(equis_ids) > 1:
|
|
26
|
+
print(f'Too Many Equis Stations for {wiski_station_id}')
|
|
27
|
+
raise
|
|
28
|
+
else:
|
|
29
|
+
return equis_ids[0]
|
|
30
|
+
|
|
31
|
+
def wiski_equis_associations(wiski_station_id):
|
|
32
|
+
equis_ids = list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_STATION_NO'] == wiski_station_id,'EQUIS_STATION_ID'].unique())
|
|
33
|
+
equis_ids = [equis_id for equis_id in equis_ids if not pd.isna(equis_id)]
|
|
34
|
+
if len(equis_ids) == 0:
|
|
35
|
+
return []
|
|
36
|
+
else:
|
|
37
|
+
return equis_ids
|
|
38
|
+
|
|
39
|
+
def equis_wiski_associations(equis_station_id):
|
|
40
|
+
wiski_ids = list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['EQUIS_STATION_ID'] == equis_station_id,'WISKI_STATION_NO'].unique())
|
|
41
|
+
wiski_ids = [wiski_id for wiski_id in wiski_ids if not pd.isna(wiski_id)]
|
|
42
|
+
if len(wiski_ids) == 0:
|
|
43
|
+
return []
|
|
44
|
+
else:
|
|
45
|
+
return wiski_ids
|
|
46
|
+
|
|
47
|
+
def equis_wiski_alias(equis_station_id):
|
|
48
|
+
wiski_ids = list(set(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_EQUIS_ID'] == equis_station_id,'WISKI_STATION_NO'].to_list()))
|
|
49
|
+
wiski_ids = [wiski_id for wiski_id in wiski_ids if not pd.isna(wiski_id)]
|
|
50
|
+
if len(wiski_ids) == 0:
|
|
51
|
+
return []
|
|
52
|
+
elif len(wiski_ids) > 1:
|
|
53
|
+
print(f'Too Many WISKI Stations for {equis_station_id}')
|
|
54
|
+
raise ValueError(f'Too Many WISKI Stations for {equis_station_id}')
|
|
55
|
+
else:
|
|
56
|
+
return wiski_ids[0]
|
|
57
|
+
|
|
58
|
+
def _equis_wiski_associations(equis_station_ids):
|
|
59
|
+
wiski_stations = [equis_wiski_associations(equis_station_id) for equis_station_id in equis_station_ids]
|
|
60
|
+
if are_lists_identical(wiski_stations):
|
|
61
|
+
return wiski_stations[0]
|
|
62
|
+
else:
|
|
63
|
+
return []
|
|
64
|
+
|
|
65
|
+
def _stations_by_wid(wid_no,station_origin):
|
|
66
|
+
if station_origin in ['wiski','wplmn']:
|
|
67
|
+
station_col = 'WISKI_STATION_NO'
|
|
68
|
+
elif station_origin in ['equis','swd']:
|
|
69
|
+
station_col = 'EQUIS_STATION_ID'
|
|
70
|
+
else:
|
|
71
|
+
raise
|
|
72
|
+
|
|
73
|
+
return list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WID'] == wid_no,station_col].unique())
|
|
74
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mpcaHydro
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: Python package for downloading MPCA hydrology data
|
|
5
5
|
Project-URL: Homepage, https://github.com/mfratkin1/mpcaHydro
|
|
6
6
|
Author-email: Mulu Fratkin <michael.fratkin@state.mn.us>
|
|
@@ -10,7 +10,9 @@ Keywords: Hydrology,MPCA
|
|
|
10
10
|
Classifier: Development Status :: 3 - Alpha
|
|
11
11
|
Classifier: Programming Language :: Python
|
|
12
12
|
Requires-Python: >=3.8
|
|
13
|
+
Requires-Dist: baseflow
|
|
13
14
|
Requires-Dist: duckdb
|
|
15
|
+
Requires-Dist: oracledb
|
|
14
16
|
Requires-Dist: pandas
|
|
15
17
|
Requires-Dist: pathlib
|
|
16
18
|
Requires-Dist: requests
|