mpcaHydro 2.1.0__py3-none-any.whl → 2.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mpcaHydro/data/WISKI_QUALITY_CODES.csv +71 -0
- mpcaHydro/data/outlet.duckdb +0 -0
- mpcaHydro/data/stations_EQUIS.gpkg +0 -0
- mpcaHydro/data/stations_wiski.gpkg +0 -0
- mpcaHydro/data_manager.py +172 -292
- mpcaHydro/equis.py +31 -22
- mpcaHydro/etlSWD.py +21 -15
- mpcaHydro/outlets.py +367 -0
- mpcaHydro/reports.py +80 -0
- mpcaHydro/warehouse.py +525 -17
- mpcaHydro/warehouseManager.py +55 -0
- mpcaHydro/{WISKI.py → wiski.py} +97 -17
- mpcaHydro/xref.py +74 -0
- {mpcahydro-2.1.0.dist-info → mpcahydro-2.2.1.dist-info}/METADATA +3 -1
- mpcahydro-2.2.1.dist-info/RECORD +23 -0
- mpcahydro-2.1.0.dist-info/RECORD +0 -15
- {mpcahydro-2.1.0.dist-info → mpcahydro-2.2.1.dist-info}/WHEEL +0 -0
mpcaHydro/warehouse.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import duckdb
|
|
2
2
|
import pandas as pd
|
|
3
3
|
from pathlib import Path
|
|
4
|
+
from mpcaHydro import outlets
|
|
4
5
|
|
|
5
6
|
def init_db(db_path: str,reset: bool = False):
|
|
6
7
|
"""
|
|
@@ -14,13 +15,16 @@ def init_db(db_path: str,reset: bool = False):
|
|
|
14
15
|
# Create all schemas
|
|
15
16
|
create_schemas(con)
|
|
16
17
|
|
|
17
|
-
# Create tables
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
18
|
+
# Create tables
|
|
19
|
+
create_outlets_tables(con)
|
|
20
|
+
create_mapping_tables(con)
|
|
21
|
+
create_staging_tables(con)
|
|
22
|
+
create_analytics_tables(con)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Create views
|
|
26
|
+
update_views(con)
|
|
27
|
+
|
|
24
28
|
|
|
25
29
|
|
|
26
30
|
def create_schemas(con: duckdb.DuckDBPyConnection):
|
|
@@ -30,8 +34,331 @@ def create_schemas(con: duckdb.DuckDBPyConnection):
|
|
|
30
34
|
con.execute("CREATE SCHEMA IF NOT EXISTS staging")
|
|
31
35
|
con.execute("CREATE SCHEMA IF NOT EXISTS analytics")
|
|
32
36
|
con.execute("CREATE SCHEMA IF NOT EXISTS reports")
|
|
33
|
-
con.execute("CREATE SCHEMA IF NOT EXISTS
|
|
37
|
+
con.execute("CREATE SCHEMA IF NOT EXISTS outlets")
|
|
38
|
+
con.execute("CREATE SCHEMA IF NOT EXISTS mappings")
|
|
39
|
+
|
|
40
|
+
def create_staging_tables(con: duckdb.DuckDBPyConnection):
|
|
41
|
+
'''
|
|
42
|
+
Create necessary tables in the staging schema. These were copied directly from database DDL. Would need to be updated if sources change.
|
|
43
|
+
'''
|
|
44
|
+
con.execute("""
|
|
45
|
+
CREATE TABLE IF NOT EXISTS staging.equis(
|
|
46
|
+
LATITUDE DOUBLE,
|
|
47
|
+
LONGITUDE DOUBLE,
|
|
48
|
+
WID_LIST VARCHAR,
|
|
49
|
+
SAMPLE_METHOD VARCHAR,
|
|
50
|
+
SAMPLE_REMARK VARCHAR,
|
|
51
|
+
FACILITY_ID BIGINT,
|
|
52
|
+
FACILITY_NAME VARCHAR,
|
|
53
|
+
FACILITY_TYPE VARCHAR,
|
|
54
|
+
SYS_LOC_CODE VARCHAR,
|
|
55
|
+
LOC_NAME VARCHAR,
|
|
56
|
+
LOC_TYPE VARCHAR,
|
|
57
|
+
LOC_TYPE_2 VARCHAR,
|
|
58
|
+
TASK_CODE VARCHAR,
|
|
59
|
+
SAMPLE_ID BIGINT,
|
|
60
|
+
SYS_SAMPLE_CODE VARCHAR,
|
|
61
|
+
TEST_ID BIGINT,
|
|
62
|
+
ANALYTE_TYPE VARCHAR,
|
|
63
|
+
ANALYTE_TYPE_DESC VARCHAR,
|
|
64
|
+
ANALYTIC_METHOD VARCHAR,
|
|
65
|
+
PREFERRED_NAME VARCHAR,
|
|
66
|
+
PARAMETER VARCHAR,
|
|
67
|
+
CAS_RN VARCHAR,
|
|
68
|
+
CHEMICAL_NAME VARCHAR,
|
|
69
|
+
GTLT VARCHAR,
|
|
70
|
+
RESULT_TEXT VARCHAR,
|
|
71
|
+
RESULT_NUMERIC DOUBLE,
|
|
72
|
+
RESULT_UNIT VARCHAR,
|
|
73
|
+
STAT_TYPE INTEGER,
|
|
74
|
+
VALUE_TYPE VARCHAR,
|
|
75
|
+
DETECT_FLAG VARCHAR,
|
|
76
|
+
DETECT_DESC VARCHAR,
|
|
77
|
+
RESULT_REMARK VARCHAR,
|
|
78
|
+
RESULT_TYPE_CODE VARCHAR,
|
|
79
|
+
METHOD_DETECTION_LIMIT VARCHAR,
|
|
80
|
+
REPORTING_DETECTION_LIMIT VARCHAR,
|
|
81
|
+
QUANTITATION_LIMIT INTEGER,
|
|
82
|
+
LAB_QUALIFIERS VARCHAR,
|
|
83
|
+
INTERPRETED_QUALIFIERS VARCHAR,
|
|
84
|
+
REPORTABLE_RESULT VARCHAR,
|
|
85
|
+
APPROVAL_CODE VARCHAR,
|
|
86
|
+
SENSITIVE_NOTPUBLIC VARCHAR,
|
|
87
|
+
TEST_TYPE VARCHAR,
|
|
88
|
+
DILUTION_FACTOR DOUBLE,
|
|
89
|
+
FRACTION VARCHAR,
|
|
90
|
+
BASIS VARCHAR,
|
|
91
|
+
TEMP_BASIS VARCHAR,
|
|
92
|
+
TEST_REMARK VARCHAR,
|
|
93
|
+
ANALYSIS_DATE_TIME TIMESTAMP_NS,
|
|
94
|
+
ANALYSIS_DATE VARCHAR,
|
|
95
|
+
ANALYSIS_TIME VARCHAR,
|
|
96
|
+
ANALYSIS_DATE_TIMEZONE VARCHAR,
|
|
97
|
+
COMPANY_NAME VARCHAR,
|
|
98
|
+
LAB_NAME_CODE VARCHAR,
|
|
99
|
+
LAB_SAMPLE_ID VARCHAR,
|
|
100
|
+
SAMPLE_TYPE_GROUP VARCHAR,
|
|
101
|
+
SAMPLE_TYPE_CODE VARCHAR,
|
|
102
|
+
SAMPLE_TYPE_DESC VARCHAR,
|
|
103
|
+
MEDIUM_CODE VARCHAR,
|
|
104
|
+
MATRIX_CODE VARCHAR,
|
|
105
|
+
START_DEPTH DOUBLE,
|
|
106
|
+
DEPTH_UNIT VARCHAR,
|
|
107
|
+
SAMPLE_DATE_TIME TIMESTAMP_NS,
|
|
108
|
+
SAMPLE_DATE VARCHAR,
|
|
109
|
+
SAMPLE_TIME VARCHAR,
|
|
110
|
+
SAMPLE_DATE_TIMEZONE VARCHAR,
|
|
111
|
+
EBATCH DOUBLE);
|
|
112
|
+
""")
|
|
113
|
+
con.execute("""
|
|
114
|
+
CREATE TABLE IF NOT EXISTS staging.wiski(
|
|
115
|
+
"Timestamp" VARCHAR,
|
|
116
|
+
"Value" DOUBLE,
|
|
117
|
+
"Quality Code" BIGINT,
|
|
118
|
+
"Quality Code Name" VARCHAR,
|
|
119
|
+
ts_unitsymbol VARCHAR,
|
|
120
|
+
ts_name VARCHAR,
|
|
121
|
+
ts_id VARCHAR,
|
|
122
|
+
station_no VARCHAR,
|
|
123
|
+
station_name VARCHAR,
|
|
124
|
+
station_latitude VARCHAR,
|
|
125
|
+
station_longitude VARCHAR,
|
|
126
|
+
parametertype_id VARCHAR,
|
|
127
|
+
parametertype_name VARCHAR,
|
|
128
|
+
stationparameter_no VARCHAR,
|
|
129
|
+
stationparameter_name VARCHAR,
|
|
130
|
+
wplmn_flag BIGINT);
|
|
131
|
+
""")
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def create_analytics_tables(con: duckdb.DuckDBPyConnection):
|
|
135
|
+
"""
|
|
136
|
+
Create necessary tables in the analytics schema.
|
|
137
|
+
"""
|
|
138
|
+
con.execute("""
|
|
139
|
+
CREATE TABLE IF NOT EXISTS analytics.equis (
|
|
140
|
+
datetime TIMESTAMP,
|
|
141
|
+
value DOUBLE,
|
|
142
|
+
station_id TEXT,
|
|
143
|
+
station_origin TEXT,
|
|
144
|
+
constituent TEXT,
|
|
145
|
+
unit TEXT
|
|
146
|
+
);
|
|
147
|
+
""")
|
|
148
|
+
con.execute("""
|
|
149
|
+
CREATE TABLE IF NOT EXISTS analytics.wiski (
|
|
150
|
+
datetime TIMESTAMP,
|
|
151
|
+
value DOUBLE,
|
|
152
|
+
station_id TEXT,
|
|
153
|
+
station_origin TEXT,
|
|
154
|
+
constituent TEXT,
|
|
155
|
+
unit TEXT
|
|
156
|
+
);
|
|
157
|
+
""")
|
|
158
|
+
|
|
159
|
+
def create_mapping_tables(con: duckdb.DuckDBPyConnection):
|
|
160
|
+
"""
|
|
161
|
+
Create and populate tables in the mappings schema from Python dicts and CSVs.
|
|
162
|
+
"""
|
|
163
|
+
# WISKI parametertype_id -> constituent
|
|
164
|
+
wiski_parametertype_map = {
|
|
165
|
+
'11522': 'TP',
|
|
166
|
+
'11531': 'TP',
|
|
167
|
+
'11532': 'TSS',
|
|
168
|
+
'11523': 'TSS',
|
|
169
|
+
'11526': 'N',
|
|
170
|
+
'11519': 'N',
|
|
171
|
+
'11520': 'OP',
|
|
172
|
+
'11528': 'OP',
|
|
173
|
+
'11530': 'TKN',
|
|
174
|
+
'11521': 'TKN',
|
|
175
|
+
'11500': 'Q',
|
|
176
|
+
'11504': 'WT',
|
|
177
|
+
'11533': 'DO',
|
|
178
|
+
'11507': 'WL'
|
|
179
|
+
}
|
|
180
|
+
df_wiski_params = pd.DataFrame(wiski_parametertype_map.items(), columns=['parametertype_id', 'constituent'])
|
|
181
|
+
con.execute("CREATE TABLE IF NOT EXISTS mappings.wiski_parametertype AS SELECT * FROM df_wiski_params")
|
|
182
|
+
|
|
183
|
+
# EQuIS cas_rn -> constituent
|
|
184
|
+
equis_casrn_map = {
|
|
185
|
+
'479-61-8': 'CHLA',
|
|
186
|
+
'CHLA-CORR': 'CHLA',
|
|
187
|
+
'BOD': 'BOD',
|
|
188
|
+
'NO2NO3': 'N',
|
|
189
|
+
'14797-55-8': 'NO3',
|
|
190
|
+
'14797-65-0': 'NO2',
|
|
191
|
+
'14265-44-2': 'OP',
|
|
192
|
+
'N-KJEL': 'TKN',
|
|
193
|
+
'PHOSPHATE-P': 'TP',
|
|
194
|
+
'7723-14-0': 'TP',
|
|
195
|
+
'SOLIDS-TSS': 'TSS',
|
|
196
|
+
'TEMP-W': 'WT',
|
|
197
|
+
'7664-41-7': 'NH3'
|
|
198
|
+
}
|
|
199
|
+
df_equis_cas = pd.DataFrame(equis_casrn_map.items(), columns=['cas_rn', 'constituent'])
|
|
200
|
+
con.execute("CREATE TABLE IF NOT EXISTS mappings.equis_casrn AS SELECT * FROM df_equis_cas")
|
|
201
|
+
|
|
202
|
+
# Load station cross-reference from CSV
|
|
203
|
+
# Assumes this script is run from a location where this relative path is valid
|
|
204
|
+
xref_csv_path = Path(__file__).parent / 'data/WISKI_EQUIS_XREF.csv'
|
|
205
|
+
if xref_csv_path.exists():
|
|
206
|
+
con.execute(f"CREATE TABLE IF NOT EXISTS mappings.station_xref AS SELECT * FROM read_csv_auto('{xref_csv_path.as_posix()}')")
|
|
207
|
+
else:
|
|
208
|
+
print(f"Warning: WISKI_EQUIS_XREF.csv not found at {xref_csv_path}")
|
|
209
|
+
|
|
210
|
+
# Load wiski_quality_codes from CSV
|
|
211
|
+
wiski_qc_csv_path = Path(__file__).parent / 'data/WISKI_QUALITY_CODES.csv'
|
|
212
|
+
if wiski_qc_csv_path.exists():
|
|
213
|
+
con.execute(f"CREATE TABLE IF NOT EXISTS mappings.wiski_quality_codes AS SELECT * FROM read_csv_auto('{wiski_qc_csv_path.as_posix()}')")
|
|
214
|
+
else:
|
|
215
|
+
print(f"Warning: WISKI_QUALITY_CODES.csv not found at {wiski_qc_csv_path}")
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def attach_outlets_db(con: duckdb.DuckDBPyConnection, outlets_db_path: str):
|
|
219
|
+
"""
|
|
220
|
+
Attach an external DuckDB database containing outlet definitions.
|
|
221
|
+
"""
|
|
222
|
+
create_schemas(con)
|
|
223
|
+
|
|
224
|
+
con.execute(f"ATTACH DATABASE '{outlets_db_path}' AS outlets_db;")
|
|
34
225
|
|
|
226
|
+
tables = con.execute("SHOW TABLES FROM outlets_db").fetchall()
|
|
227
|
+
print(f"Tables in the source database: {tables}")
|
|
228
|
+
|
|
229
|
+
for table in tables:
|
|
230
|
+
table_name = table[0] # Extract table name
|
|
231
|
+
con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM outlets_db.{table_name}") # Copy table contents
|
|
232
|
+
|
|
233
|
+
# -- Step 2: Copy all views --
|
|
234
|
+
# Retrieve the list of views in the source database
|
|
235
|
+
views = con.execute("SHOW VIEWS FROM outlets_db").fetchall()
|
|
236
|
+
print(f"Views in the source database: {views}")
|
|
237
|
+
|
|
238
|
+
# Copy each view from source to destination
|
|
239
|
+
for view in views:
|
|
240
|
+
view_name = view[0] # Extract view name
|
|
241
|
+
|
|
242
|
+
# Get the CREATE VIEW statement for the view
|
|
243
|
+
create_view_sql = con.execute(f"SHOW CREATE VIEW outlets_db.{view_name}").fetchone()[0]
|
|
244
|
+
|
|
245
|
+
# Recreate the view in the destination database (remove the `outlets_db.` prefix if exists)
|
|
246
|
+
create_view_sql = create_view_sql.replace(f"outlets_db.", "")
|
|
247
|
+
con.execute(create_view_sql)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
con.execute(f"ATTACH DATABASE '{outlets_db_path}' AS outlets_db;")
|
|
251
|
+
# Optional: Detach the source database
|
|
252
|
+
con.execute("DETACH 'outlets_db'")
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def create_outlets_tables(con: duckdb.DuckDBPyConnection):
|
|
256
|
+
"""
|
|
257
|
+
Create tables in the outlets schema to define outlet-station-reach relationships.Copies from outlets module.
|
|
258
|
+
"""
|
|
259
|
+
query = outlets.OUTLETS_SCHEMA
|
|
260
|
+
con.execute(query)
|
|
261
|
+
outlets.build_outlets(con)
|
|
262
|
+
|
|
263
|
+
def create_normalized_wiski_view(con: duckdb.DuckDBPyConnection):
|
|
264
|
+
"""
|
|
265
|
+
Create a view in the database that contains normalized WISKI data.
|
|
266
|
+
Units converted to standard units.
|
|
267
|
+
columns renamed.
|
|
268
|
+
constituents mapped.
|
|
269
|
+
"""
|
|
270
|
+
con.execute("""
|
|
271
|
+
-- Create a single view with all transformations
|
|
272
|
+
CREATE OR REPLACE VIEW analytics.wiski_normalized AS
|
|
273
|
+
SELECT
|
|
274
|
+
|
|
275
|
+
-- Convert °C to °F and keep other values unchanged
|
|
276
|
+
CASE
|
|
277
|
+
WHEN LOWER(ts_unitsymbol) = '°c' THEN (value * 9.0 / 5.0) + 32
|
|
278
|
+
WHEN ts_unitsymbol = 'kg' THEN value * 2.20462 -- Convert kg to lb
|
|
279
|
+
ELSE value
|
|
280
|
+
END AS value,
|
|
281
|
+
|
|
282
|
+
-- Normalize units
|
|
283
|
+
CASE
|
|
284
|
+
WHEN LOWER(ts_unitsymbol) = '°c' THEN 'degf' -- Normalize °C to degF
|
|
285
|
+
WHEN ts_unitsymbol = 'kg' THEN 'lb' -- Normalize kg to lb
|
|
286
|
+
WHEN ts_unitsymbol = 'ft³/s' THEN 'cfs' -- Rename ft³/s to cfs
|
|
287
|
+
ELSE ts_unitsymbol
|
|
288
|
+
END AS unit,
|
|
289
|
+
|
|
290
|
+
-- Normalize column names
|
|
291
|
+
station_no AS station_id, -- Rename station_no to station_id
|
|
292
|
+
Timestamp AS datetime, -- Rename Timestamp to datetime
|
|
293
|
+
"Quality Code" AS quality_code, -- Rename Quality Code to quality_code
|
|
294
|
+
"Quality Code Name" AS quality_code_name, -- Rename Quality Code Name to quality_code_name
|
|
295
|
+
parametertype_id, -- Keeps parametertype_id as is
|
|
296
|
+
constituent -- Keeps constituent as is
|
|
297
|
+
FROM staging.wiski;""")
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def create_filtered_wiski_view(con: duckdb.DuckDBPyConnection, data_codes: list):
|
|
301
|
+
"""
|
|
302
|
+
Create a view in the database that filters WISKI data based on specified data codes.
|
|
303
|
+
"""
|
|
304
|
+
query = f"""
|
|
305
|
+
CREATE OR REPLACE VIEW analytics.wiski_filtered AS
|
|
306
|
+
SELECT *
|
|
307
|
+
FROM analytics.wiski_normalized
|
|
308
|
+
WHERE quality_code IN ({placeholders});
|
|
309
|
+
"""
|
|
310
|
+
|
|
311
|
+
placeholders = ', '.join(['?'] * len(data_codes))
|
|
312
|
+
query = query.format(placeholders=placeholders)
|
|
313
|
+
con.execute(query, data_codes)
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def create_aggregated_wiski_view(con: duckdb.DuckDBPyConnection):
|
|
317
|
+
"""
|
|
318
|
+
Create a view in the database that aggregates WISKI data by hour, station, and constituent.
|
|
319
|
+
"""
|
|
320
|
+
con.execute("""
|
|
321
|
+
CREATE OR REPLACE Table analytics.wiski_aggregated AS
|
|
322
|
+
SELECT
|
|
323
|
+
station_id,
|
|
324
|
+
constituent,
|
|
325
|
+
time_bucket(INTERVAL '1 hour', datetime) AS hour_start,
|
|
326
|
+
AVG(value) AS value,
|
|
327
|
+
unit
|
|
328
|
+
FROM analytics.wiski_normalized
|
|
329
|
+
GROUP BY
|
|
330
|
+
station_id,
|
|
331
|
+
constituent,
|
|
332
|
+
hour_start,
|
|
333
|
+
unit;
|
|
334
|
+
""")
|
|
335
|
+
|
|
336
|
+
def create_staging_qc_count_view(con: duckdb.DuckDBPyConnection):
|
|
337
|
+
"""
|
|
338
|
+
Create a view in staging schema that counts quality codes for each station and constituent.
|
|
339
|
+
"""
|
|
340
|
+
con.execute("""
|
|
341
|
+
CREATE OR REPLACE VIEW reports.wiski_qc_count AS (
|
|
342
|
+
SELECT
|
|
343
|
+
w.station_no,
|
|
344
|
+
w.parametertype_name,
|
|
345
|
+
w."Quality Code",
|
|
346
|
+
COUNT(w."Quality Code") AS count,
|
|
347
|
+
wqc."Text",
|
|
348
|
+
wqc.Description,
|
|
349
|
+
|
|
350
|
+
FROM staging.wiski w
|
|
351
|
+
LEFT JOIN mappings.wiski_quality_codes wqc
|
|
352
|
+
ON w."Quality Code" = wqc.quality_code
|
|
353
|
+
WHERE wqc.Active = 1
|
|
354
|
+
GROUP BY
|
|
355
|
+
w."Quality Code",wqc."Text",wqc.Description,w.parametertype_name, w.station_no
|
|
356
|
+
);
|
|
357
|
+
""")
|
|
358
|
+
# ORDER BY
|
|
359
|
+
# w.station_no,w.parametertype_name, w."Quality Code"
|
|
360
|
+
# )
|
|
361
|
+
# """)
|
|
35
362
|
|
|
36
363
|
def create_combined_observations_view(con: duckdb.DuckDBPyConnection):
|
|
37
364
|
"""
|
|
@@ -45,7 +372,103 @@ def create_combined_observations_view(con: duckdb.DuckDBPyConnection):
|
|
|
45
372
|
SELECT datetime,value,station_id,station_origin,constituent,unit
|
|
46
373
|
FROM analytics.wiski;
|
|
47
374
|
""")
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def create_outlet_observations_view(con: duckdb.DuckDBPyConnection):
|
|
378
|
+
"""
|
|
379
|
+
Create a view in analytics schema that links observations to model reaches via outlets.
|
|
380
|
+
"""
|
|
381
|
+
con.execute("""
|
|
382
|
+
CREATE OR REPLACE VIEW analytics.outlet_observations AS
|
|
383
|
+
SELECT
|
|
384
|
+
o.datetime,
|
|
385
|
+
os.outlet_id,
|
|
386
|
+
o.constituent,
|
|
387
|
+
AVG(o.value) AS value,
|
|
388
|
+
COUNT(o.value) AS count
|
|
389
|
+
FROM
|
|
390
|
+
analytics.observations AS o
|
|
391
|
+
INNER JOIN
|
|
392
|
+
outlets.outlet_stations AS os ON o.station_id = os.station_id AND o.station_origin = os.station_origin
|
|
393
|
+
WHERE os.outlet_id IS NOT NULL
|
|
394
|
+
GROUP BY
|
|
395
|
+
os.outlet_id,
|
|
396
|
+
o.constituent,
|
|
397
|
+
o.datetime; -- Group by the truncated date
|
|
398
|
+
""")
|
|
399
|
+
# ORDER BY
|
|
400
|
+
# os.outlet_id,
|
|
401
|
+
# o.constituent,
|
|
402
|
+
# datetime);
|
|
403
|
+
|
|
404
|
+
|
|
48
405
|
|
|
406
|
+
def create_outlet_observations_with_flow_view(con: duckdb.DuckDBPyConnection):
|
|
407
|
+
|
|
408
|
+
con.execute("""
|
|
409
|
+
CREATE OR REPLACE VIEW analytics.outlet_observations_with_flow AS
|
|
410
|
+
WITH
|
|
411
|
+
-- Extract baseflow data (constituent = 'QB')
|
|
412
|
+
baseflow_data AS (
|
|
413
|
+
SELECT
|
|
414
|
+
outlet_id,
|
|
415
|
+
datetime,
|
|
416
|
+
"value" AS baseflow_value
|
|
417
|
+
FROM
|
|
418
|
+
analytics.outlet_observations
|
|
419
|
+
WHERE
|
|
420
|
+
constituent = 'QB'
|
|
421
|
+
),
|
|
422
|
+
|
|
423
|
+
-- Extract flow data (constituent = 'Q')
|
|
424
|
+
flow_data AS (
|
|
425
|
+
SELECT
|
|
426
|
+
outlet_id,
|
|
427
|
+
datetime,
|
|
428
|
+
"value" AS flow_value
|
|
429
|
+
FROM
|
|
430
|
+
analytics.outlet_observations
|
|
431
|
+
WHERE
|
|
432
|
+
constituent = 'Q'
|
|
433
|
+
),
|
|
434
|
+
|
|
435
|
+
-- Extract all other constituent data (not 'Q' or 'QB')
|
|
436
|
+
constituent_data AS (
|
|
437
|
+
SELECT
|
|
438
|
+
outlet_id,
|
|
439
|
+
datetime,
|
|
440
|
+
constituent,
|
|
441
|
+
"value",
|
|
442
|
+
count
|
|
443
|
+
FROM
|
|
444
|
+
analytics.outlet_observations
|
|
445
|
+
WHERE
|
|
446
|
+
constituent NOT IN ('Q', 'QB')
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
-- Final join: Only include rows that have baseflow, flow, and constituent data
|
|
450
|
+
SELECT
|
|
451
|
+
c.outlet_id,
|
|
452
|
+
c.constituent,
|
|
453
|
+
c.datetime,
|
|
454
|
+
c."value",
|
|
455
|
+
c.count,
|
|
456
|
+
f.flow_value,
|
|
457
|
+
b.baseflow_value
|
|
458
|
+
FROM
|
|
459
|
+
constituent_data AS c
|
|
460
|
+
LEFT JOIN
|
|
461
|
+
flow_data AS f
|
|
462
|
+
ON c.outlet_id = f.outlet_id
|
|
463
|
+
AND c.datetime = f.datetime
|
|
464
|
+
LEFT JOIN
|
|
465
|
+
baseflow_data AS b
|
|
466
|
+
ON c.outlet_id = b.outlet_id
|
|
467
|
+
AND c.datetime = b.datetime;""")
|
|
468
|
+
# ORDER BY
|
|
469
|
+
# constituent_data.outlet_id,
|
|
470
|
+
# constituent_data.datetime;
|
|
471
|
+
#
|
|
49
472
|
|
|
50
473
|
def create_constituent_summary_report(con: duckdb.DuckDBPyConnection):
|
|
51
474
|
"""
|
|
@@ -66,11 +489,44 @@ def create_constituent_summary_report(con: duckdb.DuckDBPyConnection):
|
|
|
66
489
|
FROM
|
|
67
490
|
analytics.observations
|
|
68
491
|
GROUP BY
|
|
69
|
-
constituent,station_id,station_origin
|
|
70
|
-
|
|
71
|
-
|
|
492
|
+
constituent,station_id,station_origin;
|
|
493
|
+
''')
|
|
494
|
+
|
|
495
|
+
# ORDER BY
|
|
496
|
+
# constituent,sample_count;''')
|
|
497
|
+
|
|
498
|
+
def create_outlet_summary_report(con: duckdb.DuckDBPyConnection):
|
|
499
|
+
con.execute("""
|
|
500
|
+
CREATE OR REPLACE VIEW reports.outlet_constituent_summary AS
|
|
501
|
+
SELECT
|
|
502
|
+
outlet_id,
|
|
503
|
+
constituent,
|
|
504
|
+
count_star() AS sample_count,
|
|
505
|
+
avg("value") AS average_value,
|
|
506
|
+
min("value") AS min_value,
|
|
507
|
+
max("value") AS max_value,
|
|
508
|
+
"year"(min(datetime)) AS start_date,
|
|
509
|
+
"year"(max(datetime)) AS end_date
|
|
510
|
+
FROM
|
|
511
|
+
analytics.outlet_observations
|
|
512
|
+
GROUP BY
|
|
513
|
+
constituent,
|
|
514
|
+
outlet_id
|
|
515
|
+
""")
|
|
72
516
|
|
|
517
|
+
|
|
73
518
|
|
|
519
|
+
def update_views(con: duckdb.DuckDBPyConnection):
|
|
520
|
+
"""
|
|
521
|
+
Update all views in the database.
|
|
522
|
+
"""
|
|
523
|
+
create_staging_qc_count_view(con)
|
|
524
|
+
create_combined_observations_view(con)
|
|
525
|
+
create_constituent_summary_report(con)
|
|
526
|
+
create_outlet_observations_view(con)
|
|
527
|
+
create_outlet_observations_with_flow_view(con)
|
|
528
|
+
create_outlet_summary_report(con)
|
|
529
|
+
|
|
74
530
|
def connect(db_path: str, read_only: bool = False) -> duckdb.DuckDBPyConnection:
|
|
75
531
|
"""
|
|
76
532
|
Returns a DuckDB connection to the given database path.
|
|
@@ -82,16 +538,69 @@ def connect(db_path: str, read_only: bool = False) -> duckdb.DuckDBPyConnection:
|
|
|
82
538
|
return duckdb.connect(database=db_path.as_posix(), read_only=read_only)
|
|
83
539
|
|
|
84
540
|
|
|
85
|
-
def
|
|
541
|
+
def drop_station_id(con: duckdb.DuckDBPyConnection, station_id: str,station_origin: str):
|
|
542
|
+
"""
|
|
543
|
+
Drop all data for a specific station from staging and analytics schemas.
|
|
544
|
+
"""
|
|
545
|
+
con.execute(f"DELETE FROM staging.equis WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
546
|
+
con.execute(f"DELETE FROM staging.wiski WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
547
|
+
con.execute(f"DELETE FROM analytics.equis WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
548
|
+
con.execute(f"DELETE FROM analytics.wiski WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
549
|
+
update_views(con)
|
|
550
|
+
|
|
551
|
+
def get_column_names(con: duckdb.DuckDBPyConnection, table_schema: str, table_name: str) -> list:
|
|
552
|
+
"""
|
|
553
|
+
Get the column names of a DuckDB table.
|
|
554
|
+
"""
|
|
555
|
+
#table_schema, table_name = table_name.split('.')
|
|
556
|
+
query = """
|
|
557
|
+
SELECT column_name
|
|
558
|
+
FROM information_schema.columns
|
|
559
|
+
WHERE table_name = ? AND table_schema = ?
|
|
560
|
+
"""
|
|
561
|
+
result = con.execute(query,[table_name,table_schema]).fetchall()
|
|
562
|
+
column_names = [row[0] for row in result]
|
|
563
|
+
return column_names
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def add_to_table(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_schema: str, table_name: str):
|
|
567
|
+
"""
|
|
568
|
+
Append a pandas DataFrame into a DuckDB table. This will create the table
|
|
569
|
+
if it does not exist.
|
|
570
|
+
"""
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
# get existing columns
|
|
574
|
+
existing_columns = get_column_names(con, table_schema, table_name)
|
|
575
|
+
df = df[[existing_columns]]
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
# register pandas DF and create table if not exists
|
|
579
|
+
con.register("tmp_df", df)
|
|
580
|
+
|
|
581
|
+
con.execute(f"""
|
|
582
|
+
INSERT INTO {table_schema}.{table_name}
|
|
583
|
+
SELECT * FROM tmp_df
|
|
584
|
+
""")
|
|
585
|
+
con.unregister("tmp_df")
|
|
586
|
+
|
|
587
|
+
def add_station_data(con: duckdb.DuckDBPyConnection, station_id: str, station_origin: str, table_schema: str, table_name: str, df: pd.DataFrame, replace: bool = False):
|
|
588
|
+
"""
|
|
589
|
+
Add station data to the staging and analytics schemas.
|
|
590
|
+
"""
|
|
591
|
+
if replace:
|
|
592
|
+
drop_station_id(con, station_id, station_origin)
|
|
593
|
+
add_to_table(con, df, table_schema, table_name)
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
def load_df_to_table(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str):
|
|
86
597
|
"""
|
|
87
598
|
Persist a pandas DataFrame into a DuckDB table. This will overwrite the table
|
|
88
599
|
by default (replace=True).
|
|
89
600
|
"""
|
|
90
|
-
if replace:
|
|
91
|
-
con.execute(f"DROP TABLE IF EXISTS {table_name}")
|
|
92
601
|
# register pandas DF and create table
|
|
93
602
|
con.register("tmp_df", df)
|
|
94
|
-
con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM tmp_df")
|
|
603
|
+
con.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM tmp_df")
|
|
95
604
|
con.unregister("tmp_df")
|
|
96
605
|
|
|
97
606
|
def load_df_to_staging(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str, replace: bool = True):
|
|
@@ -106,7 +615,6 @@ def load_df_to_staging(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_n
|
|
|
106
615
|
con.execute(f"CREATE TABLE staging.{table_name} AS SELECT * FROM tmp_df")
|
|
107
616
|
con.unregister("tmp_df")
|
|
108
617
|
|
|
109
|
-
|
|
110
618
|
def load_csv_to_staging(con: duckdb.DuckDBPyConnection, csv_path: str, table_name: str, replace: bool = True, **read_csv_kwargs):
|
|
111
619
|
"""
|
|
112
620
|
Persist a CSV file into a staging table. This will overwrite the staging
|
|
@@ -118,7 +626,7 @@ def load_csv_to_staging(con: duckdb.DuckDBPyConnection, csv_path: str, table_nam
|
|
|
118
626
|
CREATE TABLE staging.{table_name} AS
|
|
119
627
|
SELECT * FROM read_csv_auto('{csv_path}', {', '.join(f"{k}={repr(v)}" for k, v in read_csv_kwargs.items())})
|
|
120
628
|
""")
|
|
121
|
-
|
|
629
|
+
|
|
122
630
|
def load_parquet_to_staging(con: duckdb.DuckDBPyConnection, parquet_path: str, table_name: str, replace: bool = True):
|
|
123
631
|
"""
|
|
124
632
|
Persist a Parquet file into a staging table. This will overwrite the staging
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
|
|
2
|
+
import pandas as pd
|
|
3
|
+
#from abc import abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from mpcaHydro import equis, wiski, warehouse
|
|
6
|
+
import duckdb
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
#%%
|
|
13
|
+
'''
|
|
14
|
+
This modules contains classes and functions to manage data downloads and storage into a local data warehouse.
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
'''
|
|
18
|
+
|
|
19
|
+
def get_db_path(warehouse_path:Path,db_name:str = 'observations')->Path:
|
|
20
|
+
'''
|
|
21
|
+
Constructs the full path to the database file within the warehouse directory.
|
|
22
|
+
|
|
23
|
+
Parameters:
|
|
24
|
+
warehouse_path (Path): The path to the warehouse directory.
|
|
25
|
+
db_name (str): The name of the database file.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Path: The full path to the database file.
|
|
29
|
+
'''
|
|
30
|
+
return Path(warehouse_path) / db_name
|
|
31
|
+
|
|
32
|
+
def construct_database(db_path:Path,db_name:str = 'observations')->Path:
|
|
33
|
+
'''
|
|
34
|
+
Constructs the full path to the database file within the warehouse directory.
|
|
35
|
+
|
|
36
|
+
Parameters:
|
|
37
|
+
warehouse_path (Path): The path to the warehouse directory.
|
|
38
|
+
db_name (str): The name of the database file.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Path: The full path to the database file.
|
|
42
|
+
'''
|
|
43
|
+
db_path = Path(db_path) / db_name
|
|
44
|
+
warehouse.init_db(warehouse_path=db_path)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def create_normalized_wiski_view(con: duckdb.DuckDBPyConnection):
|
|
48
|
+
"""
|
|
49
|
+
Create a view in the database that contains normalized WISKI data.
|
|
50
|
+
"""
|
|
51
|
+
con.execute("""
|
|
52
|
+
CREATE OR REPLACE VIEW analytics.normalized_wiski AS
|
|
53
|
+
SELECT
|
|
54
|
+
*""")
|
|
55
|
+
|