mpcaHydro 2.2.0__py3-none-any.whl → 2.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mpcaHydro/data/{outlets.duckdb → outlet.duckdb} +0 -0
- mpcaHydro/data/stations_EQUIS.gpkg +0 -0
- mpcaHydro/data/stations_wiski.gpkg +0 -0
- mpcaHydro/data_manager.py +105 -60
- mpcaHydro/etlSWD.py +21 -15
- mpcaHydro/outlets.py +70 -74
- mpcaHydro/reports.py +1 -1
- mpcaHydro/warehouse.py +276 -146
- mpcaHydro/warehouseManager.py +8 -0
- mpcaHydro/wiski.py +57 -5
- {mpcahydro-2.2.0.dist-info → mpcahydro-2.2.1.dist-info}/METADATA +1 -1
- mpcahydro-2.2.1.dist-info/RECORD +23 -0
- mpcahydro-2.2.0.dist-info/RECORD +0 -23
- {mpcahydro-2.2.0.dist-info → mpcahydro-2.2.1.dist-info}/WHEEL +0 -0
mpcaHydro/warehouse.py
CHANGED
|
@@ -18,11 +18,14 @@ def init_db(db_path: str,reset: bool = False):
|
|
|
18
18
|
# Create tables
|
|
19
19
|
create_outlets_tables(con)
|
|
20
20
|
create_mapping_tables(con)
|
|
21
|
+
create_staging_tables(con)
|
|
21
22
|
create_analytics_tables(con)
|
|
23
|
+
|
|
22
24
|
|
|
23
25
|
# Create views
|
|
24
|
-
|
|
25
|
-
|
|
26
|
+
update_views(con)
|
|
27
|
+
|
|
28
|
+
|
|
26
29
|
|
|
27
30
|
def create_schemas(con: duckdb.DuckDBPyConnection):
|
|
28
31
|
"""
|
|
@@ -34,6 +37,100 @@ def create_schemas(con: duckdb.DuckDBPyConnection):
|
|
|
34
37
|
con.execute("CREATE SCHEMA IF NOT EXISTS outlets")
|
|
35
38
|
con.execute("CREATE SCHEMA IF NOT EXISTS mappings")
|
|
36
39
|
|
|
40
|
+
def create_staging_tables(con: duckdb.DuckDBPyConnection):
|
|
41
|
+
'''
|
|
42
|
+
Create necessary tables in the staging schema. These were copied directly from database DDL. Would need to be updated if sources change.
|
|
43
|
+
'''
|
|
44
|
+
con.execute("""
|
|
45
|
+
CREATE TABLE IF NOT EXISTS staging.equis(
|
|
46
|
+
LATITUDE DOUBLE,
|
|
47
|
+
LONGITUDE DOUBLE,
|
|
48
|
+
WID_LIST VARCHAR,
|
|
49
|
+
SAMPLE_METHOD VARCHAR,
|
|
50
|
+
SAMPLE_REMARK VARCHAR,
|
|
51
|
+
FACILITY_ID BIGINT,
|
|
52
|
+
FACILITY_NAME VARCHAR,
|
|
53
|
+
FACILITY_TYPE VARCHAR,
|
|
54
|
+
SYS_LOC_CODE VARCHAR,
|
|
55
|
+
LOC_NAME VARCHAR,
|
|
56
|
+
LOC_TYPE VARCHAR,
|
|
57
|
+
LOC_TYPE_2 VARCHAR,
|
|
58
|
+
TASK_CODE VARCHAR,
|
|
59
|
+
SAMPLE_ID BIGINT,
|
|
60
|
+
SYS_SAMPLE_CODE VARCHAR,
|
|
61
|
+
TEST_ID BIGINT,
|
|
62
|
+
ANALYTE_TYPE VARCHAR,
|
|
63
|
+
ANALYTE_TYPE_DESC VARCHAR,
|
|
64
|
+
ANALYTIC_METHOD VARCHAR,
|
|
65
|
+
PREFERRED_NAME VARCHAR,
|
|
66
|
+
PARAMETER VARCHAR,
|
|
67
|
+
CAS_RN VARCHAR,
|
|
68
|
+
CHEMICAL_NAME VARCHAR,
|
|
69
|
+
GTLT VARCHAR,
|
|
70
|
+
RESULT_TEXT VARCHAR,
|
|
71
|
+
RESULT_NUMERIC DOUBLE,
|
|
72
|
+
RESULT_UNIT VARCHAR,
|
|
73
|
+
STAT_TYPE INTEGER,
|
|
74
|
+
VALUE_TYPE VARCHAR,
|
|
75
|
+
DETECT_FLAG VARCHAR,
|
|
76
|
+
DETECT_DESC VARCHAR,
|
|
77
|
+
RESULT_REMARK VARCHAR,
|
|
78
|
+
RESULT_TYPE_CODE VARCHAR,
|
|
79
|
+
METHOD_DETECTION_LIMIT VARCHAR,
|
|
80
|
+
REPORTING_DETECTION_LIMIT VARCHAR,
|
|
81
|
+
QUANTITATION_LIMIT INTEGER,
|
|
82
|
+
LAB_QUALIFIERS VARCHAR,
|
|
83
|
+
INTERPRETED_QUALIFIERS VARCHAR,
|
|
84
|
+
REPORTABLE_RESULT VARCHAR,
|
|
85
|
+
APPROVAL_CODE VARCHAR,
|
|
86
|
+
SENSITIVE_NOTPUBLIC VARCHAR,
|
|
87
|
+
TEST_TYPE VARCHAR,
|
|
88
|
+
DILUTION_FACTOR DOUBLE,
|
|
89
|
+
FRACTION VARCHAR,
|
|
90
|
+
BASIS VARCHAR,
|
|
91
|
+
TEMP_BASIS VARCHAR,
|
|
92
|
+
TEST_REMARK VARCHAR,
|
|
93
|
+
ANALYSIS_DATE_TIME TIMESTAMP_NS,
|
|
94
|
+
ANALYSIS_DATE VARCHAR,
|
|
95
|
+
ANALYSIS_TIME VARCHAR,
|
|
96
|
+
ANALYSIS_DATE_TIMEZONE VARCHAR,
|
|
97
|
+
COMPANY_NAME VARCHAR,
|
|
98
|
+
LAB_NAME_CODE VARCHAR,
|
|
99
|
+
LAB_SAMPLE_ID VARCHAR,
|
|
100
|
+
SAMPLE_TYPE_GROUP VARCHAR,
|
|
101
|
+
SAMPLE_TYPE_CODE VARCHAR,
|
|
102
|
+
SAMPLE_TYPE_DESC VARCHAR,
|
|
103
|
+
MEDIUM_CODE VARCHAR,
|
|
104
|
+
MATRIX_CODE VARCHAR,
|
|
105
|
+
START_DEPTH DOUBLE,
|
|
106
|
+
DEPTH_UNIT VARCHAR,
|
|
107
|
+
SAMPLE_DATE_TIME TIMESTAMP_NS,
|
|
108
|
+
SAMPLE_DATE VARCHAR,
|
|
109
|
+
SAMPLE_TIME VARCHAR,
|
|
110
|
+
SAMPLE_DATE_TIMEZONE VARCHAR,
|
|
111
|
+
EBATCH DOUBLE);
|
|
112
|
+
""")
|
|
113
|
+
con.execute("""
|
|
114
|
+
CREATE TABLE IF NOT EXISTS staging.wiski(
|
|
115
|
+
"Timestamp" VARCHAR,
|
|
116
|
+
"Value" DOUBLE,
|
|
117
|
+
"Quality Code" BIGINT,
|
|
118
|
+
"Quality Code Name" VARCHAR,
|
|
119
|
+
ts_unitsymbol VARCHAR,
|
|
120
|
+
ts_name VARCHAR,
|
|
121
|
+
ts_id VARCHAR,
|
|
122
|
+
station_no VARCHAR,
|
|
123
|
+
station_name VARCHAR,
|
|
124
|
+
station_latitude VARCHAR,
|
|
125
|
+
station_longitude VARCHAR,
|
|
126
|
+
parametertype_id VARCHAR,
|
|
127
|
+
parametertype_name VARCHAR,
|
|
128
|
+
stationparameter_no VARCHAR,
|
|
129
|
+
stationparameter_name VARCHAR,
|
|
130
|
+
wplmn_flag BIGINT);
|
|
131
|
+
""")
|
|
132
|
+
|
|
133
|
+
|
|
37
134
|
def create_analytics_tables(con: duckdb.DuckDBPyConnection):
|
|
38
135
|
"""
|
|
39
136
|
Create necessary tables in the analytics schema.
|
|
@@ -117,64 +214,51 @@ def create_mapping_tables(con: duckdb.DuckDBPyConnection):
|
|
|
117
214
|
else:
|
|
118
215
|
print(f"Warning: WISKI_QUALITY_CODES.csv not found at {wiski_qc_csv_path}")
|
|
119
216
|
|
|
217
|
+
|
|
218
|
+
def attach_outlets_db(con: duckdb.DuckDBPyConnection, outlets_db_path: str):
|
|
219
|
+
"""
|
|
220
|
+
Attach an external DuckDB database containing outlet definitions.
|
|
221
|
+
"""
|
|
222
|
+
create_schemas(con)
|
|
223
|
+
|
|
224
|
+
con.execute(f"ATTACH DATABASE '{outlets_db_path}' AS outlets_db;")
|
|
225
|
+
|
|
226
|
+
tables = con.execute("SHOW TABLES FROM outlets_db").fetchall()
|
|
227
|
+
print(f"Tables in the source database: {tables}")
|
|
228
|
+
|
|
229
|
+
for table in tables:
|
|
230
|
+
table_name = table[0] # Extract table name
|
|
231
|
+
con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM outlets_db.{table_name}") # Copy table contents
|
|
232
|
+
|
|
233
|
+
# -- Step 2: Copy all views --
|
|
234
|
+
# Retrieve the list of views in the source database
|
|
235
|
+
views = con.execute("SHOW VIEWS FROM outlets_db").fetchall()
|
|
236
|
+
print(f"Views in the source database: {views}")
|
|
237
|
+
|
|
238
|
+
# Copy each view from source to destination
|
|
239
|
+
for view in views:
|
|
240
|
+
view_name = view[0] # Extract view name
|
|
241
|
+
|
|
242
|
+
# Get the CREATE VIEW statement for the view
|
|
243
|
+
create_view_sql = con.execute(f"SHOW CREATE VIEW outlets_db.{view_name}").fetchone()[0]
|
|
244
|
+
|
|
245
|
+
# Recreate the view in the destination database (remove the `outlets_db.` prefix if exists)
|
|
246
|
+
create_view_sql = create_view_sql.replace(f"outlets_db.", "")
|
|
247
|
+
con.execute(create_view_sql)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
con.execute(f"ATTACH DATABASE '{outlets_db_path}' AS outlets_db;")
|
|
251
|
+
# Optional: Detach the source database
|
|
252
|
+
con.execute("DETACH 'outlets_db'")
|
|
253
|
+
|
|
254
|
+
|
|
120
255
|
def create_outlets_tables(con: duckdb.DuckDBPyConnection):
|
|
121
256
|
"""
|
|
122
|
-
Create tables in the outlets schema to define outlet-station-reach relationships.
|
|
123
|
-
"""
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
-- Table 1: outlets
|
|
129
|
-
-- Represents a logical grouping that ties stations and reaches together.
|
|
130
|
-
CREATE TABLE IF NOT EXISTS outlets.outlets (
|
|
131
|
-
outlet_id TEXT PRIMARY KEY,
|
|
132
|
-
repository_name TEXT NOT NULL,
|
|
133
|
-
outlet_name TEXT,
|
|
134
|
-
notes TEXT -- optional: general notes about the outlet grouping
|
|
135
|
-
);
|
|
136
|
-
|
|
137
|
-
-- Table 2: outlet_stations
|
|
138
|
-
-- One-to-many: outlet -> stations
|
|
139
|
-
CREATE TABLE IF NOT EXISTS outlets.outlet_stations (
|
|
140
|
-
outlet_id TEXT NOT NULL,
|
|
141
|
-
station_id TEXT NOT NULL,
|
|
142
|
-
station_origin TEXT NOT NULL, -- e.g., 'wiski', 'equis'
|
|
143
|
-
repository_name TEXT NOT NULL, -- repository model the station is physically located in
|
|
144
|
-
true_opnid TEXT NOT NULL, -- The specific reach the station physically sits on (optional)
|
|
145
|
-
comments TEXT, -- Per-station comments, issues, etc.
|
|
146
|
-
CONSTRAINT uq_station_origin UNIQUE (station_id, station_origin),
|
|
147
|
-
FOREIGN KEY (outlet_id) REFERENCES outlets.outlets(outlet_id)
|
|
148
|
-
);
|
|
149
|
-
|
|
150
|
-
-- Table 3: outlet_reaches
|
|
151
|
-
-- One-to-many: outlet -> reaches
|
|
152
|
-
-- A reach can appear in multiple outlets, enabling many-to-many overall.
|
|
153
|
-
CREATE TABLE IF NOT EXISTS outlets.outlet_reaches (
|
|
154
|
-
outlet_id TEXT NOT NULL,
|
|
155
|
-
reach_id TEXT NOT NULL, -- model reach identifier (aka opind)
|
|
156
|
-
repository_name TEXT NOT NULL, -- optional: where the mapping comes from
|
|
157
|
-
exclude INTEGER DEFAULT 0, -- flag to indicate if this reach should be excluded (1) or included (0)
|
|
158
|
-
FOREIGN KEY (outlet_id) REFERENCES outlets.outlets(outlet_id)
|
|
159
|
-
);
|
|
160
|
-
|
|
161
|
-
-- Useful views:
|
|
162
|
-
|
|
163
|
-
-- View: station_reach_pairs
|
|
164
|
-
-- Derives the implicit many-to-many station <-> reach relationship via shared outlet_id
|
|
165
|
-
CREATE VIEW IF NOT EXISTS outlets.station_reach_pairs AS
|
|
166
|
-
SELECT
|
|
167
|
-
s.outlet_id,
|
|
168
|
-
s.station_id,
|
|
169
|
-
s.station_origin,
|
|
170
|
-
r.reach_id,
|
|
171
|
-
r.exclude,
|
|
172
|
-
r.repository_name,
|
|
173
|
-
FROM outlets.outlet_stations s
|
|
174
|
-
JOIN outlets.outlet_reaches r
|
|
175
|
-
ON s.outlet_id = r.outlet_id;
|
|
176
|
-
|
|
177
|
-
""")
|
|
257
|
+
Create tables in the outlets schema to define outlet-station-reach relationships.Copies from outlets module.
|
|
258
|
+
"""
|
|
259
|
+
query = outlets.OUTLETS_SCHEMA
|
|
260
|
+
con.execute(query)
|
|
261
|
+
outlets.build_outlets(con)
|
|
178
262
|
|
|
179
263
|
def create_normalized_wiski_view(con: duckdb.DuckDBPyConnection):
|
|
180
264
|
"""
|
|
@@ -210,7 +294,7 @@ def create_normalized_wiski_view(con: duckdb.DuckDBPyConnection):
|
|
|
210
294
|
"Quality Code Name" AS quality_code_name, -- Rename Quality Code Name to quality_code_name
|
|
211
295
|
parametertype_id, -- Keeps parametertype_id as is
|
|
212
296
|
constituent -- Keeps constituent as is
|
|
213
|
-
FROM staging.
|
|
297
|
+
FROM staging.wiski;""")
|
|
214
298
|
|
|
215
299
|
|
|
216
300
|
def create_filtered_wiski_view(con: duckdb.DuckDBPyConnection, data_codes: list):
|
|
@@ -254,17 +338,22 @@ def create_staging_qc_count_view(con: duckdb.DuckDBPyConnection):
|
|
|
254
338
|
Create a view in staging schema that counts quality codes for each station and constituent.
|
|
255
339
|
"""
|
|
256
340
|
con.execute("""
|
|
257
|
-
|
|
341
|
+
CREATE OR REPLACE VIEW reports.wiski_qc_count AS (
|
|
258
342
|
SELECT
|
|
259
343
|
w.station_no,
|
|
260
344
|
w.parametertype_name,
|
|
261
345
|
w."Quality Code",
|
|
262
|
-
w."Quality Code
|
|
263
|
-
|
|
264
|
-
|
|
346
|
+
COUNT(w."Quality Code") AS count,
|
|
347
|
+
wqc."Text",
|
|
348
|
+
wqc.Description,
|
|
349
|
+
|
|
350
|
+
FROM staging.wiski w
|
|
351
|
+
LEFT JOIN mappings.wiski_quality_codes wqc
|
|
352
|
+
ON w."Quality Code" = wqc.quality_code
|
|
353
|
+
WHERE wqc.Active = 1
|
|
265
354
|
GROUP BY
|
|
266
|
-
w."Quality Code",
|
|
267
|
-
);
|
|
355
|
+
w."Quality Code",wqc."Text",wqc.Description,w.parametertype_name, w.station_no
|
|
356
|
+
);
|
|
268
357
|
""")
|
|
269
358
|
# ORDER BY
|
|
270
359
|
# w.station_no,w.parametertype_name, w."Quality Code"
|
|
@@ -283,7 +372,7 @@ def create_combined_observations_view(con: duckdb.DuckDBPyConnection):
|
|
|
283
372
|
SELECT datetime,value,station_id,station_origin,constituent,unit
|
|
284
373
|
FROM analytics.wiski;
|
|
285
374
|
""")
|
|
286
|
-
|
|
375
|
+
|
|
287
376
|
|
|
288
377
|
def create_outlet_observations_view(con: duckdb.DuckDBPyConnection):
|
|
289
378
|
"""
|
|
@@ -299,8 +388,9 @@ def create_outlet_observations_view(con: duckdb.DuckDBPyConnection):
|
|
|
299
388
|
COUNT(o.value) AS count
|
|
300
389
|
FROM
|
|
301
390
|
analytics.observations AS o
|
|
302
|
-
|
|
391
|
+
INNER JOIN
|
|
303
392
|
outlets.outlet_stations AS os ON o.station_id = os.station_id AND o.station_origin = os.station_origin
|
|
393
|
+
WHERE os.outlet_id IS NOT NULL
|
|
304
394
|
GROUP BY
|
|
305
395
|
os.outlet_id,
|
|
306
396
|
o.constituent,
|
|
@@ -316,51 +406,65 @@ def create_outlet_observations_view(con: duckdb.DuckDBPyConnection):
|
|
|
316
406
|
def create_outlet_observations_with_flow_view(con: duckdb.DuckDBPyConnection):
|
|
317
407
|
|
|
318
408
|
con.execute("""
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
409
|
+
CREATE OR REPLACE VIEW analytics.outlet_observations_with_flow AS
|
|
410
|
+
WITH
|
|
411
|
+
-- Extract baseflow data (constituent = 'QB')
|
|
412
|
+
baseflow_data AS (
|
|
413
|
+
SELECT
|
|
414
|
+
outlet_id,
|
|
415
|
+
datetime,
|
|
416
|
+
"value" AS baseflow_value
|
|
417
|
+
FROM
|
|
418
|
+
analytics.outlet_observations
|
|
419
|
+
WHERE
|
|
420
|
+
constituent = 'QB'
|
|
421
|
+
),
|
|
422
|
+
|
|
423
|
+
-- Extract flow data (constituent = 'Q')
|
|
424
|
+
flow_data AS (
|
|
425
|
+
SELECT
|
|
426
|
+
outlet_id,
|
|
427
|
+
datetime,
|
|
428
|
+
"value" AS flow_value
|
|
429
|
+
FROM
|
|
430
|
+
analytics.outlet_observations
|
|
431
|
+
WHERE
|
|
432
|
+
constituent = 'Q'
|
|
433
|
+
),
|
|
434
|
+
|
|
435
|
+
-- Extract all other constituent data (not 'Q' or 'QB')
|
|
436
|
+
constituent_data AS (
|
|
437
|
+
SELECT
|
|
438
|
+
outlet_id,
|
|
439
|
+
datetime,
|
|
440
|
+
constituent,
|
|
441
|
+
"value",
|
|
442
|
+
count
|
|
443
|
+
FROM
|
|
444
|
+
analytics.outlet_observations
|
|
445
|
+
WHERE
|
|
446
|
+
constituent NOT IN ('Q', 'QB')
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
-- Final join: Only include rows that have baseflow, flow, and constituent data
|
|
450
|
+
SELECT
|
|
451
|
+
c.outlet_id,
|
|
452
|
+
c.constituent,
|
|
453
|
+
c.datetime,
|
|
454
|
+
c."value",
|
|
455
|
+
c.count,
|
|
456
|
+
f.flow_value,
|
|
457
|
+
b.baseflow_value
|
|
458
|
+
FROM
|
|
459
|
+
constituent_data AS c
|
|
460
|
+
LEFT JOIN
|
|
461
|
+
flow_data AS f
|
|
462
|
+
ON c.outlet_id = f.outlet_id
|
|
463
|
+
AND c.datetime = f.datetime
|
|
464
|
+
LEFT JOIN
|
|
465
|
+
baseflow_data AS b
|
|
466
|
+
ON c.outlet_id = b.outlet_id
|
|
467
|
+
AND c.datetime = b.datetime;""")
|
|
364
468
|
# ORDER BY
|
|
365
469
|
# constituent_data.outlet_id,
|
|
366
470
|
# constituent_data.datetime;
|
|
@@ -390,10 +494,10 @@ def create_constituent_summary_report(con: duckdb.DuckDBPyConnection):
|
|
|
390
494
|
|
|
391
495
|
# ORDER BY
|
|
392
496
|
# constituent,sample_count;''')
|
|
393
|
-
|
|
497
|
+
|
|
394
498
|
def create_outlet_summary_report(con: duckdb.DuckDBPyConnection):
|
|
395
499
|
con.execute("""
|
|
396
|
-
CREATE VIEW reports.outlet_constituent_summary AS
|
|
500
|
+
CREATE OR REPLACE VIEW reports.outlet_constituent_summary AS
|
|
397
501
|
SELECT
|
|
398
502
|
outlet_id,
|
|
399
503
|
constituent,
|
|
@@ -411,16 +515,6 @@ def create_outlet_summary_report(con: duckdb.DuckDBPyConnection):
|
|
|
411
515
|
""")
|
|
412
516
|
|
|
413
517
|
|
|
414
|
-
|
|
415
|
-
def drop_station_id(con: duckdb.DuckDBPyConnection, station_id: str,station_origin: str):
|
|
416
|
-
"""
|
|
417
|
-
Drop all data for a specific station from staging and analytics schemas.
|
|
418
|
-
"""
|
|
419
|
-
con.execute(f"DELETE FROM staging.equis_raw WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
420
|
-
con.execute(f"DELETE FROM staging.wiski_raw WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
421
|
-
con.execute(f"DELETE FROM analytics.equis WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
422
|
-
con.execute(f"DELETE FROM analytics.wiski WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
423
|
-
update_views(con)
|
|
424
518
|
|
|
425
519
|
def update_views(con: duckdb.DuckDBPyConnection):
|
|
426
520
|
"""
|
|
@@ -444,16 +538,69 @@ def connect(db_path: str, read_only: bool = False) -> duckdb.DuckDBPyConnection:
|
|
|
444
538
|
return duckdb.connect(database=db_path.as_posix(), read_only=read_only)
|
|
445
539
|
|
|
446
540
|
|
|
447
|
-
def
|
|
541
|
+
def drop_station_id(con: duckdb.DuckDBPyConnection, station_id: str,station_origin: str):
|
|
542
|
+
"""
|
|
543
|
+
Drop all data for a specific station from staging and analytics schemas.
|
|
544
|
+
"""
|
|
545
|
+
con.execute(f"DELETE FROM staging.equis WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
546
|
+
con.execute(f"DELETE FROM staging.wiski WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
547
|
+
con.execute(f"DELETE FROM analytics.equis WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
548
|
+
con.execute(f"DELETE FROM analytics.wiski WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
|
|
549
|
+
update_views(con)
|
|
550
|
+
|
|
551
|
+
def get_column_names(con: duckdb.DuckDBPyConnection, table_schema: str, table_name: str) -> list:
|
|
552
|
+
"""
|
|
553
|
+
Get the column names of a DuckDB table.
|
|
554
|
+
"""
|
|
555
|
+
#table_schema, table_name = table_name.split('.')
|
|
556
|
+
query = """
|
|
557
|
+
SELECT column_name
|
|
558
|
+
FROM information_schema.columns
|
|
559
|
+
WHERE table_name = ? AND table_schema = ?
|
|
560
|
+
"""
|
|
561
|
+
result = con.execute(query,[table_name,table_schema]).fetchall()
|
|
562
|
+
column_names = [row[0] for row in result]
|
|
563
|
+
return column_names
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def add_to_table(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_schema: str, table_name: str):
|
|
567
|
+
"""
|
|
568
|
+
Append a pandas DataFrame into a DuckDB table. This will create the table
|
|
569
|
+
if it does not exist.
|
|
570
|
+
"""
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
# get existing columns
|
|
574
|
+
existing_columns = get_column_names(con, table_schema, table_name)
|
|
575
|
+
df = df[[existing_columns]]
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
# register pandas DF and create table if not exists
|
|
579
|
+
con.register("tmp_df", df)
|
|
580
|
+
|
|
581
|
+
con.execute(f"""
|
|
582
|
+
INSERT INTO {table_schema}.{table_name}
|
|
583
|
+
SELECT * FROM tmp_df
|
|
584
|
+
""")
|
|
585
|
+
con.unregister("tmp_df")
|
|
586
|
+
|
|
587
|
+
def add_station_data(con: duckdb.DuckDBPyConnection, station_id: str, station_origin: str, table_schema: str, table_name: str, df: pd.DataFrame, replace: bool = False):
|
|
588
|
+
"""
|
|
589
|
+
Add station data to the staging and analytics schemas.
|
|
590
|
+
"""
|
|
591
|
+
if replace:
|
|
592
|
+
drop_station_id(con, station_id, station_origin)
|
|
593
|
+
add_to_table(con, df, table_schema, table_name)
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
def load_df_to_table(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str):
|
|
448
597
|
"""
|
|
449
598
|
Persist a pandas DataFrame into a DuckDB table. This will overwrite the table
|
|
450
599
|
by default (replace=True).
|
|
451
600
|
"""
|
|
452
|
-
if replace:
|
|
453
|
-
con.execute(f"DROP TABLE IF EXISTS {table_name}")
|
|
454
601
|
# register pandas DF and create table
|
|
455
602
|
con.register("tmp_df", df)
|
|
456
|
-
con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM tmp_df")
|
|
603
|
+
con.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM tmp_df")
|
|
457
604
|
con.unregister("tmp_df")
|
|
458
605
|
|
|
459
606
|
def load_df_to_staging(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str, replace: bool = True):
|
|
@@ -468,23 +615,6 @@ def load_df_to_staging(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_n
|
|
|
468
615
|
con.execute(f"CREATE TABLE staging.{table_name} AS SELECT * FROM tmp_df")
|
|
469
616
|
con.unregister("tmp_df")
|
|
470
617
|
|
|
471
|
-
def add_df_to_staging(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str):
|
|
472
|
-
"""
|
|
473
|
-
Append a pandas DataFrame into a staging table. This will create the staging
|
|
474
|
-
table if it does not exist.
|
|
475
|
-
"""
|
|
476
|
-
# register pandas DF and create table if not exists
|
|
477
|
-
con.register("tmp_df", df)
|
|
478
|
-
con.execute(f"""
|
|
479
|
-
CREATE TABLE IF NOT EXISTS staging.{table_name} AS
|
|
480
|
-
SELECT * FROM tmp_df
|
|
481
|
-
""")
|
|
482
|
-
con.execute(f"""
|
|
483
|
-
INSERT INTO staging.{table_name}
|
|
484
|
-
SELECT * FROM tmp_df
|
|
485
|
-
""")
|
|
486
|
-
con.unregister("tmp_df")
|
|
487
|
-
|
|
488
618
|
def load_csv_to_staging(con: duckdb.DuckDBPyConnection, csv_path: str, table_name: str, replace: bool = True, **read_csv_kwargs):
|
|
489
619
|
"""
|
|
490
620
|
Persist a CSV file into a staging table. This will overwrite the staging
|
|
@@ -496,7 +626,7 @@ def load_csv_to_staging(con: duckdb.DuckDBPyConnection, csv_path: str, table_nam
|
|
|
496
626
|
CREATE TABLE staging.{table_name} AS
|
|
497
627
|
SELECT * FROM read_csv_auto('{csv_path}', {', '.join(f"{k}={repr(v)}" for k, v in read_csv_kwargs.items())})
|
|
498
628
|
""")
|
|
499
|
-
|
|
629
|
+
|
|
500
630
|
def load_parquet_to_staging(con: duckdb.DuckDBPyConnection, parquet_path: str, table_name: str, replace: bool = True):
|
|
501
631
|
"""
|
|
502
632
|
Persist a Parquet file into a staging table. This will overwrite the staging
|
mpcaHydro/warehouseManager.py
CHANGED
|
@@ -44,4 +44,12 @@ def construct_database(db_path:Path,db_name:str = 'observations')->Path:
|
|
|
44
44
|
warehouse.init_db(warehouse_path=db_path)
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
def create_normalized_wiski_view(con: duckdb.DuckDBPyConnection):
|
|
48
|
+
"""
|
|
49
|
+
Create a view in the database that contains normalized WISKI data.
|
|
50
|
+
"""
|
|
51
|
+
con.execute("""
|
|
52
|
+
CREATE OR REPLACE VIEW analytics.normalized_wiski AS
|
|
53
|
+
SELECT
|
|
54
|
+
*""")
|
|
47
55
|
|
mpcaHydro/wiski.py
CHANGED
|
@@ -19,12 +19,9 @@ PARAMETERTYPE_MAP ={'11522': 'TP',
|
|
|
19
19
|
'11504': 'WT',
|
|
20
20
|
'11533': 'DO',
|
|
21
21
|
'11507':'WL'}
|
|
22
|
-
#STATIONPARAMETER_NOS = ['262*','450*','451*','863*','866*','5034' ,'5035','5005', '5004','5014' ,'5015','5024' ,'5025','5044' ,'5045']
|
|
23
|
-
STATIONPARAMETER_NOS = ['262*','450*','451*','863*','866*']
|
|
24
22
|
|
|
25
23
|
DATA_CODES = [1,3,10,12,15,20,29,30,31,32,34,45,46,47,48,49]
|
|
26
24
|
|
|
27
|
-
|
|
28
25
|
TS_NAME_SELECTOR = {'Q':{'Internal':{'daily':'20.Day.Mean.Archive',
|
|
29
26
|
'unit': '15.Rated'},
|
|
30
27
|
'External': {'daily': '20.Day.Mean',
|
|
@@ -62,7 +59,8 @@ TS_NAME_SELECTOR = {'Q':{'Internal':{'daily':'20.Day.Mean.Archive',
|
|
|
62
59
|
'External': {'daily': '20.Day.Mean',
|
|
63
60
|
'unit': '08.Provisional.Edited'}}}
|
|
64
61
|
|
|
65
|
-
|
|
62
|
+
#STATIONPARAMETER_NOS = ['262*','450*','451*','863*','866*','5034' ,'5035','5005', '5004','5014' ,'5015','5024' ,'5025','5044' ,'5045']
|
|
63
|
+
STATIONPARAMETER_NOS = ['262*','450*','451*','863*','866*']
|
|
66
64
|
|
|
67
65
|
CONSTITUENT_NAME_NO = {'Q' :['262*'],#,'263'],
|
|
68
66
|
'WT' :['450*', '451*'], # '450.42','451.42'],
|
|
@@ -74,6 +72,13 @@ CONSTITUENT_NAME_NO = {'Q' :['262*'],#,'263'],
|
|
|
74
72
|
'N' :None,
|
|
75
73
|
'TKN':None}
|
|
76
74
|
|
|
75
|
+
STATIONPARAMETER_NOS_MAP = {'262*':'Q',
|
|
76
|
+
'450*':'WT',
|
|
77
|
+
'451*':'WT',
|
|
78
|
+
'863*':'OP',
|
|
79
|
+
'866*':'DO',
|
|
80
|
+
'811*':'TRB'}
|
|
81
|
+
|
|
77
82
|
CONSTITUENT_NAME_NO_WPLMN = {'Q' :['262*'],#,'263'],
|
|
78
83
|
'WT' :['450*', '451*'], # '450.42','451.42'],
|
|
79
84
|
'OP' :['863*','5034' ,'5035'],
|
|
@@ -91,6 +96,38 @@ def test_connection():
|
|
|
91
96
|
'''
|
|
92
97
|
return pywisk.test_connection()
|
|
93
98
|
|
|
99
|
+
def info(station_ids: list,constituent = None):
|
|
100
|
+
'''
|
|
101
|
+
Fetch metadata for given station IDs from WISKI database using the KISTERS API.
|
|
102
|
+
'''
|
|
103
|
+
if constituent is not None:
|
|
104
|
+
stationparameter_nos = CONSTITUENT_NAME_NO[constituent]
|
|
105
|
+
else:
|
|
106
|
+
stationparameter_nos = STATIONPARAMETER_NOS
|
|
107
|
+
|
|
108
|
+
df = pywisk.get_ts_ids(station_nos = station_ids,
|
|
109
|
+
stationparameter_no = stationparameter_nos,
|
|
110
|
+
ts_name = ['15.Rated','09.Archive','08.Provisional.Edited'])
|
|
111
|
+
|
|
112
|
+
df = normalize_columns(df)
|
|
113
|
+
|
|
114
|
+
# rows = []
|
|
115
|
+
# for station_id in df['station_id'].unique():
|
|
116
|
+
# for constituent in df.loc[df['station_id'] == station_id,'constituent'].unique():
|
|
117
|
+
# df_station_constituent = df.loc[(df['station_id'] == station_id) & (df['constituent'] == constituent) & df['ts_name'].isin(['15.Rated','09.Archive','08.Provisional.Edited'])]
|
|
118
|
+
# if not df_station_constituent.empty:
|
|
119
|
+
# if station_id.lower().startswith('e'):
|
|
120
|
+
# ts_names = TS_NAME_SELECTOR[constituent]['External']['unit']
|
|
121
|
+
# else:
|
|
122
|
+
# ts_names = TS_NAME_SELECTOR[constituent]['Internal']['unit']
|
|
123
|
+
# rows.append(df_station_constituent.loc[df_station_constituent['ts_name'] == ts_names,:])
|
|
124
|
+
|
|
125
|
+
return df
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
|
|
94
131
|
def download(station_ids: list, start_year: int = 1996, end_year: int = 2030,wplmn: bool = False):
|
|
95
132
|
'''
|
|
96
133
|
Fetch data for given station IDs from WISKI database using the KISTERS API.
|
|
@@ -219,13 +256,28 @@ def convert_units(df):
|
|
|
219
256
|
return df
|
|
220
257
|
|
|
221
258
|
|
|
259
|
+
def map_constituents(df):
|
|
260
|
+
'''
|
|
261
|
+
Map stationparameter_no to constituent names
|
|
262
|
+
'''
|
|
263
|
+
def map_values(value):
|
|
264
|
+
for key, replacement in STATIONPARAMETER_NOS_MAP.items():
|
|
265
|
+
if value.startswith(key.rstrip('*')): # Match prefix without the wildcard '*'
|
|
266
|
+
return replacement
|
|
267
|
+
return value # If no match, return the original value
|
|
268
|
+
|
|
269
|
+
df['constituent'] = df['stationparameter_no'].apply(map_values)
|
|
270
|
+
return df
|
|
271
|
+
|
|
222
272
|
def normalize_columns(df):
|
|
223
273
|
'''
|
|
224
274
|
Normalize column names and units
|
|
225
275
|
'''
|
|
226
276
|
# Map parameter numbers to constituent names
|
|
227
|
-
df['constituent'] = df['
|
|
277
|
+
#df['constituent'] = df['stationparameter_no'].map(STATIONPARAMETER_NOS_MAP,regex=True)
|
|
228
278
|
|
|
279
|
+
df = map_constituents(df)
|
|
280
|
+
|
|
229
281
|
df.rename(columns={
|
|
230
282
|
'station_no':'station_id',
|
|
231
283
|
'Timestamp':'datetime',
|