mpcaHydro 2.2.0__py3-none-any.whl → 2.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mpcaHydro/warehouse.py CHANGED
@@ -18,11 +18,14 @@ def init_db(db_path: str,reset: bool = False):
18
18
  # Create tables
19
19
  create_outlets_tables(con)
20
20
  create_mapping_tables(con)
21
+ create_staging_tables(con)
21
22
  create_analytics_tables(con)
23
+
22
24
 
23
25
  # Create views
24
- #update_views(con)
25
-
26
+ update_views(con)
27
+
28
+
26
29
 
27
30
  def create_schemas(con: duckdb.DuckDBPyConnection):
28
31
  """
@@ -34,6 +37,100 @@ def create_schemas(con: duckdb.DuckDBPyConnection):
34
37
  con.execute("CREATE SCHEMA IF NOT EXISTS outlets")
35
38
  con.execute("CREATE SCHEMA IF NOT EXISTS mappings")
36
39
 
40
+ def create_staging_tables(con: duckdb.DuckDBPyConnection):
41
+ '''
42
+ Create necessary tables in the staging schema. These were copied directly from database DDL. Would need to be updated if sources change.
43
+ '''
44
+ con.execute("""
45
+ CREATE TABLE IF NOT EXISTS staging.equis(
46
+ LATITUDE DOUBLE,
47
+ LONGITUDE DOUBLE,
48
+ WID_LIST VARCHAR,
49
+ SAMPLE_METHOD VARCHAR,
50
+ SAMPLE_REMARK VARCHAR,
51
+ FACILITY_ID BIGINT,
52
+ FACILITY_NAME VARCHAR,
53
+ FACILITY_TYPE VARCHAR,
54
+ SYS_LOC_CODE VARCHAR,
55
+ LOC_NAME VARCHAR,
56
+ LOC_TYPE VARCHAR,
57
+ LOC_TYPE_2 VARCHAR,
58
+ TASK_CODE VARCHAR,
59
+ SAMPLE_ID BIGINT,
60
+ SYS_SAMPLE_CODE VARCHAR,
61
+ TEST_ID BIGINT,
62
+ ANALYTE_TYPE VARCHAR,
63
+ ANALYTE_TYPE_DESC VARCHAR,
64
+ ANALYTIC_METHOD VARCHAR,
65
+ PREFERRED_NAME VARCHAR,
66
+ PARAMETER VARCHAR,
67
+ CAS_RN VARCHAR,
68
+ CHEMICAL_NAME VARCHAR,
69
+ GTLT VARCHAR,
70
+ RESULT_TEXT VARCHAR,
71
+ RESULT_NUMERIC DOUBLE,
72
+ RESULT_UNIT VARCHAR,
73
+ STAT_TYPE INTEGER,
74
+ VALUE_TYPE VARCHAR,
75
+ DETECT_FLAG VARCHAR,
76
+ DETECT_DESC VARCHAR,
77
+ RESULT_REMARK VARCHAR,
78
+ RESULT_TYPE_CODE VARCHAR,
79
+ METHOD_DETECTION_LIMIT VARCHAR,
80
+ REPORTING_DETECTION_LIMIT VARCHAR,
81
+ QUANTITATION_LIMIT INTEGER,
82
+ LAB_QUALIFIERS VARCHAR,
83
+ INTERPRETED_QUALIFIERS VARCHAR,
84
+ REPORTABLE_RESULT VARCHAR,
85
+ APPROVAL_CODE VARCHAR,
86
+ SENSITIVE_NOTPUBLIC VARCHAR,
87
+ TEST_TYPE VARCHAR,
88
+ DILUTION_FACTOR DOUBLE,
89
+ FRACTION VARCHAR,
90
+ BASIS VARCHAR,
91
+ TEMP_BASIS VARCHAR,
92
+ TEST_REMARK VARCHAR,
93
+ ANALYSIS_DATE_TIME TIMESTAMP_NS,
94
+ ANALYSIS_DATE VARCHAR,
95
+ ANALYSIS_TIME VARCHAR,
96
+ ANALYSIS_DATE_TIMEZONE VARCHAR,
97
+ COMPANY_NAME VARCHAR,
98
+ LAB_NAME_CODE VARCHAR,
99
+ LAB_SAMPLE_ID VARCHAR,
100
+ SAMPLE_TYPE_GROUP VARCHAR,
101
+ SAMPLE_TYPE_CODE VARCHAR,
102
+ SAMPLE_TYPE_DESC VARCHAR,
103
+ MEDIUM_CODE VARCHAR,
104
+ MATRIX_CODE VARCHAR,
105
+ START_DEPTH DOUBLE,
106
+ DEPTH_UNIT VARCHAR,
107
+ SAMPLE_DATE_TIME TIMESTAMP_NS,
108
+ SAMPLE_DATE VARCHAR,
109
+ SAMPLE_TIME VARCHAR,
110
+ SAMPLE_DATE_TIMEZONE VARCHAR,
111
+ EBATCH DOUBLE);
112
+ """)
113
+ con.execute("""
114
+ CREATE TABLE IF NOT EXISTS staging.wiski(
115
+ "Timestamp" VARCHAR,
116
+ "Value" DOUBLE,
117
+ "Quality Code" BIGINT,
118
+ "Quality Code Name" VARCHAR,
119
+ ts_unitsymbol VARCHAR,
120
+ ts_name VARCHAR,
121
+ ts_id VARCHAR,
122
+ station_no VARCHAR,
123
+ station_name VARCHAR,
124
+ station_latitude VARCHAR,
125
+ station_longitude VARCHAR,
126
+ parametertype_id VARCHAR,
127
+ parametertype_name VARCHAR,
128
+ stationparameter_no VARCHAR,
129
+ stationparameter_name VARCHAR,
130
+ wplmn_flag BIGINT);
131
+ """)
132
+
133
+
37
134
  def create_analytics_tables(con: duckdb.DuckDBPyConnection):
38
135
  """
39
136
  Create necessary tables in the analytics schema.
@@ -117,64 +214,51 @@ def create_mapping_tables(con: duckdb.DuckDBPyConnection):
117
214
  else:
118
215
  print(f"Warning: WISKI_QUALITY_CODES.csv not found at {wiski_qc_csv_path}")
119
216
 
217
+
218
+ def attach_outlets_db(con: duckdb.DuckDBPyConnection, outlets_db_path: str):
219
+ """
220
+ Attach an external DuckDB database containing outlet definitions.
221
+ """
222
+ create_schemas(con)
223
+
224
+ con.execute(f"ATTACH DATABASE '{outlets_db_path}' AS outlets_db;")
225
+
226
+ tables = con.execute("SHOW TABLES FROM outlets_db").fetchall()
227
+ print(f"Tables in the source database: {tables}")
228
+
229
+ for table in tables:
230
+ table_name = table[0] # Extract table name
231
+ con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM outlets_db.{table_name}") # Copy table contents
232
+
233
+ # -- Step 2: Copy all views --
234
+ # Retrieve the list of views in the source database
235
+ views = con.execute("SHOW VIEWS FROM outlets_db").fetchall()
236
+ print(f"Views in the source database: {views}")
237
+
238
+ # Copy each view from source to destination
239
+ for view in views:
240
+ view_name = view[0] # Extract view name
241
+
242
+ # Get the CREATE VIEW statement for the view
243
+ create_view_sql = con.execute(f"SHOW CREATE VIEW outlets_db.{view_name}").fetchone()[0]
244
+
245
+ # Recreate the view in the destination database (remove the `outlets_db.` prefix if exists)
246
+ create_view_sql = create_view_sql.replace(f"outlets_db.", "")
247
+ con.execute(create_view_sql)
248
+
249
+
250
+ con.execute(f"ATTACH DATABASE '{outlets_db_path}' AS outlets_db;")
251
+ # Optional: Detach the source database
252
+ con.execute("DETACH 'outlets_db'")
253
+
254
+
120
255
  def create_outlets_tables(con: duckdb.DuckDBPyConnection):
121
256
  """
122
- Create tables in the outlets schema to define outlet-station-reach relationships.
123
- """
124
- con.execute("""-- schema.sql
125
- -- Simple 3-table design to manage associations between model reaches and observation stations via outlets.
126
- -- Compatible with DuckDB and SQLite.
127
-
128
- -- Table 1: outlets
129
- -- Represents a logical grouping that ties stations and reaches together.
130
- CREATE TABLE IF NOT EXISTS outlets.outlets (
131
- outlet_id TEXT PRIMARY KEY,
132
- repository_name TEXT NOT NULL,
133
- outlet_name TEXT,
134
- notes TEXT -- optional: general notes about the outlet grouping
135
- );
136
-
137
- -- Table 2: outlet_stations
138
- -- One-to-many: outlet -> stations
139
- CREATE TABLE IF NOT EXISTS outlets.outlet_stations (
140
- outlet_id TEXT NOT NULL,
141
- station_id TEXT NOT NULL,
142
- station_origin TEXT NOT NULL, -- e.g., 'wiski', 'equis'
143
- repository_name TEXT NOT NULL, -- repository model the station is physically located in
144
- true_opnid TEXT NOT NULL, -- The specific reach the station physically sits on (optional)
145
- comments TEXT, -- Per-station comments, issues, etc.
146
- CONSTRAINT uq_station_origin UNIQUE (station_id, station_origin),
147
- FOREIGN KEY (outlet_id) REFERENCES outlets.outlets(outlet_id)
148
- );
149
-
150
- -- Table 3: outlet_reaches
151
- -- One-to-many: outlet -> reaches
152
- -- A reach can appear in multiple outlets, enabling many-to-many overall.
153
- CREATE TABLE IF NOT EXISTS outlets.outlet_reaches (
154
- outlet_id TEXT NOT NULL,
155
- reach_id TEXT NOT NULL, -- model reach identifier (aka opind)
156
- repository_name TEXT NOT NULL, -- optional: where the mapping comes from
157
- exclude INTEGER DEFAULT 0, -- flag to indicate if this reach should be excluded (1) or included (0)
158
- FOREIGN KEY (outlet_id) REFERENCES outlets.outlets(outlet_id)
159
- );
160
-
161
- -- Useful views:
162
-
163
- -- View: station_reach_pairs
164
- -- Derives the implicit many-to-many station <-> reach relationship via shared outlet_id
165
- CREATE VIEW IF NOT EXISTS outlets.station_reach_pairs AS
166
- SELECT
167
- s.outlet_id,
168
- s.station_id,
169
- s.station_origin,
170
- r.reach_id,
171
- r.exclude,
172
- r.repository_name,
173
- FROM outlets.outlet_stations s
174
- JOIN outlets.outlet_reaches r
175
- ON s.outlet_id = r.outlet_id;
176
-
177
- """)
257
+ Create tables in the outlets schema to define outlet-station-reach relationships.Copies from outlets module.
258
+ """
259
+ query = outlets.OUTLETS_SCHEMA
260
+ con.execute(query)
261
+ outlets.build_outlets(con)
178
262
 
179
263
  def create_normalized_wiski_view(con: duckdb.DuckDBPyConnection):
180
264
  """
@@ -210,7 +294,7 @@ def create_normalized_wiski_view(con: duckdb.DuckDBPyConnection):
210
294
  "Quality Code Name" AS quality_code_name, -- Rename Quality Code Name to quality_code_name
211
295
  parametertype_id, -- Keeps parametertype_id as is
212
296
  constituent -- Keeps constituent as is
213
- FROM staging.wiski_raw;""")
297
+ FROM staging.wiski;""")
214
298
 
215
299
 
216
300
  def create_filtered_wiski_view(con: duckdb.DuckDBPyConnection, data_codes: list):
@@ -254,17 +338,22 @@ def create_staging_qc_count_view(con: duckdb.DuckDBPyConnection):
254
338
  Create a view in staging schema that counts quality codes for each station and constituent.
255
339
  """
256
340
  con.execute("""
257
- CREATE OR REPLACE VIEW staging.wiski_qc_count AS (
341
+ CREATE OR REPLACE VIEW reports.wiski_qc_count AS (
258
342
  SELECT
259
343
  w.station_no,
260
344
  w.parametertype_name,
261
345
  w."Quality Code",
262
- w."Quality Code Name",
263
- COUNT(w."Quality Code") AS count
264
- FROM staging.wiski_raw w
346
+ COUNT(w."Quality Code") AS count,
347
+ wqc."Text",
348
+ wqc.Description,
349
+
350
+ FROM staging.wiski w
351
+ LEFT JOIN mappings.wiski_quality_codes wqc
352
+ ON w."Quality Code" = wqc.quality_code
353
+ WHERE wqc.Active = 1
265
354
  GROUP BY
266
- w."Quality Code",w."Quality Code Name",w.parametertype_name, w.station_no
267
- );
355
+ w."Quality Code",wqc."Text",wqc.Description,w.parametertype_name, w.station_no
356
+ );
268
357
  """)
269
358
  # ORDER BY
270
359
  # w.station_no,w.parametertype_name, w."Quality Code"
@@ -283,7 +372,7 @@ def create_combined_observations_view(con: duckdb.DuckDBPyConnection):
283
372
  SELECT datetime,value,station_id,station_origin,constituent,unit
284
373
  FROM analytics.wiski;
285
374
  """)
286
-
375
+
287
376
 
288
377
  def create_outlet_observations_view(con: duckdb.DuckDBPyConnection):
289
378
  """
@@ -299,8 +388,9 @@ def create_outlet_observations_view(con: duckdb.DuckDBPyConnection):
299
388
  COUNT(o.value) AS count
300
389
  FROM
301
390
  analytics.observations AS o
302
- LEFT JOIN
391
+ INNER JOIN
303
392
  outlets.outlet_stations AS os ON o.station_id = os.station_id AND o.station_origin = os.station_origin
393
+ WHERE os.outlet_id IS NOT NULL
304
394
  GROUP BY
305
395
  os.outlet_id,
306
396
  o.constituent,
@@ -316,51 +406,65 @@ def create_outlet_observations_view(con: duckdb.DuckDBPyConnection):
316
406
  def create_outlet_observations_with_flow_view(con: duckdb.DuckDBPyConnection):
317
407
 
318
408
  con.execute("""
319
- CREATE OR REPLACE VIEW analytics.outlet_observations_with_flow AS
320
- WITH baseflow_data AS (
321
- SELECT
322
- outlet_id,
323
- datetime,
324
- "value" AS baseflow_value
325
- FROM
326
- analytics.outlet_observations
327
- WHERE
328
- (constituent = 'QB')),
329
- flow_data AS (
330
- SELECT
331
- outlet_id,
332
- datetime,
333
- "value" AS flow_value
334
- FROM
335
- analytics.outlet_observations
336
- WHERE
337
- (constituent = 'Q')),
338
- constituent_data AS (
339
- SELECT
340
- outlet_id,
341
- datetime,
342
- constituent,
343
- "value",
344
- count
345
- FROM
346
- analytics.outlet_observations
347
- WHERE
348
- (constituent NOT IN ('Q', 'QB')))
349
- SELECT
350
- constituent_data.outlet_id,
351
- constituent_data.constituent,
352
- constituent_data.datetime,
353
- constituent_data."value",
354
- flow_data.flow_value,
355
- baseflow_data.baseflow_value
356
- FROM
357
- constituent_data
358
- FULL JOIN flow_data ON
359
- (((constituent_data.outlet_id = flow_data.outlet_id)
360
- AND (constituent_data.datetime = flow_data.datetime)))
361
- LEFT JOIN baseflow_data ON
362
- (((constituent_data.outlet_id = baseflow_data.outlet_id)
363
- AND (constituent_data.datetime = baseflow_data.datetime)));""")
409
+ CREATE OR REPLACE VIEW analytics.outlet_observations_with_flow AS
410
+ WITH
411
+ -- Extract baseflow data (constituent = 'QB')
412
+ baseflow_data AS (
413
+ SELECT
414
+ outlet_id,
415
+ datetime,
416
+ "value" AS baseflow_value
417
+ FROM
418
+ analytics.outlet_observations
419
+ WHERE
420
+ constituent = 'QB'
421
+ ),
422
+
423
+ -- Extract flow data (constituent = 'Q')
424
+ flow_data AS (
425
+ SELECT
426
+ outlet_id,
427
+ datetime,
428
+ "value" AS flow_value
429
+ FROM
430
+ analytics.outlet_observations
431
+ WHERE
432
+ constituent = 'Q'
433
+ ),
434
+
435
+ -- Extract all other constituent data (not 'Q' or 'QB')
436
+ constituent_data AS (
437
+ SELECT
438
+ outlet_id,
439
+ datetime,
440
+ constituent,
441
+ "value",
442
+ count
443
+ FROM
444
+ analytics.outlet_observations
445
+ WHERE
446
+ constituent NOT IN ('Q', 'QB')
447
+ )
448
+
449
+ -- Final join: Only include rows that have baseflow, flow, and constituent data
450
+ SELECT
451
+ c.outlet_id,
452
+ c.constituent,
453
+ c.datetime,
454
+ c."value",
455
+ c.count,
456
+ f.flow_value,
457
+ b.baseflow_value
458
+ FROM
459
+ constituent_data AS c
460
+ LEFT JOIN
461
+ flow_data AS f
462
+ ON c.outlet_id = f.outlet_id
463
+ AND c.datetime = f.datetime
464
+ LEFT JOIN
465
+ baseflow_data AS b
466
+ ON c.outlet_id = b.outlet_id
467
+ AND c.datetime = b.datetime;""")
364
468
  # ORDER BY
365
469
  # constituent_data.outlet_id,
366
470
  # constituent_data.datetime;
@@ -390,10 +494,10 @@ def create_constituent_summary_report(con: duckdb.DuckDBPyConnection):
390
494
 
391
495
  # ORDER BY
392
496
  # constituent,sample_count;''')
393
-
497
+
394
498
  def create_outlet_summary_report(con: duckdb.DuckDBPyConnection):
395
499
  con.execute("""
396
- CREATE VIEW reports.outlet_constituent_summary AS
500
+ CREATE OR REPLACE VIEW reports.outlet_constituent_summary AS
397
501
  SELECT
398
502
  outlet_id,
399
503
  constituent,
@@ -411,16 +515,6 @@ def create_outlet_summary_report(con: duckdb.DuckDBPyConnection):
411
515
  """)
412
516
 
413
517
 
414
-
415
- def drop_station_id(con: duckdb.DuckDBPyConnection, station_id: str,station_origin: str):
416
- """
417
- Drop all data for a specific station from staging and analytics schemas.
418
- """
419
- con.execute(f"DELETE FROM staging.equis_raw WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
420
- con.execute(f"DELETE FROM staging.wiski_raw WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
421
- con.execute(f"DELETE FROM analytics.equis WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
422
- con.execute(f"DELETE FROM analytics.wiski WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
423
- update_views(con)
424
518
 
425
519
  def update_views(con: duckdb.DuckDBPyConnection):
426
520
  """
@@ -444,16 +538,69 @@ def connect(db_path: str, read_only: bool = False) -> duckdb.DuckDBPyConnection:
444
538
  return duckdb.connect(database=db_path.as_posix(), read_only=read_only)
445
539
 
446
540
 
447
- def load_df_to_table(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str, replace: bool = True):
541
+ def drop_station_id(con: duckdb.DuckDBPyConnection, station_id: str,station_origin: str):
542
+ """
543
+ Drop all data for a specific station from staging and analytics schemas.
544
+ """
545
+ con.execute(f"DELETE FROM staging.equis WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
546
+ con.execute(f"DELETE FROM staging.wiski WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
547
+ con.execute(f"DELETE FROM analytics.equis WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
548
+ con.execute(f"DELETE FROM analytics.wiski WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
549
+ update_views(con)
550
+
551
+ def get_column_names(con: duckdb.DuckDBPyConnection, table_schema: str, table_name: str) -> list:
552
+ """
553
+ Get the column names of a DuckDB table.
554
+ """
555
+ #table_schema, table_name = table_name.split('.')
556
+ query = """
557
+ SELECT column_name
558
+ FROM information_schema.columns
559
+ WHERE table_name = ? AND table_schema = ?
560
+ """
561
+ result = con.execute(query,[table_name,table_schema]).fetchall()
562
+ column_names = [row[0] for row in result]
563
+ return column_names
564
+
565
+
566
+ def add_to_table(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_schema: str, table_name: str):
567
+ """
568
+ Append a pandas DataFrame into a DuckDB table. This will create the table
569
+ if it does not exist.
570
+ """
571
+
572
+
573
+ # get existing columns
574
+ existing_columns = get_column_names(con, table_schema, table_name)
575
+ df = df[[existing_columns]]
576
+
577
+
578
+ # register pandas DF and create table if not exists
579
+ con.register("tmp_df", df)
580
+
581
+ con.execute(f"""
582
+ INSERT INTO {table_schema}.{table_name}
583
+ SELECT * FROM tmp_df
584
+ """)
585
+ con.unregister("tmp_df")
586
+
587
+ def add_station_data(con: duckdb.DuckDBPyConnection, station_id: str, station_origin: str, table_schema: str, table_name: str, df: pd.DataFrame, replace: bool = False):
588
+ """
589
+ Add station data to the staging and analytics schemas.
590
+ """
591
+ if replace:
592
+ drop_station_id(con, station_id, station_origin)
593
+ add_to_table(con, df, table_schema, table_name)
594
+
595
+
596
+ def load_df_to_table(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str):
448
597
  """
449
598
  Persist a pandas DataFrame into a DuckDB table. This will overwrite the table
450
599
  by default (replace=True).
451
600
  """
452
- if replace:
453
- con.execute(f"DROP TABLE IF EXISTS {table_name}")
454
601
  # register pandas DF and create table
455
602
  con.register("tmp_df", df)
456
- con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM tmp_df")
603
+ con.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM tmp_df")
457
604
  con.unregister("tmp_df")
458
605
 
459
606
  def load_df_to_staging(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str, replace: bool = True):
@@ -468,23 +615,6 @@ def load_df_to_staging(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_n
468
615
  con.execute(f"CREATE TABLE staging.{table_name} AS SELECT * FROM tmp_df")
469
616
  con.unregister("tmp_df")
470
617
 
471
- def add_df_to_staging(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str):
472
- """
473
- Append a pandas DataFrame into a staging table. This will create the staging
474
- table if it does not exist.
475
- """
476
- # register pandas DF and create table if not exists
477
- con.register("tmp_df", df)
478
- con.execute(f"""
479
- CREATE TABLE IF NOT EXISTS staging.{table_name} AS
480
- SELECT * FROM tmp_df
481
- """)
482
- con.execute(f"""
483
- INSERT INTO staging.{table_name}
484
- SELECT * FROM tmp_df
485
- """)
486
- con.unregister("tmp_df")
487
-
488
618
  def load_csv_to_staging(con: duckdb.DuckDBPyConnection, csv_path: str, table_name: str, replace: bool = True, **read_csv_kwargs):
489
619
  """
490
620
  Persist a CSV file into a staging table. This will overwrite the staging
@@ -496,7 +626,7 @@ def load_csv_to_staging(con: duckdb.DuckDBPyConnection, csv_path: str, table_nam
496
626
  CREATE TABLE staging.{table_name} AS
497
627
  SELECT * FROM read_csv_auto('{csv_path}', {', '.join(f"{k}={repr(v)}" for k, v in read_csv_kwargs.items())})
498
628
  """)
499
-
629
+
500
630
  def load_parquet_to_staging(con: duckdb.DuckDBPyConnection, parquet_path: str, table_name: str, replace: bool = True):
501
631
  """
502
632
  Persist a Parquet file into a staging table. This will overwrite the staging
@@ -44,4 +44,12 @@ def construct_database(db_path:Path,db_name:str = 'observations')->Path:
44
44
  warehouse.init_db(warehouse_path=db_path)
45
45
 
46
46
 
47
+ def create_normalized_wiski_view(con: duckdb.DuckDBPyConnection):
48
+ """
49
+ Create a view in the database that contains normalized WISKI data.
50
+ """
51
+ con.execute("""
52
+ CREATE OR REPLACE VIEW analytics.normalized_wiski AS
53
+ SELECT
54
+ *""")
47
55
 
mpcaHydro/wiski.py CHANGED
@@ -19,12 +19,9 @@ PARAMETERTYPE_MAP ={'11522': 'TP',
19
19
  '11504': 'WT',
20
20
  '11533': 'DO',
21
21
  '11507':'WL'}
22
- #STATIONPARAMETER_NOS = ['262*','450*','451*','863*','866*','5034' ,'5035','5005', '5004','5014' ,'5015','5024' ,'5025','5044' ,'5045']
23
- STATIONPARAMETER_NOS = ['262*','450*','451*','863*','866*']
24
22
 
25
23
  DATA_CODES = [1,3,10,12,15,20,29,30,31,32,34,45,46,47,48,49]
26
24
 
27
-
28
25
  TS_NAME_SELECTOR = {'Q':{'Internal':{'daily':'20.Day.Mean.Archive',
29
26
  'unit': '15.Rated'},
30
27
  'External': {'daily': '20.Day.Mean',
@@ -62,7 +59,8 @@ TS_NAME_SELECTOR = {'Q':{'Internal':{'daily':'20.Day.Mean.Archive',
62
59
  'External': {'daily': '20.Day.Mean',
63
60
  'unit': '08.Provisional.Edited'}}}
64
61
 
65
-
62
+ #STATIONPARAMETER_NOS = ['262*','450*','451*','863*','866*','5034' ,'5035','5005', '5004','5014' ,'5015','5024' ,'5025','5044' ,'5045']
63
+ STATIONPARAMETER_NOS = ['262*','450*','451*','863*','866*']
66
64
 
67
65
  CONSTITUENT_NAME_NO = {'Q' :['262*'],#,'263'],
68
66
  'WT' :['450*', '451*'], # '450.42','451.42'],
@@ -74,6 +72,13 @@ CONSTITUENT_NAME_NO = {'Q' :['262*'],#,'263'],
74
72
  'N' :None,
75
73
  'TKN':None}
76
74
 
75
+ STATIONPARAMETER_NOS_MAP = {'262*':'Q',
76
+ '450*':'WT',
77
+ '451*':'WT',
78
+ '863*':'OP',
79
+ '866*':'DO',
80
+ '811*':'TRB'}
81
+
77
82
  CONSTITUENT_NAME_NO_WPLMN = {'Q' :['262*'],#,'263'],
78
83
  'WT' :['450*', '451*'], # '450.42','451.42'],
79
84
  'OP' :['863*','5034' ,'5035'],
@@ -91,6 +96,38 @@ def test_connection():
91
96
  '''
92
97
  return pywisk.test_connection()
93
98
 
99
+ def info(station_ids: list,constituent = None):
100
+ '''
101
+ Fetch metadata for given station IDs from WISKI database using the KISTERS API.
102
+ '''
103
+ if constituent is not None:
104
+ stationparameter_nos = CONSTITUENT_NAME_NO[constituent]
105
+ else:
106
+ stationparameter_nos = STATIONPARAMETER_NOS
107
+
108
+ df = pywisk.get_ts_ids(station_nos = station_ids,
109
+ stationparameter_no = stationparameter_nos,
110
+ ts_name = ['15.Rated','09.Archive','08.Provisional.Edited'])
111
+
112
+ df = normalize_columns(df)
113
+
114
+ # rows = []
115
+ # for station_id in df['station_id'].unique():
116
+ # for constituent in df.loc[df['station_id'] == station_id,'constituent'].unique():
117
+ # df_station_constituent = df.loc[(df['station_id'] == station_id) & (df['constituent'] == constituent) & df['ts_name'].isin(['15.Rated','09.Archive','08.Provisional.Edited'])]
118
+ # if not df_station_constituent.empty:
119
+ # if station_id.lower().startswith('e'):
120
+ # ts_names = TS_NAME_SELECTOR[constituent]['External']['unit']
121
+ # else:
122
+ # ts_names = TS_NAME_SELECTOR[constituent]['Internal']['unit']
123
+ # rows.append(df_station_constituent.loc[df_station_constituent['ts_name'] == ts_names,:])
124
+
125
+ return df
126
+
127
+
128
+
129
+
130
+
94
131
  def download(station_ids: list, start_year: int = 1996, end_year: int = 2030,wplmn: bool = False):
95
132
  '''
96
133
  Fetch data for given station IDs from WISKI database using the KISTERS API.
@@ -219,13 +256,28 @@ def convert_units(df):
219
256
  return df
220
257
 
221
258
 
259
+ def map_constituents(df):
260
+ '''
261
+ Map stationparameter_no to constituent names
262
+ '''
263
+ def map_values(value):
264
+ for key, replacement in STATIONPARAMETER_NOS_MAP.items():
265
+ if value.startswith(key.rstrip('*')): # Match prefix without the wildcard '*'
266
+ return replacement
267
+ return value # If no match, return the original value
268
+
269
+ df['constituent'] = df['stationparameter_no'].apply(map_values)
270
+ return df
271
+
222
272
  def normalize_columns(df):
223
273
  '''
224
274
  Normalize column names and units
225
275
  '''
226
276
  # Map parameter numbers to constituent names
227
- df['constituent'] = df['parametertype_id'].map(PARAMETERTYPE_MAP)
277
+ #df['constituent'] = df['stationparameter_no'].map(STATIONPARAMETER_NOS_MAP,regex=True)
228
278
 
279
+ df = map_constituents(df)
280
+
229
281
  df.rename(columns={
230
282
  'station_no':'station_id',
231
283
  'Timestamp':'datetime',
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mpcaHydro
3
- Version: 2.2.0
3
+ Version: 2.2.1
4
4
  Summary: Python package for downloading MPCA hydrology data
5
5
  Project-URL: Homepage, https://github.com/mfratkin1/mpcaHydro
6
6
  Author-email: Mulu Fratkin <michael.fratkin@state.mn.us>