mpcaHydro 2.1.0__py3-none-any.whl → 2.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mpcaHydro/warehouse.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import duckdb
2
2
  import pandas as pd
3
3
  from pathlib import Path
4
+ from mpcaHydro import outlets
4
5
 
5
6
  def init_db(db_path: str,reset: bool = False):
6
7
  """
@@ -14,13 +15,16 @@ def init_db(db_path: str,reset: bool = False):
14
15
  # Create all schemas
15
16
  create_schemas(con)
16
17
 
17
- # Create tables for observational data
18
- # Wrapped in try/except as they depend on tables that may not exist yet
19
- try:
20
- create_combined_observations_view(con)
21
- create_constituent_summary_report(con)
22
- except duckdb.CatalogException as e:
23
- print(f"Could not create observation views, likely because backing tables don't exist yet. This is safe to ignore on first run. Details: {e}")
18
+ # Create tables
19
+ create_outlets_tables(con)
20
+ create_mapping_tables(con)
21
+ create_staging_tables(con)
22
+ create_analytics_tables(con)
23
+
24
+
25
+ # Create views
26
+ update_views(con)
27
+
24
28
 
25
29
 
26
30
  def create_schemas(con: duckdb.DuckDBPyConnection):
@@ -30,8 +34,331 @@ def create_schemas(con: duckdb.DuckDBPyConnection):
30
34
  con.execute("CREATE SCHEMA IF NOT EXISTS staging")
31
35
  con.execute("CREATE SCHEMA IF NOT EXISTS analytics")
32
36
  con.execute("CREATE SCHEMA IF NOT EXISTS reports")
33
- con.execute("CREATE SCHEMA IF NOT EXISTS hspf")
37
+ con.execute("CREATE SCHEMA IF NOT EXISTS outlets")
38
+ con.execute("CREATE SCHEMA IF NOT EXISTS mappings")
39
+
40
+ def create_staging_tables(con: duckdb.DuckDBPyConnection):
41
+ '''
42
+ Create necessary tables in the staging schema. These were copied directly from database DDL. Would need to be updated if sources change.
43
+ '''
44
+ con.execute("""
45
+ CREATE TABLE IF NOT EXISTS staging.equis(
46
+ LATITUDE DOUBLE,
47
+ LONGITUDE DOUBLE,
48
+ WID_LIST VARCHAR,
49
+ SAMPLE_METHOD VARCHAR,
50
+ SAMPLE_REMARK VARCHAR,
51
+ FACILITY_ID BIGINT,
52
+ FACILITY_NAME VARCHAR,
53
+ FACILITY_TYPE VARCHAR,
54
+ SYS_LOC_CODE VARCHAR,
55
+ LOC_NAME VARCHAR,
56
+ LOC_TYPE VARCHAR,
57
+ LOC_TYPE_2 VARCHAR,
58
+ TASK_CODE VARCHAR,
59
+ SAMPLE_ID BIGINT,
60
+ SYS_SAMPLE_CODE VARCHAR,
61
+ TEST_ID BIGINT,
62
+ ANALYTE_TYPE VARCHAR,
63
+ ANALYTE_TYPE_DESC VARCHAR,
64
+ ANALYTIC_METHOD VARCHAR,
65
+ PREFERRED_NAME VARCHAR,
66
+ PARAMETER VARCHAR,
67
+ CAS_RN VARCHAR,
68
+ CHEMICAL_NAME VARCHAR,
69
+ GTLT VARCHAR,
70
+ RESULT_TEXT VARCHAR,
71
+ RESULT_NUMERIC DOUBLE,
72
+ RESULT_UNIT VARCHAR,
73
+ STAT_TYPE INTEGER,
74
+ VALUE_TYPE VARCHAR,
75
+ DETECT_FLAG VARCHAR,
76
+ DETECT_DESC VARCHAR,
77
+ RESULT_REMARK VARCHAR,
78
+ RESULT_TYPE_CODE VARCHAR,
79
+ METHOD_DETECTION_LIMIT VARCHAR,
80
+ REPORTING_DETECTION_LIMIT VARCHAR,
81
+ QUANTITATION_LIMIT INTEGER,
82
+ LAB_QUALIFIERS VARCHAR,
83
+ INTERPRETED_QUALIFIERS VARCHAR,
84
+ REPORTABLE_RESULT VARCHAR,
85
+ APPROVAL_CODE VARCHAR,
86
+ SENSITIVE_NOTPUBLIC VARCHAR,
87
+ TEST_TYPE VARCHAR,
88
+ DILUTION_FACTOR DOUBLE,
89
+ FRACTION VARCHAR,
90
+ BASIS VARCHAR,
91
+ TEMP_BASIS VARCHAR,
92
+ TEST_REMARK VARCHAR,
93
+ ANALYSIS_DATE_TIME TIMESTAMP_NS,
94
+ ANALYSIS_DATE VARCHAR,
95
+ ANALYSIS_TIME VARCHAR,
96
+ ANALYSIS_DATE_TIMEZONE VARCHAR,
97
+ COMPANY_NAME VARCHAR,
98
+ LAB_NAME_CODE VARCHAR,
99
+ LAB_SAMPLE_ID VARCHAR,
100
+ SAMPLE_TYPE_GROUP VARCHAR,
101
+ SAMPLE_TYPE_CODE VARCHAR,
102
+ SAMPLE_TYPE_DESC VARCHAR,
103
+ MEDIUM_CODE VARCHAR,
104
+ MATRIX_CODE VARCHAR,
105
+ START_DEPTH DOUBLE,
106
+ DEPTH_UNIT VARCHAR,
107
+ SAMPLE_DATE_TIME TIMESTAMP_NS,
108
+ SAMPLE_DATE VARCHAR,
109
+ SAMPLE_TIME VARCHAR,
110
+ SAMPLE_DATE_TIMEZONE VARCHAR,
111
+ EBATCH DOUBLE);
112
+ """)
113
+ con.execute("""
114
+ CREATE TABLE IF NOT EXISTS staging.wiski(
115
+ "Timestamp" VARCHAR,
116
+ "Value" DOUBLE,
117
+ "Quality Code" BIGINT,
118
+ "Quality Code Name" VARCHAR,
119
+ ts_unitsymbol VARCHAR,
120
+ ts_name VARCHAR,
121
+ ts_id VARCHAR,
122
+ station_no VARCHAR,
123
+ station_name VARCHAR,
124
+ station_latitude VARCHAR,
125
+ station_longitude VARCHAR,
126
+ parametertype_id VARCHAR,
127
+ parametertype_name VARCHAR,
128
+ stationparameter_no VARCHAR,
129
+ stationparameter_name VARCHAR,
130
+ wplmn_flag BIGINT);
131
+ """)
132
+
133
+
134
+ def create_analytics_tables(con: duckdb.DuckDBPyConnection):
135
+ """
136
+ Create necessary tables in the analytics schema.
137
+ """
138
+ con.execute("""
139
+ CREATE TABLE IF NOT EXISTS analytics.equis (
140
+ datetime TIMESTAMP,
141
+ value DOUBLE,
142
+ station_id TEXT,
143
+ station_origin TEXT,
144
+ constituent TEXT,
145
+ unit TEXT
146
+ );
147
+ """)
148
+ con.execute("""
149
+ CREATE TABLE IF NOT EXISTS analytics.wiski (
150
+ datetime TIMESTAMP,
151
+ value DOUBLE,
152
+ station_id TEXT,
153
+ station_origin TEXT,
154
+ constituent TEXT,
155
+ unit TEXT
156
+ );
157
+ """)
158
+
159
+ def create_mapping_tables(con: duckdb.DuckDBPyConnection):
160
+ """
161
+ Create and populate tables in the mappings schema from Python dicts and CSVs.
162
+ """
163
+ # WISKI parametertype_id -> constituent
164
+ wiski_parametertype_map = {
165
+ '11522': 'TP',
166
+ '11531': 'TP',
167
+ '11532': 'TSS',
168
+ '11523': 'TSS',
169
+ '11526': 'N',
170
+ '11519': 'N',
171
+ '11520': 'OP',
172
+ '11528': 'OP',
173
+ '11530': 'TKN',
174
+ '11521': 'TKN',
175
+ '11500': 'Q',
176
+ '11504': 'WT',
177
+ '11533': 'DO',
178
+ '11507': 'WL'
179
+ }
180
+ df_wiski_params = pd.DataFrame(wiski_parametertype_map.items(), columns=['parametertype_id', 'constituent'])
181
+ con.execute("CREATE TABLE IF NOT EXISTS mappings.wiski_parametertype AS SELECT * FROM df_wiski_params")
182
+
183
+ # EQuIS cas_rn -> constituent
184
+ equis_casrn_map = {
185
+ '479-61-8': 'CHLA',
186
+ 'CHLA-CORR': 'CHLA',
187
+ 'BOD': 'BOD',
188
+ 'NO2NO3': 'N',
189
+ '14797-55-8': 'NO3',
190
+ '14797-65-0': 'NO2',
191
+ '14265-44-2': 'OP',
192
+ 'N-KJEL': 'TKN',
193
+ 'PHOSPHATE-P': 'TP',
194
+ '7723-14-0': 'TP',
195
+ 'SOLIDS-TSS': 'TSS',
196
+ 'TEMP-W': 'WT',
197
+ '7664-41-7': 'NH3'
198
+ }
199
+ df_equis_cas = pd.DataFrame(equis_casrn_map.items(), columns=['cas_rn', 'constituent'])
200
+ con.execute("CREATE TABLE IF NOT EXISTS mappings.equis_casrn AS SELECT * FROM df_equis_cas")
201
+
202
+ # Load station cross-reference from CSV
203
+ # Assumes this script is run from a location where this relative path is valid
204
+ xref_csv_path = Path(__file__).parent / 'data/WISKI_EQUIS_XREF.csv'
205
+ if xref_csv_path.exists():
206
+ con.execute(f"CREATE TABLE IF NOT EXISTS mappings.station_xref AS SELECT * FROM read_csv_auto('{xref_csv_path.as_posix()}')")
207
+ else:
208
+ print(f"Warning: WISKI_EQUIS_XREF.csv not found at {xref_csv_path}")
209
+
210
+ # Load wiski_quality_codes from CSV
211
+ wiski_qc_csv_path = Path(__file__).parent / 'data/WISKI_QUALITY_CODES.csv'
212
+ if wiski_qc_csv_path.exists():
213
+ con.execute(f"CREATE TABLE IF NOT EXISTS mappings.wiski_quality_codes AS SELECT * FROM read_csv_auto('{wiski_qc_csv_path.as_posix()}')")
214
+ else:
215
+ print(f"Warning: WISKI_QUALITY_CODES.csv not found at {wiski_qc_csv_path}")
216
+
217
+
218
+ def attach_outlets_db(con: duckdb.DuckDBPyConnection, outlets_db_path: str):
219
+ """
220
+ Attach an external DuckDB database containing outlet definitions.
221
+ """
222
+ create_schemas(con)
223
+
224
+ con.execute(f"ATTACH DATABASE '{outlets_db_path}' AS outlets_db;")
34
225
 
226
+ tables = con.execute("SHOW TABLES FROM outlets_db").fetchall()
227
+ print(f"Tables in the source database: {tables}")
228
+
229
+ for table in tables:
230
+ table_name = table[0] # Extract table name
231
+ con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM outlets_db.{table_name}") # Copy table contents
232
+
233
+ # -- Step 2: Copy all views --
234
+ # Retrieve the list of views in the source database
235
+ views = con.execute("SHOW VIEWS FROM outlets_db").fetchall()
236
+ print(f"Views in the source database: {views}")
237
+
238
+ # Copy each view from source to destination
239
+ for view in views:
240
+ view_name = view[0] # Extract view name
241
+
242
+ # Get the CREATE VIEW statement for the view
243
+ create_view_sql = con.execute(f"SHOW CREATE VIEW outlets_db.{view_name}").fetchone()[0]
244
+
245
+ # Recreate the view in the destination database (remove the `outlets_db.` prefix if exists)
246
+ create_view_sql = create_view_sql.replace(f"outlets_db.", "")
247
+ con.execute(create_view_sql)
248
+
249
+
250
+ con.execute(f"ATTACH DATABASE '{outlets_db_path}' AS outlets_db;")
251
+ # Optional: Detach the source database
252
+ con.execute("DETACH 'outlets_db'")
253
+
254
+
255
+ def create_outlets_tables(con: duckdb.DuckDBPyConnection):
256
+ """
257
+ Create tables in the outlets schema to define outlet-station-reach relationships.Copies from outlets module.
258
+ """
259
+ query = outlets.OUTLETS_SCHEMA
260
+ con.execute(query)
261
+ outlets.build_outlets(con)
262
+
263
+ def create_normalized_wiski_view(con: duckdb.DuckDBPyConnection):
264
+ """
265
+ Create a view in the database that contains normalized WISKI data.
266
+ Units converted to standard units.
267
+ columns renamed.
268
+ constituents mapped.
269
+ """
270
+ con.execute("""
271
+ -- Create a single view with all transformations
272
+ CREATE OR REPLACE VIEW analytics.wiski_normalized AS
273
+ SELECT
274
+
275
+ -- Convert °C to °F and keep other values unchanged
276
+ CASE
277
+ WHEN LOWER(ts_unitsymbol) = '°c' THEN (value * 9.0 / 5.0) + 32
278
+ WHEN ts_unitsymbol = 'kg' THEN value * 2.20462 -- Convert kg to lb
279
+ ELSE value
280
+ END AS value,
281
+
282
+ -- Normalize units
283
+ CASE
284
+ WHEN LOWER(ts_unitsymbol) = '°c' THEN 'degf' -- Normalize °C to degF
285
+ WHEN ts_unitsymbol = 'kg' THEN 'lb' -- Normalize kg to lb
286
+ WHEN ts_unitsymbol = 'ft³/s' THEN 'cfs' -- Rename ft³/s to cfs
287
+ ELSE ts_unitsymbol
288
+ END AS unit,
289
+
290
+ -- Normalize column names
291
+ station_no AS station_id, -- Rename station_no to station_id
292
+ Timestamp AS datetime, -- Rename Timestamp to datetime
293
+ "Quality Code" AS quality_code, -- Rename Quality Code to quality_code
294
+ "Quality Code Name" AS quality_code_name, -- Rename Quality Code Name to quality_code_name
295
+ parametertype_id, -- Keeps parametertype_id as is
296
+ constituent -- Keeps constituent as is
297
+ FROM staging.wiski;""")
298
+
299
+
300
+ def create_filtered_wiski_view(con: duckdb.DuckDBPyConnection, data_codes: list):
301
+ """
302
+ Create a view in the database that filters WISKI data based on specified data codes.
303
+ """
304
+ query = f"""
305
+ CREATE OR REPLACE VIEW analytics.wiski_filtered AS
306
+ SELECT *
307
+ FROM analytics.wiski_normalized
308
+ WHERE quality_code IN ({placeholders});
309
+ """
310
+
311
+ placeholders = ', '.join(['?'] * len(data_codes))
312
+ query = query.format(placeholders=placeholders)
313
+ con.execute(query, data_codes)
314
+
315
+
316
+ def create_aggregated_wiski_view(con: duckdb.DuckDBPyConnection):
317
+ """
318
+ Create a view in the database that aggregates WISKI data by hour, station, and constituent.
319
+ """
320
+ con.execute("""
321
+ CREATE OR REPLACE Table analytics.wiski_aggregated AS
322
+ SELECT
323
+ station_id,
324
+ constituent,
325
+ time_bucket(INTERVAL '1 hour', datetime) AS hour_start,
326
+ AVG(value) AS value,
327
+ unit
328
+ FROM analytics.wiski_normalized
329
+ GROUP BY
330
+ station_id,
331
+ constituent,
332
+ hour_start,
333
+ unit;
334
+ """)
335
+
336
+ def create_staging_qc_count_view(con: duckdb.DuckDBPyConnection):
337
+ """
338
+ Create a view in staging schema that counts quality codes for each station and constituent.
339
+ """
340
+ con.execute("""
341
+ CREATE OR REPLACE VIEW reports.wiski_qc_count AS (
342
+ SELECT
343
+ w.station_no,
344
+ w.parametertype_name,
345
+ w."Quality Code",
346
+ COUNT(w."Quality Code") AS count,
347
+ wqc."Text",
348
+ wqc.Description,
349
+
350
+ FROM staging.wiski w
351
+ LEFT JOIN mappings.wiski_quality_codes wqc
352
+ ON w."Quality Code" = wqc.quality_code
353
+ WHERE wqc.Active = 1
354
+ GROUP BY
355
+ w."Quality Code",wqc."Text",wqc.Description,w.parametertype_name, w.station_no
356
+ );
357
+ """)
358
+ # ORDER BY
359
+ # w.station_no,w.parametertype_name, w."Quality Code"
360
+ # )
361
+ # """)
35
362
 
36
363
  def create_combined_observations_view(con: duckdb.DuckDBPyConnection):
37
364
  """
@@ -45,7 +372,103 @@ def create_combined_observations_view(con: duckdb.DuckDBPyConnection):
45
372
  SELECT datetime,value,station_id,station_origin,constituent,unit
46
373
  FROM analytics.wiski;
47
374
  """)
375
+
376
+
377
+ def create_outlet_observations_view(con: duckdb.DuckDBPyConnection):
378
+ """
379
+ Create a view in analytics schema that links observations to model reaches via outlets.
380
+ """
381
+ con.execute("""
382
+ CREATE OR REPLACE VIEW analytics.outlet_observations AS
383
+ SELECT
384
+ o.datetime,
385
+ os.outlet_id,
386
+ o.constituent,
387
+ AVG(o.value) AS value,
388
+ COUNT(o.value) AS count
389
+ FROM
390
+ analytics.observations AS o
391
+ INNER JOIN
392
+ outlets.outlet_stations AS os ON o.station_id = os.station_id AND o.station_origin = os.station_origin
393
+ WHERE os.outlet_id IS NOT NULL
394
+ GROUP BY
395
+ os.outlet_id,
396
+ o.constituent,
397
+ o.datetime; -- Group by the truncated date
398
+ """)
399
+ # ORDER BY
400
+ # os.outlet_id,
401
+ # o.constituent,
402
+ # datetime);
403
+
404
+
48
405
 
406
+ def create_outlet_observations_with_flow_view(con: duckdb.DuckDBPyConnection):
407
+
408
+ con.execute("""
409
+ CREATE OR REPLACE VIEW analytics.outlet_observations_with_flow AS
410
+ WITH
411
+ -- Extract baseflow data (constituent = 'QB')
412
+ baseflow_data AS (
413
+ SELECT
414
+ outlet_id,
415
+ datetime,
416
+ "value" AS baseflow_value
417
+ FROM
418
+ analytics.outlet_observations
419
+ WHERE
420
+ constituent = 'QB'
421
+ ),
422
+
423
+ -- Extract flow data (constituent = 'Q')
424
+ flow_data AS (
425
+ SELECT
426
+ outlet_id,
427
+ datetime,
428
+ "value" AS flow_value
429
+ FROM
430
+ analytics.outlet_observations
431
+ WHERE
432
+ constituent = 'Q'
433
+ ),
434
+
435
+ -- Extract all other constituent data (not 'Q' or 'QB')
436
+ constituent_data AS (
437
+ SELECT
438
+ outlet_id,
439
+ datetime,
440
+ constituent,
441
+ "value",
442
+ count
443
+ FROM
444
+ analytics.outlet_observations
445
+ WHERE
446
+ constituent NOT IN ('Q', 'QB')
447
+ )
448
+
449
+ -- Final join: Only include rows that have baseflow, flow, and constituent data
450
+ SELECT
451
+ c.outlet_id,
452
+ c.constituent,
453
+ c.datetime,
454
+ c."value",
455
+ c.count,
456
+ f.flow_value,
457
+ b.baseflow_value
458
+ FROM
459
+ constituent_data AS c
460
+ LEFT JOIN
461
+ flow_data AS f
462
+ ON c.outlet_id = f.outlet_id
463
+ AND c.datetime = f.datetime
464
+ LEFT JOIN
465
+ baseflow_data AS b
466
+ ON c.outlet_id = b.outlet_id
467
+ AND c.datetime = b.datetime;""")
468
+ # ORDER BY
469
+ # constituent_data.outlet_id,
470
+ # constituent_data.datetime;
471
+ #
49
472
 
50
473
  def create_constituent_summary_report(con: duckdb.DuckDBPyConnection):
51
474
  """
@@ -66,11 +489,44 @@ def create_constituent_summary_report(con: duckdb.DuckDBPyConnection):
66
489
  FROM
67
490
  analytics.observations
68
491
  GROUP BY
69
- constituent,station_id,station_origin
70
- ORDER BY
71
- constituent,sample_count;''')
492
+ constituent,station_id,station_origin;
493
+ ''')
494
+
495
+ # ORDER BY
496
+ # constituent,sample_count;''')
497
+
498
+ def create_outlet_summary_report(con: duckdb.DuckDBPyConnection):
499
+ con.execute("""
500
+ CREATE OR REPLACE VIEW reports.outlet_constituent_summary AS
501
+ SELECT
502
+ outlet_id,
503
+ constituent,
504
+ count_star() AS sample_count,
505
+ avg("value") AS average_value,
506
+ min("value") AS min_value,
507
+ max("value") AS max_value,
508
+ "year"(min(datetime)) AS start_date,
509
+ "year"(max(datetime)) AS end_date
510
+ FROM
511
+ analytics.outlet_observations
512
+ GROUP BY
513
+ constituent,
514
+ outlet_id
515
+ """)
72
516
 
517
+
73
518
 
519
+ def update_views(con: duckdb.DuckDBPyConnection):
520
+ """
521
+ Update all views in the database.
522
+ """
523
+ create_staging_qc_count_view(con)
524
+ create_combined_observations_view(con)
525
+ create_constituent_summary_report(con)
526
+ create_outlet_observations_view(con)
527
+ create_outlet_observations_with_flow_view(con)
528
+ create_outlet_summary_report(con)
529
+
74
530
  def connect(db_path: str, read_only: bool = False) -> duckdb.DuckDBPyConnection:
75
531
  """
76
532
  Returns a DuckDB connection to the given database path.
@@ -82,16 +538,69 @@ def connect(db_path: str, read_only: bool = False) -> duckdb.DuckDBPyConnection:
82
538
  return duckdb.connect(database=db_path.as_posix(), read_only=read_only)
83
539
 
84
540
 
85
- def load_df_to_table(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str, replace: bool = True):
541
+ def drop_station_id(con: duckdb.DuckDBPyConnection, station_id: str,station_origin: str):
542
+ """
543
+ Drop all data for a specific station from staging and analytics schemas.
544
+ """
545
+ con.execute(f"DELETE FROM staging.equis WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
546
+ con.execute(f"DELETE FROM staging.wiski WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
547
+ con.execute(f"DELETE FROM analytics.equis WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
548
+ con.execute(f"DELETE FROM analytics.wiski WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
549
+ update_views(con)
550
+
551
+ def get_column_names(con: duckdb.DuckDBPyConnection, table_schema: str, table_name: str) -> list:
552
+ """
553
+ Get the column names of a DuckDB table.
554
+ """
555
+ #table_schema, table_name = table_name.split('.')
556
+ query = """
557
+ SELECT column_name
558
+ FROM information_schema.columns
559
+ WHERE table_name = ? AND table_schema = ?
560
+ """
561
+ result = con.execute(query,[table_name,table_schema]).fetchall()
562
+ column_names = [row[0] for row in result]
563
+ return column_names
564
+
565
+
566
+ def add_to_table(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_schema: str, table_name: str):
567
+ """
568
+ Append a pandas DataFrame into a DuckDB table. This will create the table
569
+ if it does not exist.
570
+ """
571
+
572
+
573
+ # get existing columns
574
+ existing_columns = get_column_names(con, table_schema, table_name)
575
+ df = df[[existing_columns]]
576
+
577
+
578
+ # register pandas DF and create table if not exists
579
+ con.register("tmp_df", df)
580
+
581
+ con.execute(f"""
582
+ INSERT INTO {table_schema}.{table_name}
583
+ SELECT * FROM tmp_df
584
+ """)
585
+ con.unregister("tmp_df")
586
+
587
+ def add_station_data(con: duckdb.DuckDBPyConnection, station_id: str, station_origin: str, table_schema: str, table_name: str, df: pd.DataFrame, replace: bool = False):
588
+ """
589
+ Add station data to the staging and analytics schemas.
590
+ """
591
+ if replace:
592
+ drop_station_id(con, station_id, station_origin)
593
+ add_to_table(con, df, table_schema, table_name)
594
+
595
+
596
+ def load_df_to_table(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str):
86
597
  """
87
598
  Persist a pandas DataFrame into a DuckDB table. This will overwrite the table
88
599
  by default (replace=True).
89
600
  """
90
- if replace:
91
- con.execute(f"DROP TABLE IF EXISTS {table_name}")
92
601
  # register pandas DF and create table
93
602
  con.register("tmp_df", df)
94
- con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM tmp_df")
603
+ con.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM tmp_df")
95
604
  con.unregister("tmp_df")
96
605
 
97
606
  def load_df_to_staging(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str, replace: bool = True):
@@ -106,7 +615,6 @@ def load_df_to_staging(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_n
106
615
  con.execute(f"CREATE TABLE staging.{table_name} AS SELECT * FROM tmp_df")
107
616
  con.unregister("tmp_df")
108
617
 
109
-
110
618
  def load_csv_to_staging(con: duckdb.DuckDBPyConnection, csv_path: str, table_name: str, replace: bool = True, **read_csv_kwargs):
111
619
  """
112
620
  Persist a CSV file into a staging table. This will overwrite the staging
@@ -118,7 +626,7 @@ def load_csv_to_staging(con: duckdb.DuckDBPyConnection, csv_path: str, table_nam
118
626
  CREATE TABLE staging.{table_name} AS
119
627
  SELECT * FROM read_csv_auto('{csv_path}', {', '.join(f"{k}={repr(v)}" for k, v in read_csv_kwargs.items())})
120
628
  """)
121
-
629
+
122
630
  def load_parquet_to_staging(con: duckdb.DuckDBPyConnection, parquet_path: str, table_name: str, replace: bool = True):
123
631
  """
124
632
  Persist a Parquet file into a staging table. This will overwrite the staging
@@ -0,0 +1,55 @@
1
+
2
+ import pandas as pd
3
+ #from abc import abstractmethod
4
+ from pathlib import Path
5
+ from mpcaHydro import equis, wiski, warehouse
6
+ import duckdb
7
+
8
+
9
+
10
+
11
+
12
+ #%%
13
+ '''
14
+ This modules contains classes and functions to manage data downloads and storage into a local data warehouse.
15
+
16
+
17
+ '''
18
+
19
+ def get_db_path(warehouse_path:Path,db_name:str = 'observations')->Path:
20
+ '''
21
+ Constructs the full path to the database file within the warehouse directory.
22
+
23
+ Parameters:
24
+ warehouse_path (Path): The path to the warehouse directory.
25
+ db_name (str): The name of the database file.
26
+
27
+ Returns:
28
+ Path: The full path to the database file.
29
+ '''
30
+ return Path(warehouse_path) / db_name
31
+
32
+ def construct_database(db_path:Path,db_name:str = 'observations')->Path:
33
+ '''
34
+ Constructs the full path to the database file within the warehouse directory.
35
+
36
+ Parameters:
37
+ warehouse_path (Path): The path to the warehouse directory.
38
+ db_name (str): The name of the database file.
39
+
40
+ Returns:
41
+ Path: The full path to the database file.
42
+ '''
43
+ db_path = Path(db_path) / db_name
44
+ warehouse.init_db(warehouse_path=db_path)
45
+
46
+
47
+ def create_normalized_wiski_view(con: duckdb.DuckDBPyConnection):
48
+ """
49
+ Create a view in the database that contains normalized WISKI data.
50
+ """
51
+ con.execute("""
52
+ CREATE OR REPLACE VIEW analytics.normalized_wiski AS
53
+ SELECT
54
+ *""")
55
+