mpcaHydro 2.0.4__py3-none-any.whl → 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mpcaHydro/warehouse.py ADDED
@@ -0,0 +1,581 @@
1
+ import duckdb
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from mpcaHydro import outlets
5
+
6
+ def init_db(db_path: str,reset: bool = False):
7
+ """
8
+ Initialize the DuckDB database: create schemas and tables.
9
+ """
10
+ db_path = Path(db_path)
11
+ if reset and db_path.exists():
12
+ db_path.unlink()
13
+
14
+ with connect(db_path.as_posix()) as con:
15
+ # Create all schemas
16
+ create_schemas(con)
17
+
18
+ # Create tables
19
+ create_outlets_tables(con)
20
+ create_mapping_tables(con)
21
+ create_analytics_tables(con)
22
+
23
+ # Create views
24
+ #update_views(con)
25
+
26
+
27
+ def create_schemas(con: duckdb.DuckDBPyConnection):
28
+ """
29
+ Create staging, analytics, hspf, and reports schemas if they do not exist.
30
+ """
31
+ con.execute("CREATE SCHEMA IF NOT EXISTS staging")
32
+ con.execute("CREATE SCHEMA IF NOT EXISTS analytics")
33
+ con.execute("CREATE SCHEMA IF NOT EXISTS reports")
34
+ con.execute("CREATE SCHEMA IF NOT EXISTS outlets")
35
+ con.execute("CREATE SCHEMA IF NOT EXISTS mappings")
36
+
37
+ def create_analytics_tables(con: duckdb.DuckDBPyConnection):
38
+ """
39
+ Create necessary tables in the analytics schema.
40
+ """
41
+ con.execute("""
42
+ CREATE TABLE IF NOT EXISTS analytics.equis (
43
+ datetime TIMESTAMP,
44
+ value DOUBLE,
45
+ station_id TEXT,
46
+ station_origin TEXT,
47
+ constituent TEXT,
48
+ unit TEXT
49
+ );
50
+ """)
51
+ con.execute("""
52
+ CREATE TABLE IF NOT EXISTS analytics.wiski (
53
+ datetime TIMESTAMP,
54
+ value DOUBLE,
55
+ station_id TEXT,
56
+ station_origin TEXT,
57
+ constituent TEXT,
58
+ unit TEXT
59
+ );
60
+ """)
61
+
62
+ def create_mapping_tables(con: duckdb.DuckDBPyConnection):
63
+ """
64
+ Create and populate tables in the mappings schema from Python dicts and CSVs.
65
+ """
66
+ # WISKI parametertype_id -> constituent
67
+ wiski_parametertype_map = {
68
+ '11522': 'TP',
69
+ '11531': 'TP',
70
+ '11532': 'TSS',
71
+ '11523': 'TSS',
72
+ '11526': 'N',
73
+ '11519': 'N',
74
+ '11520': 'OP',
75
+ '11528': 'OP',
76
+ '11530': 'TKN',
77
+ '11521': 'TKN',
78
+ '11500': 'Q',
79
+ '11504': 'WT',
80
+ '11533': 'DO',
81
+ '11507': 'WL'
82
+ }
83
+ df_wiski_params = pd.DataFrame(wiski_parametertype_map.items(), columns=['parametertype_id', 'constituent'])
84
+ con.execute("CREATE TABLE IF NOT EXISTS mappings.wiski_parametertype AS SELECT * FROM df_wiski_params")
85
+
86
+ # EQuIS cas_rn -> constituent
87
+ equis_casrn_map = {
88
+ '479-61-8': 'CHLA',
89
+ 'CHLA-CORR': 'CHLA',
90
+ 'BOD': 'BOD',
91
+ 'NO2NO3': 'N',
92
+ '14797-55-8': 'NO3',
93
+ '14797-65-0': 'NO2',
94
+ '14265-44-2': 'OP',
95
+ 'N-KJEL': 'TKN',
96
+ 'PHOSPHATE-P': 'TP',
97
+ '7723-14-0': 'TP',
98
+ 'SOLIDS-TSS': 'TSS',
99
+ 'TEMP-W': 'WT',
100
+ '7664-41-7': 'NH3'
101
+ }
102
+ df_equis_cas = pd.DataFrame(equis_casrn_map.items(), columns=['cas_rn', 'constituent'])
103
+ con.execute("CREATE TABLE IF NOT EXISTS mappings.equis_casrn AS SELECT * FROM df_equis_cas")
104
+
105
+ # Load station cross-reference from CSV
106
+ # Assumes this script is run from a location where this relative path is valid
107
+ xref_csv_path = Path(__file__).parent / 'data/WISKI_EQUIS_XREF.csv'
108
+ if xref_csv_path.exists():
109
+ con.execute(f"CREATE TABLE IF NOT EXISTS mappings.station_xref AS SELECT * FROM read_csv_auto('{xref_csv_path.as_posix()}')")
110
+ else:
111
+ print(f"Warning: WISKI_EQUIS_XREF.csv not found at {xref_csv_path}")
112
+
113
+ # Load wiski_quality_codes from CSV
114
+ wiski_qc_csv_path = Path(__file__).parent / 'data/WISKI_QUALITY_CODES.csv'
115
+ if wiski_qc_csv_path.exists():
116
+ con.execute(f"CREATE TABLE IF NOT EXISTS mappings.wiski_quality_codes AS SELECT * FROM read_csv_auto('{wiski_qc_csv_path.as_posix()}')")
117
+ else:
118
+ print(f"Warning: WISKI_QUALITY_CODES.csv not found at {wiski_qc_csv_path}")
119
+
120
+ def create_outlets_tables(con: duckdb.DuckDBPyConnection):
121
+ """
122
+ Create tables in the outlets schema to define outlet-station-reach relationships.
123
+ """
124
+ con.execute("""-- schema.sql
125
+ -- Simple 3-table design to manage associations between model reaches and observation stations via outlets.
126
+ -- Compatible with DuckDB and SQLite.
127
+
128
+ -- Table 1: outlets
129
+ -- Represents a logical grouping that ties stations and reaches together.
130
+ CREATE TABLE IF NOT EXISTS outlets.outlets (
131
+ outlet_id TEXT PRIMARY KEY,
132
+ repository_name TEXT NOT NULL,
133
+ outlet_name TEXT,
134
+ notes TEXT -- optional: general notes about the outlet grouping
135
+ );
136
+
137
+ -- Table 2: outlet_stations
138
+ -- One-to-many: outlet -> stations
139
+ CREATE TABLE IF NOT EXISTS outlets.outlet_stations (
140
+ outlet_id TEXT NOT NULL,
141
+ station_id TEXT NOT NULL,
142
+ station_origin TEXT NOT NULL, -- e.g., 'wiski', 'equis'
143
+ repository_name TEXT NOT NULL, -- repository model the station is physically located in
144
+ true_opnid TEXT NOT NULL, -- The specific reach the station physically sits on (optional)
145
+ comments TEXT, -- Per-station comments, issues, etc.
146
+ CONSTRAINT uq_station_origin UNIQUE (station_id, station_origin),
147
+ FOREIGN KEY (outlet_id) REFERENCES outlets.outlets(outlet_id)
148
+ );
149
+
150
+ -- Table 3: outlet_reaches
151
+ -- One-to-many: outlet -> reaches
152
+ -- A reach can appear in multiple outlets, enabling many-to-many overall.
153
+ CREATE TABLE IF NOT EXISTS outlets.outlet_reaches (
154
+ outlet_id TEXT NOT NULL,
155
+ reach_id TEXT NOT NULL, -- model reach identifier (aka opind)
156
+ repository_name TEXT NOT NULL, -- optional: where the mapping comes from
157
+ exclude INTEGER DEFAULT 0, -- flag to indicate if this reach should be excluded (1) or included (0)
158
+ FOREIGN KEY (outlet_id) REFERENCES outlets.outlets(outlet_id)
159
+ );
160
+
161
+ -- Useful views:
162
+
163
+ -- View: station_reach_pairs
164
+ -- Derives the implicit many-to-many station <-> reach relationship via shared outlet_id
165
+ CREATE VIEW IF NOT EXISTS outlets.station_reach_pairs AS
166
+ SELECT
167
+ s.outlet_id,
168
+ s.station_id,
169
+ s.station_origin,
170
+ r.reach_id,
171
+ r.exclude,
172
+ r.repository_name,
173
+ FROM outlets.outlet_stations s
174
+ JOIN outlets.outlet_reaches r
175
+ ON s.outlet_id = r.outlet_id;
176
+
177
+ """)
178
+
179
+ def create_normalized_wiski_view(con: duckdb.DuckDBPyConnection):
180
+ """
181
+ Create a view in the database that contains normalized WISKI data.
182
+ Units converted to standard units.
183
+ columns renamed.
184
+ constituents mapped.
185
+ """
186
+ con.execute("""
187
+ -- Create a single view with all transformations
188
+ CREATE OR REPLACE VIEW analytics.wiski_normalized AS
189
+ SELECT
190
+
191
+ -- Convert °C to °F and keep other values unchanged
192
+ CASE
193
+ WHEN LOWER(ts_unitsymbol) = '°c' THEN (value * 9.0 / 5.0) + 32
194
+ WHEN ts_unitsymbol = 'kg' THEN value * 2.20462 -- Convert kg to lb
195
+ ELSE value
196
+ END AS value,
197
+
198
+ -- Normalize units
199
+ CASE
200
+ WHEN LOWER(ts_unitsymbol) = '°c' THEN 'degf' -- Normalize °C to degF
201
+ WHEN ts_unitsymbol = 'kg' THEN 'lb' -- Normalize kg to lb
202
+ WHEN ts_unitsymbol = 'ft³/s' THEN 'cfs' -- Rename ft³/s to cfs
203
+ ELSE ts_unitsymbol
204
+ END AS unit,
205
+
206
+ -- Normalize column names
207
+ station_no AS station_id, -- Rename station_no to station_id
208
+ Timestamp AS datetime, -- Rename Timestamp to datetime
209
+ "Quality Code" AS quality_code, -- Rename Quality Code to quality_code
210
+ "Quality Code Name" AS quality_code_name, -- Rename Quality Code Name to quality_code_name
211
+ parametertype_id, -- Keeps parametertype_id as is
212
+ constituent -- Keeps constituent as is
213
+ FROM staging.wiski_raw;""")
214
+
215
+
216
+ def create_filtered_wiski_view(con: duckdb.DuckDBPyConnection, data_codes: list):
217
+ """
218
+ Create a view in the database that filters WISKI data based on specified data codes.
219
+ """
220
+ query = f"""
221
+ CREATE OR REPLACE VIEW analytics.wiski_filtered AS
222
+ SELECT *
223
+ FROM analytics.wiski_normalized
224
+ WHERE quality_code IN ({placeholders});
225
+ """
226
+
227
+ placeholders = ', '.join(['?'] * len(data_codes))
228
+ query = query.format(placeholders=placeholders)
229
+ con.execute(query, data_codes)
230
+
231
+
232
+ def create_aggregated_wiski_view(con: duckdb.DuckDBPyConnection):
233
+ """
234
+ Create a view in the database that aggregates WISKI data by hour, station, and constituent.
235
+ """
236
+ con.execute("""
237
+ CREATE OR REPLACE Table analytics.wiski_aggregated AS
238
+ SELECT
239
+ station_id,
240
+ constituent,
241
+ time_bucket(INTERVAL '1 hour', datetime) AS hour_start,
242
+ AVG(value) AS value,
243
+ unit
244
+ FROM analytics.wiski_normalized
245
+ GROUP BY
246
+ station_id,
247
+ constituent,
248
+ hour_start,
249
+ unit;
250
+ """)
251
+
252
+ def create_staging_qc_count_view(con: duckdb.DuckDBPyConnection):
253
+ """
254
+ Create a view in staging schema that counts quality codes for each station and constituent.
255
+ """
256
+ con.execute("""
257
+ CREATE OR REPLACE VIEW staging.wiski_qc_count AS (
258
+ SELECT
259
+ w.station_no,
260
+ w.parametertype_name,
261
+ w."Quality Code",
262
+ w."Quality Code Name",
263
+ COUNT(w."Quality Code") AS count
264
+ FROM staging.wiski_raw w
265
+ GROUP BY
266
+ w."Quality Code",w."Quality Code Name",w.parametertype_name, w.station_no
267
+ );
268
+ """)
269
+ # ORDER BY
270
+ # w.station_no,w.parametertype_name, w."Quality Code"
271
+ # )
272
+ # """)
273
+
274
+ def create_combined_observations_view(con: duckdb.DuckDBPyConnection):
275
+ """
276
+ Create a view in analytics schema that combines observations from equis and wiski processed tables.
277
+ """
278
+ con.execute("""
279
+ CREATE OR REPLACE VIEW analytics.observations AS
280
+ SELECT datetime,value,station_id,station_origin,constituent,unit
281
+ FROM analytics.equis
282
+ UNION ALL
283
+ SELECT datetime,value,station_id,station_origin,constituent,unit
284
+ FROM analytics.wiski;
285
+ """)
286
+
287
+
288
+ def create_outlet_observations_view(con: duckdb.DuckDBPyConnection):
289
+ """
290
+ Create a view in analytics schema that links observations to model reaches via outlets.
291
+ """
292
+ con.execute("""
293
+ CREATE OR REPLACE VIEW analytics.outlet_observations AS
294
+ SELECT
295
+ o.datetime,
296
+ os.outlet_id,
297
+ o.constituent,
298
+ AVG(o.value) AS value,
299
+ COUNT(o.value) AS count
300
+ FROM
301
+ analytics.observations AS o
302
+ LEFT JOIN
303
+ outlets.outlet_stations AS os ON o.station_id = os.station_id AND o.station_origin = os.station_origin
304
+ GROUP BY
305
+ os.outlet_id,
306
+ o.constituent,
307
+ o.datetime; -- Group by the truncated date
308
+ """)
309
+ # ORDER BY
310
+ # os.outlet_id,
311
+ # o.constituent,
312
+ # datetime);
313
+
314
+
315
+
316
+ def create_outlet_observations_with_flow_view(con: duckdb.DuckDBPyConnection):
317
+
318
+ con.execute("""
319
+ CREATE OR REPLACE VIEW analytics.outlet_observations_with_flow AS
320
+ WITH baseflow_data AS (
321
+ SELECT
322
+ outlet_id,
323
+ datetime,
324
+ "value" AS baseflow_value
325
+ FROM
326
+ analytics.outlet_observations
327
+ WHERE
328
+ (constituent = 'QB')),
329
+ flow_data AS (
330
+ SELECT
331
+ outlet_id,
332
+ datetime,
333
+ "value" AS flow_value
334
+ FROM
335
+ analytics.outlet_observations
336
+ WHERE
337
+ (constituent = 'Q')),
338
+ constituent_data AS (
339
+ SELECT
340
+ outlet_id,
341
+ datetime,
342
+ constituent,
343
+ "value",
344
+ count
345
+ FROM
346
+ analytics.outlet_observations
347
+ WHERE
348
+ (constituent NOT IN ('Q', 'QB')))
349
+ SELECT
350
+ constituent_data.outlet_id,
351
+ constituent_data.constituent,
352
+ constituent_data.datetime,
353
+ constituent_data."value",
354
+ flow_data.flow_value,
355
+ baseflow_data.baseflow_value
356
+ FROM
357
+ constituent_data
358
+ FULL JOIN flow_data ON
359
+ (((constituent_data.outlet_id = flow_data.outlet_id)
360
+ AND (constituent_data.datetime = flow_data.datetime)))
361
+ LEFT JOIN baseflow_data ON
362
+ (((constituent_data.outlet_id = baseflow_data.outlet_id)
363
+ AND (constituent_data.datetime = baseflow_data.datetime)));""")
364
+ # ORDER BY
365
+ # constituent_data.outlet_id,
366
+ # constituent_data.datetime;
367
+ #
368
+
369
+ def create_constituent_summary_report(con: duckdb.DuckDBPyConnection):
370
+ """
371
+ Create a constituent summary report in the reports schema that groups observations by constituent and station.
372
+ """
373
+ con.execute('''
374
+ CREATE OR REPLACE VIEW reports.constituent_summary AS
375
+ SELECT
376
+ station_id,
377
+ station_origin,
378
+ constituent,
379
+ COUNT(*) AS sample_count,
380
+ AVG(value) AS average_value,
381
+ MIN(value) AS min_value,
382
+ MAX(value) AS max_value,
383
+ year(MIN(datetime)) AS start_date,
384
+ year(MAX(datetime)) AS end_date
385
+ FROM
386
+ analytics.observations
387
+ GROUP BY
388
+ constituent,station_id,station_origin;
389
+ ''')
390
+
391
+ # ORDER BY
392
+ # constituent,sample_count;''')
393
+
394
+ def create_outlet_summary_report(con: duckdb.DuckDBPyConnection):
395
+ con.execute("""
396
+ CREATE VIEW reports.outlet_constituent_summary AS
397
+ SELECT
398
+ outlet_id,
399
+ constituent,
400
+ count_star() AS sample_count,
401
+ avg("value") AS average_value,
402
+ min("value") AS min_value,
403
+ max("value") AS max_value,
404
+ "year"(min(datetime)) AS start_date,
405
+ "year"(max(datetime)) AS end_date
406
+ FROM
407
+ analytics.outlet_observations
408
+ GROUP BY
409
+ constituent,
410
+ outlet_id
411
+ """)
412
+
413
+
414
+
415
+ def drop_station_id(con: duckdb.DuckDBPyConnection, station_id: str,station_origin: str):
416
+ """
417
+ Drop all data for a specific station from staging and analytics schemas.
418
+ """
419
+ con.execute(f"DELETE FROM staging.equis_raw WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
420
+ con.execute(f"DELETE FROM staging.wiski_raw WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
421
+ con.execute(f"DELETE FROM analytics.equis WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
422
+ con.execute(f"DELETE FROM analytics.wiski WHERE station_id = '{station_id}' AND station_origin = '{station_origin}'")
423
+ update_views(con)
424
+
425
+ def update_views(con: duckdb.DuckDBPyConnection):
426
+ """
427
+ Update all views in the database.
428
+ """
429
+ create_staging_qc_count_view(con)
430
+ create_combined_observations_view(con)
431
+ create_constituent_summary_report(con)
432
+ create_outlet_observations_view(con)
433
+ create_outlet_observations_with_flow_view(con)
434
+ create_outlet_summary_report(con)
435
+
436
+ def connect(db_path: str, read_only: bool = False) -> duckdb.DuckDBPyConnection:
437
+ """
438
+ Returns a DuckDB connection to the given database path.
439
+ Ensures the parent directory exists.
440
+ """
441
+ db_path = Path(db_path)
442
+ parent = db_path.parent
443
+ parent.mkdir(parents=True, exist_ok=True)
444
+ return duckdb.connect(database=db_path.as_posix(), read_only=read_only)
445
+
446
+
447
+ def load_df_to_table(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str, replace: bool = True):
448
+ """
449
+ Persist a pandas DataFrame into a DuckDB table. This will overwrite the table
450
+ by default (replace=True).
451
+ """
452
+ if replace:
453
+ con.execute(f"DROP TABLE IF EXISTS {table_name}")
454
+ # register pandas DF and create table
455
+ con.register("tmp_df", df)
456
+ con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM tmp_df")
457
+ con.unregister("tmp_df")
458
+
459
+ def load_df_to_staging(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str, replace: bool = True):
460
+ """
461
+ Persist a pandas DataFrame into a staging table. This will overwrite the staging
462
+ table by default (replace=True).
463
+ """
464
+ if replace:
465
+ con.execute(f"DROP TABLE IF EXISTS staging.{table_name}")
466
+ # register pandas DF and create table
467
+ con.register("tmp_df", df)
468
+ con.execute(f"CREATE TABLE staging.{table_name} AS SELECT * FROM tmp_df")
469
+ con.unregister("tmp_df")
470
+
471
+ def add_df_to_staging(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str):
472
+ """
473
+ Append a pandas DataFrame into a staging table. This will create the staging
474
+ table if it does not exist.
475
+ """
476
+ # register pandas DF and create table if not exists
477
+ con.register("tmp_df", df)
478
+ con.execute(f"""
479
+ CREATE TABLE IF NOT EXISTS staging.{table_name} AS
480
+ SELECT * FROM tmp_df
481
+ """)
482
+ con.execute(f"""
483
+ INSERT INTO staging.{table_name}
484
+ SELECT * FROM tmp_df
485
+ """)
486
+ con.unregister("tmp_df")
487
+
488
+ def load_csv_to_staging(con: duckdb.DuckDBPyConnection, csv_path: str, table_name: str, replace: bool = True, **read_csv_kwargs):
489
+ """
490
+ Persist a CSV file into a staging table. This will overwrite the staging
491
+ table by default (replace=True).
492
+ """
493
+ if replace:
494
+ con.execute(f"DROP TABLE IF EXISTS staging.{table_name}")
495
+ con.execute(f"""
496
+ CREATE TABLE staging.{table_name} AS
497
+ SELECT * FROM read_csv_auto('{csv_path}', {', '.join(f"{k}={repr(v)}" for k, v in read_csv_kwargs.items())})
498
+ """)
499
+
500
+ def load_parquet_to_staging(con: duckdb.DuckDBPyConnection, parquet_path: str, table_name: str, replace: bool = True):
501
+ """
502
+ Persist a Parquet file into a staging table. This will overwrite the staging
503
+ table by default (replace=True).
504
+ """
505
+ if replace:
506
+ con.execute(f"DROP TABLE IF EXISTS staging.{table_name}")
507
+ con.execute(f"""
508
+ CREATE TABLE staging.{table_name} AS
509
+ SELECT * FROM read_parquet('{parquet_path}')
510
+ """)
511
+
512
+
513
+ def write_table_to_parquet(con: duckdb.DuckDBPyConnection, table_name: str, path: str, compression="snappy"):
514
+ """
515
+ Persist a DuckDB table into a Parquet file.
516
+ """
517
+ con.execute(f"COPY (SELECT * FROM {table_name}) TO '{path}' (FORMAT PARQUET, COMPRESSION '{compression}')")
518
+
519
+
520
+ def write_table_to_csv(con: duckdb.DuckDBPyConnection, table_name: str, path: str, header: bool = True, sep: str = ',', **kwargs):
521
+ """
522
+ Persist a DuckDB table into a CSV file.
523
+ """
524
+ con.execute(f"COPY (SELECT * FROM {table_name}) TO '{path}' (FORMAT CSV, HEADER {str(header).upper()}, DELIMITER '{sep}' {', '.join(f', {k}={repr(v)}' for k, v in kwargs.items())})")
525
+
526
+
527
+
528
+
529
+ def load_df_to_analytics(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str):
530
+ """
531
+ Persist a pandas DataFrame into an analytics table.
532
+ """
533
+ con.execute(f"DROP TABLE IF EXISTS analytics.{table_name}")
534
+ con.register("tmp_df", df)
535
+ con.execute(f"CREATE TABLE analytics.{table_name} AS SELECT * FROM tmp_df")
536
+ con.unregister("tmp_df")
537
+
538
+
539
+ def migrate_staging_to_analytics(con: duckdb.DuckDBPyConnection, staging_table: str, analytics_table: str):
540
+ """
541
+ Migrate data from a staging table to an analytics table.
542
+ """
543
+ con.execute(f"DROP TABLE IF EXISTS analytics.{analytics_table}")
544
+ con.execute(f"""
545
+ CREATE TABLE analytics.{analytics_table} AS
546
+ SELECT * FROM staging.{staging_table}
547
+ """)
548
+
549
+
550
+ def load_to_analytics(con: duckdb.DuckDBPyConnection, table_name: str):
551
+ con.execute(f"""
552
+ CREATE OR REPLACE TABLE analytics.{table_name} AS
553
+ SELECT
554
+ station_id,
555
+ constituent,
556
+ datetime,
557
+ value AS observed_value,
558
+ time_bucket(INTERVAL '1 hour', datetime) AS hour_start,
559
+ AVG(observed_value) AS value
560
+ FROM
561
+ staging.equis_processed
562
+ GROUP BY
563
+ hour_start,
564
+ constituent,
565
+ station_id
566
+ ORDER BY
567
+ station_id,
568
+ constituent
569
+ """)
570
+ # register pandas DF and create table
571
+ con.register("tmp_df", df)
572
+ con.execute(f"CREATE TABLE analytics.{table_name} AS SELECT * FROM tmp_df")
573
+ con.unregister("tmp_df")
574
+
575
+ def dataframe_to_parquet(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, path, compression="snappy"):
576
+ # path should be a filename like 'data/raw/equis/equis-20251118.parquet'
577
+ con = duckdb.connect()
578
+ con.register("tmp_df", df)
579
+ con.execute(f"COPY (SELECT * FROM tmp_df) TO '{path}' (FORMAT PARQUET, COMPRESSION '{compression}')")
580
+ con.unregister("tmp_df")
581
+ con.close()
@@ -0,0 +1,47 @@
1
+
2
+ import pandas as pd
3
+ #from abc import abstractmethod
4
+ from pathlib import Path
5
+ from mpcaHydro import equis, wiski, warehouse
6
+ import duckdb
7
+
8
+
9
+
10
+
11
+
12
+ #%%
13
+ '''
14
+ This modules contains classes and functions to manage data downloads and storage into a local data warehouse.
15
+
16
+
17
+ '''
18
+
19
+ def get_db_path(warehouse_path:Path,db_name:str = 'observations')->Path:
20
+ '''
21
+ Constructs the full path to the database file within the warehouse directory.
22
+
23
+ Parameters:
24
+ warehouse_path (Path): The path to the warehouse directory.
25
+ db_name (str): The name of the database file.
26
+
27
+ Returns:
28
+ Path: The full path to the database file.
29
+ '''
30
+ return Path(warehouse_path) / db_name
31
+
32
+ def construct_database(db_path:Path,db_name:str = 'observations')->Path:
33
+ '''
34
+ Constructs the full path to the database file within the warehouse directory.
35
+
36
+ Parameters:
37
+ warehouse_path (Path): The path to the warehouse directory.
38
+ db_name (str): The name of the database file.
39
+
40
+ Returns:
41
+ Path: The full path to the database file.
42
+ '''
43
+ db_path = Path(db_path) / db_name
44
+ warehouse.init_db(warehouse_path=db_path)
45
+
46
+
47
+