PyPI - mpcaHydro - Versions diffs - 2.1.0__py3-none-any.whl → 2.2.1__py3-none-any.whl - Mend

mpcaHydro 2.1.0py3-none-any.whl → 2.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

mpcaHydro/data/WISKI_QUALITY_CODES.csv +71 -0
mpcaHydro/data/outlet.duckdb +0 -0
mpcaHydro/data/stations_EQUIS.gpkg +0 -0
mpcaHydro/data/stations_wiski.gpkg +0 -0
mpcaHydro/data_manager.py +172 -292
mpcaHydro/equis.py +31 -22
mpcaHydro/etlSWD.py +21 -15
mpcaHydro/outlets.py +367 -0
mpcaHydro/reports.py +80 -0
mpcaHydro/warehouse.py +525 -17
mpcaHydro/warehouseManager.py +55 -0
mpcaHydro/{WISKI.py → wiski.py} +97 -17
mpcaHydro/xref.py +74 -0
{mpcahydro-2.1.0.dist-info → mpcahydro-2.2.1.dist-info}/METADATA +3 -1
mpcahydro-2.2.1.dist-info/RECORD +23 -0
mpcahydro-2.1.0.dist-info/RECORD +0 -15
{mpcahydro-2.1.0.dist-info → mpcahydro-2.2.1.dist-info}/WHEEL +0 -0

mpcaHydro/equis.py CHANGED Viewed

@@ -164,26 +164,25 @@ def as_utc_offset(naive_dt: Union[datetime, str], tz_label: str, target_offset:
     aware_src = naive.replace(tzinfo=src_tz)
     # convert the instant to fixed UTC-6
-    return aware_src.astimezone(target_offset)
+    return aware_src.astimezone(target_offset).tz_localize(None)
 def normalize_columns(df):
     '''Select relevant columns from Equis data.'''
     return df[['SYS_LOC_CODE',
+               'constituent',
+               'CAS_RN',
                'datetime',
                'RESULT_NUMERIC',
                'RESULT_UNIT',
-               'constituent'
                ]].rename(columns={
                    'SYS_LOC_CODE':'station_id',
                    'RESULT_NUMERIC':'value',
-                   'RESULT_UNIT':'unit'
+                   'RESULT_UNIT':'unit',
+                   'CAS_RN':'cas_rn'
                })
-def replace_nondetects(df):
-    '''Replace non-detect results with 0 in Equis data.'''
-    df.loc[df['RESULT_NUMERIC'].isna(), 'RESULT_NUMERIC'] = 0
-    return df
 def normalize_timezone(df):
     '''Normalize datetime to UTC in Equis data.'''
@@ -194,27 +193,27 @@ def normalize_timezone(df):
         except Exception:
             return pd.NaT
-    df['datetime'] = df.apply(_conv, axis=1)
+    df.loc[:,'datetime'] = df.apply(_conv, axis=1)
     return df
 def convert_units(df):
     '''Convert units in Equis data to standard units.'''
     # Convert ug/L to mg/L
-    df['RESULT_UNIT'] = df['RESULT_UNIT'].str.lower()
+    df['unit'] = df['unit'].str.lower()
-    mask_ugL = df['RESULT_UNIT'] == 'ug/l'
-    df.loc[mask_ugL, 'RESULT_NUMERIC'] = df.loc[mask_ugL, 'RESULT_NUMERIC'] / 1000
-    df.loc[mask_ugL, 'RESULT_UNIT'] = 'mg/l'
+    mask_ugL = df['unit'] == 'ug/l'
+    df.loc[mask_ugL, 'value'] = df.loc[mask_ugL, 'value'] / 1000
+    df.loc[mask_ugL, 'unit'] = 'mg/l'
     # Convert mg/g to mg/L (assuming density of 1 g/mL)
-    mask_mgg = df['RESULT_UNIT'] == 'mg/g'
-    df.loc[mask_mgg, 'RESULT_NUMERIC'] = df.loc[mask_mgg, 'RESULT_NUMERIC'] * 1000
-    df.loc[mask_mgg, 'RESULT_UNIT'] = 'mg/l'
+    mask_mgg = df['unit'] == 'mg/g'
+    df.loc[mask_mgg, 'value'] = df.loc[mask_mgg, 'value'] * 1000
+    df.loc[mask_mgg, 'unit'] = 'mg/l'
     # Convert deg C to degF
-    mask_degC = df['RESULT_UNIT'].isin(['deg c', 'degc'])
-    df.loc[mask_degC, 'RESULT_NUMERIC'] = (df.loc[mask_degC, 'RESULT_NUMERIC'] * 9/5) + 32
-    df.loc[mask_degC, 'RESULT_UNIT'] = 'degf'
+    mask_degC = df['unit'].isin(['deg c', 'degc'])
+    df.loc[mask_degC, 'value'] = (df.loc[mask_degC, 'value'] * 9/5) + 32
+    df.loc[mask_degC, 'unit'] = 'degf'
     return df
@@ -232,15 +231,25 @@ def average_results(df):
         value=('value', 'mean')
     ).reset_index()
+def replace_nondetects(df):
+    '''Replace non-detect results with 0 in Equis data.'''
+    df.loc[df['value'].isna(), 'value'] = 0
+    return df
+def normalize(df):
+    '''Normalize Equis data: select relevant columns.'''
+    df = map_constituents(df)
+    df = normalize_timezone(df)
+    df = normalize_columns(df)
+    df = convert_units(df)
+    return df
 def transform(df):
     '''Transform Equis data: handle non-detects, convert units, map constituents.'''
+    df = normalize(df)
     df = replace_nondetects(df)
     if not df.empty:
-        df = normalize_timezone(df)
-        df = convert_units(df)
-        df = map_constituents(df)
-        df = normalize_columns(df)
         df = average_results(df)
     return df

mpcaHydro/etlSWD.py CHANGED Viewed

@@ -26,19 +26,21 @@ CONSTITUENT_MAP = {i[0]:i[1] for i in EQUIS_PARAMETER_XREF[['PARAMETER','constit
 #     return df
 import requests
-def _download(station_no):
+def _download(station_id):
     # Replace {station_no} in the URL with the actual station number
-    url = f"https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=json"
+    #url = f"https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=json"
+    url = 'https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results'
     try:
         # Send a GET request to the URL
-        response = requests.get(url)
+        params = {
+            'stationId': station_id,
+            'format': 'json'
+        }
+        response = requests.get(url,params = params)
         response.raise_for_status()  # Raise exception for HTTP errors
         # Parse the JSON data
-        if response.json()['recordCount'] == 0:
-            return pd.DataFrame(columns = response.json()['column_names'])
-        else:
-            return pd.DataFrame(response.json()['data'])
+        return pd.DataFrame(response.json()['data'])
     except requests.exceptions.RequestException as e:
         print(f"An error occurred: {e}")
@@ -46,14 +48,18 @@ def _download(station_no):
-def download(station_no):
+def download(station_ids):
     #df = pd.read_csv(f'https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=csv')
-    df = _download(station_no)
-    if df.empty:
-        return df
-    else:
-        df['station_id'] = station_no
-        return transform(df)
+    dfs = []
+    for station_id in station_ids:
+        df = _download(station_id)
+        if not df.empty:
+            df['station_id'] = station_id
+            dfs.append(df)
+    return pd.concat(dfs, ignore_index=True)
 def info(station_no):
     #df = pd.read_csv(f'https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=csv')

mpcaHydro/outlets.py ADDED Viewed

@@ -0,0 +1,367 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu May  1 09:51:51 2025
+@author: mfratki
+"""
+#import sqlite3
+from pathlib import Path
+import geopandas as gpd
+import pandas as pd
+import duckdb
+#from hspf_tools.calibrator import etlWISKI, etlSWD
+#stations_wiski = gpd.read_file('C:/Users/mfratki/Documents/GitHub/pyhcal/src/pyhcal/data/stations_wiski.gpkg')
+_stations_wiski = gpd.read_file(str(Path(__file__).resolve().parent/'data\\stations_wiski.gpkg'))
+stations_wiski = _stations_wiski.loc[:,['station_id','true_opnid','opnids','comments','modeled','repo_name','wplmn_flag']]
+stations_wiski['source'] = 'wiski'
+_stations_equis = gpd.read_file(str(Path(__file__).resolve().parent/'data\\stations_EQUIS.gpkg'))
+stations_equis = _stations_equis.loc[:,['station_id','true_opnid','opnids','comments','modeled','repo_name']]
+stations_equis['source'] = 'equis'
+stations_equis['wplmn_flag'] = 0
+DB_PATH = str(Path(__file__).resolve().parent/'data\\outlet.duckdb')
+MODL_DB = pd.concat([stations_wiski,stations_equis])
+MODL_DB['opnids'] = MODL_DB['opnids'].str.strip().replace('',pd.NA)
+MODL_DB = MODL_DB.dropna(subset='opnids')
+MODL_DB = MODL_DB.drop_duplicates(['station_id','source']).reset_index(drop=True)
+def _reload():
+    global _stations_wiski, stations_wiski, _stations_equis, stations_equis, MODL_DB
+    _stations_wiski = gpd.read_file(str(Path(__file__).resolve().parent/'data\\stations_wiski.gpkg'))
+    stations_wiski = _stations_wiski.loc[:,['station_id','true_opnid','opnids','comments','modeled','repo_name','wplmn_flag']]
+    stations_wiski['source'] = 'wiski'
+    _stations_equis = gpd.read_file(str(Path(__file__).resolve().parent/'data\\stations_EQUIS.gpkg'))
+    stations_equis = _stations_equis.loc[:,['station_id','true_opnid','opnids','comments','modeled','repo_name']]
+    stations_equis['source'] = 'equis'
+    stations_equis['wplmn_flag'] = 0
+    MODL_DB = pd.concat([stations_wiski,stations_equis])
+    MODL_DB['opnids'] = MODL_DB['opnids'].str.strip().replace('',pd.NA)
+    MODL_DB = MODL_DB.dropna(subset='opnids')
+    MODL_DB = MODL_DB.drop_duplicates(['station_id','source']).reset_index(drop=True)
+def split_opnids(opnids: list):
+    return [int(float(j)) for i in opnids for j in i]
+def get_model_db(model_name: str):
+    return MODL_DB.query('repo_name == @model_name')
+def valid_models():
+    return MODL_DB['repo_name'].unique().tolist()
+def equis_stations(model_name):
+    return _stations_equis.query('repo_name == @model_name')['station_id'].tolist()
+def wiski_stations(model_name):
+    return _stations_wiski.query('repo_name == @model_name')['station_id'].tolist()
+def wplmn_stations(model_name):
+    return MODL_DB.query('repo_name == @model_name and wplmn_flag == 1 and source == "wiski"')['station_id'].tolist()
+def wplmn_station_opnids(model_name):
+    opnids = MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name and wplmn_flag == 1 and source == "wiski"')['opnids'].str.split(',').to_list()
+    return split_opnids(opnids)
+def wiski_station_opnids(model_name):
+    opnids = MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name and source == "wiski"')['opnids'].str.split(',').to_list()
+    return split_opnids(opnids)
+def equis_station_opnids(model_name):
+    opnids = MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name and source == "equis"')['opnids'].str.split(',').to_list()
+    return split_opnids(opnids)
+def station_opnids(model_name):
+    opnids = MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name')['opnids'].str.split(',').to_list()
+    return split_opnids(opnids)
+def mapped_equis_stations(model_name):
+    return MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name and source == "equis"')['station_id'].tolist()
+def mapped_wiski_stations(model_name):
+    return MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name and source == "wiski"')['station_id'].tolist()
+def outlets(model_name):
+    return [group for _, group in MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name').groupby(by = ['opnids','repo_name'])]
+def outlet_stations(model_name):
+    return [group['station_id'].to_list() for _, group in MODL_DB.dropna(subset=['opnids']).query('repo_name == @model_name').groupby(by = ['opnids','repo_name'])]
+def connect(db_path, read_only=True):
+    #Path(db_path).parent.mkdir(parents=True, exist_ok=True)
+    return duckdb.connect(db_path,read_only=read_only)
+def init_db(db_path: str,reset: bool = False):
+    """
+    Initialize the DuckDB database: create staging and analytics schemas
+    """
+    db_path = Path(db_path)
+    if reset and db_path.exists():
+        db_path.unlink()
+    with connect(db_path.as_posix(),False) as con:
+        con.execute(OUTLETS_SCHEMA)
+# Accessors:
+def get_outlets_by_model(model_name: str):
+    with connect(DB_PATH) as con:
+        df = con.execute(
+            """
+            SELECT r.*
+            FROM outlets.station_reach_pairs r
+            WHERE r.repository_name = ?
+            """,
+            [model_name]
+        ).fetchdf()
+    return df
+def get_outlets_by_reach(reach_id: int, model_name: str):
+    """
+    Return all outlet rows for outlets that include the given reach_id in the given model_name.
+    """
+    with connect(DB_PATH) as con:
+        df = con.execute(
+            """
+            SELECT r.*
+            FROM outlets.station_reach_pairs r
+            WHERE r.reach_id = ? AND r.repository_name = ?
+            """,
+        [reach_id, model_name]).fetchdf()
+    return df
+def get_outlets_by_station(station_id: str, station_origin: str):
+    """
+    Return all outlet rows for outlets that include the given reach_id in the given model_name.
+    """
+    with connect(DB_PATH) as con:
+        df = con.execute(
+        """
+        SELECT r.*
+        FROM outlets.station_reach_pairs r
+        WHERE r.station_id = ? AND r.station_origin = ?
+        """,
+        [station_id, station_origin]).fetchdf()
+    return df
+class OutletGateway:
+    def __init__(self, model_name: str):
+        self.model_name = model_name
+        self.db_path = DB_PATH
+        self.modl_db = get_model_db(model_name)
+    # Legacy methods to access functions
+    def wplmn_station_opnids(self):
+        return wplmn_station_opnids(self.model_name)
+    def wiski_station_opnids(self):
+        return wiski_station_opnids(self.model_name)
+    def equis_station_opnids(self):
+        return equis_station_opnids(self.model_name)
+    def station_opnids(self):
+        return station_opnids(self.model_name)
+    def equis_stations(self):
+        return equis_stations(self.model_name)
+    def wiski_stations(self):
+        return wiski_stations(self.model_name)
+    def wplmn_stations(self):
+        return wplmn_stations(self.model_name)
+    def outlets(self):
+        return outlets(self.model_name)
+    def outlet_stations(self):
+        return outlet_stations(self.model_name)
+    # Accessors for outlets
+    def get_outlets(self):
+        return get_outlets_by_model(self.model_name)
+    def get_outlets_by_reach(self, reach_id: int):
+        return get_outlets_by_reach(reach_id, self.model_name)
+    def get_outlets_by_station(self, station_id: str, station_origin: str):
+        assert(station_id in self.wiski_stations() + self.equis_stations()), f"Station ID {station_id} not found in model {self.model_name}"
+        return get_outlets_by_station(station_id, station_origin)
+# constructors:
+def build_outlet_db(db_path: str = None):
+    if db_path is None:
+        db_path = DB_PATH
+    init_db(db_path,reset=True)
+    with connect(db_path,False) as con:
+        build_outlets(con)
+def build_outlets(con, model_name: str = None):
+    if model_name is not None:
+        modl_db = get_model_db(model_name)
+    else:
+        modl_db = MODL_DB
+    for index, (_, group) in enumerate(modl_db.drop_duplicates(['station_id','source']).groupby(by = ['opnids','repo_name'])):
+        repo_name = group['repo_name'].iloc[0]
+        add_outlet(con, outlet_id = index, outlet_name = None, repository_name = repo_name, notes = None)
+        opnids = set(split_opnids(group['opnids'].str.split(',').to_list()))
+        for opnid in opnids:
+            add_reach(con, outlet_id = index, reach_id = int(opnid), repository_name = repo_name)
+        for _, row in group.drop_duplicates(subset=['station_id', 'source']).iterrows():
+            add_station(con, outlet_id = index, station_id = row['station_id'], station_origin = row['source'], true_opnid = row['true_opnid'], repository_name= repo_name, comments = row['comments'])
+def create_outlet_schema(con, model_name : str):
+    for index, (_, group) in enumerate(outlets(model_name)):
+        repo_name = group['repo_name'].iloc[0]
+        add_outlet(con, outlet_id = index, outlet_name = None, repository_name = repo_name, notes = None)
+        opnids = set(split_opnids(group['opnids'].str.split(',').to_list()))
+        for opnid in opnids:
+            add_reach(con, outlet_id = index, reach_id = int(opnid), repository_name = repo_name)
+        for _, row in group.drop_duplicates(subset=['station_id', 'source']).iterrows():
+            add_station(con, outlet_id = index, station_id = row['station_id'], station_origin = row['source'], true_opnid = row['true_opnid'], repository_name= repo_name, comments = row['comments'])
+def add_outlet(con,
+               outlet_id: int,
+               repository_name: str,
+               outlet_name = None,
+               notes = None):
+    """
+    Insert an outlet. repository_name is required.
+    """
+    con.execute(
+        "INSERT INTO outlets.outlet_groups (outlet_id, repository_name, outlet_name, notes) VALUES (?, ?, ?, ?)",
+        [outlet_id, repository_name, outlet_name, notes]
+    )
+def add_station(con,
+                outlet_id: int,
+                station_id: int,
+                station_origin: str,
+                true_opnid: int,
+                repository_name: str,
+                comments = None):
+    """
+    Insert a station membership for an outlet.
+    Constraints:
+    - PRIMARY KEY (station_id, station_origin): unique per origin across all outlets.
+    - true_opnid and true_opnid_repository_name are required per schema.
+    """
+    con.execute(
+        """INSERT INTO outlets.outlet_stations
+           (outlet_id, station_id, station_origin, true_opnid, repository_name, comments)
+           VALUES (?, ?, ?, ?, ?, ?)""",
+        [outlet_id, station_id, station_origin, true_opnid, repository_name, comments]
+    )
+def add_reach(con,
+              outlet_id: int,
+              reach_id: int,
+              repository_name: str):
+    """
+    Insert a reach membership for an outlet.
+    - repository_name is required and participates in the PK (reach_id, repository_name).
+    """
+    con.execute(
+        """INSERT INTO outlets.outlet_reaches (outlet_id, reach_id, repository_name)
+           VALUES (?, ?, ?)""",
+        [outlet_id, reach_id, repository_name]
+    )
+OUTLETS_SCHEMA  = """-- schema.sql
+-- Simple 3-table design to manage associations between model reaches and observation stations via outlets.
+-- Compatible with DuckDB and SQLite.
+-- Table 1: outlets
+-- Represents a logical grouping that ties stations and reaches together.
+CREATE SCHEMA IF NOT EXISTS outlets;
+CREATE TABLE IF NOT EXISTS outlets.outlet_groups  (
+  outlet_id INTEGER PRIMARY KEY,
+  repository_name TEXT NOT NULL,
+  outlet_name TEXT,
+  notes TEXT             -- optional: general notes about the outlet grouping
+);
+-- Table 2: outlet_stations
+-- One-to-many: outlet -> stations
+CREATE TABLE IF NOT EXISTS outlets.outlet_stations (
+  outlet_id INTEGER NOT NULL,
+  station_id TEXT NOT NULL,
+  station_origin TEXT NOT NULL,       -- e.g., 'wiski', 'equis'
+  repository_name TEXT NOT NULL,  -- repository model the station is physically located in
+  true_opnid INTEGER NOT NULL,           -- The specific reach the station physically sits on (optional)
+  comments TEXT,             -- Per-station comments, issues, etc.
+  CONSTRAINT uq_station_origin UNIQUE (station_id, station_origin),
+  FOREIGN KEY (outlet_id) REFERENCES outlets.outlet_groups(outlet_id)
+);
+-- Table 3: outlet_reaches
+-- One-to-many: outlet -> reaches
+-- A reach can appear in multiple outlets, enabling many-to-many overall.
+CREATE TABLE IF NOT EXISTS outlets.outlet_reaches (
+  outlet_id INTEGER NOT NULL,
+  reach_id INTEGER NOT NULL,    -- model reach identifier (aka opind)
+  repository_name TEXT NOT NULL,  -- optional: where the mapping comes from
+  FOREIGN KEY (outlet_id) REFERENCES outlets.outlet_groups(outlet_id)
+);
+-- Useful views:
+-- View: station_reach_pairs
+-- Derives the implicit many-to-many station <-> reach relationship via shared outlet_id
+CREATE OR REPLACE VIEW outlets.station_reach_pairs AS
+SELECT
+  s.outlet_id,
+  s.station_id,
+  s.station_origin,
+  r.reach_id,
+  r.repository_name
+FROM outlets.outlet_stations AS s
+JOIN outlets.outlet_reaches AS r
+  ON s.outlet_id = r.outlet_id;
+"""
+#row = modl_db.MODL_DB.iloc[0]
+#info = etlWISKI.info(row['station_id'])
+#modl_db.MODL_DB.query('source == "equis"')
+# outlet_dict = {'stations': {'wiski': ['E66050001'],
+#                'equis': ['S002-118']},
+#                'reaches': {'Clearwater': [650]}
+# station_ids = ['S002-118']
+# #station_ids = ['E66050001']
+# reach_ids = [650]
+# flow_station_ids =  ['E66050001']

mpcaHydro/reports.py ADDED Viewed

@@ -0,0 +1,80 @@
+from pathlib import Path
+import duckdb
+import glob
+#TODO ensure all reports are actually in the reports schema
+class reportManager():
+    def __init__(self,db_path:Path):
+        self.db_path = db_path
+    def wiski_qc_counts(self):
+        with duckdb.connect(self.db_path,read_only=True) as con:
+            return wiski_qc_counts(con)
+    def constituent_summary(self,constituent: str = None):
+        with duckdb.connect(self.db_path,read_only=True) as con:
+            return constituent_summary(con,constituent)
+    def station_reach_pairs(self):
+        with duckdb.connect(self.db_path,read_only=True) as con:
+            return station_reach_pairs(con)
+    def outlet_summary(self):
+        with duckdb.connect(self.db_path,read_only=True) as con:
+            return outlet_summary(con)
+def outlet_summary(con: duckdb.DuckDBPyConnection):
+    query = '''
+    SELECT *,
+    FROM
+        reports.outlet_constituent_summary
+    ORDER BY
+        outlet_id,
+        constituent
+    '''
+    df = con.execute(query).fetch_df()
+    return df
+def wiski_qc_counts(con: duckdb.DuckDBPyConnection):
+    query = '''
+    SELECT *,
+    FROM
+        reports.wiski_qc_count
+    ORDER BY
+        station_no,
+        parametertype_name
+    '''
+    df = con.execute(query).fetch_df()
+    return df
+def constituent_summary(con: duckdb.DuckDBPyConnection,constituent: str = None):
+    query = '''
+    SELECT *,
+    FROM
+        reports.constituent_summary
+    ORDER BY
+        station_id,
+        station_origin,
+        constituent
+    '''
+    df = con.execute(query).fetch_df()
+    if constituent is not None:
+        df = df[df['constituent'] == constituent]
+    return df
+def station_reach_pairs(con: duckdb.DuckDBPyConnection):
+    query = '''
+    SELECT *,
+    FROM
+        reports.station_reach_pairs
+    ORDER BY
+        outlet_id,
+        station_id
+    '''
+    df = con.execute(query).fetch_df()
+    return df

mpcaHydro 2.1.0__py3-none-any.whl → 2.2.1__py3-none-any.whl

mpcaHydro 2.1.0py3-none-any.whl → 2.2.1py3-none-any.whl