PyPI - mpcaHydro - Versions diffs - 2.2.6__tar.gz → 2.2.8__tar.gz - Mend

mpcaHydro 2.2.6tar.gz → 2.2.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{mpcahydro-2.2.6 → mpcahydro-2.2.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mpcaHydro
-Version: 2.2.6
+Version: 2.2.8
 Summary: Python package for downloading MPCA hydrology data
 Project-URL: Homepage, https://github.com/mfratkin1/mpcaHydro
 Author-email: Mulu Fratkin <michael.fratkin@state.mn.us>

mpcahydro-2.2.8/demo.py ADDED Viewed

@@ -0,0 +1,226 @@
+#%%
+from mpcaHydro.data_manager import dataManager
+from pyhcal.repository import Repository
+from mpcaHydro import outlets
+import duckdb
+from mpcaHydro import equis, warehouse, wiski
+from hspf.hspfModel import hspfModel
+from hspf.uci import UCI
+from mpcaHydro import etlSWD
+#%%
+'''
+New approach. Directly load to warehouse from downloads.
+Store raw and processed data in warehouse. For large timeseries I could store
+as parquet files. The transformations using pandas take a bit of time. I imagine doing them
+within duckdb would be faster.
+'''
+# with warehouse.connect(db_path) as con:
+#    df = con.execute("SELECT * FROM staging.wiski").df()
+#    df = wiski.transform(df,filter_qc_codes = False)
+#%%
+model_name = 'Nemadji'
+db_path = f'C:/Users/mfratki/Documents/{model_name}.duckdb'
+start_year = 1996
+end_year = 2030
+replace = True
+filter_qc_codes = True
+equis_stations = outlets.equis_stations(model_name)
+wiski_stations = outlets.wiski_stations(model_name)
+equis.connect('MFRATKI',password = 'DeltaT#MPCA3')
+warehouse.init_db(db_path,reset = True)
+#%% Old approach. Store as indvidual processed station files then load to warehouse
+#df_equis = equis.download(equis_stations)
+#df_wiski = wiski.download(wiski_stations,start_year = start_year, end_year = end_year)
+#%% equis
+def download_equis_data(db_path,station_ids,replace = False):
+  with warehouse.connect(db_path,read_only = False) as con:
+    df = equis.download(station_ids)
+    if not df.empty:
+      warehouse.load_df_to_table(con,df, 'staging.equis',replace = replace)
+      warehouse.load_df_to_table(con,equis.transform(df), 'analytics.equis',replace = replace)
+    else:
+      print('No data neccesary for HSPF calibration available from equis for stations:',station_ids)
+def download_wiski_data(db_path,station_ids,replace = False):
+  with warehouse.connect(db_path,read_only = False) as con:
+    df = wiski.download(station_ids,start_year = start_year, end_year = end_year)
+    if not df.empty:
+      warehouse.load_df_to_table(con,df, 'staging.wiski', replace = replace)
+      warehouse.load_df_to_table(con,wiski.transform(df), 'analytics.wiski',replace = replace)
+    else:
+      print('No data neccesary for HSPF calibration available from wiski for stations:',station_ids)
+# Add to warehouse from custom df. Must contain required normalized columns.
+with warehouse.connect(db_path,read_only = False) as con:
+  if replace:
+     warehouse.drop_station_id(con,station_id,station_origin='equis')
+  warehouse.add_to_table(con,df, 'staging','equis_normalized')
+  warehouse.load_df_to_staging(con,df, 'equis_raw',replace = replace)
+  df = equis.normalize(df.copy())
+  warehouse.add_to_table(con,df, 'staging','equis_normalized')
+  df = equis.transform(df)
+  warehouse.add_to_table(con,df, 'analytics','equis')
+#%% swd
+df = etlSWD.download(equis_stations)
+with warehouse.connect(db_path,read_only = False) as con:
+  warehouse.load_df_to_staging(con,df, 'equis_raw',replace = replace)
+  df = equis.normalize(df.copy())
+  warehouse.add_to_table(con,df, 'staging','equis_normalized')
+  df = equis.transform(df)
+  warehouse.add_to_table(con,df, 'analytics','equis')
+#%% wiski
+      if station_origin == 'wiski':
+          df = wiski.download(station_ids,start_year = start_year, end_year = end_year)
+          warehouse.load_df_to_staging(con,df, 'wiski_raw', replace = replace)
+          df = wiski.normalize(df.copy())
+          warehouse.add_to_table(con,df, 'staging','wiski_normalized')
+          df = wiski.transform(df,filter_qc_codes = filter_qc_codes)
+          warehouse.add_to_table(con,df, 'analytics','wiski') # method includes normalization
+      if station_origin == 'swd':
+          df = pd.concat([etlSWD.download(station_id) for station_id in station_ids])
+          warehouse.load_df_to_staging(con,df, 'equis_raw', replace = replace)
+          df = etlSWD.transform(df.copy())
+          warehouse.add_to_table(con,df, 'analytics','equis')
+      warehouse.update_views(con)
+with warehouse.connect(db_path) as con:
+  warehouse.update_views(con)
+#%%
+import requests
+url = 'http://ifrshiny.seas.umich.edu/mglp/'
+requests.get(url)
+db_path = 'C:/Users/mfratki/Documents/Rum.duckdb'
+modl_db.build_outlet_db(db_path)
+con = duckdb.connect(db_path)
+con.execute("SELECT * FROM station_reach_pairs").df()
+con.execute('SELECT * FROM station_reach_pairs WHERE outlet_id = 76').df()
+# Need to remove duplicates from MODL_DB
+modl_db.MODL_DB.loc[modl_db.MODL_DB.duplicated(['station_id','source'])]
+#%%
+dm = dataManager('C:/Users/mfratki/Documents/')
+dm._build_warehouse()
+equis_stations = modl_db.equis_stations('Nemadji')
+wiski_stations = modl_db.wiski_stations('Nemadji')
+#%% Old approach. Store as indvidual processed station files then load to warehouse
+for station_id in equis_stations:
+    dm._download_station_data(station_id,'equis', True)
+for station_id in wiski_stations:
+    dm._download_station_data(station_id,'wiski', True)
+#%% Adding HSPF outputs to warehouse
+con = duckdb.connect(db_path)
+model_name = 'Nemadji'
+outlets = [group for _, group in modl_db.MODL_DB.query('repository_name == @model_name').groupby(by = ['opnids','repository_name'])]
+for outlet in outlets:
+    1+1
+dfs = []
+for constituent in ['Q','TSS','TP','N','OP','TKN']:
+    opnids = modl_db.split_opnids([opnid.split(',') for opnid in set(outlet['opnids'].tolist())])
+    for opnid in opnids:
+        df = mod.hbns.get_reach_constituent(constituent,opnids,time_step='h')
+        df.columns = ['value']
+        df['constituent'] = constituent
+        df['operation'] = operation
+        df['opnid'] = opnid
+        dfs.append(df)
+df = pd.concat(dfs).reset_index()
+df['model_name'] = model_name
+station_ids = ['H05018001','S006-214','S015-102']
+target_constituent = 'TSS'
+flow_constituent = 'Q'
+# build placeholders for the IN list (one ? per station id)
+placeholders = ','.join(['?'] * len(station_ids))
+sql = f'''
+SELECT o.*, f.datetime AS flow_datetime, f.value AS flow, f.baseflow, f.station_id AS flow_station_id, f.station_origin AS flow_station_origin
+FROM analytics.observations o
+JOIN analytics.observations f
+  ON o.datetime = f.datetime
+WHERE o.constituent = ?
+  AND o.station_id IN ({placeholders})
+  AND f.constituent = ?;
+'''
+# parameter order must match the ? positions in the query
+params = [target_constituent] + station_ids + [flow_constituent]
+df = con.execute(sql, params).df()
+outlet_id: station_ids
+outlet_id: opnid
+outlets = []
+for index, (_, group) in enumerate(modl_db.MODL_DB.groupby(by = ['opnids','repository_name'])):
+    group['outlet_id'] = index
+    group.reset_index(drop=True, inplace=True)
+    outlets.append(group)
+    for _, row in group.iterrows():
+        opnids = group.split_opnids(row['opnids'].str.split(',').to_list())
+        row*len(opnids)

{mpcahydro-2.2.6 → mpcahydro-2.2.8}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
 [project]
 name = "mpcaHydro"
 urls = { "Homepage" = "https://github.com/mfratkin1/mpcaHydro" }  # ? Add this!
-version = "2.2.6"
+version = "2.2.8"
 dependencies = [
   "pandas",
   "requests",

{mpcahydro-2.2.6 → mpcahydro-2.2.8}/src/mpcaHydro/outlets.py RENAMED Viewed

@@ -144,7 +144,7 @@ def get_outlets_by_reach(reach_id: int, model_name: str):
             """,
         [reach_id, model_name]).fetchdf()
     return df
 def get_outlets_by_station(station_id: str, station_origin: str):
     """
     Return all outlet rows for outlets that include the given reach_id in the given model_name.
@@ -160,6 +160,19 @@ def get_outlets_by_station(station_id: str, station_origin: str):
         [station_id, station_origin]).fetchdf()
     return df
+def get_station_opnids(station_id: str, station_origin: str):
+    """
+    Return all model reach IDs (opnids) associated with the given station ID and origin.
+    """
+    with connect(DB_PATH) as con:
+        df = con.execute(
+        """
+        SELECT r.reach_id
+        FROM outlets.station_reach_pairs r
+        WHERE r.station_id = ? AND r.station_origin = ?
+        """,
+        [station_id, station_origin]).fetchdf()
+    return df['reach_id'].tolist()
 class OutletGateway:

{mpcahydro-2.2.6 → mpcahydro-2.2.8}/src/mpcaHydro/pywisk.py RENAMED Viewed

@@ -13,7 +13,7 @@ import time
 CERT_PATH = str(Path(__file__).resolve().parent/'data\\wiskiweb01.pca.state.mn.us.crt')
 #TODO: Use this url to make sure web service is working https://wiskiweb01.pca.state.mn.us/
 class Service():
-    base_url = 'https://wiskiweb01.pca.state.mn.us/KiWIS/KiWIS?'
+    base_url = 'http://wiskiweb01.pca.state.mn.us/KiWIS/KiWIS?'
     base_dict = {
         'datasource': '0',
         'service': 'kisters',
@@ -30,7 +30,7 @@ class Service():
         try:
             # Using requests.head() to fetch headers is faster than requests.get()
             # as it doesn't download the full content
-            response = requests.head('https://wiskiweb01.pca.state.mn.us', timeout=timeout)
+            response = requests.head('http://wiskiweb01.pca.state.mn.us', timeout=timeout)
             # raise_for_status() raises an HTTPError for 4xx or 5xx status codes
             response.raise_for_status()
@@ -53,7 +53,7 @@ class Service():
     def _requestTypes(self):
         url = self.url({'request': 'getrequestinfo'})
-        return requests.get(url,verify=CERT_PATH).json()[0]
+        return requests.get(url).json()[0]
     def getRequests(self):
         return list(self._requestTypes()['Requests'].keys())
@@ -72,7 +72,7 @@ class Service():
     def info(self,request_type):
         url = self.url({'request': 'getrequestinfo'})
-        response = requests.get(url, verify=CERT_PATH)
+        response = requests.get(url)
         get_requests = response.json()
         return get_requests[0]['Requests'].keys()
@@ -95,7 +95,7 @@ class Service():
     def get_json(self,args_dict):
         # Download request
-        self.response = requests.get(self.url(args_dict), verify=CERT_PATH)
+        self.response = requests.get(self.url(args_dict))
         if self.response.status_code != 200:
             print('Error: ' + self.response.json()['message'])
             self.response.raise_for_status()  # raises exception when not a 2xx response

{mpcahydro-2.2.6 → mpcahydro-2.2.8}/src/mpcaHydro/swd.py RENAMED Viewed

@@ -26,19 +26,21 @@ CONSTITUENT_MAP = {i[0]:i[1] for i in EQUIS_PARAMETER_XREF[['PARAMETER','constit
 #     return df
 import requests
-def _download(station_no):
+def _download(station_id):
     # Replace {station_no} in the URL with the actual station number
-    url = f"https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=json"
+    #url = f"https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=json"
+    url = 'https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results'
     try:
         # Send a GET request to the URL
-        response = requests.get(url)
+        params = {
+            'stationId': station_id,
+            'format': 'json'
+        }
+        response = requests.get(url,params = params)
         response.raise_for_status()  # Raise exception for HTTP errors
         # Parse the JSON data
-        if response.json()['recordCount'] == 0:
-            return pd.DataFrame(columns = response.json()['column_names'])
-        else:
-            return pd.DataFrame(response.json()['data'])
+        return pd.DataFrame(response.json()['data'])
     except requests.exceptions.RequestException as e:
         print(f"An error occurred: {e}")
@@ -46,14 +48,18 @@ def _download(station_no):
-def download(station_no):
+def download(station_ids):
     #df = pd.read_csv(f'https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=csv')
-    df = _download(station_no)
-    if df.empty:
-        return df
-    else:
-        df['station_id'] = station_no
-        return transform(df)
+    dfs = []
+    for station_id in station_ids:
+        df = _download(station_id)
+        if not df.empty:
+            df['station_id'] = station_id
+            dfs.append(df)
+    return pd.concat(dfs, ignore_index=True)
 def info(station_no):
     #df = pd.read_csv(f'https://services.pca.state.mn.us/api/v1/surfacewater/monitoring-stations/results?stationId={station_no}&format=csv')

{mpcahydro-2.2.6 → mpcahydro-2.2.8}/src/mpcaHydro/warehouse.py RENAMED Viewed

@@ -28,6 +28,23 @@ def init_db(db_path: str,reset: bool = False):
+def validate_schemas(con: duckdb.DuckDBPyConnection):
+    """Validate that the database has the expected schemas and tables."""
+    expected_schemas = {'staging', 'analytics', 'mappings', 'outlets', 'reports'}
+    result = con.execute("SELECT schema_name FROM information_schema.schemata").fetchall()
+    existing_schemas = {row[0] for row in result}
+    missing_schemas = expected_schemas - existing_schemas
+    if missing_schemas:
+        raise ValueError(f"Missing schemas: {missing_schemas}")
+def validate_tables(con: duckdb.DuckDBPyConnection, schema: str, expected_tables: set):
+    """Validate that a schema contains the expected tables."""
+    result = con.execute(f"SELECT table_name FROM information_schema.tables WHERE table_schema = ?", [schema]).fetchall()
+    existing_tables = {row[0] for row in result}
+    missing_tables = expected_tables - existing_tables
+    if missing_tables:
+        raise ValueError(f"Missing tables in {schema} schema: {missing_tables}")
 def create_schemas(con: duckdb.DuckDBPyConnection):
     """Create staging, analytics, hspf, and reports schemas if they do not exist."""
     con.execute(sql_loader.get_schemas_sql())
@@ -96,6 +113,44 @@ def create_mapping_tables(con: duckdb.DuckDBPyConnection):
     else:
         print(f"Warning: WISKI_QUALITY_CODES.csv not found at {wiski_qc_csv_path}")
+def attach_outlets_db(con: duckdb.DuckDBPyConnection, outlets_db_path: str):
+    """
+    Attach an external DuckDB database containing outlet definitions.
+    """
+    create_schemas(con)
+    con.execute(f"ATTACH DATABASE '{outlets_db_path}' AS outlets_db;")
+    tables = con.execute("SHOW TABLES FROM outlets_db").fetchall()
+    print(f"Tables in the source database: {tables}")
+    for table in tables:
+        table_name = table[0]  # Extract table name
+        con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM outlets_db.{table_name}")  # Copy table contents
+    # -- Step 2: Copy all views --
+    # Retrieve the list of views in the source database
+    views = con.execute("SHOW VIEWS FROM outlets_db").fetchall()
+    print(f"Views in the source database: {views}")
+    # Copy each view from source to destination
+    for view in views:
+        view_name = view[0]  # Extract view name
+        # Get the CREATE VIEW statement for the view
+        create_view_sql = con.execute(f"SHOW CREATE VIEW outlets_db.{view_name}").fetchone()[0]
+        # Recreate the view in the destination database (remove the `outlets_db.` prefix if exists)
+        create_view_sql = create_view_sql.replace(f"outlets_db.", "")
+        con.execute(create_view_sql)
+    con.execute(f"ATTACH DATABASE '{outlets_db_path}' AS outlets_db;")
+    # Optional: Detach the source database
+    con.execute("DETACH 'outlets_db'")
 def create_outlets_tables(con: duckdb.DuckDBPyConnection):
     """Create tables in the outlets schema to define outlet-station-reach relationships."""
     con.execute(sql_loader.get_outlets_schema_sql())

mpcahydro-2.2.8/tests/integration/test_dataManager.py ADDED Viewed

@@ -0,0 +1,61 @@
+#%% Imports
+from mpcaHydro.data_manager import dataManager
+from pathlib import Path
+import duckdb
+THIS_DIR = Path(__file__).parent
+WISKI_STATIONS = ['E05011002']
+EQUIS_STATIONS = ['S001-235','S005-115']
+#%%
+def test_build_warehouse():
+    dm = dataManager(THIS_DIR)
+    dm._build_warehouse()
+test_build_warehouse()
+# %%
+def test_equis_data_download():
+    dm = dataManager(THIS_DIR,
+                     oracle_username = 'MFRATKI',
+                     oracle_password = 'DeltaT#MPCA3',
+                     reset=True)
+    dm.connect_to_oracle()
+    dm._download_equis_data(EQUIS_STATIONS)
+test_equis_data_download()
+#%%
+def test_wiski_data_download():
+    dm = dataManager(THIS_DIR, reset=True)
+    dm._download_wiski_data(WISKI_STATIONS)
+test_wiski_data_download()
+#%%
+dm = dataManager(THIS_DIR, reset=False)
+with duckdb.connect(dm.db_path, read_only=True) as con:
+    df = con.execute('SELECT * FROM analytics.outlet_observations').fetch_df()
+    assert(df['outlet_id'].isnull().sum() == 0)
+with duckdb.connect(dm.db_path, read_only=True) as con:
+    df = con.execute('SELECT * FROM analytics.outlet_observations_with_flow').fetch_df()
+    assert(df['outlet_id'].isnull().sum() == 0)
+    assert(df['value'].isnull().sum() == 0)
+# %%
+dm = dataManager(THIS_DIR, reset=False)
+def test_wiski_download():
+    dm = dataManager(THIS_DIR, reset=False)
+    wiski_stations = WISKI_STATIONS
+    dm._download_wiski_data(wiski_stations)
+    return dm
+test_wiski_download()
+with duckdb.connect(dm.db_path, read_only=True) as con:
+    df = con.execute('SELECT * FROM analytics.outlet_observations_with_flow').fetch_df()
+    assert(df['outlet_id'].isnull().sum() == 0)
+# %%

mpcahydro-2.2.8/tests/integration/test_warehouse.duckdb ADDED Viewed

Binary file

mpcahydro-2.2.8/tests/unit/test_equis.py ADDED Viewed

@@ -0,0 +1,19 @@
+#%%
+from mpcaHydro import equis
+from mpcaHydro import outlets
+#%%
+model_name = 'Rum'
+equis_stations = outlets.equis_stations(model_name)
+equis.connect('MFRATKI',password = 'DeltaT#MPCA3')
+df = equis.download(equis_stations)
+df_normalized = equis.normalize(df.copy())
+expected_columns = ['station_id', 'constituent', 'cas_rn', 'datetime', 'value', 'unit']
+assert all(col in df_normalized.columns for col in expected_columns)
+# %%

mpcahydro-2.2.6/tests/pixi.toml DELETED Viewed

@@ -1,25 +0,0 @@
-[workspace]
-channels   = ["https://prefix.dev/conda-forge"]
-platforms  = ["linux-64", "osx-64", "win-64"]
-[dependencies]
-requests    = "*"
-pandas       = "*"
-time = 			"*"
-pathlib = 		"*"
-spyder = "*"
-jupyter = "*"
-[package]
-name        = "mpcaHydro"
-version     = "0.1.0"
-[package.build]
-backend     = { name = "pixi-build-python", version = "0.1.*" }
-[package.run-dependencies]
-requests    = "*"
-pandas       = "*"
-time = 			"*"
-pathlib = 		"*"