PyPI - mpcaHydro - Versions diffs - 2.0.0__py3-none-any.whl - Mend

mpcaHydro 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

mpcaHydro/WISKI.py +351 -0
mpcaHydro/__init__.py +0 -0
mpcaHydro/data_manager.py +321 -0
mpcaHydro/etlCSG.py +88 -0
mpcaHydro/etlSWD.py +187 -0
mpcaHydro/etlWISKI.py +555 -0
mpcaHydro/etlWPLMN.py +104 -0
mpcahydro-2.0.0.dist-info/METADATA +15 -0
mpcahydro-2.0.0.dist-info/RECORD +10 -0
mpcahydro-2.0.0.dist-info/WHEEL +4 -0

mpcaHydro/WISKI.py ADDED Viewed

@@ -0,0 +1,351 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jul 10 16:18:03 2023
+@author: mfratki
+"""
+import requests
+import pandas as pd
+import time
+#TODO: Use this url to make sure web service is working https://wiskiweb01.pca.state.mn.us/
+class Service():
+    base_url = 'http://wiskiweb01.pca.state.mn.us/KiWIS/KiWIS?'
+    base_dict = {
+        'datasource': '0',
+        'service': 'kisters',
+        'type': 'queryServices',
+        'format': 'json'}
+    def __init__(self):
+        #TODO: store request types in a file and load them here to avoid making a request when the class is instantiated
+        #url = self.url({'request': 'getrequestinfo'})
+        #self.requestTypes = requests.get(url).json()[0]
+        self._url = None
+        self._args = None
+    # def _requestTypes(self):
+    #     url = self.url({'request': 'getrequestinfo'})
+    #     self.requestTypes = requests.get(url).json()[0]
+    #     self._url = None
+    #     self._args = None
+    def _requestTypes(self):
+        url = self.url({'request': 'getrequestinfo'})
+        return requests.get(url).json()[0]
+    def getRequests(self):
+        return list(self._requestTypes()['Requests'].keys())
+    def queryfields(self,request_type):
+        return list(self._requestTypes()['Requests'][request_type]['QueryFields']['Content'].keys())
+    def returnfields(self,request_type):
+        return list(self._requestTypes()['Requests'][request_type]['Returnfields']['Content'].keys())
+    def optionalfields(self,request_type):
+        return list(self._requestTypes()['Requests'][request_type]['Optionalfields']['Content'].keys())
+    def formats(self,request_type):
+        return list(self._requestTypes()['Requests'][request_type]['Formats']['Content'].keys())
+    def info(self,request_type):
+        url = self.url({'request': 'getrequestinfo'})
+        response = requests.get(url)
+        get_requests = response.json()
+        return get_requests[0]['Requests'].keys()
+    def url(self,args_dict):
+        args_dict = self.base_dict | args_dict
+        args = []
+        for k,v in args_dict.items():
+            if v is None:
+                continue
+            elif isinstance(v,list):
+                v = [str(vv) for vv in v]
+                v = ','.join(v)
+            args.append(f'{k}={v}')
+        args = '&'.join(args)
+        url = self.base_url + args
+        self._url = url
+        return url
+    def df(self,args_dict):
+        # Download request
+        # print('Downloading')
+        response = requests.get(self.url(args_dict))
+        response.raise_for_status()  # raises exception when not a 2xx response
+        if response.status_code != 200:
+            print('Error: ' + response.json()['message'])
+            return 1
+        get_requests = response.json()
+        # Convert to dataframe
+        if args_dict['request'] in ['getTimeseriesValues']:
+            dfs = []
+            for get_request in get_requests:
+                df = pd.DataFrame(get_request['data'],columns = get_request['columns'].split(','))
+                del get_request['data']
+                del get_request['rows']
+                del get_request['columns']
+                for k,v in get_request.items(): df[k] = v
+                dfs.append(df)
+            df = pd.concat(dfs)
+        else:
+            df = pd.DataFrame(get_requests[1:], columns = get_requests[0])
+        # print('Done!')
+        return df
+    def get(self,args):
+        request_type = args['request']
+        assert(request_type in self.getRequests())
+        _args = {queryfield: None for queryfield in self.queryfields(request_type)} | {optionalfield: None for optionalfield in self.optionalfields(request_type)}
+        args = {**_args, **args}
+        self._args = args
+        return self.df(args)
+    def _filter(self,args):
+        '''
+        Filter for ensuring not too many values are requested and determining the proper division
+        given the number of timeseries, timeseries length, and timeseries sampling interval
+        '''
+        'minute','hour','daily'
+        MAX_OUTPUT = 240000 #True max output is 250,000 but giving myself a bit of a buffer
+        n_timeseries = 1
+        n_years = 1
+        #1 timeseries for 1 year
+        n_values = 60*24*365*n_timeseries*n_years
+        if n_values < MAX_OUTPUT :
+            return 0
+        elif n_timeseries == 1:
+            n_values/MAX_OUTPUT
+'''
+Potential use cases:
+1. timeseries for a given ts_id
+2. All timeseries for a given station
+3. All timeseries for a given parameter
+4. All timeseries for a given huc_id
+5. All timeseries of a given resolution
+'''
+class pyWISK():
+    def __init__(self):
+        self.service = Service()
+    def get(self,args_dict):
+        return self.service.get(args_dict)
+    def get_ts(self,
+               ts_ids = None,
+               huc_id = None,
+               station_nos = None,
+               parametertype_id = None,
+               parameter_no = None,
+               start_date = '1996-01-01',
+               end_date = '2050-12-31',
+               stationgroup_id = None,
+               timezone = 'UTC'):
+        if ts_ids is None:
+            print('Determing Timeseries IDs')
+            ts_ids = self.get_ts_ids(station_nos,huc_id,parametertype_id)
+            print('Done!')
+        #print('Downloading Timeseries Data')
+        args = {'request':'getTimeseriesValues',
+                'ts_id' : ts_ids,
+                'from': start_date,
+                'to': end_date,
+                'returnfields': ['Timestamp', 'Value', 'Quality Code','Quality Code Name'],
+                'metadata': 'true',
+                'md_returnfields': ['ts_unitsymbol',
+                                    'ts_name',
+                                    'ts_id',
+                                    'station_no',
+                                    'station_name',
+                                    'station_latitude',
+                                    'station_longitude',
+                                    'parametertype_id',
+                                    'parametertype_name',
+                                    'stationparameter_no',
+                                    'stationparameter_name'],
+                'timezone':timezone,
+                'ca_sta_returnfields': ['stn_HUC12','stn_EQuIS_ID']}
+        df = self.service.get(args)
+        #print('Done!')
+        return df
+    def get_stations(self,
+                     huc_id = None,
+                     parametertype_id = None,
+                     stationgroup_id = None,
+                     stationparameter_no = None,
+                     station_no = None,
+                     returnfields = []):
+        args = {'request':'getStationList'}
+        returnfields = list(set(['ca_sta','station_no','station_name'] + returnfields))
+        args ={'request': 'getStationList',
+               'stationparameter_no': stationparameter_no,
+               'stationgroup_id': stationgroup_id,
+               'parametertype_id': parametertype_id,
+               'station_no': station_no,
+               #'object_type': object_type,
+               'returnfields': returnfields,
+               #                  'parametertype_id','parametertype_name',
+               #                  'station_latitude','station_longitude',
+               #                  'stationparameter_no','stationparameter_name'],
+               'ca_sta_returnfields': ['stn_HUC12','stn_EQuIS_ID','stn_AUID','hydrounit_title','hydrounit_no','NearestTown']
+               }
+        df = self.service.get(args)
+        if huc_id is not None: df = df.loc[df['stn_HUC12'].str.startswith(huc_id)]
+        return df
+    def get_ts_ids(self,
+                   station_nos=None,
+                   huc_id = None,
+                   parametertype_id = None,
+                   stationparameter_no = None,
+                   stationgroup_id = None,
+                   ts_name = None,
+                   returnfields = None):
+        if station_nos is None:
+            station_nos = self.get_stations(huc_id,parametertype_id,stationgroup_id,stationparameter_no)['station_no'].to_list()
+        if returnfields is None:
+            returnfields = ['ts_id','ts_name','ca_sta','station_no',
+                             'ts_unitsymbol',
+                             'parametertype_id','parametertype_name',
+                             'station_latitude','station_longitude',
+                             'stationparameter_no','stationparameter_name',
+                             'station_no','station_name',
+                             'coverage','ts_density']
+        args ={'request': 'getTimeseriesList',
+               'station_no': station_nos,
+               'parametertype_id': parametertype_id,
+               'stationparameter_no': stationparameter_no,
+               'ts_name' : ts_name,
+               'returnfields': returnfields,
+               'ca_sta_returnfields': ['stn_HUC12','stn_EQuIS_ID','stn_AUID']}
+        df = self.service.get(args)
+        return df
+    def get_wplmn(self,station_nos):
+        PARAMETERS_MAP={'5004':'TP Load',
+                        '5005':'TP Conc',
+                        '5014':'TSS Load',
+                        '5015':'TSS Conc',
+                        '5024':'N Load',
+                        '5025':'N Conc',
+                        '5034':'OP Load',
+                        '5035':'OP Conc',
+                        '5044':'TKN Load',
+                        '5045':'TKN Conc',
+                        '262' :'Flow'}
+        ts_ids = self.get_ts_ids(station_nos = station_nos,
+                          stationgroup_id = '1319204',
+                          stationparameter_no = list(PARAMETERS_MAP.keys()),
+                          ts_name = ['20.Day.Mean'])
+        if len(ts_ids) == 0:
+            print('No WPLMN Sites Available')
+            return pd.DataFrame()
+        dfs = []
+        for ts_id in ts_ids['ts_id']:
+            dfs.append(self.get_ts(ts_id))
+            time.sleep(1)
+        return pd.concat(dfs)
+    # CONSTITUENT_NAME_NO = {'Q'  :['262'],#,'263'],
+    #                        'WT' :['450'],# , '451' , '450.42','451.42'],
+    #                        'OP' :['863'   ,'5034'  ,'5035'],
+    #                        'DO' :['865'   ,'866'   , '867'],
+    #                        'TP' :['5005'  ,'5004'],
+    #                        'TSS':['5014' ,'5015'],
+    #                        'N'  :['5024'  ,'5025'],
+    #                        'TKN':['5044' ,'5045']}
+    # TS_NAME_SELECTOR = {'Q':{'daily':['20.Day.Mean.Archive','20.Day.Mean'],
+    #       'unit': ['15.Rated','08.Provisional.Edited']},
+    #  'WT':{'daily':['20.Day.Mean','20.Day.Mean'],
+    #        'unit': ['09.Archive','08.Provisional.Edited']},
+    #  'TSS':{'daily':['20.Day.Mean','20.Day.Mean'],
+    #        'unit': ['09.Archive','08.Provisional.Edited']},
+    #  'N':{'daily':['20.Day.Mean','20.Day.Mean'],
+    #        'unit': ['09.Archive','08.Provisional.Edited']},
+    #  'TKN':{'daily':['20.Day.Mean','20.Day.Mean'],
+    #        'unit': ['09.Archive','08.Provisional.Edited']},
+    #  'TP':{'daily':['20.Day.Mean','20.Day.Mean'],
+    #        'unit': ['09.Archive','08.Provisional.Edited']},
+    #  'OP':{'daily':['20.Day.Mean','20.Day.Mean'],
+    #        'unit': ['09.Archive','08.Provisional.Edited']},
+    #  'DO':{'daily':['20.Day.Mean','20.Day.Mean'],
+    #        'unit': ['09.Archive','08.Provisional.Edited']}}
+    # def extract(self,station_nos,constituent,resolution):
+    #     ts_names = self.TS_NAME_SELECTOR[constituent][resolution]
+    #     data = self.get_ts_ids(station_no = station_nos,stationparameter_no = self.CONSTITUENT_NAME_NO[constituent],ts_name =ts_names)
+    #     # Filter by MPCA distinction between internal and external sites and how time series are named
+    #     ts_ids = pd.concat([data.loc[(data['station_no'].str.startswith('E')) & (data['ts_name'] == ts_names[1])],
+    #                         data.loc[(~data['station_no'].str.startswith('E')) & (data['ts_name'] == ts_names[0])]])
+    #     dfs = [self.get_ts(ts_ids = ts_id) for ts_id in ts_ids['ts_id']]
+    #     data = pd.concat(dfs)
+    #     return data
+# nutrient
+#     -N03N02
+#     -OP
+#     -NH3
+#     -TP
+#     -DO
+#     -CHla
+# temperature
+# flow
+# test = pyWISK()
+# df = test.get_ts(ts_ids = 424663010)
+# df = test.get_ts(station_nos = 'W25060001')
+# df = test.get_wplmn(huc8_id = '07020005')
+# df = test.get_ts(huc_id = '07010205',stationgroup_id = '1319204',parametertype_id = 11500)

mpcaHydro/__init__.py ADDED Viewed

File without changes

mpcaHydro/data_manager.py ADDED Viewed

@@ -0,0 +1,321 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun  3 10:01:14 2022
+@author: mfratki
+"""
+import pandas as pd
+#from abc import abstractmethod
+from pathlib import Path
+from mpcaHydro import etlWISKI, etlSWD#, etlEQUIS
+#
+'''
+Q
+WT
+TSS
+N
+TKN
+OP
+TP
+CHLA
+DO
+class Station
+- id
+- name
+- source
+- data
+'''
+WISKI_EQUIS_XREF = pd.read_csv(Path(__file__).parent.parent/'WISKI_EQUIS_XREF.csv')
+#WISKI_EQUIS_XREF = pd.read_csv('C:/Users/mfratki/Documents/GitHub/hspf_tools/WISKI_EQUIS_XREF.csv')
+AGG_DEFAULTS = {'cfs':'mean',
+                'mg/l':'mean',
+                'degF': 'mean',
+                'lb':'sum'}
+UNIT_DEFAULTS = {'Q': 'cfs',
+                 'TSS': 'mg/l',
+                 'TP' : 'mg/l',
+                 'OP' : 'mg/l',
+                 'TKN': 'mg/l',
+                 'N'  : 'mg/l',
+                 'WT' : 'degF',
+                 'WL' : 'ft'}
+# VALID_UNITS = {'Q': 'cfs',
+#                  'TSS': 'mg/l','lb',
+#                  'TP' : 'mg/l',
+#                  'OP' : 'mg/l',
+#                  'TKN': 'mg/l',
+#                  'N'  : 'mg/l',
+#                  'WT' : 'degF',
+#                  'WL' : 'ft'}
+def are_lists_identical(nested_list):
+    # Sort each sublist
+    sorted_sublists = [sorted(sublist) for sublist in nested_list]
+    # Compare all sublists to the first one
+    return all(sublist == sorted_sublists[0] for sublist in sorted_sublists)
+class dataManager():
+    def __init__(self,folderpath):
+        self.data = {}
+        self.folderpath = Path(folderpath)
+    def get_wiski_stations(self):
+        return list(WISKI_EQUIS_XREF['WISKI_STATION_NO'].unique())
+    def get_equis_stations(self):
+        return list(WISKI_EQUIS_XREF['EQUIS_STATION_ID'].unique())
+    def wiski_equis_alias(self,wiski_station_id):
+        equis_ids =  list(set(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_STATION_NO'] == wiski_station_id,'WISKI_EQUIS_ID'].to_list()))
+        equis_ids = [equis_id for equis_id in equis_ids if not pd.isna(equis_id)]
+        if len(equis_ids) == 0:
+            return []
+        elif len(equis_ids) > 1:
+            print(f'Too Many Equis Stations for {wiski_station_id}')
+            raise
+        else:
+            return equis_ids[0]
+    def wiski_equis_associations(self,wiski_station_id):
+        equis_ids =  list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_STATION_NO'] == wiski_station_id,'EQUIS_STATION_ID'].unique())
+        equis_ids =  [equis_id for equis_id in equis_ids if not pd.isna(equis_id)]
+        if len(equis_ids) == 0:
+            return []
+        else:
+            return equis_ids
+    def equis_wiski_associations(self,equis_station_id):
+        wiski_ids = list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['EQUIS_STATION_ID'] == equis_station_id,'WISKI_STATION_NO'].unique())
+        wiski_ids = [wiski_id for wiski_id in wiski_ids if not pd.isna(wiski_id)]
+        if len(wiski_ids) == 0:
+            return []
+        else:
+            return wiski_ids
+    def _equis_wiski_associations(self,equis_station_ids):
+        wiski_stations = [self.equis_wiski_associations(equis_station_id) for equis_station_id in equis_station_ids]
+        if are_lists_identical(wiski_stations):
+            return wiski_stations[0]
+        else:
+            return []
+    def _download_station_data(self,station_id,station_origin,overwrite=False):
+        assert(station_origin in ['wiski','equis','swd','wplmn'])
+        if station_origin == 'wiski':
+            #equis_stations = list(WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['WISKI_STATION_NO'] == station_id,'WISKI_EQUIS_ID'].unique())
+            #[self.download_station_data(equis_station,'equis',overwrite = overwrite) for equis_station in equis_stations]
+            self.download_station_data(station_id,'wiski',overwrite = overwrite)
+            equis_alias = self.wiski_equis_alias(station_id)
+            self.download_station_data(equis_alias,'swd',overwrite = overwrite)
+        elif station_origin == 'wplmn':
+            self.download_station_data(station_id,'wplmn',overwrite = overwrite)
+            equis_alias = self.wiski_equis_alias(station_id)
+            self.download_station_data(equis_alias,'swd',overwrite = overwrite)
+        else:
+            wiski_station = self.equis_wiski_associations(station_id)
+            #wiski_station = WISKI_EQUIS_XREF.loc[WISKI_EQUIS_XREF['EQUIS_STATION_ID'] == station_id,'WISKI_STATION_NO']
+            self.download_station_data(station_id,'equis',overwrite = overwrite)
+            self.download_station_data(wiski_station,'wiski',overwrite = overwrite)
+    def download_station_data(self,station_id,source,folderpath=None,overwrite = False):
+        assert(source in ['wiski','equis','swd','wplmn'])
+        station_id = str(station_id)
+        save_name = station_id
+        if source == 'wplmn':
+            save_name = station_id + '_wplmn'
+        if folderpath is None:
+            folderpath = self.folderpath
+        else:
+            folderpath = Path(folderpath)
+        if (folderpath.joinpath(save_name + '.csv').exists()) & (not overwrite):
+            print (f'{station_id} data already downloaded')
+            return
+        if source == 'wiski':
+            data = etlWISKI.download(station_id)
+        elif source == 'swd':
+            data = etlSWD.download(station_id)
+        elif source == 'equis':
+            data = etlSWD.download(station_id)
+        else:
+            data = etlWISKI.download(station_id,wplmn=True)
+            #raise NotImplementedError()
+            #data = etlEQUIS.download(station_id)
+        if len(data) > 0:
+            data.to_csv(folderpath.joinpath(save_name + '.csv'))
+            self.data[station_id] = data
+        else:
+            print(f'No {source} calibration cata available at Station {station_id}')
+    def _load(self,station_id):
+        df =  pd.read_csv(self.folderpath.joinpath(station_id + '.csv'),
+                          index_col='datetime',
+                          parse_dates=['datetime'],
+                          #usecols=['Ts Date','Station number','variable', 'value','reach_id'],
+                          dtype={'station_id': str, 'value': float, 'variable': str,'constituent':str,'unit':str})
+        self.data[station_id] = df
+        return df
+    def load(self,station_id):
+        try:
+            df = self.data[station_id]
+        except:
+            df =  pd.read_csv(self.folderpath.joinpath(station_id + '.csv'),
+                              index_col='datetime',
+                              parse_dates=['datetime'],
+                              #usecols=['Ts Date','Station number','variable', 'value','reach_id'],
+                              dtype={'station_id': str, 'value': float, 'variable': str,'constituent':str,'unit':str})
+            self.data[station_id] = df
+        return df
+    def info(self,constituent):
+        return pd.concat([self._load(file.stem) for file in self.folderpath.iterdir() if file.suffix == '.csv'])[['station_id','constituent','value']].groupby(by = ['station_id','constituent']).count()
+    def get_wplmn_data(self,station_id,constituent,unit = 'mg/l', agg_period = 'YE', samples_only = True):
+        assert constituent in ['Q','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
+        station_id = station_id + '_wplmn'
+        dfsub = self._load(station_id)
+        if samples_only:
+            dfsub = dfsub.loc[dfsub['quality_id'] == 3]
+        agg_func = 'mean'
+        dfsub = dfsub.loc[(dfsub['constituent'] == constituent) &
+                              (dfsub['unit'] == unit),
+                              ['value','data_format','source']]
+        df = dfsub[['value']].resample(agg_period).agg(agg_func)
+        if df.empty:
+            dfsub = df
+        else:
+            df['data_format'] = dfsub['data_format'].iloc[0]
+            df['source'] = dfsub['source'].iloc[0]
+            #if (constituent == 'TSS') & (unit == 'lb'): #convert TSS from lbs to us tons
+            #    dfsub['value'] = dfsub['value']/2000
+            #dfsub = dfsub.resample('H').mean().dropna()
+        df.attrs['unit'] = unit
+        df.attrs['constituent'] = constituent
+        return df['value'].to_frame().dropna()
+    def get_data(self,station_id,constituent,agg_period = 'D'):
+        return self._get_data([station_id],constituent,agg_period)
+    def _get_data(self,station_ids,constituent,agg_period = 'D'):
+        '''
+        Returns the processed observational data associated with the calibration specific id.
+        Parameters
+        ----------
+        station_id : STR
+            Station ID as a string
+        constituent : TYPE
+            Constituent abbreviation used for calibration. Valid options:
+                'Q',
+                'TSS',
+                'TP',
+                'OP',
+                'TKN',
+                'N',
+                'WT',
+                'DO',
+                'WL']
+        unit : TYPE, optional
+            Units of data. The default is 'mg/l'.
+        sample_flag : TYPE, optional
+            For WPLMN data this flag determines modeled loads are returned. The default is False.
+        Returns
+        -------
+        dfsub : Pands.Series
+            Pandas series of data. Note that no metadata is returned.
+        '''
+        assert constituent in ['Q','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
+        unit = UNIT_DEFAULTS[constituent]
+        agg_func = AGG_DEFAULTS[unit]
+        dfsub = pd.concat([self.load(station_id) for station_id in station_ids]) # Check cache
+        dfsub = dfsub.loc[(dfsub['constituent'] == constituent) &
+                              (dfsub['unit'] == unit),
+                              ['value','data_format','source']]
+        df = dfsub[['value']].resample(agg_period).agg(agg_func)
+        df.attrs['unit'] = unit
+        df.attrs['constituent'] = constituent
+        if df.empty:
+            return df
+        else:
+            df['data_format'] = dfsub['data_format'].iloc[0]
+            df['source'] = dfsub['source'].iloc[0]
+        return df['value'].to_frame().dropna()
+def validate_constituent(constituent):
+    assert constituent in ['Q','TSS','TP','OP','TKN','N','WT','DO','WL','CHLA']
+def validate_unit(unit):
+    assert(unit in ['mg/l','lb','cfs','degF'])
+# class database():
+#     def __init__(self,db_path):
+#         self.dbm = MonitoringDatabase(db_path)
+#     def get_timeseries(self,station_ds, constituent,agg_period):
+#         validate_constituent(constituent)
+#         unit = UNIT_DEFAULTS[constituent]
+#         agg_func = AGG_DEFAULTS[unit]
+#         return odm.get_timeseries(station_id,constituent)
+#     def get_samples(self,station_ds, constituent,agg_period):
+#         validate_constituent(constituent)
+#         unit = UNIT_DEFAULTS[constituent]
+#         agg_func = AGG_DEFAULTS[unit]
+#         return odm.get_sample(station_id,constituent)
+#     def get_samples_and_timeseries(self,station_ds, constituent,agg_period)