PyPI - ecopipeline - Versions diffs - 0.7.7__tar.gz → 0.8.1__tar.gz - Mend

ecopipeline 0.7.7tar.gz → 0.8.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{ecopipeline-0.7.7/src/ecopipeline.egg-info → ecopipeline-0.8.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ecopipeline
-Version: 0.7.7
+Version: 0.8.1
 Summary: Contains functions for use in Ecotope Datapipelines
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: GNU General Public License (GPL)

{ecopipeline-0.7.7 → ecopipeline-0.8.1}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = ecopipeline
-version = 0.7.7
+version = 0.8.1
 authors = ["Carlos Bello, <bellocarlos@seattleu.edu>, Emil Fahrig <fahrigemil@seattleu.edu>, Casey Mang <cmang@seattleu.edu>, Julian Harris <harrisjulian@seattleu.edu>, Roger Tram <rtram@seattleu.edu>, Nolan Price <nolan@ecotope.com>"]
 description = Contains functions for use in Ecotope Datapipelines
 long_description = file: README.md

{ecopipeline-0.7.7 → ecopipeline-0.8.1}/src/ecopipeline/__init__.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from .utils.ConfigManager import ConfigManager
 from . import extract
 from . import transform
+from . import event_tracking
 from . import load
-__all__ = ['extract', 'transform', 'load', 'ConfigManager']
+__all__ = ['extract', 'transform', 'event_tracking', 'load', 'ConfigManager']

ecopipeline-0.8.1/src/ecopipeline/event_tracking/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .event_tracking import *
2	+ __all__ = ['flag_boundary_alarms']

ecopipeline-0.8.1/src/ecopipeline/event_tracking/event_tracking.py ADDED Viewed

@@ -0,0 +1,253 @@
+import pandas as pd
+import numpy as np
+import datetime as dt
+from ecopipeline import ConfigManager
+def flag_boundary_alarms(df: pd.DataFrame, daily_df : pd.DataFrame, config : ConfigManager, default_fault_time : int = 15, site: str = "") -> pd.DataFrame:
+    """
+    Function will take a pandas dataframe and location of alarm information in a csv,
+    and create an dataframe with applicable alarm events
+    Parameters
+    ----------
+    df: pd.DataFrame
+        post-transformed dataframe for minute data
+    daily_df: pd.DataFrame
+        post-transformed dataframe for daily data
+    config : ecopipeline.ConfigManager
+        The ConfigManager object that holds configuration data for the pipeline. Among other things, this object will point to a file
+        called Varriable_Names.csv in the input folder of the pipeline (e.g. "full/path/to/pipeline/input/Variable_Names.csv").
+        The file must have at least three columns which must be titled "variable_name", "low_alarm", and "high_alarm" which should contain the
+        name of each variable in the dataframe that requires the alarming, the lower bound for acceptable data, and the upper bound for
+        acceptable data respectively
+    default_fault_time : int
+        Number of consecutive minutes that a sensor must be out of bounds for to trigger an alarm. Can be customized for each variable with
+        the fault_time column in Varriable_Names.csv
+    site: str
+        string of site name if processing a particular site in a Variable_Names.csv file with multiple sites. Leave as an empty string if not aplicable.
+    Returns
+    -------
+    pd.DataFrame:
+        Pandas dataframe with alarm events
+    """
+    variable_names_path = config.get_var_names_path()
+    try:
+        bounds_df = pd.read_csv(variable_names_path)
+    except FileNotFoundError:
+        print("File Not Found: ", variable_names_path)
+        return pd.DataFrame()
+    if (site != ""):
+        if not 'site' in bounds_df.columns:
+            raise Exception("site parameter is non null, however, site is not present in Variable_Names.csv")
+        bounds_df = bounds_df.loc[bounds_df['site'] == site]
+    required_columns = ["variable_name", "high_alarm", "low_alarm"]
+    for required_column in required_columns:
+        if not required_column in bounds_df.columns:
+            raise Exception(f"{required_column} is not present in Variable_Names.csv")
+    if not 'pretty_name' in bounds_df.columns:
+        bounds_df['pretty_name'] = bounds_df['variable_name']
+    if not 'fault_time' in bounds_df.columns:
+        bounds_df['fault_time'] = default_fault_time
+    bounds_df = bounds_df.loc[:, ["variable_name", "high_alarm", "low_alarm", "fault_time", "pretty_name"]]
+    bounds_df.dropna(axis=0, thresh=2, inplace=True)
+    bounds_df.set_index(['variable_name'], inplace=True)
+    # ensure that lower and upper bounds are numbers
+    bounds_df['high_alarm'] = pd.to_numeric(bounds_df['high_alarm'], errors='coerce').astype(float)
+    bounds_df['low_alarm'] = pd.to_numeric(bounds_df['low_alarm'], errors='coerce').astype(float)
+    bounds_df['fault_time'] = pd.to_numeric(bounds_df['fault_time'], errors='coerce').astype('Int64')
+    bounds_df = bounds_df[bounds_df.index.notnull()]
+    alarms = {}
+    for bound_var, bounds in bounds_df.iterrows():
+        if bound_var in df.columns:
+            lower_mask = df[bound_var] < bounds["low_alarm"]
+            upper_mask = df[bound_var] > bounds["high_alarm"]
+            if pd.isna(bounds['fault_time']):
+                bounds['fault_time'] = default_fault_time
+            for day in daily_df.index:
+                next_day = day + pd.Timedelta(days=1)
+                # low alert
+                low_filtered_df = lower_mask.loc[(lower_mask.index >= day) & (lower_mask.index < next_day)]
+                low_consecutive_condition = low_filtered_df.rolling(window=bounds["fault_time"]).min() == 1
+                if low_consecutive_condition.any():
+                    first_true_index = low_consecutive_condition.idxmax()
+                    adjusted_time = first_true_index - pd.Timedelta(minutes=bounds["fault_time"]-1)
+                    alarm_string = f"Lower bound alarm for {bounds['pretty_name']} (first one at {adjusted_time.strftime('%H:%M')})."
+                    if day in alarms:
+                        alarms[day].append([bound_var, alarm_string])
+                    else:
+                        alarms[day] = [[bound_var, alarm_string]]
+                # high alert
+                up_filtered_df = upper_mask.loc[(upper_mask.index >= day) & (upper_mask.index < next_day)]
+                up_consecutive_condition = up_filtered_df.rolling(window=bounds["fault_time"]).min() == 1
+                if up_consecutive_condition.any():
+                    first_true_index = up_consecutive_condition.idxmax()
+                    adjusted_time = first_true_index - pd.Timedelta(minutes=bounds["fault_time"]-1)
+                    alarm_string = f"Upper bound alarm for {bounds['pretty_name']} (first one at {adjusted_time.strftime('%H:%M')})."
+                    if day in alarms:
+                        alarms[day].append([bound_var, alarm_string])
+                    else:
+                        alarms[day] = [[bound_var, alarm_string]]
+    events = {
+        'start_time_pt' : [],
+        'end_time_pt' : [],
+        'event_type' : [],
+        'event_detail' : [],
+        'variable_name' : []
+    }
+    for key, value_list in alarms.items():
+        for value in value_list:
+            events['start_time_pt'].append(key)
+            events['end_time_pt'].append(key)
+            events['event_type'].append('SILENT_ALARM')
+            events['event_detail'].append(value[1])
+            events['variable_name'].append(value[0])
+    event_df = pd.DataFrame(events)
+    event_df.set_index('start_time_pt', inplace=True)
+    return event_df
+# def flag_dhw_outage(df: pd.DataFrame, daily_df : pd.DataFrame, dhw_outlet_column : str, supply_temp : int = 110, consecutive_minutes : int = 15) -> pd.DataFrame:
+#     """
+#      Parameters
+#     ----------
+#     df : pd.DataFrame
+#         Single pandas dataframe of sensor data on minute intervals.
+#     daily_df : pd.DataFrame
+#         Single pandas dataframe of sensor data on daily intervals.
+#     dhw_outlet_column : str
+#         Name of the column in df and daily_df that contains temperature of DHW supplied to building occupants
+#     supply_temp : int
+#         the minimum DHW temperature acceptable to supply to building occupants
+#     consecutive_minutes : int
+#         the number of minutes in a row that DHW is not delivered to tenants to qualify as a DHW Outage
+#     Returns
+#     -------
+#     event_df : pd.DataFrame
+#         Dataframe with 'ALARM' events on the days in which there was a DHW Outage.
+#     """
+#     # TODO edge case for outage that spans over a day
+#     events = {
+#         'start_time_pt' : [],
+#         'end_time_pt' : [],
+#         'event_type' : [],
+#         'event_detail' : [],
+#     }
+#     mask = df[dhw_outlet_column] < supply_temp
+#     for day in daily_df.index:
+#         next_day = day + pd.Timedelta(days=1)
+#         filtered_df = mask.loc[(mask.index >= day) & (mask.index < next_day)]
+#         consecutive_condition = filtered_df.rolling(window=consecutive_minutes).min() == 1
+#         if consecutive_condition.any():
+#             # first_true_index = consecutive_condition['supply_temp'].idxmax()
+#             first_true_index = consecutive_condition.idxmax()
+#             adjusted_time = first_true_index - pd.Timedelta(minutes=consecutive_minutes-1)
+#             events['start_time_pt'].append(day)
+#             events['end_time_pt'].append(next_day - pd.Timedelta(minutes=1))
+#             events['event_type'].append("ALARM")
+#             events['event_detail'].append(f"Hot Water Outage Occured (first one starting at {adjusted_time.strftime('%H:%M')})")
+#     event_df = pd.DataFrame(events)
+#     event_df.set_index('start_time_pt', inplace=True)
+#     return event_df
+# def generate_event_log_df(config : ConfigManager):
+#     """
+#     Creates an event log df based on user submitted events in an event log csv
+#     Parameters
+#     ----------
+#     config : ecopipeline.ConfigManager
+#         The ConfigManager object that holds configuration data for the pipeline.
+#     Returns
+#     -------
+#     event_df : pd.DataFrame
+#         Dataframe formatted from events in Event_log.csv for pipeline.
+#     """
+#     event_filename = config.get_event_log_path()
+#     try:
+#         event_df = pd.read_csv(event_filename)
+#         event_df['start_time_pt'] = pd.to_datetime(event_df['start_time_pt'])
+#         event_df['end_time_pt'] = pd.to_datetime(event_df['end_time_pt'])
+#         event_df.set_index('start_time_pt', inplace=True)
+#         return event_df
+#     except Exception as e:
+#         print(f"Error processing file {event_filename}: {e}")
+#         return pd.DataFrame({
+#             'start_time_pt' : [],
+#             'end_time_pt' : [],
+#             'event_type' : [],
+#             'event_detail' : [],
+#         })
+# def create_data_statistics_df(df: pd.DataFrame) -> pd.DataFrame:
+#     """
+#     Function must be called on the raw minute data df after the rename_varriables() and before the ffill_missing() function has been called.
+#     The function returns a dataframe indexed by day. Each column will expanded to 3 columns, appended with '_missing_mins', '_avg_gap', and
+#     '_max_gap' respectively. the columns will carry the following statisctics:
+#     _missing_mins -> the number of minutes in the day that have no reported data value for the column
+#     _avg_gap -> the average gap (in minutes) between collected data values that day
+#     _max_gap -> the maximum gap (in minutes) between collected data values that day
+#     Parameters
+#     ----------
+#     df : pd.DataFrame
+#         minute data df after the rename_varriables() and before the ffill_missing() function has been called
+#     Returns
+#     -------
+#     daily_data_stats : pd.DataFrame
+#         new dataframe with the columns descriped in the function's description
+#     """
+#     min_time = df.index.min()
+#     start_day = min_time.floor('D')
+#     # If min_time is not exactly at the start of the day, move to the next day
+#     if min_time != start_day:
+#         start_day = start_day + pd.tseries.offsets.Day(1)
+#     # Build a complete minutely timestamp index over the full date range
+#     full_index = pd.date_range(start=start_day,
+#                                end=df.index.max().floor('D') - pd.Timedelta(minutes=1),
+#                                freq='T')
+#     # Reindex to include any completely missing minutes
+#     df_full = df.reindex(full_index)
+#     # Resample daily to count missing values per column
+#     total_missing = df_full.isna().resample('D').sum().astype(int)
+#     # Function to calculate max consecutive missing values
+#     def max_consecutive_nans(x):
+#         is_na = x.isna()
+#         groups = (is_na != is_na.shift()).cumsum()
+#         return is_na.groupby(groups).sum().max() or 0
+#     # Function to calculate average consecutive missing values
+#     def avg_consecutive_nans(x):
+#         is_na = x.isna()
+#         groups = (is_na != is_na.shift()).cumsum()
+#         gap_lengths = is_na.groupby(groups).sum()
+#         gap_lengths = gap_lengths[gap_lengths > 0]
+#         if len(gap_lengths) == 0:
+#             return 0
+#         return gap_lengths.mean()
+#     # Apply daily, per column
+#     max_consec_missing = df_full.resample('D').apply(lambda day: day.apply(max_consecutive_nans))
+#     avg_consec_missing = df_full.resample('D').apply(lambda day: day.apply(avg_consecutive_nans))
+#     # Rename columns to include a suffix
+#     total_missing = total_missing.add_suffix('_missing_mins')
+#     max_consec_missing = max_consec_missing.add_suffix('_max_gap')
+#     avg_consec_missing = avg_consec_missing.add_suffix('_avg_gap')
+#     # Concatenate along columns (axis=1)
+#     combined_df = pd.concat([total_missing, max_consec_missing, avg_consec_missing], axis=1)
+#     return combined_df

{ecopipeline-0.7.7 → ecopipeline-0.8.1}/src/ecopipeline/load/load.py RENAMED Viewed

@@ -319,8 +319,8 @@ def load_event_table(config : ConfigManager, event_df: pd.DataFrame, site_name :
         site_name = config.get_site_name()
     column_names = f"start_time_pt,site_name"
     column_types = ["datetime","varchar(25)","datetime",
-                    "ENUM('MISC_EVENT','DATA_LOSS','DATA_LOSS_COP','SITE_VISIT','SYSTEM_MAINTENANCE','EQUIPMENT_MALFUNCTION','PARTIAL_OCCUPANCY','INSTALLATION_ERROR','ALARM','MV_COMMISSIONED','PLANT_COMMISSIONED','INSTALLATION_ERROR_COP','SOO_PERIOD','SOO_PERIOD_COP','SYSTEM_TESTING')",
-                    "varchar(200)"]
+                    "ENUM('MISC_EVENT','DATA_LOSS','DATA_LOSS_COP','SITE_VISIT','SYSTEM_MAINTENANCE','EQUIPMENT_MALFUNCTION','PARTIAL_OCCUPANCY','INSTALLATION_ERROR','ALARM','SILENT_ALARM','MV_COMMISSIONED','PLANT_COMMISSIONED','INSTALLATION_ERROR_COP','SOO_PERIOD','SOO_PERIOD_COP','SYSTEM_TESTING')",
+                    "varchar(800)"]
     column_list = ['end_time_pt','event_type', 'event_detail']
     if not set(column_list).issubset(event_df.columns):
         raise Exception(f"event_df should contain a dataframe with columns start_time_pt, end_time_pt, event_type, and event_detail. Instead, found dataframe with columns {event_df.columns}")
@@ -329,21 +329,26 @@ def load_event_table(config : ConfigManager, event_df: pd.DataFrame, site_name :
         column_names += "," + column
     # create SQL statement
-    insert_str = "INSERT INTO " + table_name + " (" + column_names + ", last_modified_date, last_modified_by) VALUES (%s,%s,%s,%s,%s,'"+datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"','automatic_upload')"
+    insert_str = "INSERT INTO " + table_name + " (" + column_names + ", variable_name, last_modified_date, last_modified_by) VALUES (%s,%s,%s,%s,%s,%s,'"+datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"','automatic_upload')"
+    if not 'variable_name' in event_df.columns:
+        event_df['variable_name'] = None
     # add aditional columns for db creation
     full_column_names = column_names.split(",")[1:]
     full_column_names.append('last_modified_date')
     full_column_names.append('last_modified_by')
+    full_column_names.append('variable_name')
     full_column_types = column_types[1:]
     full_column_types.append('datetime')
     full_column_types.append('varchar(60)')
+    full_column_types.append('varchar(70)')
     existing_rows = pd.DataFrame({
         'start_time_pt' : [],
         'end_time_pt' : [],
         'event_type' : [],
+        'variable_name' : [],
         'last_modified_by' : []
     })
@@ -358,9 +363,9 @@ def load_event_table(config : ConfigManager, event_df: pd.DataFrame, site_name :
         try:
             # find existing times in database for upsert statement
             cursor.execute(
-                f"SELECT id, start_time_pt, end_time_pt, event_type, last_modified_by FROM {table_name} WHERE start_time_pt >= '{event_df.index.min()}' AND site_name = '{site_name}'")
+                f"SELECT id, start_time_pt, end_time_pt, event_detail, event_type, variable_name, last_modified_by FROM {table_name} WHERE start_time_pt >= '{event_df.index.min()}' AND site_name = '{site_name}'")
             # Fetch the results into a DataFrame
-            existing_rows = pd.DataFrame(cursor.fetchall(), columns=['id','start_time_pt', 'end_time_pt', 'event_type', 'last_modified_by'])
+            existing_rows = pd.DataFrame(cursor.fetchall(), columns=['id','start_time_pt', 'end_time_pt', 'event_detail', 'event_type', 'variable_name', 'last_modified_by'])
             existing_rows['start_time_pt'] = pd.to_datetime(existing_rows['start_time_pt'])
             existing_rows['end_time_pt'] = pd.to_datetime(existing_rows['end_time_pt'])
@@ -371,7 +376,7 @@ def load_event_table(config : ConfigManager, event_df: pd.DataFrame, site_name :
     ignoredRows = 0
     try:
         for index, row in event_df.iterrows():
-            time_data = [index,site_name,row['end_time_pt'],row['event_type'],row['event_detail']]
+            time_data = [index,site_name,row['end_time_pt'],row['event_type'],row['event_detail'],row['variable_name']]
             #remove nans and infinites
             time_data = [None if (x is None or pd.isna(x)) else x for x in time_data]
             time_data = [None if (x == float('inf') or x == float('-inf')) else x for x in time_data]
@@ -379,6 +384,10 @@ def load_event_table(config : ConfigManager, event_df: pd.DataFrame, site_name :
                 (existing_rows['start_time_pt'] == index) &
                 (existing_rows['event_type'] == row['event_type'])
             ]
+            if not time_data[-1] is None and not filtered_existing_rows.empty:
+                filtered_existing_rows = filtered_existing_rows[(filtered_existing_rows['variable_name'] == row['variable_name']) &
+                                                                (filtered_existing_rows['event_detail'] == row['event_detail'])]
             if not filtered_existing_rows.empty:
                 first_matching_row = filtered_existing_rows.iloc[0]  # Retrieves the first row
                 statement, values = _generate_mysql_update_event_table(row, first_matching_row['id'])

{ecopipeline-0.7.7 → ecopipeline-0.8.1/src/ecopipeline.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ecopipeline
-Version: 0.7.7
+Version: 0.8.1
 Summary: Contains functions for use in Ecotope Datapipelines
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: GNU General Public License (GPL)

{ecopipeline-0.7.7 → ecopipeline-0.8.1}/src/ecopipeline.egg-info/SOURCES.txt RENAMED Viewed

@@ -9,6 +9,8 @@ src/ecopipeline.egg-info/SOURCES.txt
 src/ecopipeline.egg-info/dependency_links.txt
 src/ecopipeline.egg-info/requires.txt
 src/ecopipeline.egg-info/top_level.txt
+src/ecopipeline/event_tracking/__init__.py
+src/ecopipeline/event_tracking/event_tracking.py
 src/ecopipeline/extract/__init__.py
 src/ecopipeline/extract/extract.py
 src/ecopipeline/load/__init__.py