PyPI - anomaly-pipeline - Versions diffs - 0.1.27__py3-none-any.whl → 0.1.61__py3-none-any.whl - Mend

anomaly-pipeline 0.1.27py3-none-any.whl → 0.1.61py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

anomaly_pipeline/__init__.py +73 -1
anomaly_pipeline/helpers/DB_scan.py +144 -10
anomaly_pipeline/helpers/MAD.py +45 -0
anomaly_pipeline/helpers/Preprocessing.py +274 -73
anomaly_pipeline/helpers/STD.py +64 -0
anomaly_pipeline/helpers/__init__.py +13 -1
anomaly_pipeline/helpers/evaluation_info.py +25 -17
anomaly_pipeline/helpers/evaluation_plots.py +636 -30
anomaly_pipeline/helpers/ewma.py +105 -7
anomaly_pipeline/helpers/fb_prophet.py +150 -2
anomaly_pipeline/helpers/{help_info.py → help_anomaly.py} +194 -89
anomaly_pipeline/helpers/iso_forest_general.py +5 -3
anomaly_pipeline/helpers/iso_forest_timeseries.py +195 -23
anomaly_pipeline/helpers/percentile.py +46 -3
anomaly_pipeline/main.py +158 -39
anomaly_pipeline/pipeline.py +106 -34
anomaly_pipeline-0.1.61.dist-info/METADATA +275 -0
anomaly_pipeline-0.1.61.dist-info/RECORD +24 -0
anomaly_pipeline-0.1.27.dist-info/METADATA +0 -15
anomaly_pipeline-0.1.27.dist-info/RECORD +0 -24
{anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/WHEEL +0 -0
{anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/entry_points.txt +0 -0
{anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/top_level.txt +0 -0

anomaly_pipeline/helpers/evaluation_plots.py CHANGED Viewed

@@ -6,7 +6,7 @@ import plotly.express as px
 def initialize_fig(group, group_columns, variable, date_column, anomaly_detection_model):
-    plot_title = "  --  ".join(list(group[group_columns].values[0])).upper() + "  --  " + anomaly_detection_model
+    plot_title = " - ".join(list(group[group_columns].values[0])).upper() + "  --  " + anomaly_detection_model
     fig = go.Figure()
@@ -19,6 +19,21 @@ def initialize_fig(group, group_columns, variable, date_column, anomaly_detectio
         name=variable if variable == variable.upper() else variable.title(),
     ))
+    # --- Calculate X-Axis Padding (One Period) ---
+    dates = group[date_column].sort_values()
+    min_date = dates.min()
+    max_date = dates.max()
+    if len(dates) > 1:
+        # Calculate the most common time difference to determine the period
+        period = dates.diff().mode().iloc[0]
+    else:
+        period = pd.Timedelta(days=1)
+    # Apply padding
+    range_min = min_date - period
+    range_max = max_date + period
     fig.update_layout(
         title=dict(
                 text=plot_title,
@@ -34,7 +49,7 @@ def initialize_fig(group, group_columns, variable, date_column, anomaly_detectio
         plot_bgcolor='snow',
         paper_bgcolor='whitesmoke',
         xaxis=dict(
-            range=[group[date_column].min(), group[date_column].max()],
+            range=[range_min, range_max],
             showline=True,
             linewidth=0.5,
             linecolor='orange',
@@ -52,7 +67,7 @@ def initialize_fig(group, group_columns, variable, date_column, anomaly_detectio
             mirror=True
             ),
         yaxis_title=dict(
-            text=variable if variable == variable.upper() else variable.title(),
+            text=variable.replace('_', ' ') if variable == variable.upper() else variable.title().replace('_', ' '),
             font=dict(size=16, weight='bold', color='black')
             ),
         legend=dict(
@@ -139,16 +154,45 @@ def add_eval_period_highlight(fig, group, date_column, variable, eval_period):
             color='rgba(0, 255, 0, 0.25)', # 'lime' with 0.25 alpha
             width=10
         ),
-        name='Evalution Period',
+        name='Evaluation Period',
         hoverinfo='skip',
     ))
     return fig
-def anomaly_eval_plot(group, group_columns, variable, date_column, eval_period=12, show_anomaly_scores_on_main_plot=False):
+def anomaly_overview_plot(group, group_columns, variable, date_column, eval_period, show_anomaly_scores_on_main_plot=False):
     # IS ANOMALY Plot
     # This is the main plot
+    """
+    Generates an ensemble anomaly evaluation plot using Plotly.
+    This function aggregates multiple anomaly detection models (columns starting with 'is_'
+    and ending with '_anomaly') to create a consensus 'Anomaly Score'. It visualizes
+    actual values, mean, median, and highlights points where the ensemble of models
+    agrees there is an anomaly.
+    Args:
+        group (pd.DataFrame): The processed dataframe containing original data and
+            boolean anomaly flags from various models (e.g., 'is_FB_anomaly').
+        group_columns (list): List of column names used to identify the group
+            (e.g., ['Region', 'Product']).
+        variable (str): The name of the numeric column being analyzed.
+        date_column (str): The name of the datetime column.
+        eval_period (int, optional): The number of recent periods evaluated. Defaults to 12.
+        show_anomaly_scores_on_main_plot (bool, optional): If True, adds a secondary
+            Y-axis bar chart showing the normalized ensemble score (-100 to 100).
+            Defaults to False.
+    Logic:
+        - Voting: Counts all columns matching 'is_*_anomaly'.
+        - is_Anomaly: True if >= 50% of the active models flag the point.
+        - Anomaly Score: A normalized metric where 100 represents total consensus
+          among all models and negative values represent low-risk points.
+    Returns:
+        None: Displays an interactive Plotly figure.
+    """
     try:
         group = group.copy()
@@ -157,24 +201,20 @@ def anomaly_eval_plot(group, group_columns, variable, date_column, eval_period=1
             if col.startswith('is_') and col.endswith('_anomaly') and col != 'is_anomaly':
                 anomaly_cols.append(col)
         group['Anomaly Vote Models'] = group.apply(
-            lambda row: sorted([col.removeprefix('is_').removesuffix('_anomaly')
+            lambda row: ([
+                'IF' if 'IsolationForest' in col else
+                'PCNTL' if 'Percentile' in col else
+                col.removeprefix('is_').removesuffix('_anomaly')
                 for col in anomaly_cols
-                if pd.notna(row[col]) and row[col] == True]),
+                if pd.notna(row[col]) and row[col] == True
+            ]),
             axis=1)
         group['Anomaly Vote Models'] = group['Anomaly Vote Models'].apply(lambda x: ', '.join(x))
-        group['Anomaly_Votes'] = group[anomaly_cols].sum(axis=1).astype(int)
-        group['Vote_Cnt'] = group[anomaly_cols].replace(False, True).sum(axis=1).astype(int)
-        group['Anomaly_Votes_Display'] = group['Anomaly_Votes'].astype(str) + ' out of ' + group['Vote_Cnt'].astype(str)
-        group['is_Anomaly'] = np.where(group['Anomaly_Votes']/group['Vote_Cnt'] >= 0.5, True, False)
-        group['Anomaly_Score'] = 2 * (group['Anomaly_Votes']/group['Vote_Cnt'] - 0.5).astype(float)
-        group['Anomaly_Score_Display'] = np.where(group['Anomaly_Score'] < 0, np.floor(100*group['Anomaly_Score']),
-                                                  np.where(group['Anomaly_Score'] > 0, np.ceil(100*group['Anomaly_Score']),
-                                                  1)).astype(float)
         group['Mean'] = group[variable].mean()
         group['Median'] = group[variable].median()
-        fig = initialize_fig(group, group_columns, variable, date_column, "Anomalies")
+        fig = initialize_fig(group, group_columns, variable, date_column, "Anomalies Overview Plot")
         # Mean
         fig.add_trace(go.Scatter(
             x=group[date_column],
@@ -202,7 +242,11 @@ def anomaly_eval_plot(group, group_columns, variable, date_column, eval_period=1
             x=group[group['is_Anomaly'] == True][date_column],
             y=group[group['is_Anomaly'] == True][variable],
             mode='markers',
-            marker=dict(color='red', symbol='circle', line=dict(width=1), size=5*(group[group['is_Anomaly'] == True]['Anomaly_Score'] + 2)),
+            marker=dict(color='crimson',
+                        symbol='circle',
+                        line=dict(width=1),
+                        size=10*(group[group['is_Anomaly'] == True]['Anomaly_Votes']) ** (1/4)
+                       ),
             name='Anomalies',
             customdata=group[group['is_Anomaly'] == True][['Anomaly_Votes_Display', 'Anomaly Vote Models', 'Anomaly_Score_Display']],
             hovertemplate=(
@@ -222,7 +266,8 @@ def anomaly_eval_plot(group, group_columns, variable, date_column, eval_period=1
             marker=dict(color='orange',
                         symbol='circle',
                         line=dict(width=1),
-                        size=5*(group[(group['is_Anomaly'] == False) & (group['Anomaly_Votes'] >= 1)]['Anomaly_Score'] + 2)),
+                        size=8*(group[(group['is_Anomaly'] == False) & (group['Anomaly_Votes'] >= 1)]['Anomaly_Votes']) ** (1/4)
+                       ),
             name='Not Quite Anomalies',
             customdata=group[(group['is_Anomaly'] == False) & (group['Anomaly_Votes'] >= 1)][['Anomaly_Votes_Display', 'Anomaly Vote Models', 'Anomaly_Score_Display']],
             hovertemplate=(
@@ -234,6 +279,22 @@ def anomaly_eval_plot(group, group_columns, variable, date_column, eval_period=1
                 )
             ))
+        # Not Anomalies
+        fig.add_trace(go.Scatter(
+            x=group[(group['is_Anomaly'] == False) & (group['Anomaly_Votes'] == 0)][date_column],
+            y=group[(group['is_Anomaly'] == False) & (group['Anomaly_Votes'] == 0)][variable],
+            mode='markers',
+            marker=dict(color='lightgray',
+                        symbol='circle',
+                        line=dict(width=0),
+                        size=6),
+            name='Normal',
+            customdata=group[(group['is_Anomaly'] == False) & (group['Anomaly_Votes'] == 0)][['Anomaly_Votes_Display', 'Anomaly Vote Models', 'Anomaly_Score_Display']],
+            hovertemplate=(
+                f'Date: %{{x|%Y-%m-%d}}<br>' +
+                f'{variable if variable == variable.upper() else variable.title()}: %{{y:,d}}<br>'
+                )))
         # Add Anomaly Scores to Secondary Axis
         if show_anomaly_scores_on_main_plot:
             fig.add_trace(go.Bar(
@@ -274,8 +335,38 @@ def anomaly_eval_plot(group, group_columns, variable, date_column, eval_period=1
         print(f"Anomaly Plot Failed: {e}")
-def anomaly_percentile_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
+def anomaly_percentile_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
     # Percentile Model Plot
+    """
+    Visualizes anomaly detection based on Percentile-derived thresholds.
+    This function plots the time-series data alongside shaded regions representing
+    the upper and lower percentile boundaries. It highlights specific 'Percentile'
+    model anomalies and can optionally overlay the final consensus anomalies.
+    Args:
+        group (pd.DataFrame): Dataframe containing the time-series data and
+            calculated percentile columns ('Percentile_low', 'Percentile_high',
+            and 'is_Percentile_anomaly').
+        group_columns (list): Column names used for grouping/title identification.
+        variable (str): The numeric column name being plotted on the Y-axis.
+        date_column (str): The datetime column name for the X-axis.
+        final_anomalies (bool, optional): If True, overlays the final ensemble
+            consensus markers (red circles) on top of the model-specific markers.
+            Defaults to True.
+        eval_period (int, optional): The look-back period used for the evaluation
+            context. Defaults to 12.
+    Logic:
+        - Shading: Uses `add_anomaly_region` to fill the area beyond 'Percentile_low'
+          and 'Percentile_high'.
+        - Model Markers: Highlights points where 'is_Percentile_anomaly' is True.
+        - Integration: Uses helper functions `initialize_fig`, `add_anomaly_region`,
+          and `add_model_anomalies` to maintain a consistent UI/UX.
+    Returns:
+        None: Displays an interactive Plotly figure.
+    """
     try:
         group = group.copy()
         fig = initialize_fig(group, group_columns, variable, date_column, "Percentile Anomaly Detection")
@@ -294,8 +385,41 @@ def anomaly_percentile_plot(group, group_columns, variable, date_column, final_a
         print(f"Percentile Anomaly Plot Failed: {e}")
-def anomaly_sd_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
+def anomaly_sd_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
     # SD Model Plot
+    """
+    Visualizes anomaly detection based on Standard Deviation (SD) thresholds.
+    This function plots the time-series data and overlays shaded regions representing
+    statistical boundaries (typically 2 or 3 standard deviations from the mean).
+    It identifies 'SD' model-specific anomalies and can optionally display the
+    final ensemble consensus markers.
+    Args:
+        group (pd.DataFrame): Dataframe containing the time-series data and
+            calculated SD boundary columns ('SD2_low', 'SD2_high', and
+            'is_SD_anomaly').
+        group_columns (list): Column names used for grouping/title identification.
+        variable (str): The numeric column name being plotted on the Y-axis.
+        date_column (str): The datetime column name for the X-axis.
+        final_anomalies (bool, optional): If True, overlays the final ensemble
+            consensus markers (red circles) on top of the SD model markers.
+            Defaults to True.
+        eval_period (int, optional): The look-back period used for the evaluation
+            context. Defaults to 12.
+    Logic:
+        - Shading: Utilizes `add_anomaly_region` to fill the areas outside the
+          'SD2_low' and 'SD2_high' thresholds, visually representing the
+          statistical "outlier zones."
+        - Model Markers: Highlights points where the SD model specifically
+          triggered an anomaly flag.
+        - Visualization Helpers: Relies on `initialize_fig`, `add_anomaly_region`,
+          and `add_model_anomalies` for UI consistency across the pipeline.
+    Returns:
+        None: Displays an interactive Plotly figure and prints a newline.
+    """
     try:
         group = group.copy()
         fig = initialize_fig(group, group_columns, variable, date_column, "SD Anomaly Detection")
@@ -314,8 +438,41 @@ def anomaly_sd_plot(group, group_columns, variable, date_column, final_anomalies
         print(f"SD Anomaly Plot Failed: {e}")
-def anomaly_mad_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
+def anomaly_mad_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
     # MAD Model Plot
+    """
+    Visualizes anomaly detection based on Median Absolute Deviation (MAD).
+    MAD is a robust measure of statistical dispersion. This plot displays the
+    time-series data with shaded thresholds derived from the median and
+    the MAD scale factor. It is particularly effective for datasets where
+    mean and standard deviation are heavily skewed by extreme outliers.
+    Args:
+        group (pd.DataFrame): Dataframe containing the time-series data and
+            calculated MAD boundary columns ('MAD_low', 'MAD_high', and
+            'is_MAD_anomaly').
+        group_columns (list): Column names used for grouping/title identification.
+        variable (str): The numeric column name being plotted on the Y-axis.
+        date_column (str): The datetime column name for the X-axis.
+        final_anomalies (bool, optional): If True, overlays the final ensemble
+            consensus markers (red circles) on top of the MAD model markers.
+            Defaults to True.
+        eval_period (int, optional): The look-back period used for the evaluation
+            context. Defaults to 12.
+    Logic:
+        - Shading: Highlights the areas outside the 'MAD_low' and 'MAD_high'
+          thresholds. Because MAD uses the median as a baseline, these bands
+          are often tighter and more resistant to outlier-driven "threshold bloat."
+        - Model Markers: Specifically plots points flagged by the 'is_MAD_anomaly'
+          logic.
+        - Helper Integration: Uses `initialize_fig` for layout and `add_anomalies`
+          for consensus overlay.
+    Returns:
+        None: Displays an interactive Plotly figure.
+    """
     try:
         group = group.copy()
         fig = initialize_fig(group, group_columns, variable, date_column, "MAD Anomaly Detection")
@@ -334,7 +491,38 @@ def anomaly_mad_plot(group, group_columns, variable, date_column, final_anomalie
         print(f"MAD Anomaly Plot Failed: {e}")
-def anomaly_iqr_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
+def anomaly_iqr_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
+    """
+    Visualizes anomaly detection based on the Interquartile Range (IQR).
+    This function utilizes the Tukey's Fences method to identify outliers. It
+    calculates the spread between the 25th (Q1) and 75th (Q3) percentiles to
+    establish 'Normal' bounds. It is highly effective for skewed data as it
+    does not assume a normal distribution.
+    Args:
+        group (pd.DataFrame): Dataframe containing the time-series data and
+            calculated IQR boundary columns ('IQR_low', 'IQR_high', and
+            'is_IQR_anomaly').
+        group_columns (list): Column names used for grouping/title identification.
+        variable (str): The numeric column name being plotted on the Y-axis.
+        date_column (str): The datetime column name for the X-axis.
+        final_anomalies (bool, optional): If True, overlays the final ensemble
+            consensus markers (red circles) on top of the IQR-specific markers.
+            Defaults to True.
+        eval_period (int, optional): The look-back period used for the evaluation
+            context. Defaults to 12.
+    Logic:
+        - Shading: Fills the region below Q1 - 1.5*IQR and above Q3 + 1.5*IQR.
+        - Robustness: Because it uses quartiles rather than mean/SD, it is
+          resistant to being "fooled" by the outliers it is trying to detect.
+        - Consistency: Uses the standard suite of helpers (`initialize_fig`,
+          `add_anomaly_region`) to match the rest of the pipeline's visual style.
+    Returns:
+        None: Displays an interactive Plotly figure.
+    """
     # IQR Model Plot
     try:
         group = group.copy()
@@ -354,7 +542,38 @@ def anomaly_iqr_plot(group, group_columns, variable, date_column, final_anomalie
         print(f"IQR Anomaly Plot Failed: {e}")
-def anomaly_ewma_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
+def anomaly_ewma_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
+    """
+    Visualizes anomaly detection based on Exponentially Weighted Moving Average (EWMA).
+    This plot highlights anomalies using a moving baseline that gives more weight to
+    recent observations. It visualizes the EWMA forecast line, the calculated upper
+    and lower control limits (bands), and model-specific outliers. It is ideal for
+    detecting shifts in mean or variance in non-stationary time series.
+    Args:
+        group (pd.DataFrame): Dataframe containing the time-series data and
+            EWMA-specific columns ('EWMA_forecast', 'EWMA_low', 'EWMA_high',
+            and 'is_EWMA_anomaly').
+        group_columns (list): Column names used for grouping and plot titles.
+        variable (str): The name of the target numeric column.
+        date_column (str): The name of the datetime column.
+        final_anomalies (bool, optional): If True, overlays the final ensemble
+            consensus markers (red circles) on top of the EWMA markers.
+            Defaults to True.
+        eval_period (int, optional): The number of recent periods evaluated.
+            Used for context in title or scaling. Defaults to 12.
+    Logic:
+        - Forecast Line: Displays the weighted moving average ('slateblue').
+        - Dynamic Thresholds: Visualizes 'EWMA_low' and 'EWMA_high' as 'orangered'
+          dashdot lines with light red shading in the outlier zones.
+        - Model Markers: Highlights points where the EWMA logic specifically
+          triggered an anomaly flag.
+    Returns:
+        None: Displays an interactive Plotly figure.
+    """
     # EWMA Model Plot
     try:
         group = group.copy()
@@ -425,7 +644,39 @@ def anomaly_ewma_plot(group, group_columns, variable, date_column, final_anomali
         print(f"EWMA Anomaly Plot Failed: {e}")
-def anomaly_fb_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
+def anomaly_fb_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
+    """
+    Visualizes anomaly detection using the Facebook Prophet (FB) model.
+    This function displays the Prophet model's additive trend and seasonality
+    forecasts along with its uncertainty intervals (yhat_upper and yhat_lower).
+    It is particularly useful for identifying anomalies in data with strong
+    seasonality (weekly/yearly) that simpler statistical models might miss.
+    Args:
+        group (pd.DataFrame): Dataframe containing Prophet output columns
+            ('FB_forecast', 'FB_low', 'FB_high', and 'is_FB_anomaly').
+        group_columns (list): Column names used to identify and title the group.
+        variable (str): The name of the target numeric column analyzed.
+        date_column (str): The name of the datetime column.
+        final_anomalies (bool, optional): If True, overlays the final ensemble
+            consensus markers (red circles) over the Prophet markers.
+            Defaults to True.
+        eval_period (int, optional): The number of recent periods analyzed.
+            Defaults to 12.
+    Logic:
+        - Recursive Visibility: Since FB Prophet is run in a walk-forward manner,
+          the shaded regions represent the prediction interval at the time
+          of forecast.
+        - Outlier Zones: Shaded red areas represent values that fall outside
+          the model's expected confidence interval (based on `prophet_CI`).
+        - Model Markers: Highlights points where Prophet specifically flagged
+          an anomaly based on its trend and seasonal expectations.
+    Returns:
+        None: Displays an interactive Plotly figure.
+    """
     # FB Prophet Model Plot
     try:
         group = group.copy()
@@ -493,10 +744,42 @@ def anomaly_fb_plot(group, group_columns, variable, date_column, final_anomalies
         fig.show()
         print("\n")
     except Exception as e:
-        print(f"EWMA Anomaly Plot Failed: {e}")
+        print(f"FB Anomaly Plot Failed: {e}")
+def anomaly_dbscan_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
+    """
+    Visualizes anomaly detection using the DBSCAN clustering algorithm.
+    DBSCAN identifies anomalies as 'noise' points that reside in low-density
+    regions of the feature space. Unlike threshold-based methods, DBSCAN
+    looks for multi-dimensional patterns. This plot highlights points
+    flagged as noise by the algorithm, contextually placed within the
+    time-series trend.
-def anomaly_dbscan_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
+    Args:
+        group (pd.DataFrame): Dataframe containing the time-series data and
+            DBSCAN results (specifically the 'is_DBSCAN_anomaly' column).
+        group_columns (list): Column names used to identify and title the group.
+        variable (str): The name of the target numeric column analyzed.
+        date_column (str): The name of the datetime column.
+        final_anomalies (bool, optional): If True, overlays the final ensemble
+            consensus markers (red circles) over the DBSCAN markers.
+            Defaults to True.
+        eval_period (int, optional): The number of recent periods to highlight
+            as the evaluation window. Defaults to 12.
+    Logic:
+        - Density Clustering: Points are flagged as anomalies if they are
+          isolated from the main "clusters" of data points in the feature space.
+        - Eval Period Highlight: Uses `add_eval_period_highlight` to visually
+          distinguish the recent testing window from the historical training data.
+        - Model Markers: Highlights specific DBSCAN outliers using 'mediumorchid'
+          circles.
+    Returns:
+        None: Displays an interactive Plotly figure.
+    """
     # DBSCAN Model Plot
     try:
         group = group.copy()
@@ -521,7 +804,38 @@ def anomaly_dbscan_plot(group, group_columns, variable, date_column, final_anoma
         print(f"DBSCAN Anomaly Plot Failed: {e}")
-def anomaly_isolation_forest_timeseries_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
+def anomaly_isolation_forest_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
+    """
+    Visualizes anomaly detection using the Isolation Forest algorithm.
+    Isolation Forest is an unsupervised learning algorithm that isolates anomalies
+    by randomly selecting a feature and a split value. Since anomalies are few
+    and different, they are easier to isolate (shorter path length in the tree).
+    This plot shows points identified as anomalies based on this branching logic.
+    Args:
+        group (pd.DataFrame): Dataframe containing time-series data and
+            Isolation Forest results (specifically 'is_IsolationForest_anomaly_timeseries').
+        group_columns (list): Column names used to identify and title the group.
+        variable (str): The name of the target numeric column analyzed.
+        date_column (str): The name of the datetime column.
+        final_anomalies (bool, optional): If True, overlays the final ensemble
+            consensus markers (red circles) over the Isolation Forest markers.
+            Defaults to True.
+        eval_period (int, optional): The number of recent periods to highlight
+            as the evaluation window. Defaults to 12.
+    Logic:
+        - Tree-Based Isolation: Anomalies are identified by having shorter average
+          path lengths across a forest of random trees.
+        - Temporal Context: Uses `add_eval_period_highlight` to shade the recursive
+          testing window, helping users see if anomalies are recent.
+        - Model Markers: Highlights specific Isolation Forest outliers using
+          'mediumorchid' markers.
+    Returns:
+        None: Displays an interactive Plotly figure.
+    """
     # Isolation Forest Model Plot
     try:
         group = group.copy()
@@ -531,8 +845,8 @@ def anomaly_isolation_forest_timeseries_plot(group, group_columns, variable, dat
             fig = add_eval_period_highlight(fig, group, date_column, variable, eval_period)
         # Isolation Forest Anomalies
         fig.add_trace(go.Scatter(
-            x=group[group['is_IsolationForest_anomaly_timeseries'] == True][date_column],
-            y=group[group['is_IsolationForest_anomaly_timeseries'] == True][variable],
+            x=group[group['is_IsolationForest_anomaly'] == True][date_column],
+            y=group[group['is_IsolationForest_anomaly'] == True][variable],
             mode='markers',
             marker=dict(color='mediumorchid', symbol='circle', line=dict(width=1), size=7),
             name='Isolation Forest Anomalies',
@@ -544,3 +858,295 @@ def anomaly_isolation_forest_timeseries_plot(group, group_columns, variable, dat
         print("\n")
     except Exception as e:
         print(f"Isolation Forest Time Series Anomaly Plot Failed: {e}")
+def anomaly_stacked_bar_plot(df, group_columns, variable, date_column, anomaly_col='is_Anomaly', secondary_line=None):
+    """
+    Generates a time-ordered stacked bar chart showing Normal vs. Anomalous record counts.
+    Args:
+        df (pd.DataFrame): The dataframe containing the data.
+        date_column (str): The name of the datetime column.
+        anomaly_col (str): The name of the boolean column (True=Anomaly).
+        title (str): Title of the chart.
+    Returns:
+        None: Displays the interactive Plotly figure.
+    """
+    try:
+        # 1. Aggregation
+        # Group by date to get counts across all unique_ids for that specific timestamp
+        df['normal_val'] = np.where(df[anomaly_col] != True, 1, 0)
+        df['anomaly_val'] = np.where(df[anomaly_col] == True, 1, 0)
+        agg_df = df.groupby(date_column).agg(
+                normal_sum=('normal_val', 'sum'),
+                anomaly_sum=('anomaly_val', 'sum'),
+                variable_mean=(variable, 'mean'),
+                score_mean=('Anomaly_Score', 'mean'),
+            ).reset_index()
+        agg_df['total_sum'] = agg_df['normal_sum'] + agg_df['anomaly_sum']
+        # Calculate percentage (handle division by zero just in case)
+        agg_df['anomaly_pct'] = np.where(agg_df['total_sum'] > 0, (agg_df['anomaly_sum'] / agg_df['total_sum']) * 100, 0)
+        dates = agg_df[date_column].sort_values()
+        min_date = dates.min()
+        max_date = dates.max()
+        if len(dates) > 1:
+            # Calculate the most common time difference to determine the period
+            period = dates.diff().mode().iloc[0]
+        else:
+            period = pd.Timedelta(days=1)
+        # Apply padding
+        range_min = min_date - period
+        range_max = max_date + period
+        # 2. Initialize Figure
+        fig = go.Figure()
+        if secondary_line is None or secondary_line == variable:
+            line_var = 'variable_mean'
+            var_title = f"Avg {variable if variable == variable.upper() else variable.replace('_', ' ').title()}"
+        else:
+            line_var = 'score_mean'
+            var_title = 'Avg Anomaly Score'
+        # 3. Add Traces
+        # Bottom Bar: Non-Anomalous (Grey)
+        fig.add_trace(go.Bar(
+            x=agg_df[date_column],
+            y=agg_df['normal_sum'],
+            name='Normal',
+            marker_color='lightgray',
+            customdata=agg_df[['total_sum']],
+            hovertemplate=(
+                f'<b>Date:</b> %{{x|%Y-%m-%d}}<br>' +
+                f'<b>Normal Records:</b> %{{y:,}}<br>' +
+                f'<b>Total Volume:</b> %{{customdata[0]:,}}<extra></extra>'
+            )
+        ))
+        # Top Bar: Anomalous (Red)
+        fig.add_trace(go.Bar(
+            x=agg_df[date_column],
+            y=agg_df['anomaly_sum'],
+            name='Anomaly',
+            marker_color='crimson',  # Red for anomalies
+            customdata=agg_df[['total_sum', 'anomaly_pct']],
+            hovertemplate=(
+                f'<b>Date:</b> %{{x|%Y-%m-%d}}<br>' +
+                f'<b>Anomalies:</b> %{{y:,}}<br>' +
+                f'<b>Anomaly Rate:</b> %{{customdata[1]:.0f}}%<extra></extra>'
+            )
+        ))
+        # Line on secondary axis
+        fig.add_trace(go.Scatter(
+            x=agg_df[date_column],
+            y=agg_df[line_var],
+            name=var_title,
+            yaxis='y2',
+            mode='lines',
+            line=dict(width=3, color='darkslategray'),
+            hovertemplate=(
+                f'<b>Date:</b> %{{x|%Y-%m-%d}}<br>' +
+                f'<b>Total {variable}:</b> %{{y:,.2f}}<extra></extra>'
+            )
+        ))
+        # 4. Apply Visual Design (Matching your existing style)
+        fig.update_layout(
+            title=dict(
+                text=f'Anomalies and {var_title} per Group Over Time for {len(df[group_columns].drop_duplicates())} Groups',
+                y=0.96,
+                x=0.5,
+                xanchor='center',
+                yanchor='top',
+                font=dict(size=18, color='black', weight='bold'),
+            ),
+            barmode='stack',
+            height=350,
+            width=1200,
+            margin=dict(l=50, r=100, t=60, b=30),
+            plot_bgcolor='snow',
+            paper_bgcolor='whitesmoke',
+            xaxis=dict(
+                range=[range_min, range_max],
+                showline=True,
+                linewidth=0.5,
+                linecolor='orange',
+                zeroline=False,
+                gridcolor='rgba(255, 165, 0, 0.1)',
+                mirror=True,
+            ),
+            yaxis=dict(
+                # Dynamic range with a little headroom
+                range=[0, agg_df['total_sum'].max()],
+                showline=True,
+                linewidth=0.5,
+                linecolor='orange',
+                zeroline=False,
+                gridcolor='rgba(255, 165, 0, 0.1)',
+                mirror=True,
+                title=dict(text="Group Count", font=dict(size=16, weight='bold', color='black')),
+                ),
+            yaxis2=dict(
+                title=dict(text=var_title, font=dict(size=14, weight='bold', color='darkslategray')),
+                tickfont=dict(color='darkslategray'),
+                anchor="x",
+                overlaying="y",
+                side="right",
+                showgrid=False, # Usually better to hide grid for 2nd axis to avoid clutter
+                zeroline=False,
+                range=[0, agg_df[line_var].max() * 1.1] # Give it some headroom
+                ),
+            legend=dict(
+                orientation="v",
+                yanchor="top",
+                y=1,
+                xanchor="left",
+                x=1.08,
+            )
+        )
+        fig.show()
+        print("\n")
+    except Exception as e:
+        print(f"Stacked Bar Plot Failed: {e}")
+def summary_pie_plot(summary_df, title="Anomaly Detection Summary"):
+    """
+    Generates a Pie Chart visualizing the distribution of Evaluated, Anomalous,
+    and Dropped records using the specific project styling.
+    Args:
+        summary_df (pd.DataFrame): Dataframe containing columns 'evaluated records',
+                                   'anomalies', and 'dropped'.
+    Returns:
+        None: Displays the interactive Plotly figure.
+    """
+    try:
+        colors = ['silver', 'crimson', 'gold']
+        # 2. Initialize Figure
+        fig = go.Figure()
+        # 3. Add Trace
+        fig.add_trace(go.Pie(
+            labels=summary_df['Records'],
+            values=summary_df['count'],
+            marker=dict(
+                colors=colors,
+                line=dict(color='white', width=2)
+            ),
+            textposition='auto',
+            texttemplate='%{label}<br>%{percent:.0%}',
+            # textinfo='percent+label',
+            hoverinfo='label+value+percent',
+            sort=False
+        ))
+        # 4. Apply Visual Design (Matching provided style)
+        fig.update_layout(
+            title=dict(
+                text=title,
+                y=0.96,
+                x=0.5,
+                xanchor='center',
+                yanchor='top',
+                font=dict(size=18, color='black', weight='bold'),
+            ),
+            height=400,
+            width=600,
+            margin=dict(l=50, r=50, t=80, b=30),
+            plot_bgcolor='snow',
+            paper_bgcolor='whitesmoke',
+            legend=dict(
+                orientation="v",
+                yanchor="top",
+                y=1,
+                xanchor="left",
+                x=1.02,
+            )
+        )
+        fig.show()
+        print("\n")
+    except Exception as e:
+        print(f"Summary Pie Plot Failed: {e}")
+def avg_anomaly_score_plot(df, group_columns, date_column):
+    try:
+        plot_title = f"Average Anomaly Scores Over Time for {len(df[group_columns].drop_duplicates())} Groups"
+        fig = go.Figure()
+        agg_df = df.groupby(date_column)['Anomaly_Score'].mean().reset_index()
+        # Average Anomaly Scores
+        fig.add_trace(go.Scatter(
+            x=agg_df[date_column],
+            y=agg_df['Anomaly_Score'],
+            mode='lines',
+            line=dict(color='seagreen', width=1.5),
+            name='Average Anomaly Score',
+        ))
+        fig.update_layout(
+            title=dict(
+                    text=plot_title,
+                    y=0.96,
+                    x=0.5,
+                    xanchor='center',
+                    yanchor='top',
+                    font=dict(size=18, color='black', weight='bold'),
+                ),
+            height=350,
+            width=1200,
+            margin=dict(l=50, r=50, t=40, b=30),
+            plot_bgcolor='snow',
+            paper_bgcolor='whitesmoke',
+            xaxis=dict(
+                range=[agg_df[date_column].min(), agg_df[date_column].max()],
+                showline=True,
+                linewidth=0.5,
+                linecolor='orange',
+                zeroline=False,
+                gridcolor='rgba(255, 165, 0, 0.1)',
+                mirror=True
+                ),
+            yaxis=dict(
+                range=[agg_df['Anomaly_Score'].min()*0.9, agg_df['Anomaly_Score'].max()*1.06],
+                showline=True,
+                linewidth=0.5,
+                linecolor='orange',
+                zeroline=False,
+                gridcolor='rgba(255, 165, 0, 0.1)',
+                mirror=True
+                ),
+            yaxis_title=dict(
+                text='Average Anomaly Score',
+                font=dict(size=16, weight='bold', color='black')
+                ),
+            legend=dict(
+                orientation="v",
+                yanchor="top",
+                y=1,
+                xanchor="left",
+                x=1.02,
+                )
+            )
+        fig.show()
+        print("\n")
+    except Exception as e:
+        print(f"Anomaly Score Plot Failed: {e}")

anomaly-pipeline 0.1.27__py3-none-any.whl → 0.1.61__py3-none-any.whl

anomaly-pipeline 0.1.27py3-none-any.whl → 0.1.61py3-none-any.whl