PyPI - anomaly-pipeline - Versions diffs - 0.1.27__py3-none-any.whl → 0.1.61__py3-none-any.whl - Mend

anomaly-pipeline 0.1.27py3-none-any.whl → 0.1.61py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

anomaly_pipeline/__init__.py +73 -1
anomaly_pipeline/helpers/DB_scan.py +144 -10
anomaly_pipeline/helpers/MAD.py +45 -0
anomaly_pipeline/helpers/Preprocessing.py +274 -73
anomaly_pipeline/helpers/STD.py +64 -0
anomaly_pipeline/helpers/__init__.py +13 -1
anomaly_pipeline/helpers/evaluation_info.py +25 -17
anomaly_pipeline/helpers/evaluation_plots.py +636 -30
anomaly_pipeline/helpers/ewma.py +105 -7
anomaly_pipeline/helpers/fb_prophet.py +150 -2
anomaly_pipeline/helpers/{help_info.py → help_anomaly.py} +194 -89
anomaly_pipeline/helpers/iso_forest_general.py +5 -3
anomaly_pipeline/helpers/iso_forest_timeseries.py +195 -23
anomaly_pipeline/helpers/percentile.py +46 -3
anomaly_pipeline/main.py +158 -39
anomaly_pipeline/pipeline.py +106 -34
anomaly_pipeline-0.1.61.dist-info/METADATA +275 -0
anomaly_pipeline-0.1.61.dist-info/RECORD +24 -0
anomaly_pipeline-0.1.27.dist-info/METADATA +0 -15
anomaly_pipeline-0.1.27.dist-info/RECORD +0 -24
{anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/WHEEL +0 -0
{anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/entry_points.txt +0 -0
{anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/top_level.txt +0 -0

anomaly_pipeline/helpers/percentile.py CHANGED Viewed

@@ -7,7 +7,52 @@ from .Preprocessing import classify
 # Anomaly category columns (optional, keep if you still want string labels)
-def detect_outliers_percentile(group, variable,date_column,eval_period):
+def detect_outliers_percentile(group, variable, date_column, eval_period):
+    """# 📈 PERCENTILE MODEL
+    ---
+    The `detect_outliers_percentile` function is a robust anomaly detection tool designed to identify **statistical outliers** in
+    time series or grouped data using a dynamic, **expanding window percentile approach**.
+    ## 📋 Functional Overview
+    The function operates by partitioning the data into an initial training set and a subsequent evaluation period. It establishes
+    **"normal" behavior** based on the 5th and 95th percentiles of the available historical data, flagging any value that falls
+    outside these bounds as an anomaly.
+    ## 🧠 Core Logic Stages
+    ### 1. Data Preparation and Validation
+    > **Minimum Threshold:** The function requires at least **10 data points** to run; otherwise, it returns an empty DataFrame to
+    prevent statistically insignificant results.
+    >
+    > **Copying:** It creates a copy of the input group to ensure the original data remains unaltered during the calculation process.
+    ### 2. Initial Training Block
+    * **Static Baseline:** For the first part of the data (everything before the `eval_period`), the function calculates a single
+    static baseline using the 5th and 95th percentiles of the entire training block.
+    * **Classification:** It applies these fixed bounds to the training rows, labeling them using a helper `classify` function and
+    assigning a boolean `is_Percentile_anomaly` flag.
+    ### 3. Expanding Window Evaluation
+    * **Sequential Testing:** For each data point in the evaluation period (the last *n* points specified by `eval_period`), the
+    function recalculates the percentiles using **all previously seen data points**.
+    * **Dynamic Adaptation:** As the loop progresses, the "training set" grows. This allows the model to adapt to gradual shifts in
+    the data distribution, as the thresholds for the current point are informed by every point that came before it.
+    * **Real-time Simulation:** By calculating the bounds for point $i$ based only on points $0$ to $i-1$, the function simulates how
+    the model would perform in a live environment.
+    ## 📤 Key Output Columns
+    The function appends the following columns to the returned DataFrame:
+    * **`Percentile_low` / `Percentile_high`**: The specific thresholds used to evaluate that row.
+    * **`Percentile_anomaly`**: A categorical label (likely "High," "Low," or "Normal") generated by the external `classify` function.
+    * **`is_Percentile_anomaly`**: A boolean flag indicating whether the value was outside the 5%–95% range.
+    ## 💡 Usage Context
+    This function is particularly useful for detecting spikes or drops in metrics where the underlying distribution might **drift
+    slowly over time**. By using percentiles rather than standard deviations, it is more resilient to extreme historical outliers
+    that might otherwise skew a mean-based threshold."""
     n = len(group)
     if n < 10:
         # Optional: log specific keys if they exist in your scope
@@ -61,5 +106,3 @@ def detect_outliers_percentile(group, variable,date_column,eval_period):
     group[date_column] = pd.to_datetime(group[date_column])
     return group

anomaly_pipeline/main.py CHANGED Viewed

@@ -1,63 +1,182 @@
 from .pipeline import run_pipeline
+import pandas as pd
-def timeseries_anomaly_detection(master_data, group_columns, variable,
-         date_column="week_start", freq="W-MON",
-         max_records=104, min_records=15,
-         contamination=0.03, random_state=42,
-         alpha=0.3, sigma=1.5, eval_period=12,
-         interval_width=0.90, mad_threshold = 2, mad_scale_factor = 0.6745):
+def timeseries_anomaly_detection(
+    master_data=None,
+    group_columns = None,
+    variable= None,
+    date_column = None,
+    freq="W-MON",
+    min_records=None,
+    max_records =None,
+    contamination=0.03,
+    random_state=42,
+    alpha=0.3,
+    sigma=1.5,
+    eval_period=1,
+    prophet_CI=0.90,
+    mad_threshold=2,
+    mad_scale_factor=0.6745
+):
     """
     Performs anomaly detection on grouped time-series data.
-    This function identifies outliers within specific groups of data by analyzing
-    historical trends, applying statistical thresholds, and calculating
-    prediction intervals.
-    Args:
-        master_data (pd.DataFrame): The input dataset containing the time series.
-        group_columns (list[str]): Columns used to partition the data (e.g., ['store_id', 'item_id']).
-        variable (str): The target numerical column to analyze for anomalies.
-        date_column (str): The column containing datetime information. Defaults to 'week_start'.
-        freq (str): Frequency of the time series (Pandas offset alias). Defaults to 'W-MON'.
-        max_records (int): Maximum historical records to consider for the model. Defaults to 104.
-        min_records (int): Minimum records required to perform detection. Defaults to 15.
-        contamination (float): Expected proportion of outliers in the data (0 to 0.5). Defaults to 0.03.
-        random_state (int): Seed for reproducibility in stochastic models. Defaults to 42.
-        alpha (float): Smoothing factor for trend calculations. Defaults to 0.3.
-        sigma (float): Standard deviation multiplier for thresholding. Defaults to 1.5.
-        eval_periods (int): Number of recent periods to evaluate for anomalies. Defaults to 12.
-        interval_width (float): The confidence level for the prediction interval (0 to 1). Defaults to 0.9.
+    Timeseries_anomaly_detection is designed to identify anomalous values on a single column that is time-ordered. The data should have a time component. Currently, we support daily, weekly, and monthly data. Data for missing time units is interpolated. Maximum interpolation is 25% of the series. Combines 8 models (Statistical + ML) to provide a robust Anomaly_Score and a final is_Anomaly consensus. The pipeline utilizes an ensemble of the following methodologies:
+    - Statistical: Percentile (5th/95th), Standard Deviation (SD), Median Absolute Deviation (MAD), and Interquartile Range (IQR).
+    - Time-Series Specific: EWMA (Exponentially Weighted Moving Average) and FB Prophet (Walk-forward validation).
+    - Machine Learning: Isolation Forest (General & Time-series optimized) and DBSCAN.
+    For more information, ask for help on each specific model, such as below:
+```python
+from anomaly_pipeline import help_anomaly
+help_anomaly('fb')
+```
+    # Mandatory Columns:
+    - master_data: Input DataFrame containing variables, dates, and group identifiers.
+    - group_columns: List of columns used to segment the data (e.g., ['Region', 'Product']).
+    - variable (numeric): The numerical target column to analyze for outliers.
+    - date_column: The datetime column representing the time axis.
+    # Default arguments:
+    - freq (str): Frequency of the time series (Pandas offset alias). Defaults to 'W-MON'.
+    - min_records: Minimum history required per group. Default is None; If None, extracts based on freq (1 Year + eval_period). Ex: if freq is weekly and eval_period is 1: min_records = 52+1.
+    - max_records: Maximum history to retain per group. Default is None; if provided, filters for the most recent N records.
+    - contamination (float): Expected proportion of outliers in the data (0 to 0.5). Defaults to 0.03.
+    - random_state (int): Seed for reproducibility in stochastic models. Defaults to 42.
+    - alpha (float): Smoothing factor for trend calculations. Defaults to 0.3.
+    - sigma (float): Standard deviation multiplier for thresholding. Defaults to 1.5.
+    - eval_period: The number of trailing records in each group to evaluate for anomalies.
+    - prophet_CI (float): The confidence level for the prediction interval (0 to 1). Defaults to 0.9.
     Returns:
-        pd.DataFrame: The original dataframe appended with anomaly flags and scores.
+        tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+            - final_results: The main dataframe containing original data, interpolated values,
+              forecasts, residuals, and anomaly flags (e.g., is_FB_anomaly, is_IQR_anomaly).
+            - success_report: A summary table for successful groups showing 'initial_record_count',
+              'interpolated_record_count', and 'interpolation_pct'.
+            - exclusion_report: A diagnostic table listing groups dropped from the analysis
+              and the specific reason (e.g., "Insufficient records" or "High Interpolation").
     """
+    # making robust with input parameters
+    if isinstance(group_columns, str):
+        group_columns = [group_columns]
+    # --- 1. MANDATORY PARAMETER VALIDATION ---
+    required_params = {
+        "master_data": master_data,
+        "group_columns": group_columns,
+        "variable": variable,
+        "date_column": date_column
+    }
+    missing_params = [name for name, val in required_params.items() if val is None]
-    return run_pipeline(
+    if missing_params:
+        print("\n" + "!"*60)
+        print("❌ ERROR: MISSING REQUIRED PARAMETERS")
+        print("The following parameters are required to run the detection:")
+        for param in missing_params:
+            print(f"  - {param}")
+        print("\n💡 HINT: Use help(timeseries_anomaly_detection) to see detailed")
+        print("descriptions and expected formats for each parameter.")
+        print("!"*60 + "\n")
+        return # Exit early
+    # --- 2. MANDATORY COLUMN VALIDATION ---
+    mandatory_cols = group_columns + [variable, date_column]
+    missing_cols = [col for col in mandatory_cols if col not in master_data.columns]
+    if missing_cols:
+        raise ValueError(
+            f"CRITICAL ERROR: Mandatory columns missing from input DataFrame: {missing_cols}. "
+            f"Please ensure group_columns, variable, and date_column are correctly spelled."
+        )
+        return # Exit early
+    # Check if the variable is numeric
+    if not pd.api.types.is_numeric_dtype(master_data[variable]):
+        raise TypeError(f"CRITICAL: The variable '{variable}' must be numeric, but found {master_data[variable].dtype}.")
+    # --- 3. EXECUTE PIPELINE ---
+    # Store results in a local variable first
+    final_df, success_report, exclusion_report = run_pipeline(
         master_data=master_data,
         group_columns=group_columns,
         variable=variable,
         date_column=date_column,
         freq=freq,
-        max_records=max_records,
         min_records=min_records,
+        max_records=max_records,
         contamination=contamination,
         random_state=random_state,
         alpha=alpha,
         sigma=sigma,
         eval_period=eval_period,
-        interval_width=interval_width,
-        mad_threshold = mad_threshold,
-        mad_scale_factor = mad_scale_factor
+        prophet_CI=prophet_CI,
+        mad_threshold=mad_threshold,
+        mad_scale_factor=mad_scale_factor
     )
-    print("Anomaly pipeline successfully invoked via python -m!")
-# change test_weeks to eval_periods: automate min_records based on eval_periods,
-# max_records = max_records + eval_records
-# freq_daily: max_records based on frequency (for version 2) 104 for weekly
-# split all the 5 functions and parametrize all the variables
-# change interval_width name to prophet_CI
-# change FB_anomaly column to high low and none insted of -1, 1, 0
+    import inspect
+    # Inside your timeseries_anomaly_detection function:
+    # 1. Get the line of code that called this function
+    frame = inspect.currentframe().f_back
+    call_line = ""
+    if frame and inspect.getframeinfo(frame).code_context:
+        call_line = inspect.getframeinfo(frame).code_context[0].strip()
+    # 2. Check if the user assigned the result to variables
+    # We split by the function name and check the part before it (index 0)
+    is_assigned = False
+    if "timeseries_anomaly_detection" in call_line:
+        prefix = call_line.split("timeseries_anomaly_detection")[0]
+        # If there is exactly one '=', it's an assignment
+        if prefix.count("=") == 1:
+            is_assigned = True
+    # 3. If NOT assigned, trigger the "Auto-Save" to the global namespace
+    if not is_assigned:
+        from IPython import get_ipython
+        shell = get_ipython()
+        if shell:
+            shell.user_ns['final_results'] = final_df
+            shell.user_ns['success_report'] = success_report
+            shell.user_ns['exclusion_report'] = exclusion_report
+            print("\n" + "*"*60)
+            print("🚀 AUTO-SAVE: Variables were not assigned.")
+            print("The outputs have been saved globally for you as:")
+            print("   - final_results, success_report, exclusion_report")
+            print("*"*60 + "\n")
+    # 4. Final return logic
+    if is_assigned:
+        # Determine if the user assigned to a single variable or multiple
+        prefix = call_line.split("=")[0].strip()
+        # If there's no comma in the assignment prefix, they used a single variable
+        if "," not in prefix:
+            print(f"\n💡 INFO: You assigned the output to a single variable: '{prefix}'")
+            print(f"   This variable is a tuple containing 3 DataFrames. Access them via:")
+            print(f"   1. Results Data:    {prefix}[0]")
+            print(f"   2. Success Report:  {prefix}[1]")
+            print(f"   3. Exclusion List:  {prefix}[2]")
+            print(f"   Or unpack them: final_df, success, exclusion = {prefix}\n")
+        return final_df, success_report, exclusion_report
+    else:
+        # Return None so Jupyter doesn't print the "wall of text"
+        return None

anomaly_pipeline/pipeline.py CHANGED Viewed

@@ -11,10 +11,21 @@ from .helpers.ewma import ewma_with_anomalies_rolling_group
 from .helpers.fb_prophet import detect_time_series_anomalies_fb_walkforward
 from .helpers.iso_forest_timeseries import detect_time_series_anomalies_isoforest
 from .helpers.DB_scan import detect_time_series_anomalies_dbscan
-from .helpers.Preprocessing import create_full_calendar_and_interpolate, print_anomaly_stats
+from .helpers.Preprocessing import (create_full_calendar_and_interpolate,
+                                    print_anomaly_stats,
+                                    calculate_ensemble_scores,
+                                    min_records_extraction)
+from .helpers.evaluation_plots import (summary_pie_plot,
+                                       anomaly_stacked_bar_plot,
+                                       avg_anomaly_score_plot,
+                                       anomaly_overview_plot)
+from IPython.display import display, Markdown
 def process_group(model, name, group, group_columns, variable,
-                  date_column, alpha, sigma, eval_period, interval_width, contamination, random_state):
+                  date_column, alpha, sigma, eval_period, prophet_CI, contamination, random_state):
     if model == "ISF_general":
         return detect_outliers_isf_general(group, variable, contamination, random_state, eval_period)
@@ -26,7 +37,7 @@ def process_group(model, name, group, group_columns, variable,
     if model == "FB":
         return detect_time_series_anomalies_fb_walkforward(
-            group, variable, date_column, eval_period, interval_width
+            group, variable, date_column, eval_period, prophet_CI
         )
     if model == 'ISF_timeseries':
@@ -41,19 +52,28 @@ def process_group(model, name, group, group_columns, variable,
 def run_pipeline(master_data, group_columns, variable,
-                 date_column, freq,
-                 max_records, min_records,
+                 date_column, freq, min_records,max_records,
                  contamination, random_state,
                  alpha, sigma, eval_period,
-                 interval_width, mad_threshold, mad_scale_factor):
+                 prophet_CI, mad_threshold, mad_scale_factor):
+    if min_records is None:
+        min_records = min_records_extraction(freq,eval_period)
+        print(f"Min records needed to run an anomaly pipeline for a group is {min_records}")
+    if max_records is not None:
+        max_records = max_records + eval_period
+        print(f"Max records used to run an anomaly pipeline for a group is {max_records}")
     # preprocess calendar
-    final_data = create_full_calendar_and_interpolate(
+    final_data, success_report, exclusion_report = create_full_calendar_and_interpolate(
         master_data,
         group_columns,
         variable,
         date_column,
-        freq
+        freq,
+        min_records,
+        max_records
     )
     groups = list(final_data.groupby(group_columns))
@@ -113,7 +133,7 @@ def run_pipeline(master_data, group_columns, variable,
         ## ISF_general
-    results_ISF_general = Parallel(n_jobs=-1, verbose=0)(delayed(process_group)('ISF_general', name, group, group_columns, variable,date_column, alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
+    results_ISF_general = Parallel(n_jobs=-1, verbose=0)(delayed(process_group)('ISF_general', name, group, group_columns, variable,date_column, alpha, sigma, eval_period, prophet_CI, contamination, random_state) for name, group in groups)
         # Combine results back
@@ -127,7 +147,7 @@ def run_pipeline(master_data, group_columns, variable,
      ## EWMA
     results_EWMA = Parallel(n_jobs=-1, verbose=0)(
                 delayed(process_group)('EWMA', name, group,group_columns, variable, date_column,
-                                       alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
+                                       alpha, sigma, eval_period, prophet_CI, contamination, random_state) for name, group in groups)
                 # Combine results back
@@ -138,15 +158,14 @@ def run_pipeline(master_data, group_columns, variable,
     #print("anomaly_key_channel_EWMA data frame created")
     #print(anomaly_key_channel_EWMA.head())
     EWMA_cols = group_columns+[date_column]+['alpha', 'sigma', 'EWMA_forecast',
-       'STD', 'EWMA_high', 'EWMA_low','is_EWMA_anomaly']
+       'STD', 'EWMA_high', 'EWMA_low',"EWMA_residual", "EWMA_anomaly",'is_EWMA_anomaly']
     anomaly_key_channel_EWMA_final =  anomaly_key_channel_EWMA[EWMA_cols]
-        ## FB
+    ## FB
     results_fb = Parallel(n_jobs=-1, verbose=0)(delayed(process_group)('FB', name, group,group_columns, variable,date_column,
-                                  alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
+                                  alpha, sigma, eval_period,prophet_CI, contamination, random_state) for name, group in groups)
         # Combine results back
@@ -166,7 +185,7 @@ def run_pipeline(master_data, group_columns, variable,
         ## Isolation Forest timeseries
     results_ISF_timeseries = Parallel(n_jobs=-1, verbose=0)(
     delayed(process_group)('ISF_timeseries', name, group,group_columns, variable, date_column,
-                                       alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
+                                       alpha, sigma, eval_period, prophet_CI, contamination, random_state) for name, group in groups)
         # Combine results back
@@ -175,7 +194,7 @@ def run_pipeline(master_data, group_columns, variable,
               .sort_values(by=group_columns+[date_column])
         )
     #print(anomaly_key_channel_ISF_timeseries.head())
-    ISF_cols = group_columns+[date_column]+["IsolationForest_score_timeseries", "is_IsolationForest_anomaly_timeseries"]
+    ISF_cols = group_columns+[date_column]+["IsolationForest_score_timeseries", "IsolationForest_score_low_timeseries", "is_IsolationForest_anomaly_timeseries"]
     anomaly_key_channel_ISF_timeseries_final =  anomaly_key_channel_ISF_timeseries[ISF_cols]
     #print("anomaly_key_channel_ISF_timeseries data frame created")
@@ -184,7 +203,7 @@ def run_pipeline(master_data, group_columns, variable,
        ## DB Scan
     results_DB = Parallel(n_jobs=-1, verbose=0)(
     delayed(process_group)('DBSCAN', name, group,group_columns, variable, date_column,
-                                       alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
+                                       alpha, sigma, eval_period,prophet_CI, contamination, random_state) for name, group in groups)
      # Combine results back
     anomaly_key_channel_DB= (
@@ -196,7 +215,7 @@ def run_pipeline(master_data, group_columns, variable,
     #print("anomaly_key_channel_DB data frame created")
     #print(anomaly_key_channel_DB.head())
-    DB_cols = group_columns+[date_column]+["dbscan_score", "is_DBSCAN_anomaly"]
+    DB_cols = group_columns+[date_column]+["dbscan_score", "dbscan_score_high", "is_DBSCAN_anomaly"]
     anomaly_key_channel_DB_final =  anomaly_key_channel_DB[DB_cols]
         # combine ISF general and timeseries data frames
@@ -209,12 +228,16 @@ def run_pipeline(master_data, group_columns, variable,
                                anomaly_key_channel_ISF['IsolationForest_score_general'],
                                anomaly_key_channel_ISF['IsolationForest_score_timeseries'])
+    anomaly_key_channel_ISF['IsolationForest_score_low'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
+                               anomaly_key_channel_ISF['IsolationForest_score_low_general'],
+                               anomaly_key_channel_ISF['IsolationForest_score_low_timeseries'])
     # Column 2 Logic: If 'type' is train, take from 'IsolationForest_general', else take from 'IsolationForest_timeseries'
     anomaly_key_channel_ISF['is_IsolationForest_anomaly'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
                                anomaly_key_channel_ISF['is_IsolationForest_anomaly_general'],
                                anomaly_key_channel_ISF['is_IsolationForest_anomaly_timeseries'])
-    ISF_cols = group_columns+[date_column]+['IsolationForest_score', 'is_IsolationForest_anomaly']
+    ISF_cols = group_columns+[date_column]+['IsolationForest_score', 'IsolationForest_score_low', 'is_IsolationForest_anomaly']
     anomaly_key_channel_ISF_final =  anomaly_key_channel_ISF[ISF_cols]
@@ -231,23 +254,72 @@ def run_pipeline(master_data, group_columns, variable,
     anomaly = anomaly.merge(anomaly_key_channel_fb_final, on= group_columns+[date_column], how= 'inner')
     anomaly = anomaly.merge(anomaly_key_channel_ISF_final, on= group_columns+[date_column], how= 'inner')
     anomaly = anomaly.merge(anomaly_key_channel_DB_final, on= group_columns+[date_column], how= 'inner')
+    anomaly_final = calculate_ensemble_scores(anomaly, variable)
+    globals()['anomaly_df'] = anomaly_final
+    #print(anomaly_final.head())
+    #print(f"Successfully processed {len(success_report)} groups.")
+    #print(f"Excluded {len(exclusion_report)} groups due to low quality.")
+    print_anomaly_stats(anomaly_final, success_report, exclusion_report,group_columns,interpolation_method="linear")
-        # ---- Unified anomaly flag (majority voting) ----
-    anomaly_flags = [
-            'is_Percentile_anomaly',
-            'is_SD_anomaly', 'is_MAD_anomaly',
-            'is_IQR_anomaly',
-            'is_EWMA_anomaly', 'is_FB_anomaly','is_IsolationForest_anomaly','is_DBSCAN_anomaly']
+    # Plot summary charts
+    # ------------------------------
+    # Get data for pie chart
+    pie_chart_df = anomaly_final['is_Anomaly'].value_counts().reset_index()
+    pie_chart_df['is_Anomaly'] = np.where(pie_chart_df['is_Anomaly'] == True, 'Anomalous Records', 'Evaluated Records')
+    pie_chart_df = pie_chart_df.rename(columns={'is_Anomaly': 'Records'})
+    if len(exclusion_report) > 0:
+        pie_chart_df = pd.concat([pie_chart_df,
+                                  pd.DataFrame({'Records': ['Dropped Records'], 'count': [exclusion_report['dropped_records'].sum()]})])
+        exclusion_report = exclusion_report.drop(columns='dropped_records')
+    print("")
+    summary_pie_plot(pie_chart_df, title=f"Anomaly Detection Summary for {len(master_data[group_columns].drop_duplicates())} Groups")
+    anomaly_stacked_bar_plot(anomaly_final, group_columns, variable, date_column, secondary_line=variable)
+    anomaly_stacked_bar_plot(anomaly_final, group_columns, variable, date_column, secondary_line='Anomaly_Score')
+    avg_anomaly_score_plot(anomaly_final, group_columns, date_column)
+    top_5_anomaly_groups = anomaly_final.groupby(group_columns)['is_Anomaly'].agg(['mean', 'sum', 'count']).reset_index()\
+        .sort_values('mean', ascending=False).reset_index(drop=True).head(5)
-    anomaly['Anomaly_Votes'] = anomaly[anomaly_flags].sum(axis=1)
-        # Majority rule: anomaly if flagged by at least half the methods
-    anomaly['is_Anomaly'] = anomaly['Anomaly_Votes'] >= 4
-        # Add refresh_date as the first column
-    anomaly.insert(0, 'refresh_date', pd.to_datetime(date.today()))
+    eval_plots_msg = f"""
+---
+### Overall Evaluation Plots of the {len(top_5_anomaly_groups)} Groups with the Highest Anomaly Rates
+Here is how to view detailed plots of individual anomaly detection models per group.\n
+Start with the main (first) DataFrame returned from the timeseries_anomaly_detection function.\n
+Suppose you called that DataFrame anomaly_df, that the group_columns are 'taxonomy' and 'channel', and that you want to see all the plots for the group where 'taxonomy' = 'tools' and 'channel' = 'mobile'.
+Then you could run this code block:\n
+```python
+from anomaly_pipeline import evaluation_info
+from anomaly_pipeline import help_anomaly
+group_values = ['tools', 'mobile']
+mask = anomaly_df[group_columns].eq(group_values).all(axis=1)
+group_df = anomaly_df[mask]
+evaluation_info(
+    group_df,
+    group_columns,
+    variable,
+    date_column,
+    eval_period)
+```
+---
+"""
+    display(Markdown(eval_plots_msg))
-    print(anomaly.head())
+    group_nbr = 1
+    for group_key, group in top_5_anomaly_groups.groupby(group_columns, sort=False):
+        anomaly_rate = group['mean'].values[0]
+        group_df = anomaly_final.merge(group[group_columns], on=group_columns, how='inner')
+        group_id = group_df[group_columns].drop_duplicates().astype(str).apply(lambda x: ' -- '.join(x), axis=1).values[0]
+        group_msg = f"""#### #{group_nbr}, Anomaly Rate: {anomaly_rate:.1%}, Group: {group_id}"""
+        display(Markdown(group_msg))
+        anomaly_overview_plot(group_df, group_columns, variable, date_column, eval_period, show_anomaly_scores_on_main_plot=False)
+        group_nbr += 1
-    print_anomaly_stats(anomaly, group_columns)
+    return anomaly_final, success_report, exclusion_report
-    return anomaly

anomaly-pipeline 0.1.27__py3-none-any.whl → 0.1.61__py3-none-any.whl

anomaly-pipeline 0.1.27py3-none-any.whl → 0.1.61py3-none-any.whl