PyPI - anomaly-pipeline - Versions diffs - 0.1.27__py3-none-any.whl → 0.1.61__py3-none-any.whl - Mend

anomaly-pipeline 0.1.27py3-none-any.whl → 0.1.61py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

anomaly_pipeline/__init__.py +73 -1
anomaly_pipeline/helpers/DB_scan.py +144 -10
anomaly_pipeline/helpers/MAD.py +45 -0
anomaly_pipeline/helpers/Preprocessing.py +274 -73
anomaly_pipeline/helpers/STD.py +64 -0
anomaly_pipeline/helpers/__init__.py +13 -1
anomaly_pipeline/helpers/evaluation_info.py +25 -17
anomaly_pipeline/helpers/evaluation_plots.py +636 -30
anomaly_pipeline/helpers/ewma.py +105 -7
anomaly_pipeline/helpers/fb_prophet.py +150 -2
anomaly_pipeline/helpers/{help_info.py → help_anomaly.py} +194 -89
anomaly_pipeline/helpers/iso_forest_general.py +5 -3
anomaly_pipeline/helpers/iso_forest_timeseries.py +195 -23
anomaly_pipeline/helpers/percentile.py +46 -3
anomaly_pipeline/main.py +158 -39
anomaly_pipeline/pipeline.py +106 -34
anomaly_pipeline-0.1.61.dist-info/METADATA +275 -0
anomaly_pipeline-0.1.61.dist-info/RECORD +24 -0
anomaly_pipeline-0.1.27.dist-info/METADATA +0 -15
anomaly_pipeline-0.1.27.dist-info/RECORD +0 -24
{anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/WHEEL +0 -0
{anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/entry_points.txt +0 -0
{anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/top_level.txt +0 -0

anomaly_pipeline/helpers/{help_info.py → help_anomaly.py} RENAMED Viewed

@@ -14,21 +14,24 @@ from .ewma import ewma_with_anomalies_rolling_group
 from .fb_prophet import detect_time_series_anomalies_fb_walkforward
 from .iso_forest_timeseries import detect_time_series_anomalies_isoforest
 from .DB_scan import detect_time_series_anomalies_dbscan
-from .Preprocessing import create_full_calendar_and_interpolate, print_anomaly_stats
-from .evaluation_plots import anomaly_eval_plot, anomaly_percentile_plot,anomaly_sd_plot, anomaly_mad_plot, anomaly_iqr_plot, anomaly_ewma_plot, anomaly_fb_plot, anomaly_dbscan_plot, anomaly_isolation_forest_timeseries_plot
+from .Preprocessing import (create_full_calendar_and_interpolate,
+                            print_anomaly_stats,
+                            calculate_ensemble_scores)
+from .evaluation_plots import (anomaly_overview_plot,
+                               anomaly_percentile_plot,
+                               anomaly_sd_plot,
+                               anomaly_mad_plot,
+                               anomaly_iqr_plot,
+                               anomaly_ewma_plot,
+                               anomaly_fb_plot,
+                               anomaly_dbscan_plot,
+                               anomaly_isolation_forest_plot)
-group_columns=["key", "channel"]
-variable="views"
-eval_period = 12
-date_column = "week_start"
-mad_threshold = 2
-mad_scale_factor = 0.6745
-alpha=.3
-sigma=1.5
-interval_width = .95
-freq = 'W-MON'
-def help_info(topic=None):
+def help_anomaly(topic=None):
     #example_df = get_example_df()
@@ -50,6 +53,23 @@ def help_info(topic=None):
         help_sd()
     elif topic.lower()[:3] == 'mad':
         help_mad()
+group_columns=["key", "channel"]
+variable="views"
+eval_period = 1
+date_column = "week_start"
+min_records = 52
+max_records = 156
+mad_threshold = 2
+mad_scale_factor = 0.6745
+alpha=.3
+sigma=1.5
+prophet_CI = .95
+freq = 'W-MON'
+contamination = 0.03
+random_state =42
+"""
 def get_example_df():
@@ -78,8 +98,8 @@ def get_example_df():
             'views': views})
-        example_df = create_full_calendar_and_interpolate(example_df,group_columns, variable, date_column, freq)
+        example_df = create_full_calendar_and_interpolate(example_df, group_columns, variable, date_column, freq, min_records, max_records)[0]
         logging.getLogger('fbprophet').setLevel(logging.ERROR)
         logging.getLogger('cmdstanpy').disabled = True
@@ -95,8 +115,27 @@ def get_example_df():
         df_mad = detect_outliers_mad(example_df, variable, date_column, mad_threshold, mad_scale_factor, eval_period)
         df_std = detect_outliers_sd(example_df, variable, date_column, eval_period)
         df_ewma = ewma_with_anomalies_rolling_group(example_df, group_columns, variable, date_column, alpha, sigma, eval_period)
-        df_fb = detect_time_series_anomalies_fb_walkforward(example_df, variable, date_column, eval_period,interval_width)
-        df_iosfor = detect_time_series_anomalies_isoforest(example_df,variable, date_column, eval_period)
+        df_fb = detect_time_series_anomalies_fb_walkforward(example_df, variable, date_column, eval_period,prophet_CI)
+        df_isofor = detect_time_series_anomalies_isoforest(example_df,variable, date_column, eval_period)
+        ISF_timeseries_cols = group_columns+[date_column]+["IsolationForest_score_timeseries", "IsolationForest_score_low_timeseries", "is_IsolationForest_anomaly_timeseries"]
+        df_isofor_final=  df_isofor[ISF_timeseries_cols]
+        df_isogen = detect_outliers_isf_general(group, variable, contamination, random_state, eval_period)
+        # combine ISF general and timeseries data frames
+        df_ISF= df_isogen.merge(df_isofor_final, on= group_columns+[date_column], how= 'inner')
+    # Column 1 Logic: If 'type' is train, take from 'col_A', else take from 'col_B'
+        df_ISF['IsolationForest_score'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
+                                                   anomaly_key_channel_ISF['IsolationForest_score_general'],
+                                                   anomaly_key_channel_ISF['IsolationForest_score_timeseries'])
+        df_ISF['IsolationForest_score_low'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN', anomaly_key_channel_ISF['IsolationForest_score_low_general'],anomaly_key_channel_ISF['IsolationForest_score_low_timeseries'])
+    # Column 2 Logic: If 'type' is train, take from 'IsolationForest_general', else take from 'IsolationForest_timeseries'
+       df_ISF['is_IsolationForest_anomaly'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
+                               anomaly_key_channel_ISF['is_IsolationForest_anomaly_general'],
+                               anomaly_key_channel_ISF['is_IsolationForest_anomaly_timeseries'])
         df_dbscan = detect_time_series_anomalies_dbscan(example_df, variable, date_column, eval_period)
         orig_columns = example_df.columns.to_list()
@@ -108,120 +147,186 @@ def get_example_df():
             df_std.drop(columns=orig_columns, errors='ignore'),
             df_ewma.drop(columns=orig_columns, errors='ignore'),
             df_fb.drop(columns=orig_columns, errors='ignore'),
-            df_iosfor.drop(columns=orig_columns, errors='ignore'),
+            df_ISF.drop(columns=orig_columns, errors='ignore'),
             df_dbscan.drop(columns=orig_columns, errors='ignore')
         ], axis=1)
-        # Scaled Scores
-        # example_df['Percentile_score_scaled'] = np.where(example_df['is_Percentile_anomaly'].isna()==False,
-        #     abs(example_df['views'] - (example_df['Percentile_high'] + example_df['Percentile_low'])/2)/\
-        #     ((example_df['Percentile_high'] - example_df['Percentile_low'])/2) - 1, np.nan)
-        # example_df['SD_score_scaled'] = np.where(example_df['is_SD_anomaly'].isna()==False,
-        #     abs(example_df[variable] - (example_df['SD2_high'] + example_df['SD2_low'])/2)/\
-        #     ((example_df['SD2_high'] - example_df['SD2_low'])/2) - 1, np.nan)
-        # example_df['MAD_score_scaled'] = np.where(example_df['is_MAD_anomaly'].isna()==False,
-        #     abs(example_df[variable] - (example_df['MAD_high'] + example_df['MAD_low'])/2)/\
-        #     ((example_df['MAD_high'] - example_df['MAD_low'])/2) - 1, np.nan)
-        # example_df['IQR_score_scaled'] = np.where(example_df['is_IQR_anomaly'].isna()==False,
-        #     abs(example_df['views'] - (example_df['IQR_high'] + example_df['IQR_low'])/2)/\
-        #     ((example_df['IQR_high'] - example_df['IQR_low'])/2) - 1, np.nan)
-        # example_df['EWMA_score_scaled'] = np.where(example_df['is_EWMA_anomaly'].isna()==False,
-        #     abs(example_df['views'] - (example_df['EWMA_high'] + example_df['EWMA_low'])/2)/\
-        #     ((example_df['EWMA_high'] - example_df['EWMA_low'])/2) - 1, np.nan)
-        # example_df['FB_score_scaled'] = np.where(example_df['is_FB_anomaly'].isna()==False,
-        #     abs(example_df['views'] - (example_df['FB_high'] + example_df['FB_low'])/2)/\
-        #     ((example_df['FB_high'] - example_df['FB_low'])/2) - 1, np.nan)
-        # score_scaled_cols = []
-        # for col in example_df.columns.to_list():
-        #     if col.endswith('_scaled'):
-        #         score_scaled_cols.append(col)
-        # example_df['Anomaly_Score'] = example_df[score_scaled_cols].mean(axis=1)
-        # example_df['Anomaly_Score_Display'] = np.where(example_df['Anomaly_Score'] < 0, np.floor(100*example_df['Anomaly_Score']),
-        #                                                     np.where(example_df['Anomaly_Score'].between(0, 1), np.ceil(100*example_df['Anomaly_Score']),
-        #                                                              np.where(example_df['Anomaly_Score'] > 1, 100, 0)))
-        is_anom_cols = []
-        for col in example_df.columns.to_list():
-            if col.startswith('is_') and col.endswith('_anomaly') and col != 'is_Anomaly':
-                is_anom_cols.append(col)
-        example_df['Anomaly_Votes'] = example_df[is_anom_cols].sum(axis=1).astype(float)
-        example_df['is_Anomaly'] = np.where(example_df['Anomaly_Votes']/example_df[is_anom_cols].replace(False, True).sum(axis=1) >= 0.5, True, False)
-        example_df['Anomaly_Score'] = 2 * (example_df['Anomaly_Votes']/example_df[is_anom_cols].replace(True, 1).replace(False, 1).sum(axis=1) - 0.5).astype(float)
-        example_df['Anomaly_Score_Display'] = np.where(example_df['Anomaly_Score'] < 0, np.floor(100*example_df['Anomaly_Score']),
-                                                            np.where(example_df['Anomaly_Score'] > 0, np.ceil(100*example_df['Anomaly_Score']), 1)).astype(float)
+        example_df = calculate_ensemble_scores(example_df, 'views')
+    globals()['anomaly_example_df'] = example_df
     return example_df
+"""
+def get_example_df():
+    """
+    Generates a sample dataset and runs all 8 models to demonstrate
+    the anomaly-pipeline functionality.
+    """
+    # 1. Create dummy time-series data
+    views = [
+        223006, 145101, 136508, 119284, 151332, 169419, 158795, 163725, 161911, 153131,
+        178292, 188910, 192736, 165486, 157370, 151250, 151699, 144465, 167651, 185210,
+        172594, 176735, 158885, 140992, 184203, 235889, 203074, 203714, 162486, 227249,
+        243952, 241711, 213386, 183171, 176070, 185944, 191282, 180852, 219299, 271454,
+        216265, 150586, 123755, 126039, 117597, 103758, 133977, 144088, 143186, 247731,
+        267901, 289105, 378025, 221419, 119153, 117262, 135635, 157462, 158551, 162637,
+        157246, 144626, 129089, 153280, 145880, 130291, 114119, 112931, 110593, 120172,
+        185307, 213343, 164825, 153140, 127525, 128465, 180317, 232471, 229766, 129962,
+        98732, 181722, 198247, 222167, 175792, 131070, 154662, 158707, 152083, 151097,
+        194114, 230775, 195828, 150668, 119488, 118110, 165357, 150681, 151303, 137414,
+        126470, 223347, 222285, 244610, 277318
+    ]
+    example_df = pd.DataFrame({
+        'key': ['PLP>appliances>refrigerators'] * len(views),
+        'channel': ['raw_desktop_views'] * len(views),
+        'week_start': pd.date_range(start='2023-11-27', periods=len(views), freq=freq),
+        'views': views
+    })
+    # 2. Preprocessing
+    # Assuming create_full_calendar_and_interpolate returns a tuple (df, success, exclusion)
+    example_df = create_full_calendar_and_interpolate(
+        example_df, group_columns, variable, date_column, freq, min_records, max_records
+    )[0]
+    # Silence Prophet/CmdStanPy noise
+    logging.getLogger('fbprophet').setLevel(logging.ERROR)
+    logging.getLogger('cmdstanpy').disabled = True
+    # 3. Individual Model Detections
+    df_percentile = detect_outliers_percentile(example_df, variable, date_column, eval_period)
+    df_iqr = detect_outliers_iqr(example_df, variable, date_column, eval_period)
+    df_mad = detect_outliers_mad(example_df, variable, date_column, mad_threshold, mad_scale_factor, eval_period)
+    df_std = detect_outliers_sd(example_df, variable, date_column, eval_period)
+    df_ewma = ewma_with_anomalies_rolling_group(example_df, group_columns, variable, date_column, alpha, sigma, eval_period)
+    df_fb = detect_time_series_anomalies_fb_walkforward(example_df, variable, date_column, eval_period, prophet_CI)
+    df_isofor = detect_time_series_anomalies_isoforest(example_df, variable, date_column, eval_period)
+    # 4. Handle Isolation Forest Logic (Consolidating General + Time-series)
+    df_isogen = detect_outliers_isf_general(example_df, variable, contamination, random_state, eval_period)
+    ISF_ts_cols = group_columns + [date_column] + ["IsolationForest_score_timeseries", "IsolationForest_score_low_timeseries", "is_IsolationForest_anomaly_timeseries"]
+    df_isofor_sub = df_isofor[ISF_ts_cols]
+    df_ISF = df_isogen.merge(df_isofor_sub, on=group_columns + [date_column], how='inner')
+    # Logical mapping for ISF Ensemble
+    df_ISF['IsolationForest_score'] = np.where(df_ISF['set'] == 'TRAIN',
+                                             df_ISF['IsolationForest_score_general'],
+                                             df_ISF['IsolationForest_score_timeseries'])
+    df_ISF['IsolationForest_score_low'] = np.where(df_ISF['set'] == 'TRAIN',
+                                                 df_ISF['IsolationForest_score_low_general'],
+                                                 df_ISF['IsolationForest_score_low_timeseries'])
+    df_ISF['is_IsolationForest_anomaly'] = np.where(df_ISF['set'] == 'TRAIN',
+                                                   df_ISF['is_IsolationForest_anomaly_general'],
+                                                   df_ISF['is_IsolationForest_anomaly_timeseries'])
+    # 5. Final Model (DBSCAN)
+    df_dbscan = detect_time_series_anomalies_dbscan(example_df, variable, date_column, eval_period)
+    # 6. Concatenate Results
+    # Identify non-original columns to avoid duplicates during join
+    orig_cols = example_df.columns.to_list()
+    combined_df = pd.concat([
+        example_df,
+        df_percentile.drop(columns=orig_cols, errors='ignore'),
+        df_iqr.drop(columns=orig_cols, errors='ignore'),
+        df_mad.drop(columns=orig_cols, errors='ignore'),
+        df_std.drop(columns=orig_cols, errors='ignore'),
+        df_ewma.drop(columns=orig_cols, errors='ignore'),
+        df_fb.drop(columns=orig_cols, errors='ignore'),
+        df_ISF.drop(columns=orig_cols, errors='ignore'),
+        df_dbscan.drop(columns=orig_cols, errors='ignore')
+    ], axis=1)
+    # 7. Calculate Final Ensemble Scores
+    final_example_df = calculate_ensemble_scores(combined_df, variable)
+    # Optional: assign to a global variable for notebook access
+    globals()['anomaly_example_df'] = final_example_df
+    return final_example_df
 def help_overview():
     display(Markdown(overview_msg))
     example_df = get_example_df()
     display(example_df[['key', 'channel', 'week_start', 'views']].tail(12))
     display(Markdown(overview_msg2))
-    anomaly_eval_plot(example_df, group_columns, variable, date_column, eval_period, show_anomaly_scores_on_main_plot=False)
+    anomaly_overview_plot(example_df, group_columns, variable, date_column, eval_period, show_anomaly_scores_on_main_plot=False)
 def help_percentile():
     display(Markdown(percentile_msg))
     example_df = get_example_df()
-    anomaly_percentile_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
+    anomaly_percentile_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
+def help_sd():
+    display(Markdown(sd_msg))
+    example_df = get_example_df()
+    anomaly_sd_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
+def help_mad():
+    display(Markdown(mad_msg))
+    example_df = get_example_df()
+    anomaly_mad_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
 def help_iqr():
     display(Markdown(iqr_msg))
     example_df = get_example_df()
-    anomaly_iqr_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
+    anomaly_iqr_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
-def help_mad():
-    display(Markdown(mad_msg))
-    example_df = get_example_df()
-    anomaly_mad_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
-def help_sd():
-    display(Markdown(sd_msg))
-    example_df = get_example_df()
-    anomaly_sd_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
 def help_ewma():
     display(Markdown(ewma_msg))
     example_df = get_example_df()
-    anomaly_ewma_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
+    anomaly_ewma_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
 def help_fb():
     display(Markdown(fb_msg))
     example_df = get_example_df()
-    anomaly_fb_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
+    anomaly_fb_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
+def help_isofor():
+    display(Markdown(isofor_msg))
+    example_df = get_example_df()
+    anomaly_isolation_forest_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
 def help_dbscan():
     display(Markdown(dbscan_msg))
     example_df = get_example_df()
-    anomaly_dbscan_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
+    anomaly_dbscan_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
-def help_isofor():
-    display(Markdown(isofor_msg))
-    example_df = get_example_df()
-    anomaly_isolation_forest_timeseries_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
 overview_msg = """
 # 🏗️ The Anomaly Detection Function
 ---
+FYI, you can see information about specific models used in the anomaly pipeline with any of the following commands:
+```python
+help_anomaly('percentile')
+help_anomaly('iqr')
+help_anomaly('mad')
+help_anomaly('std')
+help_anomaly('ewma')
+help_anomaly('prophet')
+help_anomaly('dbscan')
+help_anomaly('iso') # For information on isolation forest
+```
+---
 The `run_pipeline` function handles end-to-end processing — from data cleaning and interpolation to executing multiple machine learning models in parallel and aggregating their results into a final "Consensus" anomaly flag.
 ## 📋 Functional Overview
@@ -229,7 +334,7 @@ The pipeline takes raw master data, partitions it into groups by unique ID, appl
 The master data DataFrame that you pass into the anomaly detection pipeline needs to have at least 3 columns - unique ID, date, and a target variable. The unique ID can be defined by multiple columns.
-Here is an example of a DataFrame that has two columns that comprise the unique ID ('key' and 'channel'), 'week_start' is the date column, and 'views' is the target variable:"""
+Here is an example of a DataFrame that has two columns that comprise the unique ID `['key', 'channel']`, `week_start` is the date column, and `views` is the target variable:"""
 overview_msg2 = """
@@ -272,7 +377,7 @@ Use `run_pipeline` when you need a **highly reliable, automated output**. By com
 | :--- | :--- | :--- |
 | **`eval_period`** | `12` | The number of recent weeks to evaluate for anomalies. |
 | **`alpha` / `sigma`** | `0.3` / `1.5` | Sensitivity settings for the EWMA model. |
-| **`interval_width`** | `0.90` | The confidence interval for the Prophet (FB) model. |
+| **`prophet_CI`** | `0.90` | The confidence interval for the Prophet (FB) model. |
 | **`n_jobs`** | `-1` | Utilizes all available processor cores for parallelization. |
@@ -400,7 +505,7 @@ Unlike standard batch forecasting, this function operates by simulating a real-w
 * **Robust Error Handling:** If the Prophet fit fails, the function falls back to a **baseline persistence model** (last observed value) to prevent pipeline failure.
 ### 3. Anomaly Classification
-* **Uncertainty Bounds:** Anomalies are defined by the `interval_width` parameter. Any observation falling outside the predicted upper or lower bounds is flagged.
+* **Uncertainty Bounds:** Anomalies are defined by the `prophet_CI` parameter. Any observation falling outside the predicted upper or lower bounds is flagged.
 * **Residual Calculation:** The function computes the **FB_residual** (Actual - Forecast) to quantify the magnitude of deviations.
 ## 📤 Key Output Columns

anomaly_pipeline/helpers/iso_forest_general.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import pandas as pd
 from sklearn.ensemble import IsolationForest
-def detect_outliers_isf_general(group, variable, contamination=0.03, random_state=42, eval_period=12):
+def detect_outliers_isf_general(group, variable, contamination, random_state, eval_period):
     n = len(group)
     if n < 10:
         return pd.DataFrame(columns=group.columns)
@@ -12,6 +12,7 @@ def detect_outliers_isf_general(group, variable, contamination=0.03, random_stat
     # Initialize columns
     group['set'] = ""
     group['IsolationForest_score_general'] = 0.0
+    group['IsolationForest_score_low_general'] = 0.0
     group['is_IsolationForest_anomaly_general'] = False
     # --- 1. HANDLE TRAINING DATA (Initial Block) ---
@@ -19,11 +20,11 @@ def detect_outliers_isf_general(group, variable, contamination=0.03, random_stat
     initial_train = group[[variable]].iloc[:train_size]
     iso = IsolationForest(contamination=contamination, random_state=random_state)
+    iso.fit(initial_train)
-    # Fit and predict the initial block
-    group.loc[group.index[:train_size], 'IsolationForest_score_general'] = iso.fit_predict(initial_train) # Note: this is actually the cluster label
     # We use decision_function for the raw anomaly score
     group.loc[group.index[:train_size], 'IsolationForest_score_general'] = iso.decision_function(initial_train)
+    group.loc[group.index[:train_size], 'IsolationForest_score_low_general'] = iso.offset_
     group.loc[group.index[:train_size], 'is_IsolationForest_anomaly_general'] = iso.predict(initial_train) == -1
     group.loc[group.index[:train_size], 'set'] = "TRAIN"
@@ -41,6 +42,7 @@ def detect_outliers_isf_general(group, variable, contamination=0.03, random_stat
         current_point = group[[variable]].iloc[[i]]
         group.iloc[i, group.columns.get_loc('IsolationForest_score_general')] = iso_expanding.decision_function(current_point)[0]
+        group.iloc[i, group.columns.get_loc('IsolationForest_score_low_general')] = iso_expanding.offset_
         group.iloc[i, group.columns.get_loc('is_IsolationForest_anomaly_general')] = iso_expanding.predict(current_point)[0] == -1
         group.iloc[i, group.columns.get_loc('set')] = "TEST"

anomaly_pipeline/helpers/iso_forest_timeseries.py CHANGED Viewed

@@ -35,16 +35,147 @@ def detect_time_series_anomalies_isoforest(
     eval_period,
     ):
+    """
+    # 🌲 Isolation Forest Time-Series Anomaly Detection
+    ---
+    The `detect_time_series_anomalies_isoforest` function implements an **unsupervised machine learning** approach to outlier detection.
+    Unlike traditional statistical models that define "normal" regions, this model explicitly identifies anomalies by **isolating** them in a high-dimensional feature space.
+    ## 📋 Functional Overview
+    This function utilizes a **walk-forward validation** strategy. For the initial training period, all points are evaluated using
+    Isolation Forest fitted on the same training data. For every evaluation point in the test period, it dynamically engineers a unique feature set,
+    fits a forest of decision trees, and determines if the current observation is an outlier based on how easily it can be isolated from historical data.
+    ## 🧠 Core Logic & Helper Utilities
+    ### 1. Dynamic Feature Engineering (`get_dynamic_lags`)
+    To capture the temporal structure of the data, the model doesn't just look at the raw value; it looks at the **context**.
+    * **Autocorrelation (ACF):** The function calculates the **10 most significant lags** based on the data's historical patterns.
+    * **Momentum:** It always includes lags 1, 2, and 3 to ensure immediate short-term trends are captured.
+    * **Rolling Statistics:** It automatically calculates **rolling means** and **standard deviations** at multiple scales (quarter-lag, half-lag, and full-lag intervals).
+    ### 2. Isolation Forest Model Configuration
+    The model builds **200 trees** (`n_estimators`) to ensure a stable anomaly score.
+    * **Contamination:** A baseline assumption that **1%** of the data is inherently noisy.
+    * **Decision Function:** The model calculates an anomaly score where lower, more negative values indicate a higher likelihood of being an outlier.
+    ### 3. Dual-Threshold Validation
+    To reduce "false positives," the function uses two layers of verification:
+    1.  **Contamination Anomaly:** The standard output from the sklearn model based on the 1% threshold.
+    2.  **Statistical Threshold:** A custom "safety" bound calculated as:
+        > $$Mean(Positive Scores) - 3 \\times Std(Positive Scores)$$
+    **Result:** A point is only flagged as `True` if **both** the ML model and the statistical threshold agree it is an anomaly.
+    ## 📤 Key Output Columns
+    * **`IsolationForest_timeseries_score`**: The decision score (anomaly score).
+    * **`is_IsolationForest_timeseries_anomaly`**: The final boolean flag for anomalies.
+    * **Engineered Features**: All `lagX`, `roll_meanX`, and `roll_stdX` columns created during the process.
+    ## 💡 Usage Context
+    Isolation Forest is exceptionally powerful for **multi-dimensional anomalies**.
+    Because it considers lags, rolling stats, and trend simultaneously, it can detect "subtle" anomalies where the value might look normal,
+    but the **relationship** between the value and its recent history is broken.
+    ---
+    ### ⚙️ Implementation Strategy
+    For the initial training period, the function fits the model on all training data and scores all training points.
+    For the test points, they are handled one-by-one in a loop. After each prediction, the training set expands to include the latest observed value,
+    ensuring the forest is always aware of the most recent data trends before predicting the next point."""
     group[date_column] = pd.to_datetime(group[date_column])
     group = group.copy().sort_values(date_column).reset_index(drop=True)
+    group['set'] = np.where(np.arange(len(group)) >= len(group) - eval_period, 'TEST', 'TRAIN')
-    '''
-    Iterate over each of the evaluation periods, fitting the model to all the data before the evaluation period
-    and then getting the predicted anomaly score for the given evaluation period
-    '''
     try:
-        test_anom = []
+        all_results = []
+        # ===================================================================
+        # STEP 1: Evaluate all points in the initial TRAIN period
+        # ===================================================================
+        # Get the cutoff date for initial train period
+        initial_cutoff_date = group[group['set'] == 'TRAIN'][date_column].max()
+        # Prepare the full group with features
+        model_group_initial = group.copy()
+        # Get train set to determine lags
+        train_initial = model_group_initial[model_group_initial['set'] == 'TRAIN'].copy()
+        lags = get_dynamic_lags(train_initial[variable])
+        # Create lag features on the entire model_group DF
+        for lag in lags:
+            model_group_initial[f'lag{lag}'] = model_group_initial[variable].shift(lag)
+        # Get rolling stats features for the entire model_group DF
+        rolling_stats_features = []
+        for w in [int(np.ceil(max(lags)/4)), int(np.ceil(max(lags)/2)), int(max(lags))]:
+            if w >= 3:
+                rolling_stats_features.append('roll_mean' + str(w))
+                rolling_stats_features.append('roll_std' + str(w))
+                model_group_initial['roll_mean' + str(w)] = model_group_initial[variable].shift(1).rolling(w).mean()
+                model_group_initial['roll_std' + str(w)] = model_group_initial[variable].shift(1).rolling(w).std()
+        # Get trend feature
+        model_group_initial['trend'] = model_group_initial.index
+        # Drop records with NAs
+        model_group_initial = model_group_initial.copy().dropna()
+        # Get just the initial train set
+        train_initial = model_group_initial[model_group_initial['set'] == 'TRAIN'].copy()
+        # Identify all model features (lags, rolling stats, trend, and the variable itself)
+        features = [f'lag{i}' for i in lags] + rolling_stats_features + ['trend'] + [variable]
+        # Create and fit the model on initial training data
+        iso_forest_model = IsolationForest(
+            n_estimators=200,
+            contamination=0.01,
+            random_state=42
+        )
+        iso_forest_model.fit(train_initial[features])
+        # Score all training points
+        train_initial['IsolationForest_score_timeseries'] = iso_forest_model.decision_function(train_initial[features])
+        # Calculate anomaly threshold
+        positive_scores = train_initial[train_initial['IsolationForest_score_timeseries'] > 0]['IsolationForest_score_timeseries']
+        if len(positive_scores) > 0:
+            anomaly_threshold = min(0, positive_scores.mean() - 3 * positive_scores.std())
+        else:
+            anomaly_threshold = 0
+        # Predict anomalies for training points
+        train_initial['contamination_anomaly'] = iso_forest_model.predict(train_initial[features])  # -1 = anomaly, 1 = normal
+        train_initial['IsolationForest_score_low_timeseries'] = anomaly_threshold
+        train_initial['threshold_anomaly'] = np.where(
+            train_initial['IsolationForest_score_timeseries'] < anomaly_threshold, -1, 1
+        )
+        # Dual threshold: both contamination and statistical threshold must agree
+        train_initial['is_IsolationForest_anomaly_timeseries'] = np.where(
+            (train_initial['contamination_anomaly'] == -1) & (train_initial['threshold_anomaly'] == -1),
+            True,
+            False
+        )
+        # Select relevant columns
+        train_initial_result = train_initial[[
+            variable,
+            date_column,
+            'IsolationForest_score_timeseries',
+            'IsolationForest_score_low_timeseries',
+            'is_IsolationForest_anomaly_timeseries'
+        ]]
+        all_results.append(train_initial_result)
+        # ===================================================================
+        # STEP 2: Walk-forward evaluation for TEST period (one-step-ahead)
+        # ===================================================================
         for t in list(range(eval_period - 1, -1, -1)):
             try:
@@ -92,32 +223,73 @@ def detect_time_series_anomalies_isoforest(
                 iso_forest_model.fit(train[features])
                 train['IsolationForest_score_timeseries'] = iso_forest_model.decision_function(train[features])
-                anomaly_threshold = min(0,
-                    train[train['IsolationForest_score_timeseries'] > 0]['IsolationForest_score_timeseries'].mean() - 3 * train[train['IsolationForest_score_timeseries'] > 0]['IsolationForest_score_timeseries'].std())
+                # Calculate anomaly threshold
+                positive_scores = train[train['IsolationForest_score_timeseries'] > 0]['IsolationForest_score_timeseries']
+                if len(positive_scores) > 0:
+                    anomaly_threshold = min(0, positive_scores.mean() - 3 * positive_scores.std())
+                else:
+                    anomaly_threshold = 0
                 test['IsolationForest_score_timeseries'] = iso_forest_model.decision_function(test[features])
                 test['contamination_anomaly'] = iso_forest_model.predict(test[features])  # -1 = anomaly, 1 = normal
-                test['anomaly_threshold'] = anomaly_threshold
+                test['IsolationForest_score_low_timeseries'] = anomaly_threshold
                 test['threshold_anomaly'] = np.where(test['IsolationForest_score_timeseries'] < anomaly_threshold, -1, 1)
-                test['is_IsolationForest_anomaly_timeseries'] = np.where((test['contamination_anomaly'] == -1) & (test['threshold_anomaly'] == -1), True, False)
-                test = test[[variable, date_column, 'IsolationForest_score_timeseries', 'is_IsolationForest_anomaly_timeseries']]
-                test_anom.append(test)
-            except:
+                test['is_IsolationForest_anomaly_timeseries'] = np.where(
+                    (test['contamination_anomaly'] == -1) & (test['threshold_anomaly'] == -1),
+                    True,
+                    False
+                )
+                test = test[[
+                    variable,
+                    date_column,
+                    'IsolationForest_score_timeseries',
+                    'IsolationForest_score_low_timeseries',
+                    'is_IsolationForest_anomaly_timeseries'
+                ]]
+                all_results.append(test)
+            except Exception as e:
+                print(f"Error in iteration {t}: {e}")
                 pass
+        # ===================================================================
+        # STEP 3: Combine all results and merge back to original group
+        # ===================================================================
         try:
-            test_anom = pd.concat(test_anom)
-            group = group.merge(test_anom[[variable, date_column, 'IsolationForest_score_timeseries', 'is_IsolationForest_anomaly_timeseries']], on=[variable, date_column], how='left')
-        except:
-            print("Error in Isolation Forest process")
+            all_results_df = pd.concat(all_results, ignore_index=True)
+            # Merge back to original group
+            group = group.merge(
+                all_results_df[[
+                    variable,
+                    date_column,
+                    'IsolationForest_score_timeseries',
+                    'IsolationForest_score_low_timeseries',
+                    'is_IsolationForest_anomaly_timeseries'
+                ]],
+                on=[variable, date_column],
+                how='left'
+            )
+        except Exception as e:
+            print(f"Error in concatenating results: {e}")
             group["IsolationForest_score_timeseries"] = np.nan
+            group["IsolationForest_score_low_timeseries"] = np.nan
             group["is_IsolationForest_anomaly_timeseries"] = np.nan
-    except:
+    except Exception as e:
+        # Fallback error handling
+        try:
+            group_id_cols = group.select_dtypes(include=['object', 'string']).columns.tolist()
+            group_id = " ".join(group[group_id_cols].reset_index(drop=True).iloc[0].astype(str).to_list())
+        except:
+            group_id = "Unknown Group ID"
+        print(f'Isolation Forest Anomaly Detection failed for {group_id}. Error: {e}')
         group["IsolationForest_score_timeseries"] = np.nan
+        group["IsolationForest_score_low_timeseries"] = np.nan
         group["is_IsolationForest_anomaly_timeseries"] = np.nan
-        # Get string or object dtype columns from group that would identify the group
-        group_id = key_series.select_dtypes(include=['object', 'string']).columns.tolist()
-        group_id = " ".join(key_series[group_id].reset_index(drop=True).iloc[0].to_list())
-        print(f'Isolation Forest Anomaly Detection failed for {group_id}')
-    return group
+    return group

anomaly-pipeline 0.1.27__py3-none-any.whl → 0.1.61__py3-none-any.whl

anomaly-pipeline 0.1.27py3-none-any.whl → 0.1.61py3-none-any.whl