anomaly-pipeline 0.1.27__py3-none-any.whl → 0.1.61__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,21 +14,24 @@ from .ewma import ewma_with_anomalies_rolling_group
14
14
  from .fb_prophet import detect_time_series_anomalies_fb_walkforward
15
15
  from .iso_forest_timeseries import detect_time_series_anomalies_isoforest
16
16
  from .DB_scan import detect_time_series_anomalies_dbscan
17
- from .Preprocessing import create_full_calendar_and_interpolate, print_anomaly_stats
18
- from .evaluation_plots import anomaly_eval_plot, anomaly_percentile_plot,anomaly_sd_plot, anomaly_mad_plot, anomaly_iqr_plot, anomaly_ewma_plot, anomaly_fb_plot, anomaly_dbscan_plot, anomaly_isolation_forest_timeseries_plot
17
+ from .Preprocessing import (create_full_calendar_and_interpolate,
18
+ print_anomaly_stats,
19
+ calculate_ensemble_scores)
20
+ from .evaluation_plots import (anomaly_overview_plot,
21
+ anomaly_percentile_plot,
22
+ anomaly_sd_plot,
23
+ anomaly_mad_plot,
24
+ anomaly_iqr_plot,
25
+ anomaly_ewma_plot,
26
+ anomaly_fb_plot,
27
+ anomaly_dbscan_plot,
28
+ anomaly_isolation_forest_plot)
29
+
30
+
19
31
 
20
- group_columns=["key", "channel"]
21
- variable="views"
22
- eval_period = 12
23
- date_column = "week_start"
24
- mad_threshold = 2
25
- mad_scale_factor = 0.6745
26
- alpha=.3
27
- sigma=1.5
28
- interval_width = .95
29
- freq = 'W-MON'
30
32
 
31
- def help_info(topic=None):
33
+
34
+ def help_anomaly(topic=None):
32
35
 
33
36
  #example_df = get_example_df()
34
37
 
@@ -50,6 +53,23 @@ def help_info(topic=None):
50
53
  help_sd()
51
54
  elif topic.lower()[:3] == 'mad':
52
55
  help_mad()
56
+
57
+ group_columns=["key", "channel"]
58
+ variable="views"
59
+ eval_period = 1
60
+ date_column = "week_start"
61
+ min_records = 52
62
+ max_records = 156
63
+ mad_threshold = 2
64
+ mad_scale_factor = 0.6745
65
+ alpha=.3
66
+ sigma=1.5
67
+ prophet_CI = .95
68
+ freq = 'W-MON'
69
+ contamination = 0.03
70
+ random_state =42
71
+
72
+ """
53
73
 
54
74
 
55
75
  def get_example_df():
@@ -78,8 +98,8 @@ def get_example_df():
78
98
  'views': views})
79
99
 
80
100
 
81
- example_df = create_full_calendar_and_interpolate(example_df,group_columns, variable, date_column, freq)
82
-
101
+ example_df = create_full_calendar_and_interpolate(example_df, group_columns, variable, date_column, freq, min_records, max_records)[0]
102
+
83
103
  logging.getLogger('fbprophet').setLevel(logging.ERROR)
84
104
  logging.getLogger('cmdstanpy').disabled = True
85
105
 
@@ -95,8 +115,27 @@ def get_example_df():
95
115
  df_mad = detect_outliers_mad(example_df, variable, date_column, mad_threshold, mad_scale_factor, eval_period)
96
116
  df_std = detect_outliers_sd(example_df, variable, date_column, eval_period)
97
117
  df_ewma = ewma_with_anomalies_rolling_group(example_df, group_columns, variable, date_column, alpha, sigma, eval_period)
98
- df_fb = detect_time_series_anomalies_fb_walkforward(example_df, variable, date_column, eval_period,interval_width)
99
- df_iosfor = detect_time_series_anomalies_isoforest(example_df,variable, date_column, eval_period)
118
+ df_fb = detect_time_series_anomalies_fb_walkforward(example_df, variable, date_column, eval_period,prophet_CI)
119
+ df_isofor = detect_time_series_anomalies_isoforest(example_df,variable, date_column, eval_period)
120
+ ISF_timeseries_cols = group_columns+[date_column]+["IsolationForest_score_timeseries", "IsolationForest_score_low_timeseries", "is_IsolationForest_anomaly_timeseries"]
121
+ df_isofor_final= df_isofor[ISF_timeseries_cols]
122
+ df_isogen = detect_outliers_isf_general(group, variable, contamination, random_state, eval_period)
123
+ # combine ISF general and timeseries data frames
124
+ df_ISF= df_isogen.merge(df_isofor_final, on= group_columns+[date_column], how= 'inner')
125
+
126
+
127
+ # Column 1 Logic: If 'type' is train, take from 'col_A', else take from 'col_B'
128
+ df_ISF['IsolationForest_score'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
129
+ anomaly_key_channel_ISF['IsolationForest_score_general'],
130
+ anomaly_key_channel_ISF['IsolationForest_score_timeseries'])
131
+
132
+ df_ISF['IsolationForest_score_low'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN', anomaly_key_channel_ISF['IsolationForest_score_low_general'],anomaly_key_channel_ISF['IsolationForest_score_low_timeseries'])
133
+
134
+ # Column 2 Logic: If 'type' is train, take from 'IsolationForest_general', else take from 'IsolationForest_timeseries'
135
+ df_ISF['is_IsolationForest_anomaly'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
136
+ anomaly_key_channel_ISF['is_IsolationForest_anomaly_general'],
137
+ anomaly_key_channel_ISF['is_IsolationForest_anomaly_timeseries'])
138
+
100
139
  df_dbscan = detect_time_series_anomalies_dbscan(example_df, variable, date_column, eval_period)
101
140
 
102
141
  orig_columns = example_df.columns.to_list()
@@ -108,120 +147,186 @@ def get_example_df():
108
147
  df_std.drop(columns=orig_columns, errors='ignore'),
109
148
  df_ewma.drop(columns=orig_columns, errors='ignore'),
110
149
  df_fb.drop(columns=orig_columns, errors='ignore'),
111
- df_iosfor.drop(columns=orig_columns, errors='ignore'),
150
+ df_ISF.drop(columns=orig_columns, errors='ignore'),
112
151
  df_dbscan.drop(columns=orig_columns, errors='ignore')
113
152
  ], axis=1)
114
153
 
115
- # Scaled Scores
116
- # example_df['Percentile_score_scaled'] = np.where(example_df['is_Percentile_anomaly'].isna()==False,
117
- # abs(example_df['views'] - (example_df['Percentile_high'] + example_df['Percentile_low'])/2)/\
118
- # ((example_df['Percentile_high'] - example_df['Percentile_low'])/2) - 1, np.nan)
119
-
120
- # example_df['SD_score_scaled'] = np.where(example_df['is_SD_anomaly'].isna()==False,
121
- # abs(example_df[variable] - (example_df['SD2_high'] + example_df['SD2_low'])/2)/\
122
- # ((example_df['SD2_high'] - example_df['SD2_low'])/2) - 1, np.nan)
123
-
124
- # example_df['MAD_score_scaled'] = np.where(example_df['is_MAD_anomaly'].isna()==False,
125
- # abs(example_df[variable] - (example_df['MAD_high'] + example_df['MAD_low'])/2)/\
126
- # ((example_df['MAD_high'] - example_df['MAD_low'])/2) - 1, np.nan)
127
-
128
- # example_df['IQR_score_scaled'] = np.where(example_df['is_IQR_anomaly'].isna()==False,
129
- # abs(example_df['views'] - (example_df['IQR_high'] + example_df['IQR_low'])/2)/\
130
- # ((example_df['IQR_high'] - example_df['IQR_low'])/2) - 1, np.nan)
131
-
132
- # example_df['EWMA_score_scaled'] = np.where(example_df['is_EWMA_anomaly'].isna()==False,
133
- # abs(example_df['views'] - (example_df['EWMA_high'] + example_df['EWMA_low'])/2)/\
134
- # ((example_df['EWMA_high'] - example_df['EWMA_low'])/2) - 1, np.nan)
135
-
136
- # example_df['FB_score_scaled'] = np.where(example_df['is_FB_anomaly'].isna()==False,
137
- # abs(example_df['views'] - (example_df['FB_high'] + example_df['FB_low'])/2)/\
138
- # ((example_df['FB_high'] - example_df['FB_low'])/2) - 1, np.nan)
139
-
140
- # score_scaled_cols = []
141
- # for col in example_df.columns.to_list():
142
- # if col.endswith('_scaled'):
143
- # score_scaled_cols.append(col)
144
-
145
- # example_df['Anomaly_Score'] = example_df[score_scaled_cols].mean(axis=1)
146
-
147
- # example_df['Anomaly_Score_Display'] = np.where(example_df['Anomaly_Score'] < 0, np.floor(100*example_df['Anomaly_Score']),
148
- # np.where(example_df['Anomaly_Score'].between(0, 1), np.ceil(100*example_df['Anomaly_Score']),
149
- # np.where(example_df['Anomaly_Score'] > 1, 100, 0)))
150
-
151
- is_anom_cols = []
152
- for col in example_df.columns.to_list():
153
- if col.startswith('is_') and col.endswith('_anomaly') and col != 'is_Anomaly':
154
- is_anom_cols.append(col)
155
-
156
- example_df['Anomaly_Votes'] = example_df[is_anom_cols].sum(axis=1).astype(float)
157
- example_df['is_Anomaly'] = np.where(example_df['Anomaly_Votes']/example_df[is_anom_cols].replace(False, True).sum(axis=1) >= 0.5, True, False)
158
- example_df['Anomaly_Score'] = 2 * (example_df['Anomaly_Votes']/example_df[is_anom_cols].replace(True, 1).replace(False, 1).sum(axis=1) - 0.5).astype(float)
159
- example_df['Anomaly_Score_Display'] = np.where(example_df['Anomaly_Score'] < 0, np.floor(100*example_df['Anomaly_Score']),
160
- np.where(example_df['Anomaly_Score'] > 0, np.ceil(100*example_df['Anomaly_Score']), 1)).astype(float)
154
+ example_df = calculate_ensemble_scores(example_df, 'views')
161
155
 
156
+ globals()['anomaly_example_df'] = example_df
162
157
  return example_df
158
+ """
159
+
160
+ def get_example_df():
161
+ """
162
+ Generates a sample dataset and runs all 8 models to demonstrate
163
+ the anomaly-pipeline functionality.
164
+ """
165
+
166
+ # 1. Create dummy time-series data
167
+ views = [
168
+ 223006, 145101, 136508, 119284, 151332, 169419, 158795, 163725, 161911, 153131,
169
+ 178292, 188910, 192736, 165486, 157370, 151250, 151699, 144465, 167651, 185210,
170
+ 172594, 176735, 158885, 140992, 184203, 235889, 203074, 203714, 162486, 227249,
171
+ 243952, 241711, 213386, 183171, 176070, 185944, 191282, 180852, 219299, 271454,
172
+ 216265, 150586, 123755, 126039, 117597, 103758, 133977, 144088, 143186, 247731,
173
+ 267901, 289105, 378025, 221419, 119153, 117262, 135635, 157462, 158551, 162637,
174
+ 157246, 144626, 129089, 153280, 145880, 130291, 114119, 112931, 110593, 120172,
175
+ 185307, 213343, 164825, 153140, 127525, 128465, 180317, 232471, 229766, 129962,
176
+ 98732, 181722, 198247, 222167, 175792, 131070, 154662, 158707, 152083, 151097,
177
+ 194114, 230775, 195828, 150668, 119488, 118110, 165357, 150681, 151303, 137414,
178
+ 126470, 223347, 222285, 244610, 277318
179
+ ]
180
+
181
+ example_df = pd.DataFrame({
182
+ 'key': ['PLP>appliances>refrigerators'] * len(views),
183
+ 'channel': ['raw_desktop_views'] * len(views),
184
+ 'week_start': pd.date_range(start='2023-11-27', periods=len(views), freq=freq),
185
+ 'views': views
186
+ })
187
+
188
+ # 2. Preprocessing
189
+ # Assuming create_full_calendar_and_interpolate returns a tuple (df, success, exclusion)
190
+ example_df = create_full_calendar_and_interpolate(
191
+ example_df, group_columns, variable, date_column, freq, min_records, max_records
192
+ )[0]
193
+
194
+ # Silence Prophet/CmdStanPy noise
195
+ logging.getLogger('fbprophet').setLevel(logging.ERROR)
196
+ logging.getLogger('cmdstanpy').disabled = True
197
+
198
+ # 3. Individual Model Detections
199
+ df_percentile = detect_outliers_percentile(example_df, variable, date_column, eval_period)
200
+ df_iqr = detect_outliers_iqr(example_df, variable, date_column, eval_period)
201
+ df_mad = detect_outliers_mad(example_df, variable, date_column, mad_threshold, mad_scale_factor, eval_period)
202
+ df_std = detect_outliers_sd(example_df, variable, date_column, eval_period)
203
+ df_ewma = ewma_with_anomalies_rolling_group(example_df, group_columns, variable, date_column, alpha, sigma, eval_period)
204
+ df_fb = detect_time_series_anomalies_fb_walkforward(example_df, variable, date_column, eval_period, prophet_CI)
205
+ df_isofor = detect_time_series_anomalies_isoforest(example_df, variable, date_column, eval_period)
206
+
207
+ # 4. Handle Isolation Forest Logic (Consolidating General + Time-series)
208
+ df_isogen = detect_outliers_isf_general(example_df, variable, contamination, random_state, eval_period)
209
+
210
+ ISF_ts_cols = group_columns + [date_column] + ["IsolationForest_score_timeseries", "IsolationForest_score_low_timeseries", "is_IsolationForest_anomaly_timeseries"]
211
+ df_isofor_sub = df_isofor[ISF_ts_cols]
212
+
213
+ df_ISF = df_isogen.merge(df_isofor_sub, on=group_columns + [date_column], how='inner')
214
+
215
+ # Logical mapping for ISF Ensemble
216
+ df_ISF['IsolationForest_score'] = np.where(df_ISF['set'] == 'TRAIN',
217
+ df_ISF['IsolationForest_score_general'],
218
+ df_ISF['IsolationForest_score_timeseries'])
163
219
 
220
+ df_ISF['IsolationForest_score_low'] = np.where(df_ISF['set'] == 'TRAIN',
221
+ df_ISF['IsolationForest_score_low_general'],
222
+ df_ISF['IsolationForest_score_low_timeseries'])
223
+
224
+ df_ISF['is_IsolationForest_anomaly'] = np.where(df_ISF['set'] == 'TRAIN',
225
+ df_ISF['is_IsolationForest_anomaly_general'],
226
+ df_ISF['is_IsolationForest_anomaly_timeseries'])
227
+
228
+ # 5. Final Model (DBSCAN)
229
+ df_dbscan = detect_time_series_anomalies_dbscan(example_df, variable, date_column, eval_period)
230
+
231
+ # 6. Concatenate Results
232
+ # Identify non-original columns to avoid duplicates during join
233
+ orig_cols = example_df.columns.to_list()
234
+
235
+ combined_df = pd.concat([
236
+ example_df,
237
+ df_percentile.drop(columns=orig_cols, errors='ignore'),
238
+ df_iqr.drop(columns=orig_cols, errors='ignore'),
239
+ df_mad.drop(columns=orig_cols, errors='ignore'),
240
+ df_std.drop(columns=orig_cols, errors='ignore'),
241
+ df_ewma.drop(columns=orig_cols, errors='ignore'),
242
+ df_fb.drop(columns=orig_cols, errors='ignore'),
243
+ df_ISF.drop(columns=orig_cols, errors='ignore'),
244
+ df_dbscan.drop(columns=orig_cols, errors='ignore')
245
+ ], axis=1)
246
+
247
+ # 7. Calculate Final Ensemble Scores
248
+ final_example_df = calculate_ensemble_scores(combined_df, variable)
249
+
250
+ # Optional: assign to a global variable for notebook access
251
+ globals()['anomaly_example_df'] = final_example_df
252
+
253
+ return final_example_df
164
254
 
165
255
  def help_overview():
166
256
  display(Markdown(overview_msg))
167
257
  example_df = get_example_df()
168
258
  display(example_df[['key', 'channel', 'week_start', 'views']].tail(12))
169
259
  display(Markdown(overview_msg2))
170
- anomaly_eval_plot(example_df, group_columns, variable, date_column, eval_period, show_anomaly_scores_on_main_plot=False)
260
+ anomaly_overview_plot(example_df, group_columns, variable, date_column, eval_period, show_anomaly_scores_on_main_plot=False)
171
261
 
172
262
 
173
263
  def help_percentile():
174
264
  display(Markdown(percentile_msg))
175
265
  example_df = get_example_df()
176
- anomaly_percentile_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
266
+ anomaly_percentile_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
267
+
268
+ def help_sd():
269
+ display(Markdown(sd_msg))
270
+ example_df = get_example_df()
271
+ anomaly_sd_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
272
+
273
+ def help_mad():
274
+ display(Markdown(mad_msg))
275
+ example_df = get_example_df()
276
+ anomaly_mad_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
177
277
 
178
278
 
179
279
  def help_iqr():
180
280
  display(Markdown(iqr_msg))
181
281
  example_df = get_example_df()
182
- anomaly_iqr_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
282
+ anomaly_iqr_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
183
283
 
184
284
 
185
- def help_mad():
186
- display(Markdown(mad_msg))
187
- example_df = get_example_df()
188
- anomaly_mad_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
189
-
190
-
191
- def help_sd():
192
- display(Markdown(sd_msg))
193
- example_df = get_example_df()
194
- anomaly_sd_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
195
285
 
196
-
197
286
  def help_ewma():
198
287
  display(Markdown(ewma_msg))
199
288
  example_df = get_example_df()
200
- anomaly_ewma_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
289
+ anomaly_ewma_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
201
290
 
202
291
 
203
292
  def help_fb():
204
293
  display(Markdown(fb_msg))
205
294
  example_df = get_example_df()
206
- anomaly_fb_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
295
+ anomaly_fb_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
296
+
297
+ def help_isofor():
298
+ display(Markdown(isofor_msg))
299
+ example_df = get_example_df()
300
+ anomaly_isolation_forest_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
207
301
 
208
302
 
209
303
  def help_dbscan():
210
304
  display(Markdown(dbscan_msg))
211
305
  example_df = get_example_df()
212
- anomaly_dbscan_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
306
+ anomaly_dbscan_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
213
307
 
214
308
 
215
- def help_isofor():
216
- display(Markdown(isofor_msg))
217
- example_df = get_example_df()
218
- anomaly_isolation_forest_timeseries_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
219
-
220
309
 
221
310
  overview_msg = """
222
311
  # 🏗️ The Anomaly Detection Function
223
312
  ---
224
313
 
314
+ FYI, you can see information about specific models used in the anomaly pipeline with any of the following commands:
315
+
316
+
317
+ ```python
318
+ help_anomaly('percentile')
319
+ help_anomaly('iqr')
320
+ help_anomaly('mad')
321
+ help_anomaly('std')
322
+ help_anomaly('ewma')
323
+ help_anomaly('prophet')
324
+ help_anomaly('dbscan')
325
+ help_anomaly('iso') # For information on isolation forest
326
+ ```
327
+
328
+ ---
329
+
225
330
  The `run_pipeline` function handles end-to-end processing — from data cleaning and interpolation to executing multiple machine learning models in parallel and aggregating their results into a final "Consensus" anomaly flag.
226
331
 
227
332
  ## 📋 Functional Overview
@@ -229,7 +334,7 @@ The pipeline takes raw master data, partitions it into groups by unique ID, appl
229
334
 
230
335
  The master data DataFrame that you pass into the anomaly detection pipeline needs to have at least 3 columns - unique ID, date, and a target variable. The unique ID can be defined by multiple columns.
231
336
 
232
- Here is an example of a DataFrame that has two columns that comprise the unique ID ('key' and 'channel'), 'week_start' is the date column, and 'views' is the target variable:"""
337
+ Here is an example of a DataFrame that has two columns that comprise the unique ID `['key', 'channel']`, `week_start` is the date column, and `views` is the target variable:"""
233
338
 
234
339
 
235
340
  overview_msg2 = """
@@ -272,7 +377,7 @@ Use `run_pipeline` when you need a **highly reliable, automated output**. By com
272
377
  | :--- | :--- | :--- |
273
378
  | **`eval_period`** | `12` | The number of recent weeks to evaluate for anomalies. |
274
379
  | **`alpha` / `sigma`** | `0.3` / `1.5` | Sensitivity settings for the EWMA model. |
275
- | **`interval_width`** | `0.90` | The confidence interval for the Prophet (FB) model. |
380
+ | **`prophet_CI`** | `0.90` | The confidence interval for the Prophet (FB) model. |
276
381
  | **`n_jobs`** | `-1` | Utilizes all available processor cores for parallelization. |
277
382
 
278
383
 
@@ -400,7 +505,7 @@ Unlike standard batch forecasting, this function operates by simulating a real-w
400
505
  * **Robust Error Handling:** If the Prophet fit fails, the function falls back to a **baseline persistence model** (last observed value) to prevent pipeline failure.
401
506
 
402
507
  ### 3. Anomaly Classification
403
- * **Uncertainty Bounds:** Anomalies are defined by the `interval_width` parameter. Any observation falling outside the predicted upper or lower bounds is flagged.
508
+ * **Uncertainty Bounds:** Anomalies are defined by the `prophet_CI` parameter. Any observation falling outside the predicted upper or lower bounds is flagged.
404
509
  * **Residual Calculation:** The function computes the **FB_residual** (Actual - Forecast) to quantify the magnitude of deviations.
405
510
 
406
511
  ## 📤 Key Output Columns
@@ -1,7 +1,7 @@
1
1
  import pandas as pd
2
2
  from sklearn.ensemble import IsolationForest
3
3
 
4
- def detect_outliers_isf_general(group, variable, contamination=0.03, random_state=42, eval_period=12):
4
+ def detect_outliers_isf_general(group, variable, contamination, random_state, eval_period):
5
5
  n = len(group)
6
6
  if n < 10:
7
7
  return pd.DataFrame(columns=group.columns)
@@ -12,6 +12,7 @@ def detect_outliers_isf_general(group, variable, contamination=0.03, random_stat
12
12
  # Initialize columns
13
13
  group['set'] = ""
14
14
  group['IsolationForest_score_general'] = 0.0
15
+ group['IsolationForest_score_low_general'] = 0.0
15
16
  group['is_IsolationForest_anomaly_general'] = False
16
17
 
17
18
  # --- 1. HANDLE TRAINING DATA (Initial Block) ---
@@ -19,11 +20,11 @@ def detect_outliers_isf_general(group, variable, contamination=0.03, random_stat
19
20
  initial_train = group[[variable]].iloc[:train_size]
20
21
 
21
22
  iso = IsolationForest(contamination=contamination, random_state=random_state)
23
+ iso.fit(initial_train)
22
24
 
23
- # Fit and predict the initial block
24
- group.loc[group.index[:train_size], 'IsolationForest_score_general'] = iso.fit_predict(initial_train) # Note: this is actually the cluster label
25
25
  # We use decision_function for the raw anomaly score
26
26
  group.loc[group.index[:train_size], 'IsolationForest_score_general'] = iso.decision_function(initial_train)
27
+ group.loc[group.index[:train_size], 'IsolationForest_score_low_general'] = iso.offset_
27
28
  group.loc[group.index[:train_size], 'is_IsolationForest_anomaly_general'] = iso.predict(initial_train) == -1
28
29
  group.loc[group.index[:train_size], 'set'] = "TRAIN"
29
30
 
@@ -41,6 +42,7 @@ def detect_outliers_isf_general(group, variable, contamination=0.03, random_stat
41
42
  current_point = group[[variable]].iloc[[i]]
42
43
 
43
44
  group.iloc[i, group.columns.get_loc('IsolationForest_score_general')] = iso_expanding.decision_function(current_point)[0]
45
+ group.iloc[i, group.columns.get_loc('IsolationForest_score_low_general')] = iso_expanding.offset_
44
46
  group.iloc[i, group.columns.get_loc('is_IsolationForest_anomaly_general')] = iso_expanding.predict(current_point)[0] == -1
45
47
  group.iloc[i, group.columns.get_loc('set')] = "TEST"
46
48
 
@@ -35,16 +35,147 @@ def detect_time_series_anomalies_isoforest(
35
35
  eval_period,
36
36
  ):
37
37
 
38
+ """
39
+ # 🌲 Isolation Forest Time-Series Anomaly Detection
40
+ ---
41
+
42
+ The `detect_time_series_anomalies_isoforest` function implements an **unsupervised machine learning** approach to outlier detection.
43
+ Unlike traditional statistical models that define "normal" regions, this model explicitly identifies anomalies by **isolating** them in a high-dimensional feature space.
44
+
45
+ ## 📋 Functional Overview
46
+ This function utilizes a **walk-forward validation** strategy. For the initial training period, all points are evaluated using
47
+ Isolation Forest fitted on the same training data. For every evaluation point in the test period, it dynamically engineers a unique feature set,
48
+ fits a forest of decision trees, and determines if the current observation is an outlier based on how easily it can be isolated from historical data.
49
+
50
+ ## 🧠 Core Logic & Helper Utilities
51
+
52
+ ### 1. Dynamic Feature Engineering (`get_dynamic_lags`)
53
+ To capture the temporal structure of the data, the model doesn't just look at the raw value; it looks at the **context**.
54
+ * **Autocorrelation (ACF):** The function calculates the **10 most significant lags** based on the data's historical patterns.
55
+ * **Momentum:** It always includes lags 1, 2, and 3 to ensure immediate short-term trends are captured.
56
+ * **Rolling Statistics:** It automatically calculates **rolling means** and **standard deviations** at multiple scales (quarter-lag, half-lag, and full-lag intervals).
57
+
58
+ ### 2. Isolation Forest Model Configuration
59
+ The model builds **200 trees** (`n_estimators`) to ensure a stable anomaly score.
60
+ * **Contamination:** A baseline assumption that **1%** of the data is inherently noisy.
61
+ * **Decision Function:** The model calculates an anomaly score where lower, more negative values indicate a higher likelihood of being an outlier.
62
+
63
+ ### 3. Dual-Threshold Validation
64
+ To reduce "false positives," the function uses two layers of verification:
65
+ 1. **Contamination Anomaly:** The standard output from the sklearn model based on the 1% threshold.
66
+ 2. **Statistical Threshold:** A custom "safety" bound calculated as:
67
+ > $$Mean(Positive Scores) - 3 \\times Std(Positive Scores)$$
68
+ **Result:** A point is only flagged as `True` if **both** the ML model and the statistical threshold agree it is an anomaly.
69
+
70
+ ## 📤 Key Output Columns
71
+ * **`IsolationForest_timeseries_score`**: The decision score (anomaly score).
72
+ * **`is_IsolationForest_timeseries_anomaly`**: The final boolean flag for anomalies.
73
+ * **Engineered Features**: All `lagX`, `roll_meanX`, and `roll_stdX` columns created during the process.
74
+
75
+ ## 💡 Usage Context
76
+ Isolation Forest is exceptionally powerful for **multi-dimensional anomalies**.
77
+ Because it considers lags, rolling stats, and trend simultaneously, it can detect "subtle" anomalies where the value might look normal,
78
+ but the **relationship** between the value and its recent history is broken.
79
+
80
+ ---
81
+ ### ⚙️ Implementation Strategy
82
+ For the initial training period, the function fits the model on all training data and scores all training points.
83
+ For the test points, they are handled one-by-one in a loop. After each prediction, the training set expands to include the latest observed value,
84
+ ensuring the forest is always aware of the most recent data trends before predicting the next point."""
85
+
86
+
38
87
  group[date_column] = pd.to_datetime(group[date_column])
39
88
  group = group.copy().sort_values(date_column).reset_index(drop=True)
89
+ group['set'] = np.where(np.arange(len(group)) >= len(group) - eval_period, 'TEST', 'TRAIN')
40
90
 
41
- '''
42
- Iterate over each of the evaluation periods, fitting the model to all the data before the evaluation period
43
- and then getting the predicted anomaly score for the given evaluation period
44
- '''
45
91
  try:
46
- test_anom = []
92
+ all_results = []
93
+
94
+ # ===================================================================
95
+ # STEP 1: Evaluate all points in the initial TRAIN period
96
+ # ===================================================================
97
+
98
+ # Get the cutoff date for initial train period
99
+ initial_cutoff_date = group[group['set'] == 'TRAIN'][date_column].max()
100
+
101
+ # Prepare the full group with features
102
+ model_group_initial = group.copy()
103
+
104
+ # Get train set to determine lags
105
+ train_initial = model_group_initial[model_group_initial['set'] == 'TRAIN'].copy()
106
+ lags = get_dynamic_lags(train_initial[variable])
47
107
 
108
+ # Create lag features on the entire model_group DF
109
+ for lag in lags:
110
+ model_group_initial[f'lag{lag}'] = model_group_initial[variable].shift(lag)
111
+
112
+ # Get rolling stats features for the entire model_group DF
113
+ rolling_stats_features = []
114
+ for w in [int(np.ceil(max(lags)/4)), int(np.ceil(max(lags)/2)), int(max(lags))]:
115
+ if w >= 3:
116
+ rolling_stats_features.append('roll_mean' + str(w))
117
+ rolling_stats_features.append('roll_std' + str(w))
118
+ model_group_initial['roll_mean' + str(w)] = model_group_initial[variable].shift(1).rolling(w).mean()
119
+ model_group_initial['roll_std' + str(w)] = model_group_initial[variable].shift(1).rolling(w).std()
120
+
121
+ # Get trend feature
122
+ model_group_initial['trend'] = model_group_initial.index
123
+
124
+ # Drop records with NAs
125
+ model_group_initial = model_group_initial.copy().dropna()
126
+
127
+ # Get just the initial train set
128
+ train_initial = model_group_initial[model_group_initial['set'] == 'TRAIN'].copy()
129
+
130
+ # Identify all model features (lags, rolling stats, trend, and the variable itself)
131
+ features = [f'lag{i}' for i in lags] + rolling_stats_features + ['trend'] + [variable]
132
+
133
+ # Create and fit the model on initial training data
134
+ iso_forest_model = IsolationForest(
135
+ n_estimators=200,
136
+ contamination=0.01,
137
+ random_state=42
138
+ )
139
+ iso_forest_model.fit(train_initial[features])
140
+
141
+ # Score all training points
142
+ train_initial['IsolationForest_score_timeseries'] = iso_forest_model.decision_function(train_initial[features])
143
+
144
+ # Calculate anomaly threshold
145
+ positive_scores = train_initial[train_initial['IsolationForest_score_timeseries'] > 0]['IsolationForest_score_timeseries']
146
+ if len(positive_scores) > 0:
147
+ anomaly_threshold = min(0, positive_scores.mean() - 3 * positive_scores.std())
148
+ else:
149
+ anomaly_threshold = 0
150
+
151
+ # Predict anomalies for training points
152
+ train_initial['contamination_anomaly'] = iso_forest_model.predict(train_initial[features]) # -1 = anomaly, 1 = normal
153
+ train_initial['IsolationForest_score_low_timeseries'] = anomaly_threshold
154
+ train_initial['threshold_anomaly'] = np.where(
155
+ train_initial['IsolationForest_score_timeseries'] < anomaly_threshold, -1, 1
156
+ )
157
+
158
+ # Dual threshold: both contamination and statistical threshold must agree
159
+ train_initial['is_IsolationForest_anomaly_timeseries'] = np.where(
160
+ (train_initial['contamination_anomaly'] == -1) & (train_initial['threshold_anomaly'] == -1),
161
+ True,
162
+ False
163
+ )
164
+
165
+ # Select relevant columns
166
+ train_initial_result = train_initial[[
167
+ variable,
168
+ date_column,
169
+ 'IsolationForest_score_timeseries',
170
+ 'IsolationForest_score_low_timeseries',
171
+ 'is_IsolationForest_anomaly_timeseries'
172
+ ]]
173
+ all_results.append(train_initial_result)
174
+
175
+ # ===================================================================
176
+ # STEP 2: Walk-forward evaluation for TEST period (one-step-ahead)
177
+ # ===================================================================
178
+
48
179
  for t in list(range(eval_period - 1, -1, -1)):
49
180
 
50
181
  try:
@@ -92,32 +223,73 @@ def detect_time_series_anomalies_isoforest(
92
223
  iso_forest_model.fit(train[features])
93
224
 
94
225
  train['IsolationForest_score_timeseries'] = iso_forest_model.decision_function(train[features])
95
- anomaly_threshold = min(0,
96
- train[train['IsolationForest_score_timeseries'] > 0]['IsolationForest_score_timeseries'].mean() - 3 * train[train['IsolationForest_score_timeseries'] > 0]['IsolationForest_score_timeseries'].std())
226
+
227
+ # Calculate anomaly threshold
228
+ positive_scores = train[train['IsolationForest_score_timeseries'] > 0]['IsolationForest_score_timeseries']
229
+ if len(positive_scores) > 0:
230
+ anomaly_threshold = min(0, positive_scores.mean() - 3 * positive_scores.std())
231
+ else:
232
+ anomaly_threshold = 0
233
+
97
234
  test['IsolationForest_score_timeseries'] = iso_forest_model.decision_function(test[features])
98
235
  test['contamination_anomaly'] = iso_forest_model.predict(test[features]) # -1 = anomaly, 1 = normal
99
- test['anomaly_threshold'] = anomaly_threshold
236
+ test['IsolationForest_score_low_timeseries'] = anomaly_threshold
100
237
  test['threshold_anomaly'] = np.where(test['IsolationForest_score_timeseries'] < anomaly_threshold, -1, 1)
101
-
102
- test['is_IsolationForest_anomaly_timeseries'] = np.where((test['contamination_anomaly'] == -1) & (test['threshold_anomaly'] == -1), True, False)
103
- test = test[[variable, date_column, 'IsolationForest_score_timeseries', 'is_IsolationForest_anomaly_timeseries']]
104
- test_anom.append(test)
105
- except:
238
+
239
+ test['is_IsolationForest_anomaly_timeseries'] = np.where(
240
+ (test['contamination_anomaly'] == -1) & (test['threshold_anomaly'] == -1),
241
+ True,
242
+ False
243
+ )
244
+ test = test[[
245
+ variable,
246
+ date_column,
247
+ 'IsolationForest_score_timeseries',
248
+ 'IsolationForest_score_low_timeseries',
249
+ 'is_IsolationForest_anomaly_timeseries'
250
+ ]]
251
+ all_results.append(test)
252
+
253
+ except Exception as e:
254
+ print(f"Error in iteration {t}: {e}")
106
255
  pass
256
+
257
+ # ===================================================================
258
+ # STEP 3: Combine all results and merge back to original group
259
+ # ===================================================================
260
+
107
261
  try:
108
- test_anom = pd.concat(test_anom)
109
- group = group.merge(test_anom[[variable, date_column, 'IsolationForest_score_timeseries', 'is_IsolationForest_anomaly_timeseries']], on=[variable, date_column], how='left')
110
- except:
111
- print("Error in Isolation Forest process")
262
+ all_results_df = pd.concat(all_results, ignore_index=True)
263
+
264
+ # Merge back to original group
265
+ group = group.merge(
266
+ all_results_df[[
267
+ variable,
268
+ date_column,
269
+ 'IsolationForest_score_timeseries',
270
+ 'IsolationForest_score_low_timeseries',
271
+ 'is_IsolationForest_anomaly_timeseries'
272
+ ]],
273
+ on=[variable, date_column],
274
+ how='left'
275
+ )
276
+
277
+ except Exception as e:
278
+ print(f"Error in concatenating results: {e}")
112
279
  group["IsolationForest_score_timeseries"] = np.nan
280
+ group["IsolationForest_score_low_timeseries"] = np.nan
113
281
  group["is_IsolationForest_anomaly_timeseries"] = np.nan
114
282
 
115
- except:
283
+ except Exception as e:
284
+ # Fallback error handling
285
+ try:
286
+ group_id_cols = group.select_dtypes(include=['object', 'string']).columns.tolist()
287
+ group_id = " ".join(group[group_id_cols].reset_index(drop=True).iloc[0].astype(str).to_list())
288
+ except:
289
+ group_id = "Unknown Group ID"
290
+ print(f'Isolation Forest Anomaly Detection failed for {group_id}. Error: {e}')
116
291
  group["IsolationForest_score_timeseries"] = np.nan
292
+ group["IsolationForest_score_low_timeseries"] = np.nan
117
293
  group["is_IsolationForest_anomaly_timeseries"] = np.nan
118
- # Get string or object dtype columns from group that would identify the group
119
- group_id = key_series.select_dtypes(include=['object', 'string']).columns.tolist()
120
- group_id = " ".join(key_series[group_id].reset_index(drop=True).iloc[0].to_list())
121
- print(f'Isolation Forest Anomaly Detection failed for {group_id}')
122
294
 
123
- return group
295
+ return group