anomaly-pipeline 0.1.27__py3-none-any.whl → 0.1.61__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anomaly_pipeline/__init__.py +73 -1
- anomaly_pipeline/helpers/DB_scan.py +144 -10
- anomaly_pipeline/helpers/MAD.py +45 -0
- anomaly_pipeline/helpers/Preprocessing.py +274 -73
- anomaly_pipeline/helpers/STD.py +64 -0
- anomaly_pipeline/helpers/__init__.py +13 -1
- anomaly_pipeline/helpers/evaluation_info.py +25 -17
- anomaly_pipeline/helpers/evaluation_plots.py +636 -30
- anomaly_pipeline/helpers/ewma.py +105 -7
- anomaly_pipeline/helpers/fb_prophet.py +150 -2
- anomaly_pipeline/helpers/{help_info.py → help_anomaly.py} +194 -89
- anomaly_pipeline/helpers/iso_forest_general.py +5 -3
- anomaly_pipeline/helpers/iso_forest_timeseries.py +195 -23
- anomaly_pipeline/helpers/percentile.py +46 -3
- anomaly_pipeline/main.py +158 -39
- anomaly_pipeline/pipeline.py +106 -34
- anomaly_pipeline-0.1.61.dist-info/METADATA +275 -0
- anomaly_pipeline-0.1.61.dist-info/RECORD +24 -0
- anomaly_pipeline-0.1.27.dist-info/METADATA +0 -15
- anomaly_pipeline-0.1.27.dist-info/RECORD +0 -24
- {anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/WHEEL +0 -0
- {anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/entry_points.txt +0 -0
- {anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/top_level.txt +0 -0
|
@@ -14,21 +14,24 @@ from .ewma import ewma_with_anomalies_rolling_group
|
|
|
14
14
|
from .fb_prophet import detect_time_series_anomalies_fb_walkforward
|
|
15
15
|
from .iso_forest_timeseries import detect_time_series_anomalies_isoforest
|
|
16
16
|
from .DB_scan import detect_time_series_anomalies_dbscan
|
|
17
|
-
from .Preprocessing import create_full_calendar_and_interpolate,
|
|
18
|
-
|
|
17
|
+
from .Preprocessing import (create_full_calendar_and_interpolate,
|
|
18
|
+
print_anomaly_stats,
|
|
19
|
+
calculate_ensemble_scores)
|
|
20
|
+
from .evaluation_plots import (anomaly_overview_plot,
|
|
21
|
+
anomaly_percentile_plot,
|
|
22
|
+
anomaly_sd_plot,
|
|
23
|
+
anomaly_mad_plot,
|
|
24
|
+
anomaly_iqr_plot,
|
|
25
|
+
anomaly_ewma_plot,
|
|
26
|
+
anomaly_fb_plot,
|
|
27
|
+
anomaly_dbscan_plot,
|
|
28
|
+
anomaly_isolation_forest_plot)
|
|
29
|
+
|
|
30
|
+
|
|
19
31
|
|
|
20
|
-
group_columns=["key", "channel"]
|
|
21
|
-
variable="views"
|
|
22
|
-
eval_period = 12
|
|
23
|
-
date_column = "week_start"
|
|
24
|
-
mad_threshold = 2
|
|
25
|
-
mad_scale_factor = 0.6745
|
|
26
|
-
alpha=.3
|
|
27
|
-
sigma=1.5
|
|
28
|
-
interval_width = .95
|
|
29
|
-
freq = 'W-MON'
|
|
30
32
|
|
|
31
|
-
|
|
33
|
+
|
|
34
|
+
def help_anomaly(topic=None):
|
|
32
35
|
|
|
33
36
|
#example_df = get_example_df()
|
|
34
37
|
|
|
@@ -50,6 +53,23 @@ def help_info(topic=None):
|
|
|
50
53
|
help_sd()
|
|
51
54
|
elif topic.lower()[:3] == 'mad':
|
|
52
55
|
help_mad()
|
|
56
|
+
|
|
57
|
+
group_columns=["key", "channel"]
|
|
58
|
+
variable="views"
|
|
59
|
+
eval_period = 1
|
|
60
|
+
date_column = "week_start"
|
|
61
|
+
min_records = 52
|
|
62
|
+
max_records = 156
|
|
63
|
+
mad_threshold = 2
|
|
64
|
+
mad_scale_factor = 0.6745
|
|
65
|
+
alpha=.3
|
|
66
|
+
sigma=1.5
|
|
67
|
+
prophet_CI = .95
|
|
68
|
+
freq = 'W-MON'
|
|
69
|
+
contamination = 0.03
|
|
70
|
+
random_state =42
|
|
71
|
+
|
|
72
|
+
"""
|
|
53
73
|
|
|
54
74
|
|
|
55
75
|
def get_example_df():
|
|
@@ -78,8 +98,8 @@ def get_example_df():
|
|
|
78
98
|
'views': views})
|
|
79
99
|
|
|
80
100
|
|
|
81
|
-
example_df = create_full_calendar_and_interpolate(example_df,group_columns, variable, date_column, freq)
|
|
82
|
-
|
|
101
|
+
example_df = create_full_calendar_and_interpolate(example_df, group_columns, variable, date_column, freq, min_records, max_records)[0]
|
|
102
|
+
|
|
83
103
|
logging.getLogger('fbprophet').setLevel(logging.ERROR)
|
|
84
104
|
logging.getLogger('cmdstanpy').disabled = True
|
|
85
105
|
|
|
@@ -95,8 +115,27 @@ def get_example_df():
|
|
|
95
115
|
df_mad = detect_outliers_mad(example_df, variable, date_column, mad_threshold, mad_scale_factor, eval_period)
|
|
96
116
|
df_std = detect_outliers_sd(example_df, variable, date_column, eval_period)
|
|
97
117
|
df_ewma = ewma_with_anomalies_rolling_group(example_df, group_columns, variable, date_column, alpha, sigma, eval_period)
|
|
98
|
-
df_fb = detect_time_series_anomalies_fb_walkforward(example_df, variable, date_column, eval_period,
|
|
99
|
-
|
|
118
|
+
df_fb = detect_time_series_anomalies_fb_walkforward(example_df, variable, date_column, eval_period,prophet_CI)
|
|
119
|
+
df_isofor = detect_time_series_anomalies_isoforest(example_df,variable, date_column, eval_period)
|
|
120
|
+
ISF_timeseries_cols = group_columns+[date_column]+["IsolationForest_score_timeseries", "IsolationForest_score_low_timeseries", "is_IsolationForest_anomaly_timeseries"]
|
|
121
|
+
df_isofor_final= df_isofor[ISF_timeseries_cols]
|
|
122
|
+
df_isogen = detect_outliers_isf_general(group, variable, contamination, random_state, eval_period)
|
|
123
|
+
# combine ISF general and timeseries data frames
|
|
124
|
+
df_ISF= df_isogen.merge(df_isofor_final, on= group_columns+[date_column], how= 'inner')
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# Column 1 Logic: If 'type' is train, take from 'col_A', else take from 'col_B'
|
|
128
|
+
df_ISF['IsolationForest_score'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
|
|
129
|
+
anomaly_key_channel_ISF['IsolationForest_score_general'],
|
|
130
|
+
anomaly_key_channel_ISF['IsolationForest_score_timeseries'])
|
|
131
|
+
|
|
132
|
+
df_ISF['IsolationForest_score_low'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN', anomaly_key_channel_ISF['IsolationForest_score_low_general'],anomaly_key_channel_ISF['IsolationForest_score_low_timeseries'])
|
|
133
|
+
|
|
134
|
+
# Column 2 Logic: If 'type' is train, take from 'IsolationForest_general', else take from 'IsolationForest_timeseries'
|
|
135
|
+
df_ISF['is_IsolationForest_anomaly'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
|
|
136
|
+
anomaly_key_channel_ISF['is_IsolationForest_anomaly_general'],
|
|
137
|
+
anomaly_key_channel_ISF['is_IsolationForest_anomaly_timeseries'])
|
|
138
|
+
|
|
100
139
|
df_dbscan = detect_time_series_anomalies_dbscan(example_df, variable, date_column, eval_period)
|
|
101
140
|
|
|
102
141
|
orig_columns = example_df.columns.to_list()
|
|
@@ -108,120 +147,186 @@ def get_example_df():
|
|
|
108
147
|
df_std.drop(columns=orig_columns, errors='ignore'),
|
|
109
148
|
df_ewma.drop(columns=orig_columns, errors='ignore'),
|
|
110
149
|
df_fb.drop(columns=orig_columns, errors='ignore'),
|
|
111
|
-
|
|
150
|
+
df_ISF.drop(columns=orig_columns, errors='ignore'),
|
|
112
151
|
df_dbscan.drop(columns=orig_columns, errors='ignore')
|
|
113
152
|
], axis=1)
|
|
114
153
|
|
|
115
|
-
|
|
116
|
-
# example_df['Percentile_score_scaled'] = np.where(example_df['is_Percentile_anomaly'].isna()==False,
|
|
117
|
-
# abs(example_df['views'] - (example_df['Percentile_high'] + example_df['Percentile_low'])/2)/\
|
|
118
|
-
# ((example_df['Percentile_high'] - example_df['Percentile_low'])/2) - 1, np.nan)
|
|
119
|
-
|
|
120
|
-
# example_df['SD_score_scaled'] = np.where(example_df['is_SD_anomaly'].isna()==False,
|
|
121
|
-
# abs(example_df[variable] - (example_df['SD2_high'] + example_df['SD2_low'])/2)/\
|
|
122
|
-
# ((example_df['SD2_high'] - example_df['SD2_low'])/2) - 1, np.nan)
|
|
123
|
-
|
|
124
|
-
# example_df['MAD_score_scaled'] = np.where(example_df['is_MAD_anomaly'].isna()==False,
|
|
125
|
-
# abs(example_df[variable] - (example_df['MAD_high'] + example_df['MAD_low'])/2)/\
|
|
126
|
-
# ((example_df['MAD_high'] - example_df['MAD_low'])/2) - 1, np.nan)
|
|
127
|
-
|
|
128
|
-
# example_df['IQR_score_scaled'] = np.where(example_df['is_IQR_anomaly'].isna()==False,
|
|
129
|
-
# abs(example_df['views'] - (example_df['IQR_high'] + example_df['IQR_low'])/2)/\
|
|
130
|
-
# ((example_df['IQR_high'] - example_df['IQR_low'])/2) - 1, np.nan)
|
|
131
|
-
|
|
132
|
-
# example_df['EWMA_score_scaled'] = np.where(example_df['is_EWMA_anomaly'].isna()==False,
|
|
133
|
-
# abs(example_df['views'] - (example_df['EWMA_high'] + example_df['EWMA_low'])/2)/\
|
|
134
|
-
# ((example_df['EWMA_high'] - example_df['EWMA_low'])/2) - 1, np.nan)
|
|
135
|
-
|
|
136
|
-
# example_df['FB_score_scaled'] = np.where(example_df['is_FB_anomaly'].isna()==False,
|
|
137
|
-
# abs(example_df['views'] - (example_df['FB_high'] + example_df['FB_low'])/2)/\
|
|
138
|
-
# ((example_df['FB_high'] - example_df['FB_low'])/2) - 1, np.nan)
|
|
139
|
-
|
|
140
|
-
# score_scaled_cols = []
|
|
141
|
-
# for col in example_df.columns.to_list():
|
|
142
|
-
# if col.endswith('_scaled'):
|
|
143
|
-
# score_scaled_cols.append(col)
|
|
144
|
-
|
|
145
|
-
# example_df['Anomaly_Score'] = example_df[score_scaled_cols].mean(axis=1)
|
|
146
|
-
|
|
147
|
-
# example_df['Anomaly_Score_Display'] = np.where(example_df['Anomaly_Score'] < 0, np.floor(100*example_df['Anomaly_Score']),
|
|
148
|
-
# np.where(example_df['Anomaly_Score'].between(0, 1), np.ceil(100*example_df['Anomaly_Score']),
|
|
149
|
-
# np.where(example_df['Anomaly_Score'] > 1, 100, 0)))
|
|
150
|
-
|
|
151
|
-
is_anom_cols = []
|
|
152
|
-
for col in example_df.columns.to_list():
|
|
153
|
-
if col.startswith('is_') and col.endswith('_anomaly') and col != 'is_Anomaly':
|
|
154
|
-
is_anom_cols.append(col)
|
|
155
|
-
|
|
156
|
-
example_df['Anomaly_Votes'] = example_df[is_anom_cols].sum(axis=1).astype(float)
|
|
157
|
-
example_df['is_Anomaly'] = np.where(example_df['Anomaly_Votes']/example_df[is_anom_cols].replace(False, True).sum(axis=1) >= 0.5, True, False)
|
|
158
|
-
example_df['Anomaly_Score'] = 2 * (example_df['Anomaly_Votes']/example_df[is_anom_cols].replace(True, 1).replace(False, 1).sum(axis=1) - 0.5).astype(float)
|
|
159
|
-
example_df['Anomaly_Score_Display'] = np.where(example_df['Anomaly_Score'] < 0, np.floor(100*example_df['Anomaly_Score']),
|
|
160
|
-
np.where(example_df['Anomaly_Score'] > 0, np.ceil(100*example_df['Anomaly_Score']), 1)).astype(float)
|
|
154
|
+
example_df = calculate_ensemble_scores(example_df, 'views')
|
|
161
155
|
|
|
156
|
+
globals()['anomaly_example_df'] = example_df
|
|
162
157
|
return example_df
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
def get_example_df():
|
|
161
|
+
"""
|
|
162
|
+
Generates a sample dataset and runs all 8 models to demonstrate
|
|
163
|
+
the anomaly-pipeline functionality.
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
# 1. Create dummy time-series data
|
|
167
|
+
views = [
|
|
168
|
+
223006, 145101, 136508, 119284, 151332, 169419, 158795, 163725, 161911, 153131,
|
|
169
|
+
178292, 188910, 192736, 165486, 157370, 151250, 151699, 144465, 167651, 185210,
|
|
170
|
+
172594, 176735, 158885, 140992, 184203, 235889, 203074, 203714, 162486, 227249,
|
|
171
|
+
243952, 241711, 213386, 183171, 176070, 185944, 191282, 180852, 219299, 271454,
|
|
172
|
+
216265, 150586, 123755, 126039, 117597, 103758, 133977, 144088, 143186, 247731,
|
|
173
|
+
267901, 289105, 378025, 221419, 119153, 117262, 135635, 157462, 158551, 162637,
|
|
174
|
+
157246, 144626, 129089, 153280, 145880, 130291, 114119, 112931, 110593, 120172,
|
|
175
|
+
185307, 213343, 164825, 153140, 127525, 128465, 180317, 232471, 229766, 129962,
|
|
176
|
+
98732, 181722, 198247, 222167, 175792, 131070, 154662, 158707, 152083, 151097,
|
|
177
|
+
194114, 230775, 195828, 150668, 119488, 118110, 165357, 150681, 151303, 137414,
|
|
178
|
+
126470, 223347, 222285, 244610, 277318
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
example_df = pd.DataFrame({
|
|
182
|
+
'key': ['PLP>appliances>refrigerators'] * len(views),
|
|
183
|
+
'channel': ['raw_desktop_views'] * len(views),
|
|
184
|
+
'week_start': pd.date_range(start='2023-11-27', periods=len(views), freq=freq),
|
|
185
|
+
'views': views
|
|
186
|
+
})
|
|
187
|
+
|
|
188
|
+
# 2. Preprocessing
|
|
189
|
+
# Assuming create_full_calendar_and_interpolate returns a tuple (df, success, exclusion)
|
|
190
|
+
example_df = create_full_calendar_and_interpolate(
|
|
191
|
+
example_df, group_columns, variable, date_column, freq, min_records, max_records
|
|
192
|
+
)[0]
|
|
193
|
+
|
|
194
|
+
# Silence Prophet/CmdStanPy noise
|
|
195
|
+
logging.getLogger('fbprophet').setLevel(logging.ERROR)
|
|
196
|
+
logging.getLogger('cmdstanpy').disabled = True
|
|
197
|
+
|
|
198
|
+
# 3. Individual Model Detections
|
|
199
|
+
df_percentile = detect_outliers_percentile(example_df, variable, date_column, eval_period)
|
|
200
|
+
df_iqr = detect_outliers_iqr(example_df, variable, date_column, eval_period)
|
|
201
|
+
df_mad = detect_outliers_mad(example_df, variable, date_column, mad_threshold, mad_scale_factor, eval_period)
|
|
202
|
+
df_std = detect_outliers_sd(example_df, variable, date_column, eval_period)
|
|
203
|
+
df_ewma = ewma_with_anomalies_rolling_group(example_df, group_columns, variable, date_column, alpha, sigma, eval_period)
|
|
204
|
+
df_fb = detect_time_series_anomalies_fb_walkforward(example_df, variable, date_column, eval_period, prophet_CI)
|
|
205
|
+
df_isofor = detect_time_series_anomalies_isoforest(example_df, variable, date_column, eval_period)
|
|
206
|
+
|
|
207
|
+
# 4. Handle Isolation Forest Logic (Consolidating General + Time-series)
|
|
208
|
+
df_isogen = detect_outliers_isf_general(example_df, variable, contamination, random_state, eval_period)
|
|
209
|
+
|
|
210
|
+
ISF_ts_cols = group_columns + [date_column] + ["IsolationForest_score_timeseries", "IsolationForest_score_low_timeseries", "is_IsolationForest_anomaly_timeseries"]
|
|
211
|
+
df_isofor_sub = df_isofor[ISF_ts_cols]
|
|
212
|
+
|
|
213
|
+
df_ISF = df_isogen.merge(df_isofor_sub, on=group_columns + [date_column], how='inner')
|
|
214
|
+
|
|
215
|
+
# Logical mapping for ISF Ensemble
|
|
216
|
+
df_ISF['IsolationForest_score'] = np.where(df_ISF['set'] == 'TRAIN',
|
|
217
|
+
df_ISF['IsolationForest_score_general'],
|
|
218
|
+
df_ISF['IsolationForest_score_timeseries'])
|
|
163
219
|
|
|
220
|
+
df_ISF['IsolationForest_score_low'] = np.where(df_ISF['set'] == 'TRAIN',
|
|
221
|
+
df_ISF['IsolationForest_score_low_general'],
|
|
222
|
+
df_ISF['IsolationForest_score_low_timeseries'])
|
|
223
|
+
|
|
224
|
+
df_ISF['is_IsolationForest_anomaly'] = np.where(df_ISF['set'] == 'TRAIN',
|
|
225
|
+
df_ISF['is_IsolationForest_anomaly_general'],
|
|
226
|
+
df_ISF['is_IsolationForest_anomaly_timeseries'])
|
|
227
|
+
|
|
228
|
+
# 5. Final Model (DBSCAN)
|
|
229
|
+
df_dbscan = detect_time_series_anomalies_dbscan(example_df, variable, date_column, eval_period)
|
|
230
|
+
|
|
231
|
+
# 6. Concatenate Results
|
|
232
|
+
# Identify non-original columns to avoid duplicates during join
|
|
233
|
+
orig_cols = example_df.columns.to_list()
|
|
234
|
+
|
|
235
|
+
combined_df = pd.concat([
|
|
236
|
+
example_df,
|
|
237
|
+
df_percentile.drop(columns=orig_cols, errors='ignore'),
|
|
238
|
+
df_iqr.drop(columns=orig_cols, errors='ignore'),
|
|
239
|
+
df_mad.drop(columns=orig_cols, errors='ignore'),
|
|
240
|
+
df_std.drop(columns=orig_cols, errors='ignore'),
|
|
241
|
+
df_ewma.drop(columns=orig_cols, errors='ignore'),
|
|
242
|
+
df_fb.drop(columns=orig_cols, errors='ignore'),
|
|
243
|
+
df_ISF.drop(columns=orig_cols, errors='ignore'),
|
|
244
|
+
df_dbscan.drop(columns=orig_cols, errors='ignore')
|
|
245
|
+
], axis=1)
|
|
246
|
+
|
|
247
|
+
# 7. Calculate Final Ensemble Scores
|
|
248
|
+
final_example_df = calculate_ensemble_scores(combined_df, variable)
|
|
249
|
+
|
|
250
|
+
# Optional: assign to a global variable for notebook access
|
|
251
|
+
globals()['anomaly_example_df'] = final_example_df
|
|
252
|
+
|
|
253
|
+
return final_example_df
|
|
164
254
|
|
|
165
255
|
def help_overview():
|
|
166
256
|
display(Markdown(overview_msg))
|
|
167
257
|
example_df = get_example_df()
|
|
168
258
|
display(example_df[['key', 'channel', 'week_start', 'views']].tail(12))
|
|
169
259
|
display(Markdown(overview_msg2))
|
|
170
|
-
|
|
260
|
+
anomaly_overview_plot(example_df, group_columns, variable, date_column, eval_period, show_anomaly_scores_on_main_plot=False)
|
|
171
261
|
|
|
172
262
|
|
|
173
263
|
def help_percentile():
|
|
174
264
|
display(Markdown(percentile_msg))
|
|
175
265
|
example_df = get_example_df()
|
|
176
|
-
anomaly_percentile_plot(example_df, group_columns, variable, date_column, final_anomalies=False
|
|
266
|
+
anomaly_percentile_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
|
|
267
|
+
|
|
268
|
+
def help_sd():
|
|
269
|
+
display(Markdown(sd_msg))
|
|
270
|
+
example_df = get_example_df()
|
|
271
|
+
anomaly_sd_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
|
|
272
|
+
|
|
273
|
+
def help_mad():
|
|
274
|
+
display(Markdown(mad_msg))
|
|
275
|
+
example_df = get_example_df()
|
|
276
|
+
anomaly_mad_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
|
|
177
277
|
|
|
178
278
|
|
|
179
279
|
def help_iqr():
|
|
180
280
|
display(Markdown(iqr_msg))
|
|
181
281
|
example_df = get_example_df()
|
|
182
|
-
anomaly_iqr_plot(example_df, group_columns, variable, date_column, final_anomalies=False
|
|
282
|
+
anomaly_iqr_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
|
|
183
283
|
|
|
184
284
|
|
|
185
|
-
def help_mad():
|
|
186
|
-
display(Markdown(mad_msg))
|
|
187
|
-
example_df = get_example_df()
|
|
188
|
-
anomaly_mad_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
def help_sd():
|
|
192
|
-
display(Markdown(sd_msg))
|
|
193
|
-
example_df = get_example_df()
|
|
194
|
-
anomaly_sd_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
|
|
195
285
|
|
|
196
|
-
|
|
197
286
|
def help_ewma():
|
|
198
287
|
display(Markdown(ewma_msg))
|
|
199
288
|
example_df = get_example_df()
|
|
200
|
-
anomaly_ewma_plot(example_df, group_columns, variable, date_column, final_anomalies=False
|
|
289
|
+
anomaly_ewma_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
|
|
201
290
|
|
|
202
291
|
|
|
203
292
|
def help_fb():
|
|
204
293
|
display(Markdown(fb_msg))
|
|
205
294
|
example_df = get_example_df()
|
|
206
|
-
anomaly_fb_plot(example_df, group_columns, variable, date_column, final_anomalies=False
|
|
295
|
+
anomaly_fb_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
|
|
296
|
+
|
|
297
|
+
def help_isofor():
|
|
298
|
+
display(Markdown(isofor_msg))
|
|
299
|
+
example_df = get_example_df()
|
|
300
|
+
anomaly_isolation_forest_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
|
|
207
301
|
|
|
208
302
|
|
|
209
303
|
def help_dbscan():
|
|
210
304
|
display(Markdown(dbscan_msg))
|
|
211
305
|
example_df = get_example_df()
|
|
212
|
-
anomaly_dbscan_plot(example_df, group_columns, variable, date_column, final_anomalies=False
|
|
306
|
+
anomaly_dbscan_plot(example_df, group_columns, variable, date_column, eval_period, final_anomalies=False)
|
|
213
307
|
|
|
214
308
|
|
|
215
|
-
def help_isofor():
|
|
216
|
-
display(Markdown(isofor_msg))
|
|
217
|
-
example_df = get_example_df()
|
|
218
|
-
anomaly_isolation_forest_timeseries_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
|
|
219
|
-
|
|
220
309
|
|
|
221
310
|
overview_msg = """
|
|
222
311
|
# 🏗️ The Anomaly Detection Function
|
|
223
312
|
---
|
|
224
313
|
|
|
314
|
+
FYI, you can see information about specific models used in the anomaly pipeline with any of the following commands:
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
help_anomaly('percentile')
|
|
319
|
+
help_anomaly('iqr')
|
|
320
|
+
help_anomaly('mad')
|
|
321
|
+
help_anomaly('std')
|
|
322
|
+
help_anomaly('ewma')
|
|
323
|
+
help_anomaly('prophet')
|
|
324
|
+
help_anomaly('dbscan')
|
|
325
|
+
help_anomaly('iso') # For information on isolation forest
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
---
|
|
329
|
+
|
|
225
330
|
The `run_pipeline` function handles end-to-end processing — from data cleaning and interpolation to executing multiple machine learning models in parallel and aggregating their results into a final "Consensus" anomaly flag.
|
|
226
331
|
|
|
227
332
|
## 📋 Functional Overview
|
|
@@ -229,7 +334,7 @@ The pipeline takes raw master data, partitions it into groups by unique ID, appl
|
|
|
229
334
|
|
|
230
335
|
The master data DataFrame that you pass into the anomaly detection pipeline needs to have at least 3 columns - unique ID, date, and a target variable. The unique ID can be defined by multiple columns.
|
|
231
336
|
|
|
232
|
-
Here is an example of a DataFrame that has two columns that comprise the unique ID
|
|
337
|
+
Here is an example of a DataFrame that has two columns that comprise the unique ID `['key', 'channel']`, `week_start` is the date column, and `views` is the target variable:"""
|
|
233
338
|
|
|
234
339
|
|
|
235
340
|
overview_msg2 = """
|
|
@@ -272,7 +377,7 @@ Use `run_pipeline` when you need a **highly reliable, automated output**. By com
|
|
|
272
377
|
| :--- | :--- | :--- |
|
|
273
378
|
| **`eval_period`** | `12` | The number of recent weeks to evaluate for anomalies. |
|
|
274
379
|
| **`alpha` / `sigma`** | `0.3` / `1.5` | Sensitivity settings for the EWMA model. |
|
|
275
|
-
| **`
|
|
380
|
+
| **`prophet_CI`** | `0.90` | The confidence interval for the Prophet (FB) model. |
|
|
276
381
|
| **`n_jobs`** | `-1` | Utilizes all available processor cores for parallelization. |
|
|
277
382
|
|
|
278
383
|
|
|
@@ -400,7 +505,7 @@ Unlike standard batch forecasting, this function operates by simulating a real-w
|
|
|
400
505
|
* **Robust Error Handling:** If the Prophet fit fails, the function falls back to a **baseline persistence model** (last observed value) to prevent pipeline failure.
|
|
401
506
|
|
|
402
507
|
### 3. Anomaly Classification
|
|
403
|
-
* **Uncertainty Bounds:** Anomalies are defined by the `
|
|
508
|
+
* **Uncertainty Bounds:** Anomalies are defined by the `prophet_CI` parameter. Any observation falling outside the predicted upper or lower bounds is flagged.
|
|
404
509
|
* **Residual Calculation:** The function computes the **FB_residual** (Actual - Forecast) to quantify the magnitude of deviations.
|
|
405
510
|
|
|
406
511
|
## 📤 Key Output Columns
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from sklearn.ensemble import IsolationForest
|
|
3
3
|
|
|
4
|
-
def detect_outliers_isf_general(group, variable, contamination
|
|
4
|
+
def detect_outliers_isf_general(group, variable, contamination, random_state, eval_period):
|
|
5
5
|
n = len(group)
|
|
6
6
|
if n < 10:
|
|
7
7
|
return pd.DataFrame(columns=group.columns)
|
|
@@ -12,6 +12,7 @@ def detect_outliers_isf_general(group, variable, contamination=0.03, random_stat
|
|
|
12
12
|
# Initialize columns
|
|
13
13
|
group['set'] = ""
|
|
14
14
|
group['IsolationForest_score_general'] = 0.0
|
|
15
|
+
group['IsolationForest_score_low_general'] = 0.0
|
|
15
16
|
group['is_IsolationForest_anomaly_general'] = False
|
|
16
17
|
|
|
17
18
|
# --- 1. HANDLE TRAINING DATA (Initial Block) ---
|
|
@@ -19,11 +20,11 @@ def detect_outliers_isf_general(group, variable, contamination=0.03, random_stat
|
|
|
19
20
|
initial_train = group[[variable]].iloc[:train_size]
|
|
20
21
|
|
|
21
22
|
iso = IsolationForest(contamination=contamination, random_state=random_state)
|
|
23
|
+
iso.fit(initial_train)
|
|
22
24
|
|
|
23
|
-
# Fit and predict the initial block
|
|
24
|
-
group.loc[group.index[:train_size], 'IsolationForest_score_general'] = iso.fit_predict(initial_train) # Note: this is actually the cluster label
|
|
25
25
|
# We use decision_function for the raw anomaly score
|
|
26
26
|
group.loc[group.index[:train_size], 'IsolationForest_score_general'] = iso.decision_function(initial_train)
|
|
27
|
+
group.loc[group.index[:train_size], 'IsolationForest_score_low_general'] = iso.offset_
|
|
27
28
|
group.loc[group.index[:train_size], 'is_IsolationForest_anomaly_general'] = iso.predict(initial_train) == -1
|
|
28
29
|
group.loc[group.index[:train_size], 'set'] = "TRAIN"
|
|
29
30
|
|
|
@@ -41,6 +42,7 @@ def detect_outliers_isf_general(group, variable, contamination=0.03, random_stat
|
|
|
41
42
|
current_point = group[[variable]].iloc[[i]]
|
|
42
43
|
|
|
43
44
|
group.iloc[i, group.columns.get_loc('IsolationForest_score_general')] = iso_expanding.decision_function(current_point)[0]
|
|
45
|
+
group.iloc[i, group.columns.get_loc('IsolationForest_score_low_general')] = iso_expanding.offset_
|
|
44
46
|
group.iloc[i, group.columns.get_loc('is_IsolationForest_anomaly_general')] = iso_expanding.predict(current_point)[0] == -1
|
|
45
47
|
group.iloc[i, group.columns.get_loc('set')] = "TEST"
|
|
46
48
|
|
|
@@ -35,16 +35,147 @@ def detect_time_series_anomalies_isoforest(
|
|
|
35
35
|
eval_period,
|
|
36
36
|
):
|
|
37
37
|
|
|
38
|
+
"""
|
|
39
|
+
# 🌲 Isolation Forest Time-Series Anomaly Detection
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
The `detect_time_series_anomalies_isoforest` function implements an **unsupervised machine learning** approach to outlier detection.
|
|
43
|
+
Unlike traditional statistical models that define "normal" regions, this model explicitly identifies anomalies by **isolating** them in a high-dimensional feature space.
|
|
44
|
+
|
|
45
|
+
## 📋 Functional Overview
|
|
46
|
+
This function utilizes a **walk-forward validation** strategy. For the initial training period, all points are evaluated using
|
|
47
|
+
Isolation Forest fitted on the same training data. For every evaluation point in the test period, it dynamically engineers a unique feature set,
|
|
48
|
+
fits a forest of decision trees, and determines if the current observation is an outlier based on how easily it can be isolated from historical data.
|
|
49
|
+
|
|
50
|
+
## 🧠 Core Logic & Helper Utilities
|
|
51
|
+
|
|
52
|
+
### 1. Dynamic Feature Engineering (`get_dynamic_lags`)
|
|
53
|
+
To capture the temporal structure of the data, the model doesn't just look at the raw value; it looks at the **context**.
|
|
54
|
+
* **Autocorrelation (ACF):** The function calculates the **10 most significant lags** based on the data's historical patterns.
|
|
55
|
+
* **Momentum:** It always includes lags 1, 2, and 3 to ensure immediate short-term trends are captured.
|
|
56
|
+
* **Rolling Statistics:** It automatically calculates **rolling means** and **standard deviations** at multiple scales (quarter-lag, half-lag, and full-lag intervals).
|
|
57
|
+
|
|
58
|
+
### 2. Isolation Forest Model Configuration
|
|
59
|
+
The model builds **200 trees** (`n_estimators`) to ensure a stable anomaly score.
|
|
60
|
+
* **Contamination:** A baseline assumption that **1%** of the data is inherently noisy.
|
|
61
|
+
* **Decision Function:** The model calculates an anomaly score where lower, more negative values indicate a higher likelihood of being an outlier.
|
|
62
|
+
|
|
63
|
+
### 3. Dual-Threshold Validation
|
|
64
|
+
To reduce "false positives," the function uses two layers of verification:
|
|
65
|
+
1. **Contamination Anomaly:** The standard output from the sklearn model based on the 1% threshold.
|
|
66
|
+
2. **Statistical Threshold:** A custom "safety" bound calculated as:
|
|
67
|
+
> $$Mean(Positive Scores) - 3 \\times Std(Positive Scores)$$
|
|
68
|
+
**Result:** A point is only flagged as `True` if **both** the ML model and the statistical threshold agree it is an anomaly.
|
|
69
|
+
|
|
70
|
+
## 📤 Key Output Columns
|
|
71
|
+
* **`IsolationForest_timeseries_score`**: The decision score (anomaly score).
|
|
72
|
+
* **`is_IsolationForest_timeseries_anomaly`**: The final boolean flag for anomalies.
|
|
73
|
+
* **Engineered Features**: All `lagX`, `roll_meanX`, and `roll_stdX` columns created during the process.
|
|
74
|
+
|
|
75
|
+
## 💡 Usage Context
|
|
76
|
+
Isolation Forest is exceptionally powerful for **multi-dimensional anomalies**.
|
|
77
|
+
Because it considers lags, rolling stats, and trend simultaneously, it can detect "subtle" anomalies where the value might look normal,
|
|
78
|
+
but the **relationship** between the value and its recent history is broken.
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
### ⚙️ Implementation Strategy
|
|
82
|
+
For the initial training period, the function fits the model on all training data and scores all training points.
|
|
83
|
+
For the test points, they are handled one-by-one in a loop. After each prediction, the training set expands to include the latest observed value,
|
|
84
|
+
ensuring the forest is always aware of the most recent data trends before predicting the next point."""
|
|
85
|
+
|
|
86
|
+
|
|
38
87
|
group[date_column] = pd.to_datetime(group[date_column])
|
|
39
88
|
group = group.copy().sort_values(date_column).reset_index(drop=True)
|
|
89
|
+
group['set'] = np.where(np.arange(len(group)) >= len(group) - eval_period, 'TEST', 'TRAIN')
|
|
40
90
|
|
|
41
|
-
'''
|
|
42
|
-
Iterate over each of the evaluation periods, fitting the model to all the data before the evaluation period
|
|
43
|
-
and then getting the predicted anomaly score for the given evaluation period
|
|
44
|
-
'''
|
|
45
91
|
try:
|
|
46
|
-
|
|
92
|
+
all_results = []
|
|
93
|
+
|
|
94
|
+
# ===================================================================
|
|
95
|
+
# STEP 1: Evaluate all points in the initial TRAIN period
|
|
96
|
+
# ===================================================================
|
|
97
|
+
|
|
98
|
+
# Get the cutoff date for initial train period
|
|
99
|
+
initial_cutoff_date = group[group['set'] == 'TRAIN'][date_column].max()
|
|
100
|
+
|
|
101
|
+
# Prepare the full group with features
|
|
102
|
+
model_group_initial = group.copy()
|
|
103
|
+
|
|
104
|
+
# Get train set to determine lags
|
|
105
|
+
train_initial = model_group_initial[model_group_initial['set'] == 'TRAIN'].copy()
|
|
106
|
+
lags = get_dynamic_lags(train_initial[variable])
|
|
47
107
|
|
|
108
|
+
# Create lag features on the entire model_group DF
|
|
109
|
+
for lag in lags:
|
|
110
|
+
model_group_initial[f'lag{lag}'] = model_group_initial[variable].shift(lag)
|
|
111
|
+
|
|
112
|
+
# Get rolling stats features for the entire model_group DF
|
|
113
|
+
rolling_stats_features = []
|
|
114
|
+
for w in [int(np.ceil(max(lags)/4)), int(np.ceil(max(lags)/2)), int(max(lags))]:
|
|
115
|
+
if w >= 3:
|
|
116
|
+
rolling_stats_features.append('roll_mean' + str(w))
|
|
117
|
+
rolling_stats_features.append('roll_std' + str(w))
|
|
118
|
+
model_group_initial['roll_mean' + str(w)] = model_group_initial[variable].shift(1).rolling(w).mean()
|
|
119
|
+
model_group_initial['roll_std' + str(w)] = model_group_initial[variable].shift(1).rolling(w).std()
|
|
120
|
+
|
|
121
|
+
# Get trend feature
|
|
122
|
+
model_group_initial['trend'] = model_group_initial.index
|
|
123
|
+
|
|
124
|
+
# Drop records with NAs
|
|
125
|
+
model_group_initial = model_group_initial.copy().dropna()
|
|
126
|
+
|
|
127
|
+
# Get just the initial train set
|
|
128
|
+
train_initial = model_group_initial[model_group_initial['set'] == 'TRAIN'].copy()
|
|
129
|
+
|
|
130
|
+
# Identify all model features (lags, rolling stats, trend, and the variable itself)
|
|
131
|
+
features = [f'lag{i}' for i in lags] + rolling_stats_features + ['trend'] + [variable]
|
|
132
|
+
|
|
133
|
+
# Create and fit the model on initial training data
|
|
134
|
+
iso_forest_model = IsolationForest(
|
|
135
|
+
n_estimators=200,
|
|
136
|
+
contamination=0.01,
|
|
137
|
+
random_state=42
|
|
138
|
+
)
|
|
139
|
+
iso_forest_model.fit(train_initial[features])
|
|
140
|
+
|
|
141
|
+
# Score all training points
|
|
142
|
+
train_initial['IsolationForest_score_timeseries'] = iso_forest_model.decision_function(train_initial[features])
|
|
143
|
+
|
|
144
|
+
# Calculate anomaly threshold
|
|
145
|
+
positive_scores = train_initial[train_initial['IsolationForest_score_timeseries'] > 0]['IsolationForest_score_timeseries']
|
|
146
|
+
if len(positive_scores) > 0:
|
|
147
|
+
anomaly_threshold = min(0, positive_scores.mean() - 3 * positive_scores.std())
|
|
148
|
+
else:
|
|
149
|
+
anomaly_threshold = 0
|
|
150
|
+
|
|
151
|
+
# Predict anomalies for training points
|
|
152
|
+
train_initial['contamination_anomaly'] = iso_forest_model.predict(train_initial[features]) # -1 = anomaly, 1 = normal
|
|
153
|
+
train_initial['IsolationForest_score_low_timeseries'] = anomaly_threshold
|
|
154
|
+
train_initial['threshold_anomaly'] = np.where(
|
|
155
|
+
train_initial['IsolationForest_score_timeseries'] < anomaly_threshold, -1, 1
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Dual threshold: both contamination and statistical threshold must agree
|
|
159
|
+
train_initial['is_IsolationForest_anomaly_timeseries'] = np.where(
|
|
160
|
+
(train_initial['contamination_anomaly'] == -1) & (train_initial['threshold_anomaly'] == -1),
|
|
161
|
+
True,
|
|
162
|
+
False
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Select relevant columns
|
|
166
|
+
train_initial_result = train_initial[[
|
|
167
|
+
variable,
|
|
168
|
+
date_column,
|
|
169
|
+
'IsolationForest_score_timeseries',
|
|
170
|
+
'IsolationForest_score_low_timeseries',
|
|
171
|
+
'is_IsolationForest_anomaly_timeseries'
|
|
172
|
+
]]
|
|
173
|
+
all_results.append(train_initial_result)
|
|
174
|
+
|
|
175
|
+
# ===================================================================
|
|
176
|
+
# STEP 2: Walk-forward evaluation for TEST period (one-step-ahead)
|
|
177
|
+
# ===================================================================
|
|
178
|
+
|
|
48
179
|
for t in list(range(eval_period - 1, -1, -1)):
|
|
49
180
|
|
|
50
181
|
try:
|
|
@@ -92,32 +223,73 @@ def detect_time_series_anomalies_isoforest(
|
|
|
92
223
|
iso_forest_model.fit(train[features])
|
|
93
224
|
|
|
94
225
|
train['IsolationForest_score_timeseries'] = iso_forest_model.decision_function(train[features])
|
|
95
|
-
|
|
96
|
-
|
|
226
|
+
|
|
227
|
+
# Calculate anomaly threshold
|
|
228
|
+
positive_scores = train[train['IsolationForest_score_timeseries'] > 0]['IsolationForest_score_timeseries']
|
|
229
|
+
if len(positive_scores) > 0:
|
|
230
|
+
anomaly_threshold = min(0, positive_scores.mean() - 3 * positive_scores.std())
|
|
231
|
+
else:
|
|
232
|
+
anomaly_threshold = 0
|
|
233
|
+
|
|
97
234
|
test['IsolationForest_score_timeseries'] = iso_forest_model.decision_function(test[features])
|
|
98
235
|
test['contamination_anomaly'] = iso_forest_model.predict(test[features]) # -1 = anomaly, 1 = normal
|
|
99
|
-
test['
|
|
236
|
+
test['IsolationForest_score_low_timeseries'] = anomaly_threshold
|
|
100
237
|
test['threshold_anomaly'] = np.where(test['IsolationForest_score_timeseries'] < anomaly_threshold, -1, 1)
|
|
101
|
-
|
|
102
|
-
test['is_IsolationForest_anomaly_timeseries'] = np.where(
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
238
|
+
|
|
239
|
+
test['is_IsolationForest_anomaly_timeseries'] = np.where(
|
|
240
|
+
(test['contamination_anomaly'] == -1) & (test['threshold_anomaly'] == -1),
|
|
241
|
+
True,
|
|
242
|
+
False
|
|
243
|
+
)
|
|
244
|
+
test = test[[
|
|
245
|
+
variable,
|
|
246
|
+
date_column,
|
|
247
|
+
'IsolationForest_score_timeseries',
|
|
248
|
+
'IsolationForest_score_low_timeseries',
|
|
249
|
+
'is_IsolationForest_anomaly_timeseries'
|
|
250
|
+
]]
|
|
251
|
+
all_results.append(test)
|
|
252
|
+
|
|
253
|
+
except Exception as e:
|
|
254
|
+
print(f"Error in iteration {t}: {e}")
|
|
106
255
|
pass
|
|
256
|
+
|
|
257
|
+
# ===================================================================
|
|
258
|
+
# STEP 3: Combine all results and merge back to original group
|
|
259
|
+
# ===================================================================
|
|
260
|
+
|
|
107
261
|
try:
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
262
|
+
all_results_df = pd.concat(all_results, ignore_index=True)
|
|
263
|
+
|
|
264
|
+
# Merge back to original group
|
|
265
|
+
group = group.merge(
|
|
266
|
+
all_results_df[[
|
|
267
|
+
variable,
|
|
268
|
+
date_column,
|
|
269
|
+
'IsolationForest_score_timeseries',
|
|
270
|
+
'IsolationForest_score_low_timeseries',
|
|
271
|
+
'is_IsolationForest_anomaly_timeseries'
|
|
272
|
+
]],
|
|
273
|
+
on=[variable, date_column],
|
|
274
|
+
how='left'
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
except Exception as e:
|
|
278
|
+
print(f"Error in concatenating results: {e}")
|
|
112
279
|
group["IsolationForest_score_timeseries"] = np.nan
|
|
280
|
+
group["IsolationForest_score_low_timeseries"] = np.nan
|
|
113
281
|
group["is_IsolationForest_anomaly_timeseries"] = np.nan
|
|
114
282
|
|
|
115
|
-
except:
|
|
283
|
+
except Exception as e:
|
|
284
|
+
# Fallback error handling
|
|
285
|
+
try:
|
|
286
|
+
group_id_cols = group.select_dtypes(include=['object', 'string']).columns.tolist()
|
|
287
|
+
group_id = " ".join(group[group_id_cols].reset_index(drop=True).iloc[0].astype(str).to_list())
|
|
288
|
+
except:
|
|
289
|
+
group_id = "Unknown Group ID"
|
|
290
|
+
print(f'Isolation Forest Anomaly Detection failed for {group_id}. Error: {e}')
|
|
116
291
|
group["IsolationForest_score_timeseries"] = np.nan
|
|
292
|
+
group["IsolationForest_score_low_timeseries"] = np.nan
|
|
117
293
|
group["is_IsolationForest_anomaly_timeseries"] = np.nan
|
|
118
|
-
# Get string or object dtype columns from group that would identify the group
|
|
119
|
-
group_id = key_series.select_dtypes(include=['object', 'string']).columns.tolist()
|
|
120
|
-
group_id = " ".join(key_series[group_id].reset_index(drop=True).iloc[0].to_list())
|
|
121
|
-
print(f'Isolation Forest Anomaly Detection failed for {group_id}')
|
|
122
294
|
|
|
123
|
-
return group
|
|
295
|
+
return group
|