anomaly-pipeline 0.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anomaly_pipeline/__init__.py +2 -0
- anomaly_pipeline/helpers/DB_scan.py +188 -0
- anomaly_pipeline/helpers/IQR.py +71 -0
- anomaly_pipeline/helpers/MAD.py +88 -0
- anomaly_pipeline/helpers/Preprocessing.py +116 -0
- anomaly_pipeline/helpers/STD.py +70 -0
- anomaly_pipeline/helpers/__init__.py +1 -0
- anomaly_pipeline/helpers/baseline.py +112 -0
- anomaly_pipeline/helpers/cluster_functions.py +289 -0
- anomaly_pipeline/helpers/evaluation_info.py +121 -0
- anomaly_pipeline/helpers/evaluation_plots.py +546 -0
- anomaly_pipeline/helpers/ewma.py +119 -0
- anomaly_pipeline/helpers/fb_prophet.py +94 -0
- anomaly_pipeline/helpers/help_info.py +683 -0
- anomaly_pipeline/helpers/iso_forest_general.py +50 -0
- anomaly_pipeline/helpers/iso_forest_timeseries.py +123 -0
- anomaly_pipeline/helpers/percentile.py +65 -0
- anomaly_pipeline/main.py +63 -0
- anomaly_pipeline/pipeline.py +253 -0
- anomaly_pipeline-0.1.27.dist-info/METADATA +15 -0
- anomaly_pipeline-0.1.27.dist-info/RECORD +24 -0
- anomaly_pipeline-0.1.27.dist-info/WHEEL +5 -0
- anomaly_pipeline-0.1.27.dist-info/entry_points.txt +2 -0
- anomaly_pipeline-0.1.27.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,683 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import plotly.graph_objects as go
|
|
4
|
+
from IPython.display import display, Markdown
|
|
5
|
+
import logging
|
|
6
|
+
import warnings
|
|
7
|
+
warnings.filterwarnings("ignore")
|
|
8
|
+
from .percentile import detect_outliers_percentile
|
|
9
|
+
from .STD import detect_outliers_sd
|
|
10
|
+
from .MAD import detect_outliers_mad
|
|
11
|
+
from .IQR import detect_outliers_iqr
|
|
12
|
+
from .iso_forest_general import detect_outliers_isf_general
|
|
13
|
+
from .ewma import ewma_with_anomalies_rolling_group
|
|
14
|
+
from .fb_prophet import detect_time_series_anomalies_fb_walkforward
|
|
15
|
+
from .iso_forest_timeseries import detect_time_series_anomalies_isoforest
|
|
16
|
+
from .DB_scan import detect_time_series_anomalies_dbscan
|
|
17
|
+
from .Preprocessing import create_full_calendar_and_interpolate, print_anomaly_stats
|
|
18
|
+
from .evaluation_plots import anomaly_eval_plot, anomaly_percentile_plot,anomaly_sd_plot, anomaly_mad_plot, anomaly_iqr_plot, anomaly_ewma_plot, anomaly_fb_plot, anomaly_dbscan_plot, anomaly_isolation_forest_timeseries_plot
|
|
19
|
+
|
|
20
|
+
group_columns=["key", "channel"]
|
|
21
|
+
variable="views"
|
|
22
|
+
eval_period = 12
|
|
23
|
+
date_column = "week_start"
|
|
24
|
+
mad_threshold = 2
|
|
25
|
+
mad_scale_factor = 0.6745
|
|
26
|
+
alpha=.3
|
|
27
|
+
sigma=1.5
|
|
28
|
+
interval_width = .95
|
|
29
|
+
freq = 'W-MON'
|
|
30
|
+
|
|
31
|
+
def help_info(topic=None):
|
|
32
|
+
|
|
33
|
+
#example_df = get_example_df()
|
|
34
|
+
|
|
35
|
+
if topic == None:
|
|
36
|
+
help_overview()
|
|
37
|
+
elif topic.lower()[:7] == 'percent':
|
|
38
|
+
help_percentile()
|
|
39
|
+
elif topic.lower() == 'iqr':
|
|
40
|
+
help_iqr()
|
|
41
|
+
elif topic.lower()[:2] == 'fb' or topic.lower()[:5] == 'proph':
|
|
42
|
+
help_fb()
|
|
43
|
+
elif topic.lower() == 'ewma':
|
|
44
|
+
help_ewma()
|
|
45
|
+
elif topic.lower()[:2] == 'db':
|
|
46
|
+
help_dbscan()
|
|
47
|
+
elif topic.lower()[:3] == 'iso':
|
|
48
|
+
help_isofor()
|
|
49
|
+
elif topic.lower()[:2] in ['st', 'sd']:
|
|
50
|
+
help_sd()
|
|
51
|
+
elif topic.lower()[:3] == 'mad':
|
|
52
|
+
help_mad()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_example_df():
|
|
56
|
+
|
|
57
|
+
# Check if example_df already exists in the notebook
|
|
58
|
+
global_vars = globals()
|
|
59
|
+
exists = ('example_df' in global_vars) and isinstance(global_vars['example_df'], pd.DataFrame)
|
|
60
|
+
|
|
61
|
+
# If it doesn't exist, create it
|
|
62
|
+
if exists == False:
|
|
63
|
+
|
|
64
|
+
global example_df
|
|
65
|
+
|
|
66
|
+
views = [223006, 145101, 136508, 119284, 151332, 169419, 158795, 163725, 161911, 153131, 178292, 188910, 192736, 165486, 157370, 151250, 151699,
|
|
67
|
+
144465, 167651, 185210, 172594, 176735, 158885, 140992, 184203, 235889, 203074, 203714, 162486, 227249, 243952, 241711, 213386, 183171,
|
|
68
|
+
176070, 185944, 191282, 180852, 219299, 271454, 216265, 150586, 123755, 126039, 117597, 103758, 133977, 144088, 143186, 247731, 267901,
|
|
69
|
+
289105, 378025, 221419, 119153, 117262, 135635, 157462, 158551, 162637, 157246, 144626, 129089, 153280, 145880, 130291, 114119, 112931,
|
|
70
|
+
110593, 120172, 185307, 213343, 164825, 153140, 127525, 128465, 180317, 232471, 229766, 129962, 98732, 181722, 198247, 222167, 175792,
|
|
71
|
+
131070, 154662, 158707, 152083, 151097, 194114, 230775, 195828, 150668, 119488, 118110, 165357, 150681, 151303, 137414, 126470, 223347,
|
|
72
|
+
222285, 244610, 277318]
|
|
73
|
+
|
|
74
|
+
example_df = pd.DataFrame({
|
|
75
|
+
'key': ['PLP>appliances>refrigerators'] * len(views),
|
|
76
|
+
'channel': ['raw_desktop_views'] * len(views),
|
|
77
|
+
'week_start': pd.date_range(start='2023-11-27', end='2025-11-24', freq='W-MON'),
|
|
78
|
+
'views': views})
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
example_df = create_full_calendar_and_interpolate(example_df,group_columns, variable, date_column, freq)
|
|
82
|
+
|
|
83
|
+
logging.getLogger('fbprophet').setLevel(logging.ERROR)
|
|
84
|
+
logging.getLogger('cmdstanpy').disabled = True
|
|
85
|
+
|
|
86
|
+
# tmp_model = Prophet(
|
|
87
|
+
# weekly_seasonality=True,
|
|
88
|
+
# yearly_seasonality=True,
|
|
89
|
+
# daily_seasonality=False
|
|
90
|
+
# )
|
|
91
|
+
# tmp_model.fit(example_df[['week_start', 'views']].rename(columns={'week_start': 'ds', 'views': 'y'}))
|
|
92
|
+
|
|
93
|
+
df_percentile = detect_outliers_percentile(example_df, variable, date_column, eval_period)
|
|
94
|
+
df_iqr = detect_outliers_iqr(example_df, variable, date_column, eval_period)
|
|
95
|
+
df_mad = detect_outliers_mad(example_df, variable, date_column, mad_threshold, mad_scale_factor, eval_period)
|
|
96
|
+
df_std = detect_outliers_sd(example_df, variable, date_column, eval_period)
|
|
97
|
+
df_ewma = ewma_with_anomalies_rolling_group(example_df, group_columns, variable, date_column, alpha, sigma, eval_period)
|
|
98
|
+
df_fb = detect_time_series_anomalies_fb_walkforward(example_df, variable, date_column, eval_period,interval_width)
|
|
99
|
+
df_iosfor = detect_time_series_anomalies_isoforest(example_df,variable, date_column, eval_period)
|
|
100
|
+
df_dbscan = detect_time_series_anomalies_dbscan(example_df, variable, date_column, eval_period)
|
|
101
|
+
|
|
102
|
+
orig_columns = example_df.columns.to_list()
|
|
103
|
+
example_df = pd.concat([
|
|
104
|
+
example_df,
|
|
105
|
+
df_percentile.drop(columns=orig_columns, errors='ignore'),
|
|
106
|
+
df_iqr.drop(columns=orig_columns, errors='ignore'),
|
|
107
|
+
df_mad.drop(columns=orig_columns, errors='ignore'),
|
|
108
|
+
df_std.drop(columns=orig_columns, errors='ignore'),
|
|
109
|
+
df_ewma.drop(columns=orig_columns, errors='ignore'),
|
|
110
|
+
df_fb.drop(columns=orig_columns, errors='ignore'),
|
|
111
|
+
df_iosfor.drop(columns=orig_columns, errors='ignore'),
|
|
112
|
+
df_dbscan.drop(columns=orig_columns, errors='ignore')
|
|
113
|
+
], axis=1)
|
|
114
|
+
|
|
115
|
+
# Scaled Scores
|
|
116
|
+
# example_df['Percentile_score_scaled'] = np.where(example_df['is_Percentile_anomaly'].isna()==False,
|
|
117
|
+
# abs(example_df['views'] - (example_df['Percentile_high'] + example_df['Percentile_low'])/2)/\
|
|
118
|
+
# ((example_df['Percentile_high'] - example_df['Percentile_low'])/2) - 1, np.nan)
|
|
119
|
+
|
|
120
|
+
# example_df['SD_score_scaled'] = np.where(example_df['is_SD_anomaly'].isna()==False,
|
|
121
|
+
# abs(example_df[variable] - (example_df['SD2_high'] + example_df['SD2_low'])/2)/\
|
|
122
|
+
# ((example_df['SD2_high'] - example_df['SD2_low'])/2) - 1, np.nan)
|
|
123
|
+
|
|
124
|
+
# example_df['MAD_score_scaled'] = np.where(example_df['is_MAD_anomaly'].isna()==False,
|
|
125
|
+
# abs(example_df[variable] - (example_df['MAD_high'] + example_df['MAD_low'])/2)/\
|
|
126
|
+
# ((example_df['MAD_high'] - example_df['MAD_low'])/2) - 1, np.nan)
|
|
127
|
+
|
|
128
|
+
# example_df['IQR_score_scaled'] = np.where(example_df['is_IQR_anomaly'].isna()==False,
|
|
129
|
+
# abs(example_df['views'] - (example_df['IQR_high'] + example_df['IQR_low'])/2)/\
|
|
130
|
+
# ((example_df['IQR_high'] - example_df['IQR_low'])/2) - 1, np.nan)
|
|
131
|
+
|
|
132
|
+
# example_df['EWMA_score_scaled'] = np.where(example_df['is_EWMA_anomaly'].isna()==False,
|
|
133
|
+
# abs(example_df['views'] - (example_df['EWMA_high'] + example_df['EWMA_low'])/2)/\
|
|
134
|
+
# ((example_df['EWMA_high'] - example_df['EWMA_low'])/2) - 1, np.nan)
|
|
135
|
+
|
|
136
|
+
# example_df['FB_score_scaled'] = np.where(example_df['is_FB_anomaly'].isna()==False,
|
|
137
|
+
# abs(example_df['views'] - (example_df['FB_high'] + example_df['FB_low'])/2)/\
|
|
138
|
+
# ((example_df['FB_high'] - example_df['FB_low'])/2) - 1, np.nan)
|
|
139
|
+
|
|
140
|
+
# score_scaled_cols = []
|
|
141
|
+
# for col in example_df.columns.to_list():
|
|
142
|
+
# if col.endswith('_scaled'):
|
|
143
|
+
# score_scaled_cols.append(col)
|
|
144
|
+
|
|
145
|
+
# example_df['Anomaly_Score'] = example_df[score_scaled_cols].mean(axis=1)
|
|
146
|
+
|
|
147
|
+
# example_df['Anomaly_Score_Display'] = np.where(example_df['Anomaly_Score'] < 0, np.floor(100*example_df['Anomaly_Score']),
|
|
148
|
+
# np.where(example_df['Anomaly_Score'].between(0, 1), np.ceil(100*example_df['Anomaly_Score']),
|
|
149
|
+
# np.where(example_df['Anomaly_Score'] > 1, 100, 0)))
|
|
150
|
+
|
|
151
|
+
is_anom_cols = []
|
|
152
|
+
for col in example_df.columns.to_list():
|
|
153
|
+
if col.startswith('is_') and col.endswith('_anomaly') and col != 'is_Anomaly':
|
|
154
|
+
is_anom_cols.append(col)
|
|
155
|
+
|
|
156
|
+
example_df['Anomaly_Votes'] = example_df[is_anom_cols].sum(axis=1).astype(float)
|
|
157
|
+
example_df['is_Anomaly'] = np.where(example_df['Anomaly_Votes']/example_df[is_anom_cols].replace(False, True).sum(axis=1) >= 0.5, True, False)
|
|
158
|
+
example_df['Anomaly_Score'] = 2 * (example_df['Anomaly_Votes']/example_df[is_anom_cols].replace(True, 1).replace(False, 1).sum(axis=1) - 0.5).astype(float)
|
|
159
|
+
example_df['Anomaly_Score_Display'] = np.where(example_df['Anomaly_Score'] < 0, np.floor(100*example_df['Anomaly_Score']),
|
|
160
|
+
np.where(example_df['Anomaly_Score'] > 0, np.ceil(100*example_df['Anomaly_Score']), 1)).astype(float)
|
|
161
|
+
|
|
162
|
+
return example_df
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def help_overview():
|
|
166
|
+
display(Markdown(overview_msg))
|
|
167
|
+
example_df = get_example_df()
|
|
168
|
+
display(example_df[['key', 'channel', 'week_start', 'views']].tail(12))
|
|
169
|
+
display(Markdown(overview_msg2))
|
|
170
|
+
anomaly_eval_plot(example_df, group_columns, variable, date_column, eval_period, show_anomaly_scores_on_main_plot=False)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def help_percentile():
|
|
174
|
+
display(Markdown(percentile_msg))
|
|
175
|
+
example_df = get_example_df()
|
|
176
|
+
anomaly_percentile_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def help_iqr():
|
|
180
|
+
display(Markdown(iqr_msg))
|
|
181
|
+
example_df = get_example_df()
|
|
182
|
+
anomaly_iqr_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def help_mad():
|
|
186
|
+
display(Markdown(mad_msg))
|
|
187
|
+
example_df = get_example_df()
|
|
188
|
+
anomaly_mad_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def help_sd():
|
|
192
|
+
display(Markdown(sd_msg))
|
|
193
|
+
example_df = get_example_df()
|
|
194
|
+
anomaly_sd_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def help_ewma():
|
|
198
|
+
display(Markdown(ewma_msg))
|
|
199
|
+
example_df = get_example_df()
|
|
200
|
+
anomaly_ewma_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def help_fb():
|
|
204
|
+
display(Markdown(fb_msg))
|
|
205
|
+
example_df = get_example_df()
|
|
206
|
+
anomaly_fb_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def help_dbscan():
|
|
210
|
+
display(Markdown(dbscan_msg))
|
|
211
|
+
example_df = get_example_df()
|
|
212
|
+
anomaly_dbscan_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def help_isofor():
|
|
216
|
+
display(Markdown(isofor_msg))
|
|
217
|
+
example_df = get_example_df()
|
|
218
|
+
anomaly_isolation_forest_timeseries_plot(example_df, group_columns, variable, date_column, final_anomalies=False, eval_period=12)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
overview_msg = """
|
|
222
|
+
# ποΈ The Anomaly Detection Function
|
|
223
|
+
---
|
|
224
|
+
|
|
225
|
+
The `run_pipeline` function handles end-to-end processing β from data cleaning and interpolation to executing multiple machine learning models in parallel and aggregating their results into a final "Consensus" anomaly flag.
|
|
226
|
+
|
|
227
|
+
## π Functional Overview
|
|
228
|
+
The pipeline takes raw master data, partitions it into groups by unique ID, applies a suite of 8 different anomaly detection methods, and then flags observations as anomalies where at least half of the models consider the observation an anomaly.
|
|
229
|
+
|
|
230
|
+
The master data DataFrame that you pass into the anomaly detection pipeline needs to have at least 3 columns - unique ID, date, and a target variable. The unique ID can be defined by multiple columns.
|
|
231
|
+
|
|
232
|
+
Here is an example of a DataFrame that has two columns that comprise the unique ID ('key' and 'channel'), 'week_start' is the date column, and 'views' is the target variable:"""
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
overview_msg2 = """
|
|
236
|
+
## π§ Core Execution Stages
|
|
237
|
+
|
|
238
|
+
### 1. Preprocessing & Interpolation
|
|
239
|
+
Before modeling, the function interpolates target variable values for missing dates
|
|
240
|
+
* Fill gaps in the `variable` column to prevent model crashes.
|
|
241
|
+
|
|
242
|
+
### 2. Statistical Baseline Models (Local Execution)
|
|
243
|
+
The pipeline first runs four computationally light models sequentially on each group:
|
|
244
|
+
* **Percentile & IQR:** Non-parametric bounds detection.
|
|
245
|
+
* **SD (Standard Deviation) & MAD (Median Absolute Deviation):** Variance-based detection.
|
|
246
|
+
|
|
247
|
+
### 3. Parallel Machine Learning Suite (`process_group`)
|
|
248
|
+
To maximize performance, the pipeline uses `joblib.Parallel` to run intensive models across all available CPU cores. The `process_group` utility acts as a **router**, sending data to the correct engine based on the model key:
|
|
249
|
+
* **FB (Prophet):** Walk-forward time-series forecasting.
|
|
250
|
+
* **EWMA:** Exponentially weighted moving averages.
|
|
251
|
+
* **ISF (Isolation Forest):** Unsupervised isolation of anomalies.
|
|
252
|
+
* **DBSCAN:** Density-based spatial clustering.
|
|
253
|
+
|
|
254
|
+
### 4. Majority Voting (Ensemble Logic)
|
|
255
|
+
The power of this pipeline lies in its **Consensus Model**. After all models finish, the pipeline calculates:
|
|
256
|
+
> **`Anomaly_Votes`**: The sum of flags across all 8-9 methods.
|
|
257
|
+
>
|
|
258
|
+
> **`is_Anomaly`**: A final boolean set to **True** only if at least **4 models** agree that the point is an outlier.
|
|
259
|
+
|
|
260
|
+
## π€ Key Output Columns
|
|
261
|
+
* **`refresh_date`**: The timestamp of when the pipeline was executed.
|
|
262
|
+
* **`Anomaly_Votes`**: Total count of models that flagged the row.
|
|
263
|
+
* **`is_Anomaly`**: The final "Gold Standard" anomaly flag.
|
|
264
|
+
* **Individual Model Flags**: Columns like `is_FB_anomaly`, `is_IQR_anomaly`, etc., for granular auditing.
|
|
265
|
+
|
|
266
|
+
## π‘ Usage Context
|
|
267
|
+
Use `run_pipeline` when you need a **highly reliable, automated output**. By combining statistical, forecasting, and clustering models, the pipeline reduces "false positives" often generated by single-model approaches.
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
### βοΈ Primary Hyperparameters
|
|
271
|
+
| Parameter | Default | Description |
|
|
272
|
+
| :--- | :--- | :--- |
|
|
273
|
+
| **`eval_period`** | `12` | The number of recent weeks to evaluate for anomalies. |
|
|
274
|
+
| **`alpha` / `sigma`** | `0.3` / `1.5` | Sensitivity settings for the EWMA model. |
|
|
275
|
+
| **`interval_width`** | `0.90` | The confidence interval for the Prophet (FB) model. |
|
|
276
|
+
| **`n_jobs`** | `-1` | Utilizes all available processor cores for parallelization. |
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
---
|
|
280
|
+
## π Evaluation Plot
|
|
281
|
+
The plot below shows an example of anomalies identified by the process:
|
|
282
|
+
"""
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
percentile_msg = """# π PERCENTILE MODEL
|
|
286
|
+
---
|
|
287
|
+
|
|
288
|
+
The `detect_outliers_percentile` function is a robust anomaly detection tool designed to identify **statistical outliers** in
|
|
289
|
+
time series or grouped data using a dynamic, **expanding window percentile approach**.
|
|
290
|
+
|
|
291
|
+
## π Functional Overview
|
|
292
|
+
The function operates by partitioning the data into an initial training set and a subsequent evaluation period. It establishes
|
|
293
|
+
**"normal" behavior** based on the 5th and 95th percentiles of the available historical data, flagging any value that falls
|
|
294
|
+
outside these bounds as an anomaly.
|
|
295
|
+
|
|
296
|
+
## π§ Core Logic Stages
|
|
297
|
+
|
|
298
|
+
### 1. Data Preparation and Validation
|
|
299
|
+
> **Minimum Threshold:** The function requires at least **10 data points** to run; otherwise, it returns an empty DataFrame to
|
|
300
|
+
prevent statistically insignificant results.
|
|
301
|
+
>
|
|
302
|
+
> **Copying:** It creates a copy of the input group to ensure the original data remains unaltered during the calculation process.
|
|
303
|
+
|
|
304
|
+
### 2. Initial Training Block
|
|
305
|
+
* **Static Baseline:** For the first part of the data (everything before the `eval_period`), the function calculates a single
|
|
306
|
+
static baseline using the 5th and 95th percentiles of the entire training block.
|
|
307
|
+
* **Classification:** It applies these fixed bounds to the training rows, labeling them using a helper `classify` function and
|
|
308
|
+
assigning a boolean `is_Percentile_anomaly` flag.
|
|
309
|
+
|
|
310
|
+
### 3. Expanding Window Evaluation
|
|
311
|
+
* **Sequential Testing:** For each data point in the evaluation period (the last *n* points specified by `eval_period`), the
|
|
312
|
+
function recalculates the percentiles using **all previously seen data points**.
|
|
313
|
+
* **Dynamic Adaptation:** As the loop progresses, the "training set" grows. This allows the model to adapt to gradual shifts in
|
|
314
|
+
the data distribution, as the thresholds for the current point are informed by every point that came before it.
|
|
315
|
+
* **Real-time Simulation:** By calculating the bounds for point $i$ based only on points $0$ to $i-1$, the function simulates how
|
|
316
|
+
the model would perform in a live environment.
|
|
317
|
+
|
|
318
|
+
## π€ Key Output Columns
|
|
319
|
+
The function appends the following columns to the returned DataFrame:
|
|
320
|
+
* **`Percentile_low` / `Percentile_high`**: The specific thresholds used to evaluate that row.
|
|
321
|
+
* **`Percentile_anomaly`**: A categorical label (likely "High," "Low," or "Normal") generated by the external `classify` function.
|
|
322
|
+
* **`is_Percentile_anomaly`**: A boolean flag indicating whether the value was outside the 5%β95% range.
|
|
323
|
+
|
|
324
|
+
## π‘ Usage Context
|
|
325
|
+
This function is particularly useful for detecting spikes or drops in metrics where the underlying distribution might **drift
|
|
326
|
+
slowly over time**. By using percentiles rather than standard deviations, it is more resilient to extreme historical outliers
|
|
327
|
+
that might otherwise skew a mean-based threshold.
|
|
328
|
+
|
|
329
|
+
---
|
|
330
|
+
## π Evaluation Plot
|
|
331
|
+
The plot below shows an example of how the Percentile model sets bounds and anomaly regions:
|
|
332
|
+
"""
|
|
333
|
+
|
|
334
|
+
iqr_msg = iqr_msg = """
|
|
335
|
+
# π IQR MODEL (Interquartile Range)
|
|
336
|
+
---
|
|
337
|
+
|
|
338
|
+
The `detect_outliers_iqr` function is a statistical anomaly detection tool that identifies outliers by calculating the **Interquartile Range (IQR)** through a dynamic, **expanding window approach**.
|
|
339
|
+
|
|
340
|
+
## π Functional Overview
|
|
341
|
+
The function partitions data into a baseline **"training" set** and an **"evaluation" period**. It identifies **"normal" data** as values falling within:
|
|
342
|
+
> $$[Q1 - 1.5 \\times IQR, Q3 + 1.5 \\times IQR]$$
|
|
343
|
+
|
|
344
|
+
Any data point exceeding these calculated boundaries is flagged as a statistical anomaly.
|
|
345
|
+
|
|
346
|
+
## π§ Core Logic Stages
|
|
347
|
+
|
|
348
|
+
### 1. Data Preparation and Validation
|
|
349
|
+
* **Minimum Threshold:** To ensure statistical significance, the function requires at least **10 data points**; if the threshold isn't met, it returns an empty DataFrame.
|
|
350
|
+
* **Safe Copying:** It operates on a **copy** of the input group to protect the original dataset from unintended modifications.
|
|
351
|
+
|
|
352
|
+
### 2. Initial Training Block
|
|
353
|
+
* **Static Baseline:** For the initial block (all data before the `eval_period`), the function calculates a single set of baseline quartiles ($Q1$ and $Q3$) and the resulting $IQR$.
|
|
354
|
+
* **Fixed Boundaries:** The lower bound is set to $max(Q1 - 1.5 \\times IQR, 0)$ and the upper bound to $Q3 + 1.5 \\times IQR$.
|
|
355
|
+
* **Batch Classification:** These fixed bounds are applied to all rows in the training set, assigning them a **"TRAIN"** label and a boolean `is_IQR_anomaly` flag.
|
|
356
|
+
|
|
357
|
+
### 3. Expanding Window Evaluation
|
|
358
|
+
* **Incremental Recalculation:** For every point in the evaluation period (the last $n$ points), the function recalculates $Q1$, $Q3$, and $IQR$ using **all previously observed data**.
|
|
359
|
+
* **Dynamic Adaptation:** As the loop iterates, the training window **"expands."** This allows the model to adjust its expectations of "normal" as more data becomes available.
|
|
360
|
+
* **Live Simulation:** By testing point $i$ against thresholds derived from points $0$ to $i-1$, the function accurately simulates how the outlier detection would behave in a production environment.
|
|
361
|
+
|
|
362
|
+
## π€ Key Output Columns
|
|
363
|
+
The function appends several analytical columns to the returned DataFrame:
|
|
364
|
+
* **`Q1` / `Q3` / `IQR`**: The specific quartiles and range used for that row's calculation.
|
|
365
|
+
* **`IQR_low` / `IQR_high`**: The calculated "fences" (bounds). The lower bound is clipped at zero.
|
|
366
|
+
* **`set`**: Categorizes the row as either **"TRAIN"** or **"TEST"**.
|
|
367
|
+
* **`IQR_anomaly`**: A descriptive label (e.g., "High," "Low," or "Normal").
|
|
368
|
+
* **`is_IQR_anomaly`**: A boolean flag identifying if the value is an outlier.
|
|
369
|
+
|
|
370
|
+
## π‘ Usage Context
|
|
371
|
+
The IQR method is a **classic, non-parametric approach** to anomaly detection. It is particularly effective for datasets where you **cannot assume a normal (Gaussian) distribution**.
|
|
372
|
+
|
|
373
|
+
By using the expanding window, this function is more robust than a simple static boxplot, as it accounts for a growing history of data while remaining less sensitive to extreme outliers than mean-based methods (like Z-Score).
|
|
374
|
+
|
|
375
|
+
---
|
|
376
|
+
## π Evaluation Plot
|
|
377
|
+
The plot below shows an example of how the IQR model sets bounds and anomaly regions:
|
|
378
|
+
"""
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
fb_msg = """
|
|
382
|
+
# π Facebook Prophet Walk-Forward Model
|
|
383
|
+
---
|
|
384
|
+
|
|
385
|
+
The `detect_time_series_anomalies_fb_walkforward` function is a sophisticated forecasting tool designed for **iterative anomaly detection**. It utilizes the Facebook Prophet library to perform a **walk-forward validation**, forecasting one data point at a time and expanding the training set as it progresses.
|
|
386
|
+
|
|
387
|
+
## π Functional Overview
|
|
388
|
+
Unlike standard batch forecasting, this function operates by simulating a real-world scenario where the model is updated as soon as new data arrives. It establishes a **cutoff date** based on the specified `eval_period`, then iteratively predicts the next point, compares it to the observed value, and incorporates that value back into the training history.
|
|
389
|
+
|
|
390
|
+
## π§ Core Logic Stages
|
|
391
|
+
|
|
392
|
+
### 1. Data Preparation and Cutoff
|
|
393
|
+
* **Standardization:** The input data is sorted by date and converted to **datetime objects** to ensure proper time-series alignment.
|
|
394
|
+
* **Partitioning:** The dataset is split into an **Initial Training Set** (all data before the cutoff) and an **Evaluation Set** (the rolling forecast window).
|
|
395
|
+
|
|
396
|
+
### 2. Walk-Forward Loop (Sequential Testing)
|
|
397
|
+
* **Model Fitting:** For every point in the evaluation set, a new **Prophet model** is initialized with weekly and yearly seasonality enabled.
|
|
398
|
+
* **One-Step Forecast:** The model generates a prediction (`yhat`) and an uncertainty interval (`yhat_lower`, `yhat_upper`) specifically for the **next single point**.
|
|
399
|
+
* **Dynamic Training Expansion:** After each prediction, the actual observed value is appended to the training data. This ensures the model learns from the most recent information before making the next prediction.
|
|
400
|
+
* **Robust Error Handling:** If the Prophet fit fails, the function falls back to a **baseline persistence model** (last observed value) to prevent pipeline failure.
|
|
401
|
+
|
|
402
|
+
### 3. Anomaly Classification
|
|
403
|
+
* **Uncertainty Bounds:** Anomalies are defined by the `interval_width` parameter. Any observation falling outside the predicted upper or lower bounds is flagged.
|
|
404
|
+
* **Residual Calculation:** The function computes the **FB_residual** (Actual - Forecast) to quantify the magnitude of deviations.
|
|
405
|
+
|
|
406
|
+
## π€ Key Output Columns
|
|
407
|
+
The function appends the following columns to the returned DataFrame:
|
|
408
|
+
* **`FB_forecast`**: The point estimate predicted by Prophet for that date.
|
|
409
|
+
* **`FB_low` / `FB_high`**: The dynamic boundaries based on the specified uncertainty interval.
|
|
410
|
+
* **`FB_residual`**: The difference between the actual observed metric and the forecast.
|
|
411
|
+
* **`FB_anomaly`**: A categorical label designating the deviation as **"high"** or **"low"**.
|
|
412
|
+
* **`is_FB_anomaly`**: A boolean flag identifying outliers in the evaluation region.
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
## π‘ Usage Context
|
|
416
|
+
This approach is highly effective for metrics with **strong seasonality and complex trends**. Because it uses a walk-forward loop, it is significantly more accurate than a static forecast for long evaluation periods, as it corrects itself based on the most recent trends. It is ideal for detecting "sudden" shifts that standard statistical models (like Z-Score) might miss.
|
|
417
|
+
|
|
418
|
+
---
|
|
419
|
+
### π Evaluation Strategy
|
|
420
|
+
This function strictly ignores the training region for anomaly reporting, ensuring that all reported anomalies are based on "out-of-sample" performance where the model had no prior knowledge of the specific data point being tested.
|
|
421
|
+
|
|
422
|
+
---
|
|
423
|
+
## π Evaluation Plot
|
|
424
|
+
The plot below shows an example of how the FB Prophet model sets bounds and anomaly regions:
|
|
425
|
+
"""
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
dbscan_msg = """
|
|
429
|
+
# π DBSCAN Walk-Forward Anomaly Detection
|
|
430
|
+
---
|
|
431
|
+
|
|
432
|
+
The `detect_time_series_anomalies_dbscan` function implements a **density-based clustering** approach for time-series anomaly detection. It utilizes an **iterative walk-forward validation** strategy to identify data points that exist in "low-density" regions of the feature space.
|
|
433
|
+
|
|
434
|
+
## π Functional Overview
|
|
435
|
+
This function transforms a univariate time series into a high-dimensional feature space using **dynamic lags** and **rolling statistics**. It then applies the **DBSCAN** (Density-Based Spatial Clustering of Applications with Noise) algorithm to distinguish between dense clusters of "normal" behavior and sparse "noise" points (anomalies).
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
## π§ Core Logic & Helper Utilities
|
|
440
|
+
|
|
441
|
+
### 1. Dynamic Feature Engineering (`get_dynamic_lags`)
|
|
442
|
+
Instead of using fixed lags, the function uses the **Autocorrelation Function (ACF)** to find the 10 most significant seasonal patterns in the data.
|
|
443
|
+
* **Baseline:** Always includes lags 1, 2, and 3 to capture immediate momentum.
|
|
444
|
+
* **Significance:** Uses a 75% confidence interval ($\\\\alpha=0.25$) to identify meaningful historical dependencies.
|
|
445
|
+
|
|
446
|
+
### 2. Automated Parameter Tuning (`find_optimal_epsilon`)
|
|
447
|
+
DBSCAN is highly sensitive to the **Epsilon ($\\\\epsilon$)** parameter (the neighborhood radius).
|
|
448
|
+
* **Proxy Elbow Method:** The function automatically calculates $\\\\epsilon$ by analyzing the distance to the $k$-th nearest neighbor for all training points.
|
|
449
|
+
* **Density Threshold:** It sets $\\\\epsilon$ at the **95th percentile** of these distances, ensuring that 95% of training data is considered "dense" while the most isolated 5% are candidates for noise.
|
|
450
|
+
|
|
451
|
+
### 3. Walk-Forward Iteration
|
|
452
|
+
For each period in the `eval_period`:
|
|
453
|
+
* **Feature Construction:** Builds a matrix containing the variable, its dynamic lags, rolling means, rolling standard deviations, and a linear trend component.
|
|
454
|
+
* **Scaling:** Fits a `StandardScaler` **only on training data** to prevent data leakage.
|
|
455
|
+
* **Novelty Detection:** Since DBSCAN cannot "predict" on new points, the function uses a **Nearest Neighbors proxy**. If the distance from a new test point to its $k$-th neighbor in the training set is greater than the trained $\\\\epsilon$, it is flagged as an anomaly.
|
|
456
|
+
|
|
457
|
+
## π€ Key Output Columns
|
|
458
|
+
* **`dbscan_score`**: The distance from the point to the $\\\\epsilon$ boundary (positive values indicate anomalies).
|
|
459
|
+
* **`is_DBSCAN_anomaly`**: A boolean flag identifying outliers.
|
|
460
|
+
* **Generated Features**: Includes all dynamic lags (`lagX`) and rolling statistics (`roll_mean_W`) used during the fit.
|
|
461
|
+
|
|
462
|
+
## π‘ Usage Context
|
|
463
|
+
DBSCAN is exceptionally powerful for detecting **contextual anomalies**βpoints that might look "normal" in value but are "weird" given their recent history or seasonal context. Because it is density-based, it can find anomalies in non-linear or multi-modal distributions where simple percentile or Z-score methods would fail.
|
|
464
|
+
|
|
465
|
+
---
|
|
466
|
+
### β οΈ Performance Note
|
|
467
|
+
This model is computationally more intensive than statistical methods due to the iterative re-fitting of the `NearestNeighbors` and `DBSCAN` models. It is best suited for high-priority metrics where accuracy is more critical than processing speed.
|
|
468
|
+
|
|
469
|
+
---
|
|
470
|
+
## π Evaluation Plot
|
|
471
|
+
The plot below shows an example of how the DBSCAN model flags anomalies:
|
|
472
|
+
"""
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
ewma_msg = """
|
|
476
|
+
# π EWMA Rolling Anomaly Detection
|
|
477
|
+
---
|
|
478
|
+
|
|
479
|
+
The `ewma_with_anomalies_rolling_group` function implements a **statistically weighted** approach to identifying outliers. It uses an **Expanding Window** (Walk-Forward) strategy to adapt to recent trends while maintaining a memory of historical data.
|
|
480
|
+
|
|
481
|
+
## π Functional Overview
|
|
482
|
+
This function calculates the **Exponentially Weighted Moving Average (EWMA)**, which assigns higher importance to recent observations. By combining this forecast with a dynamic standard deviation "envelope," the function identifies points that deviate significantly from the expected trend.
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
## π§ Core Logic Components
|
|
487
|
+
|
|
488
|
+
### 1. Forecast Engine (`ewma_forecast`)
|
|
489
|
+
* **Weighting Mechanism:** Uses an `alpha` parameter (between 0 and 1) to determine the "decay" of information. A **higher alpha** makes the model more sensitive to recent changes.
|
|
490
|
+
* **Calculation:** Employs the formula:
|
|
491
|
+
$$EWMA_t = \\alpha \\cdot Y_t + (1 - \\alpha) \\cdot EWMA_{t-1}$$
|
|
492
|
+
|
|
493
|
+
### 2. The Rolling Anomaly Loop
|
|
494
|
+
The function partitions data into **TRAIN** and **TEST** sets and iterates through the evaluation period:
|
|
495
|
+
* **Expanding Training Set:** For every evaluation point, the function uses all preceding data to re-calculate the baseline.
|
|
496
|
+
* **Dynamic Thresholding:** * **Upper Limit:** `Forecast + (Sigma * Standard Deviation)`
|
|
497
|
+
* **Lower Limit:** `max(Forecast - (Sigma * Standard Deviation), 0)`
|
|
498
|
+
* **Iterative Evaluation:** It forecasts exactly **one point ahead**, checks for an anomaly, and then moves that point into the training set for the next iteration.
|
|
499
|
+
|
|
500
|
+
## π€ Key Output Columns
|
|
501
|
+
The function returns a concatenated DataFrame containing:
|
|
502
|
+
* **`EWMA_forecast`**: The predicted value for that timestamp.
|
|
503
|
+
* **`STD`**: The standard deviation used to calculate the threshold.
|
|
504
|
+
* **`EWMA_high` / `EWMA_low`**: The dynamic boundaries (the "envelope") for the test period.
|
|
505
|
+
* **`set`**: Labels data as either **"TRAIN"** (historical baseline) or **"TEST"** (anomaly detection window).
|
|
506
|
+
* **`is_EWMA_anomaly`**: A boolean flag indicating if the actual value fell outside the limits.
|
|
507
|
+
|
|
508
|
+
## π‘ Usage Context
|
|
509
|
+
EWMA is ideal for **streaming-style data** or metrics that exhibit **level shifts**. Because it weights recent data more heavily than a simple moving average, it is faster to adapt to new "normals" while still filtering out minor noise.
|
|
510
|
+
|
|
511
|
+
---
|
|
512
|
+
### βοΈ Parameter Tuning
|
|
513
|
+
* **`alpha`**: Adjust this to control how quickly the model "forgets" old data (Typical range: `0.1 - 0.3`).
|
|
514
|
+
* **`sigma`**: Adjust this to control sensitivity. A **lower sigma** results in more anomalies, while a **higher sigma** (e.g., `3.0`) only flags extreme outliers.
|
|
515
|
+
|
|
516
|
+
---
|
|
517
|
+
## π Evaluation Plot
|
|
518
|
+
The plot below shows an example of how the EWMA model sets bounds and anomaly regions:
|
|
519
|
+
"""
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
isofor_msg = """
|
|
523
|
+
# π² Isolation Forest Time-Series Anomaly Detection
|
|
524
|
+
---
|
|
525
|
+
|
|
526
|
+
The `detect_time_series_anomalies_isoforest` function implements an **unsupervised machine learning** approach to outlier detection. Unlike traditional statistical models that define "normal" regions, this model explicitly identifies anomalies by **isolating** them in a high-dimensional feature space.
|
|
527
|
+
|
|
528
|
+
## π Functional Overview
|
|
529
|
+
This function utilizes a **walk-forward validation** strategy. For every evaluation point, it dynamically engineers a unique feature set, fits a forest of decision trees, and determines if the current observation is an outlier based on how easily it can be isolated from historical data.
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
## π§ Core Logic & Helper Utilities
|
|
534
|
+
|
|
535
|
+
### 1. Dynamic Feature Engineering (`get_dynamic_lags`)
|
|
536
|
+
To capture the temporal structure of the data, the model doesn't just look at the raw value; it looks at the **context**.
|
|
537
|
+
* **Autocorrelation (ACF):** The function calculates the **10 most significant lags** based on the data's historical patterns.
|
|
538
|
+
* **Momentum:** It always includes lags 1, 2, and 3 to ensure immediate short-term trends are captured.
|
|
539
|
+
* **Rolling Statistics:** It automatically calculates **rolling means** and **standard deviations** at multiple scales (quarter-lag, half-lag, and full-lag intervals).
|
|
540
|
+
|
|
541
|
+
### 2. Isolation Forest Model Configuration
|
|
542
|
+
The model builds **200 trees** (`n_estimators`) to ensure a stable anomaly score.
|
|
543
|
+
* **Contamination:** A baseline assumption that **1%** of the data is inherently noisy.
|
|
544
|
+
* **Decision Function:** The model calculates an anomaly score where lower, more negative values indicate a higher likelihood of being an outlier.
|
|
545
|
+
|
|
546
|
+
### 3. Dual-Threshold Validation
|
|
547
|
+
To reduce "false positives," the function uses two layers of verification:
|
|
548
|
+
1. **Contamination Anomaly:** The standard output from the sklearn model based on the 1% threshold.
|
|
549
|
+
2. **Statistical Threshold:** A custom "safety" bound calculated as:
|
|
550
|
+
> $$Mean(Positive Scores) - 3 \\times Std(Positive Scores)$$
|
|
551
|
+
**Result:** A point is only flagged as `True` if **both** the ML model and the statistical threshold agree it is an anomaly.
|
|
552
|
+
|
|
553
|
+
## π€ Key Output Columns
|
|
554
|
+
* **`IsolationForest_timeseries_score`**: The decision score (anomaly score).
|
|
555
|
+
* **`is_IsolationForest_timeseries_anomaly`**: The final boolean flag for anomalies.
|
|
556
|
+
* **Engineered Features**: All `lagX`, `roll_meanX`, and `roll_stdX` columns created during the process.
|
|
557
|
+
|
|
558
|
+
## π‘ Usage Context
|
|
559
|
+
Isolation Forest is exceptionally powerful for **multi-dimensional anomalies**. Because it considers lags, rolling stats, and trend simultaneously, it can detect "subtle" anomalies where the value might look normal, but the **relationship** between the value and its recent history is broken.
|
|
560
|
+
|
|
561
|
+
---
|
|
562
|
+
### βοΈ Implementation Strategy
|
|
563
|
+
The function handles the "test" points one-by-one in a loop. After each prediction, the training set expands to include the latest observed value, ensuring the forest is always aware of the most recent data trends before predicting the next point.
|
|
564
|
+
|
|
565
|
+
---
|
|
566
|
+
## π Evaluation Plot
|
|
567
|
+
The plot below shows an example of how the Isolation Forest model flags anomalies:
|
|
568
|
+
"""
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
mad_msg = """
|
|
572
|
+
# π‘οΈ MAD Anomaly Detection Model
|
|
573
|
+
---
|
|
574
|
+
|
|
575
|
+
Median Absolute Deviation with Expanding Window
|
|
576
|
+
|
|
577
|
+
The detect_outliers_mad function is a non-parametric outlier detection tool. Unlike methods based on the standard deviation, this model uses the Median and MAD, making it significantly more robust against data that contains extreme outliers or non-normal distributions.
|
|
578
|
+
|
|
579
|
+
## π Functional Overview
|
|
580
|
+
|
|
581
|
+
The function identifies anomalies by calculating how far a data point deviates from the median. It utilizes an expanding window approach to ensure that as the dataset grows, the definition of "normal" behavior adapts dynamically to the historical context.
|
|
582
|
+
|
|
583
|
+
## π§ Core Logic Stages
|
|
584
|
+
|
|
585
|
+
1. Preprocessing & Validation
|
|
586
|
+
Sample Size Check: Requires a minimum of 10 data points. If the group is too small, it returns an empty DataFrame to avoid biased statistical results.
|
|
587
|
+
Deep Copy: Operates on a group.copy() to ensure the original input data remains untouched.
|
|
588
|
+
|
|
589
|
+
2. Initial Training Block
|
|
590
|
+
Baseline Calculation: For the first part of the series (pre-evaluation period), it establishes a static baseline.
|
|
591
|
+
The MAD Formula: > It calculates the Median Absolute Deviation: $MAD = median(|x_i - median(x)|)$.
|
|
592
|
+
Thresholding: It uses a mad_scale_factor (default 0.6745) to make the MAD comparable to a standard deviation for a normal distribution.
|
|
593
|
+
Bounds:
|
|
594
|
+
MAD_high: $Median + (Threshold \times \frac{MAD}{Scale})$
|
|
595
|
+
MAD_low: $max(Median - (Threshold \times \frac{MAD}{Scale}), 0)$
|
|
596
|
+
|
|
597
|
+
3. Expanding Window Evaluation
|
|
598
|
+
Incremental Testing: For each point in the evaluation period, the function recalculates the Median and MAD using all data available up to that point.
|
|
599
|
+
Real-time Simulation: This simulates a "production" environment where each new weekly point is tested against the entirety of its known history.
|
|
600
|
+
Zero-Variance Handling: If MAD is 0 (all historical values are identical), the bounds collapse to the median value to avoid division errors.
|
|
601
|
+
|
|
602
|
+
## π€ Key Output Columns
|
|
603
|
+
|
|
604
|
+
## π‘ Usage Context
|
|
605
|
+
|
|
606
|
+
The MAD model is the "gold standard" for univariate outlier detection in robust statistics. It is highly recommended for:
|
|
607
|
+
- Data with large, extreme spikes that would skew a Mean-based (SD) model.
|
|
608
|
+
- Datasets that are not normally distributed.
|
|
609
|
+
- Scenarios where you need a conservative, reliable boundary that isn't easily shifted by a single bad data point.
|
|
610
|
+
|
|
611
|
+
---
|
|
612
|
+
## π Evaluation Plot
|
|
613
|
+
The plot below shows an example of how the MAD model sets bounds and anomaly regions:
|
|
614
|
+
"""
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
sd_msg = """
|
|
618
|
+
# π Standard-DeviationβBased Outlier Detection (Expanding Window)
|
|
619
|
+
|
|
620
|
+
## **Function:** `detect_outliers_sd`
|
|
621
|
+
|
|
622
|
+
This function detects **anomalies in a time series** using a **mean Β± 2 standard deviation (SD)** rule, applied in a **trainβtest, expanding-window framework**.
|
|
623
|
+
|
|
624
|
+
---
|
|
625
|
+
|
|
626
|
+
## π **What the Function Does**
|
|
627
|
+
|
|
628
|
+
### **1. Minimum Data Requirement**
|
|
629
|
+
- Requires **at least 10 observations**
|
|
630
|
+
- Returns an empty DataFrame if insufficient data is provided
|
|
631
|
+
|
|
632
|
+
---
|
|
633
|
+
|
|
634
|
+
## ποΈ **Training Phase**
|
|
635
|
+
*(Initial fixed window)*
|
|
636
|
+
|
|
637
|
+
- Uses all observations **prior to the evaluation period**
|
|
638
|
+
- Computes:
|
|
639
|
+
- **Mean**
|
|
640
|
+
- **Standard Deviation**
|
|
641
|
+
- **Lower bound:** `max(mean β 2 Γ SD, 0)`
|
|
642
|
+
- **Upper bound:** `mean + 2 Γ SD`
|
|
643
|
+
- Flags anomalies where values fall **outside the 2-SD range**
|
|
644
|
+
- Labels rows as **TRAIN**
|
|
645
|
+
|
|
646
|
+
---
|
|
647
|
+
|
|
648
|
+
## π **Evaluation Phase**
|
|
649
|
+
*(Expanding window)*
|
|
650
|
+
|
|
651
|
+
For each step in the evaluation period:
|
|
652
|
+
- Expands the training window to include all prior observations
|
|
653
|
+
- Recomputes **mean and SD dynamically**
|
|
654
|
+
- Recalculates anomaly bounds
|
|
655
|
+
- Tests the current observation against updated bounds
|
|
656
|
+
- Labels rows as **TEST**
|
|
657
|
+
|
|
658
|
+
---
|
|
659
|
+
|
|
660
|
+
## π¨ **Anomaly Classification**
|
|
661
|
+
|
|
662
|
+
Each observation receives:
|
|
663
|
+
- **`SD_anomaly`** β categorical label via `classify()`
|
|
664
|
+
- **`is_SD_anomaly`** β boolean flag
|
|
665
|
+
- `True` if outside Β±2 SD
|
|
666
|
+
- `False` otherwise
|
|
667
|
+
|
|
668
|
+
---
|
|
669
|
+
|
|
670
|
+
## π **Output Columns Added**
|
|
671
|
+
|
|
672
|
+
- **Mean**
|
|
673
|
+
- **SD**
|
|
674
|
+
- **SD2_low**
|
|
675
|
+
- **SD2_high**
|
|
676
|
+
- **set** (`TRAIN` or `TEST`)
|
|
677
|
+
- **SD_anomaly**
|
|
678
|
+
- **is_SD_anomaly**
|
|
679
|
+
|
|
680
|
+
---
|
|
681
|
+
## π Evaluation Plot
|
|
682
|
+
The plot below shows an example of how the STD model sets bounds and anomaly regions:
|
|
683
|
+
"""
|