anomaly-pipeline 0.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anomaly_pipeline/__init__.py +2 -0
- anomaly_pipeline/helpers/DB_scan.py +188 -0
- anomaly_pipeline/helpers/IQR.py +71 -0
- anomaly_pipeline/helpers/MAD.py +88 -0
- anomaly_pipeline/helpers/Preprocessing.py +116 -0
- anomaly_pipeline/helpers/STD.py +70 -0
- anomaly_pipeline/helpers/__init__.py +1 -0
- anomaly_pipeline/helpers/baseline.py +112 -0
- anomaly_pipeline/helpers/cluster_functions.py +289 -0
- anomaly_pipeline/helpers/evaluation_info.py +121 -0
- anomaly_pipeline/helpers/evaluation_plots.py +546 -0
- anomaly_pipeline/helpers/ewma.py +119 -0
- anomaly_pipeline/helpers/fb_prophet.py +94 -0
- anomaly_pipeline/helpers/help_info.py +683 -0
- anomaly_pipeline/helpers/iso_forest_general.py +50 -0
- anomaly_pipeline/helpers/iso_forest_timeseries.py +123 -0
- anomaly_pipeline/helpers/percentile.py +65 -0
- anomaly_pipeline/main.py +63 -0
- anomaly_pipeline/pipeline.py +253 -0
- anomaly_pipeline-0.1.27.dist-info/METADATA +15 -0
- anomaly_pipeline-0.1.27.dist-info/RECORD +24 -0
- anomaly_pipeline-0.1.27.dist-info/WHEEL +5 -0
- anomaly_pipeline-0.1.27.dist-info/entry_points.txt +2 -0
- anomaly_pipeline-0.1.27.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from sklearn.ensemble import IsolationForest
|
|
3
|
+
|
|
4
|
+
def detect_outliers_isf_general(group, variable, contamination=0.03, random_state=42, eval_period=12):
|
|
5
|
+
n = len(group)
|
|
6
|
+
if n < 10:
|
|
7
|
+
return pd.DataFrame(columns=group.columns)
|
|
8
|
+
|
|
9
|
+
group = group.copy()
|
|
10
|
+
train_size = n - eval_period
|
|
11
|
+
|
|
12
|
+
# Initialize columns
|
|
13
|
+
group['set'] = ""
|
|
14
|
+
group['IsolationForest_score_general'] = 0.0
|
|
15
|
+
group['is_IsolationForest_anomaly_general'] = False
|
|
16
|
+
|
|
17
|
+
# --- 1. HANDLE TRAINING DATA (Initial Block) ---
|
|
18
|
+
# Baseline ISF using all data available before eval_period
|
|
19
|
+
initial_train = group[[variable]].iloc[:train_size]
|
|
20
|
+
|
|
21
|
+
iso = IsolationForest(contamination=contamination, random_state=random_state)
|
|
22
|
+
|
|
23
|
+
# Fit and predict the initial block
|
|
24
|
+
group.loc[group.index[:train_size], 'IsolationForest_score_general'] = iso.fit_predict(initial_train) # Note: this is actually the cluster label
|
|
25
|
+
# We use decision_function for the raw anomaly score
|
|
26
|
+
group.loc[group.index[:train_size], 'IsolationForest_score_general'] = iso.decision_function(initial_train)
|
|
27
|
+
group.loc[group.index[:train_size], 'is_IsolationForest_anomaly_general'] = iso.predict(initial_train) == -1
|
|
28
|
+
group.loc[group.index[:train_size], 'set'] = "TRAIN"
|
|
29
|
+
|
|
30
|
+
# --- 2. HANDLE EVALUATION DATA (Expanding Window) ---
|
|
31
|
+
# Iterate through the eval period, increasing the training set one point at a time
|
|
32
|
+
for i in range(train_size, n):
|
|
33
|
+
# Data available up to this point (expanding window)
|
|
34
|
+
current_train = group[[variable]].iloc[:i]
|
|
35
|
+
|
|
36
|
+
# Re-fit the model on all data known up to point i
|
|
37
|
+
iso_expanding = IsolationForest(contamination=contamination, random_state=random_state)
|
|
38
|
+
iso_expanding.fit(current_train)
|
|
39
|
+
|
|
40
|
+
# Test the current point i
|
|
41
|
+
current_point = group[[variable]].iloc[[i]]
|
|
42
|
+
|
|
43
|
+
group.iloc[i, group.columns.get_loc('IsolationForest_score_general')] = iso_expanding.decision_function(current_point)[0]
|
|
44
|
+
group.iloc[i, group.columns.get_loc('is_IsolationForest_anomaly_general')] = iso_expanding.predict(current_point)[0] == -1
|
|
45
|
+
group.iloc[i, group.columns.get_loc('set')] = "TEST"
|
|
46
|
+
|
|
47
|
+
# Cast boolean column properly
|
|
48
|
+
group['is_IsolationForest_anomaly_general'] = group['is_IsolationForest_anomaly_general'].astype(bool)
|
|
49
|
+
|
|
50
|
+
return group
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.ensemble import IsolationForest
|
|
4
|
+
from statsmodels.tsa.stattools import acf
|
|
5
|
+
|
|
6
|
+
def get_dynamic_lags(series: pd.Series) -> list:
|
|
7
|
+
|
|
8
|
+
n = len(series)
|
|
9
|
+
|
|
10
|
+
# Determine Max Lags (Max is min(50% of data, a hard cap of 60))
|
|
11
|
+
nlags = min(int(n * 0.5), 60)
|
|
12
|
+
|
|
13
|
+
if nlags < 5:
|
|
14
|
+
return [1, 2, 3]
|
|
15
|
+
|
|
16
|
+
# Calculate ACF and Confidence Intervals, get the 10 most-significant lags
|
|
17
|
+
autocorrelations, confint = acf(series.dropna(), nlags=nlags, alpha=0.25, fft=True)
|
|
18
|
+
autocorr_values = autocorrelations[1:]
|
|
19
|
+
conf_limit = confint[1:, 1] - autocorr_values
|
|
20
|
+
is_significant = np.abs(autocorr_values) > conf_limit
|
|
21
|
+
significant_autocorr = autocorr_values[is_significant]
|
|
22
|
+
significant_lags_indices = np.where(is_significant)[0] + 1
|
|
23
|
+
ranked_indices = np.argsort(np.abs(significant_autocorr))[::-1]
|
|
24
|
+
top_lags_indices = ranked_indices[:10]
|
|
25
|
+
top_lags = significant_lags_indices[top_lags_indices].tolist()
|
|
26
|
+
base_lags = [1, 2, 3]
|
|
27
|
+
dynamic_lags = sorted(list(set(base_lags + top_lags)))[:10]
|
|
28
|
+
|
|
29
|
+
return dynamic_lags
|
|
30
|
+
|
|
31
|
+
def detect_time_series_anomalies_isoforest(
|
|
32
|
+
group,
|
|
33
|
+
variable,
|
|
34
|
+
date_column,
|
|
35
|
+
eval_period,
|
|
36
|
+
):
|
|
37
|
+
|
|
38
|
+
group[date_column] = pd.to_datetime(group[date_column])
|
|
39
|
+
group = group.copy().sort_values(date_column).reset_index(drop=True)
|
|
40
|
+
|
|
41
|
+
'''
|
|
42
|
+
Iterate over each of the evaluation periods, fitting the model to all the data before the evaluation period
|
|
43
|
+
and then getting the predicted anomaly score for the given evaluation period
|
|
44
|
+
'''
|
|
45
|
+
try:
|
|
46
|
+
test_anom = []
|
|
47
|
+
|
|
48
|
+
for t in list(range(eval_period - 1, -1, -1)):
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
|
|
52
|
+
# Boundary between rolling train and rolling forecast region
|
|
53
|
+
cutoff_date = group[date_column].max() - pd.Timedelta(weeks=t)
|
|
54
|
+
|
|
55
|
+
# Get train set to determine lags
|
|
56
|
+
model_group = group.copy()
|
|
57
|
+
train = model_group[model_group[date_column] <= cutoff_date].copy()
|
|
58
|
+
lags = get_dynamic_lags(train[variable])
|
|
59
|
+
|
|
60
|
+
# Create lag features on the entire model_group DF
|
|
61
|
+
for lag in lags:
|
|
62
|
+
model_group[f'lag{lag}'] = model_group[variable].shift(lag)
|
|
63
|
+
|
|
64
|
+
# Get rolling stats features for the entire model_group DF
|
|
65
|
+
rolling_stats_features = []
|
|
66
|
+
for w in [int(np.ceil(max(lags)/4)), int(np.ceil(max(lags)/2)), int(max(lags))]:
|
|
67
|
+
if w >= 3:
|
|
68
|
+
rolling_stats_features.append('roll_mean' + str(w))
|
|
69
|
+
rolling_stats_features.append('roll_std' + str(w))
|
|
70
|
+
model_group['roll_mean' + str(w)] = model_group[variable].shift(1).rolling(w).mean()
|
|
71
|
+
model_group['roll_std' + str(w)] = model_group[variable].shift(1).rolling(w).std()
|
|
72
|
+
|
|
73
|
+
# Get trend feature
|
|
74
|
+
model_group['trend'] = group.index
|
|
75
|
+
|
|
76
|
+
# Drop records with NAs
|
|
77
|
+
model_group = model_group.copy().dropna()
|
|
78
|
+
|
|
79
|
+
# Split into train and test (train and test now both have all the features
|
|
80
|
+
train = model_group[model_group[date_column] <= cutoff_date].copy()
|
|
81
|
+
test = model_group[model_group[date_column] == cutoff_date].copy()
|
|
82
|
+
|
|
83
|
+
# Identify all model features (lags, rolling stats, trend, and the variable itself)
|
|
84
|
+
features = [f'lag{i}' for i in lags] + rolling_stats_features + ['trend'] + [variable]
|
|
85
|
+
|
|
86
|
+
# Create and fit the model
|
|
87
|
+
iso_forest_model = IsolationForest(
|
|
88
|
+
n_estimators=200,
|
|
89
|
+
contamination=0.01,
|
|
90
|
+
random_state=42
|
|
91
|
+
)
|
|
92
|
+
iso_forest_model.fit(train[features])
|
|
93
|
+
|
|
94
|
+
train['IsolationForest_score_timeseries'] = iso_forest_model.decision_function(train[features])
|
|
95
|
+
anomaly_threshold = min(0,
|
|
96
|
+
train[train['IsolationForest_score_timeseries'] > 0]['IsolationForest_score_timeseries'].mean() - 3 * train[train['IsolationForest_score_timeseries'] > 0]['IsolationForest_score_timeseries'].std())
|
|
97
|
+
test['IsolationForest_score_timeseries'] = iso_forest_model.decision_function(test[features])
|
|
98
|
+
test['contamination_anomaly'] = iso_forest_model.predict(test[features]) # -1 = anomaly, 1 = normal
|
|
99
|
+
test['anomaly_threshold'] = anomaly_threshold
|
|
100
|
+
test['threshold_anomaly'] = np.where(test['IsolationForest_score_timeseries'] < anomaly_threshold, -1, 1)
|
|
101
|
+
|
|
102
|
+
test['is_IsolationForest_anomaly_timeseries'] = np.where((test['contamination_anomaly'] == -1) & (test['threshold_anomaly'] == -1), True, False)
|
|
103
|
+
test = test[[variable, date_column, 'IsolationForest_score_timeseries', 'is_IsolationForest_anomaly_timeseries']]
|
|
104
|
+
test_anom.append(test)
|
|
105
|
+
except:
|
|
106
|
+
pass
|
|
107
|
+
try:
|
|
108
|
+
test_anom = pd.concat(test_anom)
|
|
109
|
+
group = group.merge(test_anom[[variable, date_column, 'IsolationForest_score_timeseries', 'is_IsolationForest_anomaly_timeseries']], on=[variable, date_column], how='left')
|
|
110
|
+
except:
|
|
111
|
+
print("Error in Isolation Forest process")
|
|
112
|
+
group["IsolationForest_score_timeseries"] = np.nan
|
|
113
|
+
group["is_IsolationForest_anomaly_timeseries"] = np.nan
|
|
114
|
+
|
|
115
|
+
except:
|
|
116
|
+
group["IsolationForest_score_timeseries"] = np.nan
|
|
117
|
+
group["is_IsolationForest_anomaly_timeseries"] = np.nan
|
|
118
|
+
# Get string or object dtype columns from group that would identify the group
|
|
119
|
+
group_id = key_series.select_dtypes(include=['object', 'string']).columns.tolist()
|
|
120
|
+
group_id = " ".join(key_series[group_id].reset_index(drop=True).iloc[0].to_list())
|
|
121
|
+
print(f'Isolation Forest Anomaly Detection failed for {group_id}')
|
|
122
|
+
|
|
123
|
+
return group
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
from .Preprocessing import classify
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# Anomaly category columns (optional, keep if you still want string labels)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def detect_outliers_percentile(group, variable,date_column,eval_period):
|
|
11
|
+
n = len(group)
|
|
12
|
+
if n < 10:
|
|
13
|
+
# Optional: log specific keys if they exist in your scope
|
|
14
|
+
return pd.DataFrame(columns=group.columns)
|
|
15
|
+
|
|
16
|
+
group = group.copy()
|
|
17
|
+
# Explicitly ensure date_column is datetime right at the start
|
|
18
|
+
group[date_column] = pd.to_datetime(group[date_column])
|
|
19
|
+
train_size = n - eval_period
|
|
20
|
+
|
|
21
|
+
# --- 1. HANDLE TRAINING DATA (Initial Block) ---
|
|
22
|
+
# Calculate baseline IQR using all data available before eval_period
|
|
23
|
+
initial_train = group[variable].iloc[:train_size]
|
|
24
|
+
|
|
25
|
+
low = initial_train.quantile(0.05)
|
|
26
|
+
high= initial_train.quantile(0.95)
|
|
27
|
+
|
|
28
|
+
# Assign initial bounds to the training rows
|
|
29
|
+
group.loc[group.index[:train_size], 'set'] = "TRAIN"
|
|
30
|
+
group.loc[group.index[:train_size], 'Percentile_low'] = low
|
|
31
|
+
group.loc[group.index[:train_size], 'Percentile_high'] = high
|
|
32
|
+
group.loc[group.index[:train_size], 'Percentile_anomaly'] = group[variable].iloc[:train_size].apply(
|
|
33
|
+
lambda x: classify(x, low, high)
|
|
34
|
+
)
|
|
35
|
+
group.loc[group.index[:train_size], 'is_Percentile_anomaly'] = (
|
|
36
|
+
(group[variable].iloc[:train_size] < low) |
|
|
37
|
+
(group[variable].iloc[:train_size] > high)
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# --- 2. HANDLE EVALUATION DATA (Expanding Window) ---
|
|
42
|
+
# Iterate through the eval period, increasing the training set one point at a time
|
|
43
|
+
for i in range(train_size, n):
|
|
44
|
+
# Data available up to this point (expanding)
|
|
45
|
+
current_train = group[variable].iloc[:i]
|
|
46
|
+
|
|
47
|
+
LOW = current_train.quantile(0.05)
|
|
48
|
+
HIGH = current_train.quantile(0.95)
|
|
49
|
+
|
|
50
|
+
# Test the current point i
|
|
51
|
+
current_val = group[variable].iloc[i]
|
|
52
|
+
group.iloc[i, group.columns.get_loc('set')] = "TEST"
|
|
53
|
+
group.iloc[i, group.columns.get_loc('Percentile_low')] = LOW
|
|
54
|
+
group.iloc[i, group.columns.get_loc('Percentile_high')] = HIGH
|
|
55
|
+
group.iloc[i, group.columns.get_loc('Percentile_anomaly')] = classify(current_val, LOW, HIGH)
|
|
56
|
+
group.iloc[i, group.columns.get_loc('is_Percentile_anomaly')] = (current_val < LOW) or (current_val > HIGH)
|
|
57
|
+
|
|
58
|
+
# Cast boolean column properly
|
|
59
|
+
group['is_Percentile_anomaly'] = group['is_Percentile_anomaly'].astype(bool)
|
|
60
|
+
# FINAL SAFETY CHECK
|
|
61
|
+
group[date_column] = pd.to_datetime(group[date_column])
|
|
62
|
+
|
|
63
|
+
return group
|
|
64
|
+
|
|
65
|
+
|
anomaly_pipeline/main.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from .pipeline import run_pipeline
|
|
2
|
+
|
|
3
|
+
def timeseries_anomaly_detection(master_data, group_columns, variable,
|
|
4
|
+
date_column="week_start", freq="W-MON",
|
|
5
|
+
max_records=104, min_records=15,
|
|
6
|
+
contamination=0.03, random_state=42,
|
|
7
|
+
alpha=0.3, sigma=1.5, eval_period=12,
|
|
8
|
+
interval_width=0.90, mad_threshold = 2, mad_scale_factor = 0.6745):
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
Performs anomaly detection on grouped time-series data.
|
|
12
|
+
|
|
13
|
+
This function identifies outliers within specific groups of data by analyzing
|
|
14
|
+
historical trends, applying statistical thresholds, and calculating
|
|
15
|
+
prediction intervals.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
master_data (pd.DataFrame): The input dataset containing the time series.
|
|
19
|
+
group_columns (list[str]): Columns used to partition the data (e.g., ['store_id', 'item_id']).
|
|
20
|
+
variable (str): The target numerical column to analyze for anomalies.
|
|
21
|
+
date_column (str): The column containing datetime information. Defaults to 'week_start'.
|
|
22
|
+
freq (str): Frequency of the time series (Pandas offset alias). Defaults to 'W-MON'.
|
|
23
|
+
max_records (int): Maximum historical records to consider for the model. Defaults to 104.
|
|
24
|
+
min_records (int): Minimum records required to perform detection. Defaults to 15.
|
|
25
|
+
contamination (float): Expected proportion of outliers in the data (0 to 0.5). Defaults to 0.03.
|
|
26
|
+
random_state (int): Seed for reproducibility in stochastic models. Defaults to 42.
|
|
27
|
+
alpha (float): Smoothing factor for trend calculations. Defaults to 0.3.
|
|
28
|
+
sigma (float): Standard deviation multiplier for thresholding. Defaults to 1.5.
|
|
29
|
+
eval_periods (int): Number of recent periods to evaluate for anomalies. Defaults to 12.
|
|
30
|
+
interval_width (float): The confidence level for the prediction interval (0 to 1). Defaults to 0.9.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
pd.DataFrame: The original dataframe appended with anomaly flags and scores.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
return run_pipeline(
|
|
37
|
+
master_data=master_data,
|
|
38
|
+
group_columns=group_columns,
|
|
39
|
+
variable=variable,
|
|
40
|
+
date_column=date_column,
|
|
41
|
+
freq=freq,
|
|
42
|
+
max_records=max_records,
|
|
43
|
+
min_records=min_records,
|
|
44
|
+
contamination=contamination,
|
|
45
|
+
random_state=random_state,
|
|
46
|
+
alpha=alpha,
|
|
47
|
+
sigma=sigma,
|
|
48
|
+
eval_period=eval_period,
|
|
49
|
+
interval_width=interval_width,
|
|
50
|
+
mad_threshold = mad_threshold,
|
|
51
|
+
mad_scale_factor = mad_scale_factor
|
|
52
|
+
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
print("Anomaly pipeline successfully invoked via python -m!")
|
|
56
|
+
|
|
57
|
+
# change test_weeks to eval_periods: automate min_records based on eval_periods,
|
|
58
|
+
# max_records = max_records + eval_records
|
|
59
|
+
# freq_daily: max_records based on frequency (for version 2) 104 for weekly
|
|
60
|
+
# split all the 5 functions and parametrize all the variables
|
|
61
|
+
# change interval_width name to prophet_CI
|
|
62
|
+
# change FB_anomaly column to high low and none insted of -1, 1, 0
|
|
63
|
+
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from datetime import date
|
|
4
|
+
from joblib import Parallel, delayed
|
|
5
|
+
from .helpers.percentile import detect_outliers_percentile
|
|
6
|
+
from .helpers.STD import detect_outliers_sd
|
|
7
|
+
from .helpers.MAD import detect_outliers_mad
|
|
8
|
+
from .helpers.IQR import detect_outliers_iqr
|
|
9
|
+
from .helpers.iso_forest_general import detect_outliers_isf_general
|
|
10
|
+
from .helpers.ewma import ewma_with_anomalies_rolling_group
|
|
11
|
+
from .helpers.fb_prophet import detect_time_series_anomalies_fb_walkforward
|
|
12
|
+
from .helpers.iso_forest_timeseries import detect_time_series_anomalies_isoforest
|
|
13
|
+
from .helpers.DB_scan import detect_time_series_anomalies_dbscan
|
|
14
|
+
from .helpers.Preprocessing import create_full_calendar_and_interpolate, print_anomaly_stats
|
|
15
|
+
|
|
16
|
+
def process_group(model, name, group, group_columns, variable,
|
|
17
|
+
date_column, alpha, sigma, eval_period, interval_width, contamination, random_state):
|
|
18
|
+
|
|
19
|
+
if model == "ISF_general":
|
|
20
|
+
return detect_outliers_isf_general(group, variable, contamination, random_state, eval_period)
|
|
21
|
+
|
|
22
|
+
if model == "EWMA":
|
|
23
|
+
return ewma_with_anomalies_rolling_group(
|
|
24
|
+
group, group_columns, variable, date_column, alpha, sigma, eval_period
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
if model == "FB":
|
|
28
|
+
return detect_time_series_anomalies_fb_walkforward(
|
|
29
|
+
group, variable, date_column, eval_period, interval_width
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
if model == 'ISF_timeseries':
|
|
33
|
+
return detect_time_series_anomalies_isoforest(
|
|
34
|
+
group, variable, date_column, eval_period
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
if model == 'DBSCAN':
|
|
38
|
+
return detect_time_series_anomalies_dbscan(
|
|
39
|
+
group, variable, date_column, eval_period
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def run_pipeline(master_data, group_columns, variable,
|
|
44
|
+
date_column, freq,
|
|
45
|
+
max_records, min_records,
|
|
46
|
+
contamination, random_state,
|
|
47
|
+
alpha, sigma, eval_period,
|
|
48
|
+
interval_width, mad_threshold, mad_scale_factor):
|
|
49
|
+
|
|
50
|
+
# preprocess calendar
|
|
51
|
+
final_data = create_full_calendar_and_interpolate(
|
|
52
|
+
master_data,
|
|
53
|
+
group_columns,
|
|
54
|
+
variable,
|
|
55
|
+
date_column,
|
|
56
|
+
freq
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
groups = list(final_data.groupby(group_columns))
|
|
60
|
+
|
|
61
|
+
# Run in parallel (use all cores: n_jobs=-1)
|
|
62
|
+
|
|
63
|
+
## Percentile
|
|
64
|
+
results_percentile = []
|
|
65
|
+
results_SD = []
|
|
66
|
+
results_IQR = []
|
|
67
|
+
results_MAD = []
|
|
68
|
+
for name, group in groups:
|
|
69
|
+
# percentile
|
|
70
|
+
res_percentile = detect_outliers_percentile(group, variable, date_column, eval_period)
|
|
71
|
+
results_percentile.append(res_percentile)
|
|
72
|
+
|
|
73
|
+
# SD
|
|
74
|
+
res_SD = detect_outliers_sd(group, variable, date_column, eval_period)
|
|
75
|
+
results_SD.append(res_SD)
|
|
76
|
+
|
|
77
|
+
# MAD
|
|
78
|
+
res_MAD = detect_outliers_mad(group, variable, date_column, mad_threshold, mad_scale_factor, eval_period)
|
|
79
|
+
results_MAD.append(res_MAD)
|
|
80
|
+
|
|
81
|
+
# IQR
|
|
82
|
+
res_IQR = detect_outliers_iqr(group, variable, date_column, eval_period)
|
|
83
|
+
results_IQR.append(res_IQR)
|
|
84
|
+
|
|
85
|
+
anomaly_key_channel_percentile = pd.concat(results_percentile, ignore_index=True)
|
|
86
|
+
|
|
87
|
+
#print("anomaly_key_channel_percentile data frame created")
|
|
88
|
+
#print(anomaly_key_channel_percentile.head())
|
|
89
|
+
|
|
90
|
+
anomaly_key_channel_SD = pd.concat(results_SD, ignore_index=True)
|
|
91
|
+
SD_cols = group_columns+[date_column]+['Mean', 'SD', 'SD2_low', 'SD2_high','SD_anomaly',
|
|
92
|
+
'is_SD_anomaly']
|
|
93
|
+
anomaly_key_channel_SD_final = anomaly_key_channel_SD[SD_cols]
|
|
94
|
+
|
|
95
|
+
#print("anomaly_key_channel_SD data frame created")
|
|
96
|
+
#print(anomaly_key_channel_SD.head())
|
|
97
|
+
|
|
98
|
+
anomaly_key_channel_MAD = pd.concat(results_MAD, ignore_index=True)
|
|
99
|
+
MAD_cols = group_columns+[date_column]+['Median', 'MAD', 'MAD_low', 'MAD_high','is_MAD_anomaly',
|
|
100
|
+
'MAD_anomaly']
|
|
101
|
+
anomaly_key_channel_MAD_final = anomaly_key_channel_MAD[MAD_cols]
|
|
102
|
+
|
|
103
|
+
#print("anomaly_key_channel_MAD data frame created")
|
|
104
|
+
#print(anomaly_key_channel_MAD.head())
|
|
105
|
+
|
|
106
|
+
anomaly_key_channel_IQR = pd.concat(results_IQR, ignore_index=True)
|
|
107
|
+
IQR_cols = group_columns+[date_column]+['Q1', 'Q3', 'IQR', 'IQR_low', 'IQR_high','IQR_anomaly',
|
|
108
|
+
'is_IQR_anomaly']
|
|
109
|
+
anomaly_key_channel_IQR_final = anomaly_key_channel_IQR[IQR_cols]
|
|
110
|
+
|
|
111
|
+
#print("anomaly_key_channel_IQR data frame created")
|
|
112
|
+
#print(anomaly_key_channel_IQR.head())
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
## ISF_general
|
|
116
|
+
results_ISF_general = Parallel(n_jobs=-1, verbose=0)(delayed(process_group)('ISF_general', name, group, group_columns, variable,date_column, alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# Combine results back
|
|
120
|
+
anomaly_key_channel_ISF_general= (
|
|
121
|
+
pd.concat(results_ISF_general)
|
|
122
|
+
.sort_values(by=group_columns+[date_column])
|
|
123
|
+
)
|
|
124
|
+
#print("anomaly_key_channel_ISF_general data frame created")
|
|
125
|
+
#print(anomaly_key_channel_ISF_general.head())
|
|
126
|
+
|
|
127
|
+
## EWMA
|
|
128
|
+
results_EWMA = Parallel(n_jobs=-1, verbose=0)(
|
|
129
|
+
delayed(process_group)('EWMA', name, group,group_columns, variable, date_column,
|
|
130
|
+
alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# Combine results back
|
|
134
|
+
anomaly_key_channel_EWMA= (
|
|
135
|
+
pd.concat(results_EWMA)
|
|
136
|
+
.sort_values(by=group_columns+[date_column])
|
|
137
|
+
)
|
|
138
|
+
#print("anomaly_key_channel_EWMA data frame created")
|
|
139
|
+
#print(anomaly_key_channel_EWMA.head())
|
|
140
|
+
EWMA_cols = group_columns+[date_column]+['alpha', 'sigma', 'EWMA_forecast',
|
|
141
|
+
'STD', 'EWMA_high', 'EWMA_low','is_EWMA_anomaly']
|
|
142
|
+
|
|
143
|
+
anomaly_key_channel_EWMA_final = anomaly_key_channel_EWMA[EWMA_cols]
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
## FB
|
|
147
|
+
|
|
148
|
+
results_fb = Parallel(n_jobs=-1, verbose=0)(delayed(process_group)('FB', name, group,group_columns, variable,date_column,
|
|
149
|
+
alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# Combine results back
|
|
153
|
+
anomaly_key_channel_fb= (
|
|
154
|
+
pd.concat(results_fb)
|
|
155
|
+
.sort_values(by=group_columns+[date_column])
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
#print("anomaly_key_channel_fb data frame created")
|
|
159
|
+
#print(anomaly_key_channel_fb.head())
|
|
160
|
+
FB_cols = group_columns+[date_column]+["FB_forecast","FB_low","FB_high",
|
|
161
|
+
"FB_residual","FB_anomaly","is_FB_anomaly"]
|
|
162
|
+
|
|
163
|
+
anomaly_key_channel_fb_final = anomaly_key_channel_fb[FB_cols]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
## Isolation Forest timeseries
|
|
167
|
+
results_ISF_timeseries = Parallel(n_jobs=-1, verbose=0)(
|
|
168
|
+
delayed(process_group)('ISF_timeseries', name, group,group_columns, variable, date_column,
|
|
169
|
+
alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# Combine results back
|
|
173
|
+
anomaly_key_channel_ISF_timeseries= (
|
|
174
|
+
pd.concat(results_ISF_timeseries)
|
|
175
|
+
.sort_values(by=group_columns+[date_column])
|
|
176
|
+
)
|
|
177
|
+
#print(anomaly_key_channel_ISF_timeseries.head())
|
|
178
|
+
ISF_cols = group_columns+[date_column]+["IsolationForest_score_timeseries", "is_IsolationForest_anomaly_timeseries"]
|
|
179
|
+
anomaly_key_channel_ISF_timeseries_final = anomaly_key_channel_ISF_timeseries[ISF_cols]
|
|
180
|
+
|
|
181
|
+
#print("anomaly_key_channel_ISF_timeseries data frame created")
|
|
182
|
+
#print(anomaly_key_channel_ISF_timeseries.head())
|
|
183
|
+
|
|
184
|
+
## DB Scan
|
|
185
|
+
results_DB = Parallel(n_jobs=-1, verbose=0)(
|
|
186
|
+
delayed(process_group)('DBSCAN', name, group,group_columns, variable, date_column,
|
|
187
|
+
alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
|
|
188
|
+
|
|
189
|
+
# Combine results back
|
|
190
|
+
anomaly_key_channel_DB= (
|
|
191
|
+
pd.concat(results_DB)
|
|
192
|
+
.sort_values(by=group_columns+[date_column])
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
#print("anomaly_key_channel_DB data frame created")
|
|
197
|
+
#print(anomaly_key_channel_DB.head())
|
|
198
|
+
|
|
199
|
+
DB_cols = group_columns+[date_column]+["dbscan_score", "is_DBSCAN_anomaly"]
|
|
200
|
+
anomaly_key_channel_DB_final = anomaly_key_channel_DB[DB_cols]
|
|
201
|
+
|
|
202
|
+
# combine ISF general and timeseries data frames
|
|
203
|
+
anomaly_key_channel_ISF = anomaly_key_channel_ISF_general.merge(anomaly_key_channel_ISF_timeseries_final,
|
|
204
|
+
on= group_columns+[date_column], how= 'inner')
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
# Column 1 Logic: If 'type' is train, take from 'col_A', else take from 'col_B'
|
|
208
|
+
anomaly_key_channel_ISF['IsolationForest_score'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
|
|
209
|
+
anomaly_key_channel_ISF['IsolationForest_score_general'],
|
|
210
|
+
anomaly_key_channel_ISF['IsolationForest_score_timeseries'])
|
|
211
|
+
|
|
212
|
+
# Column 2 Logic: If 'type' is train, take from 'IsolationForest_general', else take from 'IsolationForest_timeseries'
|
|
213
|
+
anomaly_key_channel_ISF['is_IsolationForest_anomaly'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
|
|
214
|
+
anomaly_key_channel_ISF['is_IsolationForest_anomaly_general'],
|
|
215
|
+
anomaly_key_channel_ISF['is_IsolationForest_anomaly_timeseries'])
|
|
216
|
+
|
|
217
|
+
ISF_cols = group_columns+[date_column]+['IsolationForest_score', 'is_IsolationForest_anomaly']
|
|
218
|
+
anomaly_key_channel_ISF_final = anomaly_key_channel_ISF[ISF_cols]
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
#print("anomaly_key_channel_ISF data frame created")
|
|
222
|
+
#print(anomaly_key_channel_ISF.head())
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
# combine all the data frames
|
|
226
|
+
|
|
227
|
+
anomaly = anomaly_key_channel_percentile.merge(anomaly_key_channel_SD_final, on= group_columns+[date_column], how='inner')
|
|
228
|
+
anomaly = anomaly.merge(anomaly_key_channel_MAD_final, on= group_columns+[date_column], how='inner')
|
|
229
|
+
anomaly = anomaly.merge(anomaly_key_channel_IQR_final, on= group_columns+[date_column], how='inner')
|
|
230
|
+
anomaly = anomaly.merge(anomaly_key_channel_EWMA_final, on= group_columns+[date_column], how='inner')
|
|
231
|
+
anomaly = anomaly.merge(anomaly_key_channel_fb_final, on= group_columns+[date_column], how= 'inner')
|
|
232
|
+
anomaly = anomaly.merge(anomaly_key_channel_ISF_final, on= group_columns+[date_column], how= 'inner')
|
|
233
|
+
anomaly = anomaly.merge(anomaly_key_channel_DB_final, on= group_columns+[date_column], how= 'inner')
|
|
234
|
+
|
|
235
|
+
# ---- Unified anomaly flag (majority voting) ----
|
|
236
|
+
anomaly_flags = [
|
|
237
|
+
'is_Percentile_anomaly',
|
|
238
|
+
'is_SD_anomaly', 'is_MAD_anomaly',
|
|
239
|
+
'is_IQR_anomaly',
|
|
240
|
+
'is_EWMA_anomaly', 'is_FB_anomaly','is_IsolationForest_anomaly','is_DBSCAN_anomaly']
|
|
241
|
+
|
|
242
|
+
anomaly['Anomaly_Votes'] = anomaly[anomaly_flags].sum(axis=1)
|
|
243
|
+
# Majority rule: anomaly if flagged by at least half the methods
|
|
244
|
+
anomaly['is_Anomaly'] = anomaly['Anomaly_Votes'] >= 4
|
|
245
|
+
|
|
246
|
+
# Add refresh_date as the first column
|
|
247
|
+
anomaly.insert(0, 'refresh_date', pd.to_datetime(date.today()))
|
|
248
|
+
|
|
249
|
+
print(anomaly.head())
|
|
250
|
+
|
|
251
|
+
print_anomaly_stats(anomaly, group_columns)
|
|
252
|
+
|
|
253
|
+
return anomaly
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: anomaly_pipeline
|
|
3
|
+
Version: 0.1.27
|
|
4
|
+
Requires-Dist: pandas
|
|
5
|
+
Requires-Dist: numpy<2
|
|
6
|
+
Requires-Dist: joblib
|
|
7
|
+
Requires-Dist: prophet
|
|
8
|
+
Requires-Dist: scikit-learn
|
|
9
|
+
Requires-Dist: google-cloud-bigquery
|
|
10
|
+
Requires-Dist: google-cloud-storage
|
|
11
|
+
Requires-Dist: statsmodels
|
|
12
|
+
Requires-Dist: plotly
|
|
13
|
+
Requires-Dist: pandas-gbq
|
|
14
|
+
Requires-Dist: gcsfs
|
|
15
|
+
Dynamic: requires-dist
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
anomaly_pipeline/__init__.py,sha256=ED-UPADjbdS8xjK41KmWVYcFIn6q_cN-SwBx-dRI-nM,77
|
|
2
|
+
anomaly_pipeline/main.py,sha256=khiatXxr01XYHB8SrIfyTnlaCu008MA6ORGiI_2Tjr4,2925
|
|
3
|
+
anomaly_pipeline/pipeline.py,sha256=3Lf9b0Vok-mqWDLhhZeN9emgx5i30stPrU8XOmKpmEw,11204
|
|
4
|
+
anomaly_pipeline/helpers/DB_scan.py,sha256=80PLlubpcwY6dOUx5rm569hvFlGNa1rtvjs74US9oIk,8134
|
|
5
|
+
anomaly_pipeline/helpers/IQR.py,sha256=VlYU6Yf-4KQmVroLvzwd220jn5BUNJEchsVE4_KxKm4,2824
|
|
6
|
+
anomaly_pipeline/helpers/MAD.py,sha256=XDG8r9o1JNi7YZ2NKwNzqmu_Oyz2OPP2rThCuw8WZhs,3377
|
|
7
|
+
anomaly_pipeline/helpers/Preprocessing.py,sha256=VsAohcAW1wTKDdNAF1xNF4j4I2gyZ8rOC1HjyK0NpGk,3933
|
|
8
|
+
anomaly_pipeline/helpers/STD.py,sha256=SZ1UaS_Aa5ay6qWNzKpBXpQIloUuPlliOrfr7yHba4k,2769
|
|
9
|
+
anomaly_pipeline/helpers/__init__.py,sha256=aDAAxiNAusL4rwcn9XbkUIApp3i02UXolB_CWvbbY_0,32
|
|
10
|
+
anomaly_pipeline/helpers/baseline.py,sha256=h9t_LWcAw17P9qmoRQMceukGzOOr-gFLuHfVbipQB7M,3824
|
|
11
|
+
anomaly_pipeline/helpers/cluster_functions.py,sha256=Nhk2YdKVynrKywEILg_5B2xD4zrCZ_ICWw3oOdTDHuA,13040
|
|
12
|
+
anomaly_pipeline/helpers/evaluation_info.py,sha256=SXa1LkznNQXTOcFCbryRmRJMSNC_Fa2CU-HhFnyTIKY,6219
|
|
13
|
+
anomaly_pipeline/helpers/evaluation_plots.py,sha256=xfyVlE7B4E376EL4AF8A4T5kUfqzPShGOSy548psT6M,21230
|
|
14
|
+
anomaly_pipeline/helpers/ewma.py,sha256=YprdcvR17EQ4X9pJo5OusaD3jNaaoHvQLHRHHt25CGk,3562
|
|
15
|
+
anomaly_pipeline/helpers/fb_prophet.py,sha256=-ivBIgMBPT4DG-hbGXPMB1-aiEBfLw2LQvy6eXKzELQ,3182
|
|
16
|
+
anomaly_pipeline/helpers/help_info.py,sha256=QuRd206KQ8etRnlODH9Ek_zmXUvHSBwVQtukqf0iKSc,37012
|
|
17
|
+
anomaly_pipeline/helpers/iso_forest_general.py,sha256=nonZl2wcLyHe0E50mqQUw_IB3tuMochmZKQNd0xMFVk,2350
|
|
18
|
+
anomaly_pipeline/helpers/iso_forest_timeseries.py,sha256=SWf6g0mwLohIRdMvGfMCAcfWi5FPPokiV7dM8Un5qpE,5900
|
|
19
|
+
anomaly_pipeline/helpers/percentile.py,sha256=eLk0PgY7m7z7VKTLfXg8ykKii0ciAJvlGOYXpv84mOE,2523
|
|
20
|
+
anomaly_pipeline-0.1.27.dist-info/METADATA,sha256=YIIJMpsDchA8L2Jp0T4wBXpxwcL5r-UiJ35gLP6BRCs,371
|
|
21
|
+
anomaly_pipeline-0.1.27.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
22
|
+
anomaly_pipeline-0.1.27.dist-info/entry_points.txt,sha256=c7aMFN_VdyQk_gKp9S2-bz4AF3eBActUectAElnEdMo,92
|
|
23
|
+
anomaly_pipeline-0.1.27.dist-info/top_level.txt,sha256=3QhrLt05iNbxIQhnAA0vmIkRQje4Hc_STGY_Tukx3Vg,17
|
|
24
|
+
anomaly_pipeline-0.1.27.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
anomaly_pipeline
|