anomaly-pipeline 0.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anomaly_pipeline/__init__.py +2 -0
- anomaly_pipeline/helpers/DB_scan.py +188 -0
- anomaly_pipeline/helpers/IQR.py +71 -0
- anomaly_pipeline/helpers/MAD.py +88 -0
- anomaly_pipeline/helpers/Preprocessing.py +116 -0
- anomaly_pipeline/helpers/STD.py +70 -0
- anomaly_pipeline/helpers/__init__.py +1 -0
- anomaly_pipeline/helpers/baseline.py +112 -0
- anomaly_pipeline/helpers/cluster_functions.py +289 -0
- anomaly_pipeline/helpers/evaluation_info.py +121 -0
- anomaly_pipeline/helpers/evaluation_plots.py +546 -0
- anomaly_pipeline/helpers/ewma.py +119 -0
- anomaly_pipeline/helpers/fb_prophet.py +94 -0
- anomaly_pipeline/helpers/help_info.py +683 -0
- anomaly_pipeline/helpers/iso_forest_general.py +50 -0
- anomaly_pipeline/helpers/iso_forest_timeseries.py +123 -0
- anomaly_pipeline/helpers/percentile.py +65 -0
- anomaly_pipeline/main.py +63 -0
- anomaly_pipeline/pipeline.py +253 -0
- anomaly_pipeline-0.1.27.dist-info/METADATA +15 -0
- anomaly_pipeline-0.1.27.dist-info/RECORD +24 -0
- anomaly_pipeline-0.1.27.dist-info/WHEEL +5 -0
- anomaly_pipeline-0.1.27.dist-info/entry_points.txt +2 -0
- anomaly_pipeline-0.1.27.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import sklearn
|
|
4
|
+
from sklearn.preprocessing import StandardScaler
|
|
5
|
+
from sklearn.cluster import DBSCAN
|
|
6
|
+
from sklearn.neighbors import NearestNeighbors
|
|
7
|
+
from sklearn.ensemble import IsolationForest
|
|
8
|
+
from statsmodels.tsa.stattools import acf
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_dynamic_lags(series: pd.Series) -> list:
|
|
12
|
+
|
|
13
|
+
n = len(series)
|
|
14
|
+
|
|
15
|
+
# Determine Max Lags (Max is min(50% of data, a hard cap of 60))
|
|
16
|
+
nlags = min(int(n * 0.5), 60)
|
|
17
|
+
|
|
18
|
+
if nlags < 5:
|
|
19
|
+
return [1, 2, 3]
|
|
20
|
+
|
|
21
|
+
# Calculate ACF and Confidence Intervals, get the 10 most-significant lags
|
|
22
|
+
autocorrelations, confint = acf(series.dropna(), nlags=nlags, alpha=0.25, fft=True)
|
|
23
|
+
autocorr_values = autocorrelations[1:]
|
|
24
|
+
conf_limit = confint[1:, 1] - autocorr_values
|
|
25
|
+
is_significant = np.abs(autocorr_values) > conf_limit
|
|
26
|
+
significant_autocorr = autocorr_values[is_significant]
|
|
27
|
+
significant_lags_indices = np.where(is_significant)[0] + 1
|
|
28
|
+
ranked_indices = np.argsort(np.abs(significant_autocorr))[::-1]
|
|
29
|
+
top_lags_indices = ranked_indices[:10]
|
|
30
|
+
top_lags = significant_lags_indices[top_lags_indices].tolist()
|
|
31
|
+
base_lags = [1, 2, 3]
|
|
32
|
+
dynamic_lags = sorted(list(set(base_lags + top_lags)))[:10]
|
|
33
|
+
|
|
34
|
+
return dynamic_lags
|
|
35
|
+
|
|
36
|
+
def find_optimal_epsilon(X_scaled: np.ndarray, k: int) -> float:
|
|
37
|
+
"""
|
|
38
|
+
Finds the optimal epsilon by calculating the distance to the k-th nearest neighbor
|
|
39
|
+
and taking a high percentile (90-95th) of those distances as the cutoff.
|
|
40
|
+
This serves as a programmatic proxy for the 'elbow' method in a rolling window.
|
|
41
|
+
"""
|
|
42
|
+
if len(X_scaled) < k:
|
|
43
|
+
return 1.0 # Fallback
|
|
44
|
+
|
|
45
|
+
# Find the distance to the k-th (min_samples) neighbor for every point
|
|
46
|
+
# n_neighbors is k+1 because the first distance is 0 (to itself)
|
|
47
|
+
neigh = NearestNeighbors(n_neighbors=k + 1)
|
|
48
|
+
neigh.fit(X_scaled)
|
|
49
|
+
|
|
50
|
+
# distances matrix: [n_samples, k+1]
|
|
51
|
+
distances, indices = neigh.kneighbors(X_scaled)
|
|
52
|
+
|
|
53
|
+
# We are interested in the distance to the k-th neighbor (index k)
|
|
54
|
+
# This k-distance is the required radius for a point to be a core point's neighbor.
|
|
55
|
+
k_distances = distances[:, k]
|
|
56
|
+
|
|
57
|
+
# The elbow is hard to find programmatically. A robust proxy for the density
|
|
58
|
+
# threshold is to take a high percentile (e.g., 95th) of the k-distances.
|
|
59
|
+
# This sets epsilon such that 95% of your *training* points would be considered
|
|
60
|
+
# part of a cluster's neighborhood.
|
|
61
|
+
optimal_eps = np.percentile(k_distances, 95)
|
|
62
|
+
|
|
63
|
+
# Ensure a minimum value if data is extremely sparse
|
|
64
|
+
return max(optimal_eps, 0.1)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def detect_time_series_anomalies_dbscan(
|
|
68
|
+
group,
|
|
69
|
+
variable,
|
|
70
|
+
date_column,
|
|
71
|
+
eval_period,
|
|
72
|
+
):
|
|
73
|
+
|
|
74
|
+
group[date_column] = pd.to_datetime(group[date_column])
|
|
75
|
+
group = group.copy().sort_values(date_column).reset_index(drop=True)
|
|
76
|
+
|
|
77
|
+
# --- Default DBSCAN Parameters ---
|
|
78
|
+
# These parameters often need tuning, but these are reasonable starting points:
|
|
79
|
+
DEFAULT_EPS = 0.5 # Neighborhood radius (critical parameter)
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
test_anom = []
|
|
83
|
+
|
|
84
|
+
for t in list(range(eval_period - 1, -1, -1)):
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
# Boundary between rolling train and rolling forecast region
|
|
88
|
+
cutoff_date = group[date_column].max() - pd.Timedelta(weeks=t)
|
|
89
|
+
|
|
90
|
+
# Get train set to determine lags
|
|
91
|
+
model_group = group.copy()
|
|
92
|
+
train = model_group[model_group[date_column] <= cutoff_date].copy()
|
|
93
|
+
lags = get_dynamic_lags(train[variable])
|
|
94
|
+
|
|
95
|
+
# Create lag features and rolling stats for the entire DF
|
|
96
|
+
rolling_stats_features = []
|
|
97
|
+
for lag in lags:
|
|
98
|
+
model_group[f'lag{lag}'] = model_group[variable].shift(lag)
|
|
99
|
+
|
|
100
|
+
for w in [int(np.ceil(max(lags)/4)), int(np.ceil(max(lags)/2)), int(max(lags))]:
|
|
101
|
+
if w >= 3:
|
|
102
|
+
rolling_stats_features.extend([f'roll_mean_{w}', f'roll_std_{w}'])
|
|
103
|
+
model_group[f'roll_mean_{w}'] = model_group[variable].shift(1).rolling(w).mean()
|
|
104
|
+
model_group[f'roll_std_{w}'] = model_group[variable].shift(1).rolling(w).std()
|
|
105
|
+
|
|
106
|
+
model_group['trend'] = group.index
|
|
107
|
+
model_group = model_group.copy().dropna()
|
|
108
|
+
|
|
109
|
+
# Split into train and test
|
|
110
|
+
train = model_group[model_group[date_column] <= cutoff_date].copy()
|
|
111
|
+
test = model_group[model_group[date_column] == cutoff_date].copy()
|
|
112
|
+
|
|
113
|
+
# Identify all model features (lags, rolling stats, trend, and the variable itself)
|
|
114
|
+
features = [f'lag{i}' for i in lags] + rolling_stats_features + ['trend'] + [variable]
|
|
115
|
+
|
|
116
|
+
# Fit the scaler ONLY on the training data to avoid data leakage
|
|
117
|
+
scaler = StandardScaler()
|
|
118
|
+
|
|
119
|
+
# Fit the scaler on the train data features
|
|
120
|
+
scaler.fit(train[features])
|
|
121
|
+
|
|
122
|
+
# Transform both train and test sets
|
|
123
|
+
train_scaled = scaler.transform(train[features])
|
|
124
|
+
test_scaled = scaler.transform(test[features])
|
|
125
|
+
|
|
126
|
+
# Determine min_samples based on feature space dimension
|
|
127
|
+
min_samples = max(2 * len(features), 3)
|
|
128
|
+
|
|
129
|
+
# Find optimal epsilon
|
|
130
|
+
calculated_eps = find_optimal_epsilon(train_scaled, k=min_samples)
|
|
131
|
+
|
|
132
|
+
# --- DBSCAN MODEL ---
|
|
133
|
+
dbscan_model = DBSCAN(
|
|
134
|
+
eps=calculated_eps,
|
|
135
|
+
min_samples=min_samples,
|
|
136
|
+
n_jobs=-1
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Fit DBSCAN on the scaled training data
|
|
140
|
+
dbscan_model.fit(train_scaled)
|
|
141
|
+
|
|
142
|
+
# Since DBSCAN doesn't have a direct predict() method for new data points,
|
|
143
|
+
# the simplest (and common) proxy is to treat the test point as unassigned noise,
|
|
144
|
+
# which requires complex distance logic.
|
|
145
|
+
|
|
146
|
+
neigh = NearestNeighbors(n_neighbors=min_samples)
|
|
147
|
+
neigh.fit(train_scaled)
|
|
148
|
+
|
|
149
|
+
# Find the distance of the test point to its nearest neighbors in the train set
|
|
150
|
+
distances, indices = neigh.kneighbors(test_scaled)
|
|
151
|
+
|
|
152
|
+
# Anomaly check: If the distance to the min_samples-th neighbor is > eps, it's noise.
|
|
153
|
+
# Use the distance to the k-th neighbor (index min_samples-1)
|
|
154
|
+
k_distance = distances[:, min_samples - 1]
|
|
155
|
+
|
|
156
|
+
# Flag as anomaly if the k-distance is greater than the trained eps threshold
|
|
157
|
+
test['dbscan_score'] = k_distance - calculated_eps
|
|
158
|
+
test['is_DBSCAN_anomaly'] = np.where(k_distance > calculated_eps, True, False)
|
|
159
|
+
|
|
160
|
+
test = test[[variable, date_column, 'dbscan_score', 'is_DBSCAN_anomaly']]
|
|
161
|
+
test_anom.append(test)
|
|
162
|
+
|
|
163
|
+
except Exception as e:
|
|
164
|
+
print(f"Error in iteration {t}: {e}")
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
test_anom = pd.concat(test_anom)
|
|
169
|
+
group = group.merge(test_anom[[variable, date_column, 'dbscan_score', 'is_DBSCAN_anomaly']], on=[variable, date_column], how='left')
|
|
170
|
+
# group["is_DBSCAN_anomaly"] = group["is_DBSCAN_anomaly"].fillna(False)
|
|
171
|
+
except:
|
|
172
|
+
print("Error in DBSCAN process")
|
|
173
|
+
group['dbscan_score'] = np.nan
|
|
174
|
+
group["is_DBSCAN_anomaly"] = np.nan
|
|
175
|
+
|
|
176
|
+
except Exception as e:
|
|
177
|
+
# Fallback error handling
|
|
178
|
+
# Replace key_series with group for robustness if key_series is not defined
|
|
179
|
+
try:
|
|
180
|
+
group_id_cols = group.select_dtypes(include=['object', 'string']).columns.tolist()
|
|
181
|
+
group_id = " ".join(group[group_id_cols].reset_index(drop=True).iloc[0].astype(str).to_list())
|
|
182
|
+
except:
|
|
183
|
+
group_id = "Unknown Group ID"
|
|
184
|
+
print(f'DBSCAN Anomaly Detection failed for {group_id}. Error: {e}')
|
|
185
|
+
group['dbscan_score'] = np.nan
|
|
186
|
+
group["is_DBSCAN_anomaly"] = np.nan
|
|
187
|
+
|
|
188
|
+
return group
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from .Preprocessing import classify
|
|
4
|
+
|
|
5
|
+
def detect_outliers_iqr(group, variable, date_column, eval_period):
|
|
6
|
+
n = len(group)
|
|
7
|
+
if n < 10:
|
|
8
|
+
return pd.DataFrame(columns=group.columns)
|
|
9
|
+
|
|
10
|
+
group = group.copy()
|
|
11
|
+
# Explicitly ensure date_column is datetime right at the start
|
|
12
|
+
group[date_column] = pd.to_datetime(group[date_column])
|
|
13
|
+
train_size = n - eval_period
|
|
14
|
+
|
|
15
|
+
# --- 1. HANDLE TRAINING DATA (Initial Block) ---
|
|
16
|
+
# Calculate baseline IQR using all data available before eval_period
|
|
17
|
+
initial_train = group[variable].iloc[:train_size]
|
|
18
|
+
|
|
19
|
+
q1 = initial_train.quantile(0.25)
|
|
20
|
+
q3 = initial_train.quantile(0.75)
|
|
21
|
+
iqr = q3 - q1
|
|
22
|
+
|
|
23
|
+
low = max(q1 - 1.5 * iqr, 0)
|
|
24
|
+
high = q3 + 1.5 * iqr
|
|
25
|
+
|
|
26
|
+
# Assign initial bounds to the training rows
|
|
27
|
+
group.loc[group.index[:train_size], 'Q1'] = q1
|
|
28
|
+
group.loc[group.index[:train_size], 'Q3'] = q3
|
|
29
|
+
group.loc[group.index[:train_size], 'IQR'] = iqr
|
|
30
|
+
group.loc[group.index[:train_size], 'IQR_low'] = low
|
|
31
|
+
group.loc[group.index[:train_size], 'IQR_high'] = high
|
|
32
|
+
group.loc[group.index[:train_size], 'set'] = "TRAIN"
|
|
33
|
+
group.loc[group.index[:train_size], 'IQR_anomaly'] = group[variable].iloc[:train_size].apply(
|
|
34
|
+
lambda x: classify(x, low, high)
|
|
35
|
+
)
|
|
36
|
+
group.loc[group.index[:train_size], 'is_IQR_anomaly'] = (
|
|
37
|
+
(group[variable].iloc[:train_size] < low) |
|
|
38
|
+
(group[variable].iloc[:train_size] > high)
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# --- 2. HANDLE EVALUATION DATA (Expanding Window) ---
|
|
43
|
+
# Iterate through the eval period, increasing the training set one point at a time
|
|
44
|
+
for i in range(train_size, n):
|
|
45
|
+
# Data available up to this point (expanding)
|
|
46
|
+
current_train = group[variable].iloc[:i]
|
|
47
|
+
|
|
48
|
+
Q1 = current_train.quantile(0.25)
|
|
49
|
+
Q3 = current_train.quantile(0.75)
|
|
50
|
+
IQR = Q3 - Q1
|
|
51
|
+
|
|
52
|
+
lower_q = max(Q1 - 1.5 * IQR, 0)
|
|
53
|
+
upper_q = Q3 + 1.5 * IQR
|
|
54
|
+
|
|
55
|
+
# Test the current point i
|
|
56
|
+
current_val = group[variable].iloc[i]
|
|
57
|
+
group.iloc[i, group.columns.get_loc('Q1')] = Q1
|
|
58
|
+
group.iloc[i, group.columns.get_loc('Q3')] = Q3
|
|
59
|
+
group.iloc[i, group.columns.get_loc('IQR')] = IQR
|
|
60
|
+
group.iloc[i, group.columns.get_loc('IQR_low')] = lower_q
|
|
61
|
+
group.iloc[i, group.columns.get_loc('IQR_high')] = upper_q
|
|
62
|
+
group.iloc[i, group.columns.get_loc('set')] = "TEST"
|
|
63
|
+
group.iloc[i, group.columns.get_loc('IQR_anomaly')] = classify(current_val, lower_q, upper_q)
|
|
64
|
+
group.iloc[i, group.columns.get_loc('is_IQR_anomaly')] = (current_val < lower_q) or (current_val > upper_q)
|
|
65
|
+
|
|
66
|
+
# Cast boolean column properly
|
|
67
|
+
group['is_IQR_anomaly'] = group['is_IQR_anomaly'].astype(bool)
|
|
68
|
+
# FINAL SAFETY CHECK
|
|
69
|
+
group[date_column] = pd.to_datetime(group[date_column])
|
|
70
|
+
|
|
71
|
+
return group
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from .Preprocessing import classify
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def detect_outliers_mad(group, variable, date_column, mad_threshold, mad_scale_factor, eval_period):
|
|
8
|
+
n = len(group)
|
|
9
|
+
if n < 10:
|
|
10
|
+
return pd.DataFrame(columns=group.columns)
|
|
11
|
+
|
|
12
|
+
group = group.copy()
|
|
13
|
+
# Explicitly ensure date_column is datetime right at the start
|
|
14
|
+
group[date_column] = pd.to_datetime(group[date_column])
|
|
15
|
+
train_size = n - eval_period
|
|
16
|
+
|
|
17
|
+
# Initialize columns to store the expanding window metrics
|
|
18
|
+
group['Median'] = np.nan
|
|
19
|
+
group['MAD'] = np.nan
|
|
20
|
+
group['MAD_low'] = np.nan
|
|
21
|
+
group['MAD_high'] = np.nan
|
|
22
|
+
group['set'] = ""
|
|
23
|
+
group['is_MAD_anomaly'] = False
|
|
24
|
+
|
|
25
|
+
# --- 1. HANDLE TRAINING DATA (Initial Block) ---
|
|
26
|
+
initial_train = group[variable].iloc[:train_size]
|
|
27
|
+
median = initial_train.median()
|
|
28
|
+
mad = np.median(np.abs(initial_train - median))
|
|
29
|
+
|
|
30
|
+
if mad == 0:
|
|
31
|
+
lower_mad = median
|
|
32
|
+
upper_mad = median
|
|
33
|
+
else:
|
|
34
|
+
margin = mad_threshold * mad / mad_scale_factor
|
|
35
|
+
lower_mad = max(median - margin, 0)
|
|
36
|
+
upper_mad = median + margin
|
|
37
|
+
|
|
38
|
+
# Assign baseline values to the training block
|
|
39
|
+
train_idx = group.index[:train_size]
|
|
40
|
+
group.loc[train_idx, 'Median'] = median
|
|
41
|
+
group.loc[train_idx, 'MAD'] = mad
|
|
42
|
+
group.loc[train_idx, 'MAD_low'] = lower_mad
|
|
43
|
+
group.loc[train_idx, 'MAD_high'] = upper_mad
|
|
44
|
+
group.loc[train_idx, 'set'] = "TRAIN"
|
|
45
|
+
group.loc[train_idx, 'MAD_anomaly'] = group[variable].iloc[:train_size].apply(
|
|
46
|
+
lambda x: classify(x, lower_mad, upper_mad)
|
|
47
|
+
)
|
|
48
|
+
group.loc[train_idx, 'is_MAD_anomaly'] = (group[variable].iloc[:train_size] < lower_mad) | \
|
|
49
|
+
(group[variable].iloc[:train_size] > upper_mad)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# --- 2. HANDLE EVALUATION DATA (Expanding Window) ---
|
|
53
|
+
for i in range(train_size, n):
|
|
54
|
+
# Recursive growth: use all data up to the current point i
|
|
55
|
+
current_train = group[variable].iloc[:i]
|
|
56
|
+
|
|
57
|
+
curr_median = current_train.median()
|
|
58
|
+
curr_mad = np.median(np.abs(current_train - curr_median))
|
|
59
|
+
|
|
60
|
+
if curr_mad == 0:
|
|
61
|
+
lower_mad = curr_median
|
|
62
|
+
upper_mad = curr_median
|
|
63
|
+
else:
|
|
64
|
+
margin = mad_threshold * curr_mad / mad_scale_factor
|
|
65
|
+
lower_mad = max(curr_median - margin, 0)
|
|
66
|
+
upper_mad = curr_median + margin
|
|
67
|
+
|
|
68
|
+
# Test current point i
|
|
69
|
+
current_val = group[variable].iloc[i]
|
|
70
|
+
|
|
71
|
+
group.iloc[i, group.columns.get_loc('Median')] = curr_median
|
|
72
|
+
group.iloc[i, group.columns.get_loc('MAD')] = curr_mad
|
|
73
|
+
group.iloc[i, group.columns.get_loc('MAD_low')] = lower_mad
|
|
74
|
+
group.iloc[i, group.columns.get_loc('MAD_high')] = upper_mad
|
|
75
|
+
group.iloc[i, group.columns.get_loc('set')] = "TEST"
|
|
76
|
+
group.iloc[i, group.columns.get_loc('MAD_anomaly')] = classify(current_val, lower_mad, upper_mad)
|
|
77
|
+
group.iloc[i, group.columns.get_loc('is_MAD_anomaly')] = (current_val < lower_mad) or (current_val > upper_mad)
|
|
78
|
+
|
|
79
|
+
# If you have your classify function available:
|
|
80
|
+
# group['MAD_anomaly'] = group.apply(lambda row: classify(row[variable], row['MAD_low'], row['MAD_high']), axis=1)
|
|
81
|
+
|
|
82
|
+
group['is_MAD_anomaly'] = group['is_MAD_anomaly'].astype(bool)
|
|
83
|
+
# FINAL SAFETY CHECK
|
|
84
|
+
group[date_column] = pd.to_datetime(group[date_column])
|
|
85
|
+
|
|
86
|
+
return group
|
|
87
|
+
|
|
88
|
+
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
def classify(val,lower,upper):
|
|
6
|
+
if val < lower:
|
|
7
|
+
return 'low'
|
|
8
|
+
elif val > upper:
|
|
9
|
+
return 'high'
|
|
10
|
+
else:
|
|
11
|
+
return 'none'
|
|
12
|
+
|
|
13
|
+
def create_full_calendar_and_interpolate(
|
|
14
|
+
master_data,
|
|
15
|
+
group_columns,
|
|
16
|
+
variable,
|
|
17
|
+
date_column,
|
|
18
|
+
freq
|
|
19
|
+
):
|
|
20
|
+
"""
|
|
21
|
+
Creates a complete weekly date range for each group,
|
|
22
|
+
merges with the master data, marks missing rows,
|
|
23
|
+
and fills missing values using linear interpolation.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
master_data : pd.DataFrame
|
|
28
|
+
group_columns : list
|
|
29
|
+
One or multiple columns that define a group.
|
|
30
|
+
date_column : str
|
|
31
|
+
Name of the date column (must be datetime-like)
|
|
32
|
+
missing_check_cols : list
|
|
33
|
+
Columns used to detect missing values.
|
|
34
|
+
If None → ALL numeric columns will be used.
|
|
35
|
+
freq : str
|
|
36
|
+
Frequency for calendar generation (default weekly Mondays).
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
# Ensure datetime
|
|
40
|
+
master_data[date_column] = pd.to_datetime(master_data[date_column])
|
|
41
|
+
|
|
42
|
+
full_group_data = []
|
|
43
|
+
|
|
44
|
+
for group_key, group in master_data.groupby(group_columns):
|
|
45
|
+
|
|
46
|
+
# ---- Step 1: Create full calendar for this group ----
|
|
47
|
+
min_date = group[date_column].min()
|
|
48
|
+
max_date = group[date_column].max()
|
|
49
|
+
|
|
50
|
+
full_dates = pd.date_range(start=min_date, end=max_date, freq=freq)
|
|
51
|
+
|
|
52
|
+
# Build calendar DF dynamically using group_columns
|
|
53
|
+
calendar_dict = {col: group_key[i] if isinstance(group_key, tuple) else group_key
|
|
54
|
+
for i, col in enumerate(group_columns)}
|
|
55
|
+
calendar_dict[date_column] = full_dates
|
|
56
|
+
|
|
57
|
+
full_calendar = pd.DataFrame(calendar_dict)
|
|
58
|
+
|
|
59
|
+
# ---- Step 2: Join with actual group data ----
|
|
60
|
+
merged = full_calendar.merge(
|
|
61
|
+
group,
|
|
62
|
+
on=group_columns + [date_column],
|
|
63
|
+
how="left"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# ---- Step 3: Mark missing rows based on selected columns ----
|
|
67
|
+
merged["is_missing_record"] = merged[variable].isna()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# ---- Step 4: Interpolate numeric columns ----
|
|
71
|
+
numeric_cols = merged.select_dtypes(include=[np.number]).columns
|
|
72
|
+
|
|
73
|
+
for col in numeric_cols:
|
|
74
|
+
merged[col] = merged[col].interpolate(method="linear", limit_direction="both")
|
|
75
|
+
|
|
76
|
+
full_group_data.append(merged)
|
|
77
|
+
|
|
78
|
+
final_df = pd.concat(full_group_data, ignore_index=True)
|
|
79
|
+
#print(f"The number of records missing {final_df['is_missing_record'].sum()}")
|
|
80
|
+
return final_df
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def print_anomaly_stats(df, group_columns):
|
|
84
|
+
# Calculate global stats
|
|
85
|
+
total_records = len(df)
|
|
86
|
+
# Ensure is_anomaly is treated as boolean for counting
|
|
87
|
+
total_anomalies = df['is_Anomaly'].fillna(False).astype(bool).sum()
|
|
88
|
+
anomaly_rate = (total_anomalies / total_records) * 100
|
|
89
|
+
|
|
90
|
+
print("\n" + "="*45)
|
|
91
|
+
print(f"{'ANOMALY DETECTION SUMMARY':^45}")
|
|
92
|
+
print("="*45)
|
|
93
|
+
print(f"{'Total Records:':<25} {total_records:,}")
|
|
94
|
+
print(f"{'Total Anomalies:':<25} {total_anomalies:,}")
|
|
95
|
+
print(f"{'Overall Anomaly Rate:':<25} {anomaly_rate:.2f}%")
|
|
96
|
+
print("-" * 45)
|
|
97
|
+
|
|
98
|
+
# --- CHANGE START: Group by Rate ---
|
|
99
|
+
print(f"Top 5 Groups by Anomaly Rate ({' > '.join(group_columns)}):")
|
|
100
|
+
|
|
101
|
+
# 1. Group by keys
|
|
102
|
+
# 2. Calculate mean (rate) and count (to show absolute numbers too)
|
|
103
|
+
group_stats = df.groupby(group_columns)['is_Anomaly'].agg(['mean', 'sum']).sort_values(by='mean', ascending=False).head(5)
|
|
104
|
+
|
|
105
|
+
for label, row in group_stats.iterrows():
|
|
106
|
+
# Handle single vs multiple group columns for clean printing
|
|
107
|
+
group_label = label if isinstance(label, str) else " | ".join(map(str, label))
|
|
108
|
+
rate_pct = row['mean'] * 100
|
|
109
|
+
count = int(row['sum'])
|
|
110
|
+
|
|
111
|
+
# Print the Rate % and the absolute count in brackets for context
|
|
112
|
+
print(f" - {group_label:<25} : {rate_pct:>6.2f}% ({count:>3} anomalies)")
|
|
113
|
+
# --- CHANGE END ---
|
|
114
|
+
|
|
115
|
+
print("="*45 + "\n")
|
|
116
|
+
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from .Preprocessing import classify
|
|
4
|
+
|
|
5
|
+
def detect_outliers_sd(group, variable, date_column, eval_period):
|
|
6
|
+
n = len(group)
|
|
7
|
+
# checking the min_size requirements
|
|
8
|
+
if n < 10:
|
|
9
|
+
return pd.DataFrame(columns=group.columns)
|
|
10
|
+
|
|
11
|
+
group = group.copy()
|
|
12
|
+
# Explicitly ensure date_column is datetime right at the start
|
|
13
|
+
group[date_column] = pd.to_datetime(group[date_column])
|
|
14
|
+
train_size = n - eval_period
|
|
15
|
+
|
|
16
|
+
# --- 1. HANDLE TRAINING DATA (Initial Block) ---
|
|
17
|
+
# Calculate baseline IQR using all data available before eval_period
|
|
18
|
+
initial_train = group[variable].iloc[:train_size]
|
|
19
|
+
|
|
20
|
+
# SD-based bounds
|
|
21
|
+
mean = initial_train.mean()
|
|
22
|
+
std = initial_train .std()
|
|
23
|
+
|
|
24
|
+
lower_2sd = max(mean - 2*std,0)
|
|
25
|
+
upper_2sd = mean + 2*std
|
|
26
|
+
|
|
27
|
+
# Assign initial bounds to the training rows
|
|
28
|
+
group.loc[group.index[:train_size], "Mean"] = mean
|
|
29
|
+
group.loc[group.index[:train_size], 'SD'] = std
|
|
30
|
+
group.loc[group.index[:train_size], 'SD2_low'] = lower_2sd
|
|
31
|
+
group.loc[group.index[:train_size], 'SD2_high'] = upper_2sd
|
|
32
|
+
group.loc[group.index[:train_size], 'set'] = "TRAIN"
|
|
33
|
+
group.loc[group.index[:train_size], 'SD_anomaly'] = group[variable].iloc[:train_size].apply(
|
|
34
|
+
lambda x: classify(x, lower_2sd , upper_2sd)
|
|
35
|
+
)
|
|
36
|
+
group.loc[group.index[:train_size], 'is_SD_anomaly'] = (
|
|
37
|
+
(group[variable].iloc[:train_size] < lower_2sd) |
|
|
38
|
+
(group[variable].iloc[:train_size] > upper_2sd)
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# --- 2. HANDLE EVALUATION DATA (Expanding Window) ---
|
|
43
|
+
# Iterate through the eval period, increasing the training set one point at a time
|
|
44
|
+
for i in range(train_size, n):
|
|
45
|
+
# Data available up to this point (expanding)
|
|
46
|
+
current_train = group[variable].iloc[:i]
|
|
47
|
+
|
|
48
|
+
MEAN = current_train.mean()
|
|
49
|
+
STD = current_train.std()
|
|
50
|
+
|
|
51
|
+
LOWER_2SD = max(MEAN - 2*STD,0)
|
|
52
|
+
UPPER_2SD = MEAN + 2*STD
|
|
53
|
+
|
|
54
|
+
# Test the current point i
|
|
55
|
+
current_val = group[variable].iloc[i]
|
|
56
|
+
group.iloc[i, group.columns.get_loc("Mean")] = MEAN
|
|
57
|
+
group.iloc[i, group.columns.get_loc('SD')] = STD
|
|
58
|
+
group.iloc[i, group.columns.get_loc('SD2_low')] = LOWER_2SD
|
|
59
|
+
group.iloc[i, group.columns.get_loc('SD2_high')] = UPPER_2SD
|
|
60
|
+
group.iloc[i, group.columns.get_loc('set')] = "TEST"
|
|
61
|
+
group.iloc[i, group.columns.get_loc('SD_anomaly')] = classify(current_val, LOWER_2SD, UPPER_2SD)
|
|
62
|
+
group.iloc[i, group.columns.get_loc('is_SD_anomaly')] = (current_val < LOWER_2SD) or (current_val > UPPER_2SD)
|
|
63
|
+
|
|
64
|
+
# Cast boolean column properly
|
|
65
|
+
group['is_SD_anomaly'] = group['is_SD_anomaly'].astype(bool)
|
|
66
|
+
# FINAL SAFETY CHECK
|
|
67
|
+
group[date_column] = pd.to_datetime(group[date_column])
|
|
68
|
+
|
|
69
|
+
return group
|
|
70
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .help_info import help_info
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.ensemble import IsolationForest
|
|
4
|
+
|
|
5
|
+
# Anomaly category columns (optional, keep if you still want string labels)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def remove_outliers_iqr_and_sd(group, variable,contamination=0.03, random_state=42):
|
|
9
|
+
if len(group) < 10:
|
|
10
|
+
# Return empty DataFrame to exclude this group entirely
|
|
11
|
+
print(f"the {group[key].unique()} has {len(group)} records.Hence dropping from the analysis")
|
|
12
|
+
return pd.DataFrame(columns=group.columns)
|
|
13
|
+
# Quantile-based bounds
|
|
14
|
+
min_value = group[variable].min()
|
|
15
|
+
max_value = group[variable].max()
|
|
16
|
+
Q1 = group[variable].quantile(0.25)
|
|
17
|
+
Q3 = group[variable].quantile(0.75)
|
|
18
|
+
median = group[variable].quantile(0.5)
|
|
19
|
+
IQR = Q3 - Q1
|
|
20
|
+
low_percentile = group[variable].quantile(0.05)
|
|
21
|
+
high_percentile = group[variable].quantile(0.95)
|
|
22
|
+
lower_q = max(Q1 - 1.5 * IQR,0)
|
|
23
|
+
upper_q = Q3 + 1.5 * IQR
|
|
24
|
+
|
|
25
|
+
group["MIN_value"]= min_value
|
|
26
|
+
group["MAX_value"]= max_value
|
|
27
|
+
group["Percentile_low"]=low_percentile
|
|
28
|
+
group["Percentile_high"]=high_percentile
|
|
29
|
+
|
|
30
|
+
# SD-based bounds
|
|
31
|
+
mean = group[variable].mean()
|
|
32
|
+
std = group[variable].std()
|
|
33
|
+
|
|
34
|
+
lower_1sd = max(mean - 1*std, 0)
|
|
35
|
+
upper_1sd = mean + 1*std
|
|
36
|
+
group["Mean"]=mean
|
|
37
|
+
group["SD"]=std
|
|
38
|
+
group['SD1_low'] = lower_1sd
|
|
39
|
+
group['SD1_high'] = upper_1sd
|
|
40
|
+
|
|
41
|
+
lower_2sd = max(mean - 2*std,0)
|
|
42
|
+
upper_2sd = mean + 2*std
|
|
43
|
+
#group["mean"]=mean
|
|
44
|
+
#group["std"]=std
|
|
45
|
+
group['SD2_low'] = lower_2sd
|
|
46
|
+
group['SD2_high'] = upper_2sd
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
lower_3sd = max(mean - 3 * std,0)
|
|
50
|
+
upper_3sd = mean + 3 * std
|
|
51
|
+
group['SD3_low'] = lower_3sd
|
|
52
|
+
group['SD3_high'] = upper_3sd
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# MAD-based bounds
|
|
56
|
+
abs_dev = np.abs(group[variable] - median)
|
|
57
|
+
mad = np.median(abs_dev)
|
|
58
|
+
threshold_v1 = 2.5
|
|
59
|
+
threshold_v2 = 2.5
|
|
60
|
+
scale_factor = 0.6745
|
|
61
|
+
|
|
62
|
+
if mad == 0:
|
|
63
|
+
lower_mad_v1 = median
|
|
64
|
+
upper_mad_v1= median
|
|
65
|
+
lower_mad_v2 = median
|
|
66
|
+
upper_mad_v2 = median
|
|
67
|
+
else:
|
|
68
|
+
margin_v1 = threshold_v1 * mad / scale_factor
|
|
69
|
+
lower_mad_v1 = max(median - margin_v1,0)
|
|
70
|
+
upper_mad_v1 = median + margin_v1
|
|
71
|
+
margin_v2 = threshold_v2 * mad / scale_factor
|
|
72
|
+
lower_mad_v2 = max(median - margin_v2,0)
|
|
73
|
+
upper_mad_v2 = median + margin_v2
|
|
74
|
+
|
|
75
|
+
group["Median"]=median
|
|
76
|
+
group['MAD'] = mad
|
|
77
|
+
#group['MAD2.5_low'] = lower_mad_v1
|
|
78
|
+
#group['MAD2.5_high'] = upper_mad_v1
|
|
79
|
+
group['MAD_low'] = lower_mad_v2
|
|
80
|
+
group['MAD_high'] = upper_mad_v2
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
group["Q1"]=Q1
|
|
84
|
+
group["Q3"]= Q3
|
|
85
|
+
group["IQR"]=IQR
|
|
86
|
+
group['IQR_low'] = lower_q
|
|
87
|
+
group['IQR_high'] = upper_q
|
|
88
|
+
|
|
89
|
+
"""
|
|
90
|
+
# ---- Isolation Forest ----
|
|
91
|
+
iso = IsolationForest(contamination=contamination, random_state=random_state)
|
|
92
|
+
preds = iso.fit_predict(group[[variable]])
|
|
93
|
+
scores = iso.decision_function(group[[variable]])
|
|
94
|
+
|
|
95
|
+
group["IsolationForest_score"] = scores
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
group['Percentile_anomaly'] = group[variable].apply(lambda val: classify(val, low_percentile, high_percentile))
|
|
99
|
+
group['SD_anomaly'] = group[variable].apply(lambda val: classify(val, lower_2sd, upper_2sd))
|
|
100
|
+
group['MAD_anomaly'] = group[variable].apply(lambda val: classify(val, lower_mad_v2, upper_mad_v2))
|
|
101
|
+
group['IQR_anomaly'] = group[variable].apply(lambda val: classify(val, lower_q, upper_q))
|
|
102
|
+
|
|
103
|
+
# Boolean anomaly flags
|
|
104
|
+
|
|
105
|
+
group['is_Percentile_anomaly'] = (group[variable] < low_percentile) | (group[variable] > high_percentile)
|
|
106
|
+
group['is_SD_anomaly'] = (group[variable] < lower_2sd) | (group[variable] > upper_2sd)
|
|
107
|
+
group['is_MAD_anomaly'] = (group[variable] < lower_mad_v2) | (group[variable] > upper_mad_v2)
|
|
108
|
+
group['is_IQR_anomaly'] = (group[variable] < lower_q) | (group[variable] > upper_q)
|
|
109
|
+
#group["is_IsolationForest_anomaly"] = preds == -1
|
|
110
|
+
|
|
111
|
+
return group
|
|
112
|
+
|