anomaly-pipeline 0.1.27__py3-none-any.whl → 0.1.61__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anomaly_pipeline/__init__.py +73 -1
- anomaly_pipeline/helpers/DB_scan.py +144 -10
- anomaly_pipeline/helpers/MAD.py +45 -0
- anomaly_pipeline/helpers/Preprocessing.py +274 -73
- anomaly_pipeline/helpers/STD.py +64 -0
- anomaly_pipeline/helpers/__init__.py +13 -1
- anomaly_pipeline/helpers/evaluation_info.py +25 -17
- anomaly_pipeline/helpers/evaluation_plots.py +636 -30
- anomaly_pipeline/helpers/ewma.py +105 -7
- anomaly_pipeline/helpers/fb_prophet.py +150 -2
- anomaly_pipeline/helpers/{help_info.py → help_anomaly.py} +194 -89
- anomaly_pipeline/helpers/iso_forest_general.py +5 -3
- anomaly_pipeline/helpers/iso_forest_timeseries.py +195 -23
- anomaly_pipeline/helpers/percentile.py +46 -3
- anomaly_pipeline/main.py +158 -39
- anomaly_pipeline/pipeline.py +106 -34
- anomaly_pipeline-0.1.61.dist-info/METADATA +275 -0
- anomaly_pipeline-0.1.61.dist-info/RECORD +24 -0
- anomaly_pipeline-0.1.27.dist-info/METADATA +0 -15
- anomaly_pipeline-0.1.27.dist-info/RECORD +0 -24
- {anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/WHEEL +0 -0
- {anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/entry_points.txt +0 -0
- {anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/top_level.txt +0 -0
|
@@ -15,102 +15,303 @@ def create_full_calendar_and_interpolate(
|
|
|
15
15
|
group_columns,
|
|
16
16
|
variable,
|
|
17
17
|
date_column,
|
|
18
|
-
freq
|
|
18
|
+
freq,
|
|
19
|
+
min_records,
|
|
20
|
+
max_records
|
|
19
21
|
):
|
|
20
|
-
"""
|
|
21
|
-
Creates a complete weekly date range for each group,
|
|
22
|
-
merges with the master data, marks missing rows,
|
|
23
|
-
and fills missing values using linear interpolation.
|
|
24
|
-
|
|
25
|
-
Parameters
|
|
26
|
-
----------
|
|
27
|
-
master_data : pd.DataFrame
|
|
28
|
-
group_columns : list
|
|
29
|
-
One or multiple columns that define a group.
|
|
30
|
-
date_column : str
|
|
31
|
-
Name of the date column (must be datetime-like)
|
|
32
|
-
missing_check_cols : list
|
|
33
|
-
Columns used to detect missing values.
|
|
34
|
-
If None → ALL numeric columns will be used.
|
|
35
|
-
freq : str
|
|
36
|
-
Frequency for calendar generation (default weekly Mondays).
|
|
37
|
-
"""
|
|
38
|
-
|
|
39
|
-
# Ensure datetime
|
|
40
22
|
master_data[date_column] = pd.to_datetime(master_data[date_column])
|
|
41
|
-
|
|
23
|
+
|
|
42
24
|
full_group_data = []
|
|
25
|
+
success_metrics = []
|
|
26
|
+
dropped_metrics = []
|
|
43
27
|
|
|
44
28
|
for group_key, group in master_data.groupby(group_columns):
|
|
45
|
-
|
|
46
|
-
#
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
29
|
+
# Create a dictionary of the group keys for structured reporting
|
|
30
|
+
# This maps {col1: val1, col2: val2}
|
|
31
|
+
current_group_info = {
|
|
32
|
+
col: group_key[i] if isinstance(group_key, (tuple, list)) else group_key
|
|
33
|
+
for i, col in enumerate(group_columns)
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# 1. Calendar Generation
|
|
37
|
+
min_date, max_date = group[date_column].min(), group[date_column].max()
|
|
50
38
|
full_dates = pd.date_range(start=min_date, end=max_date, freq=freq)
|
|
39
|
+
|
|
40
|
+
if max_records is not None and len(full_dates) > max_records:
|
|
41
|
+
full_dates = full_dates[-max_records:]
|
|
51
42
|
|
|
52
|
-
#
|
|
53
|
-
calendar_dict =
|
|
54
|
-
for i, col in enumerate(group_columns)}
|
|
43
|
+
# 2. Expansion
|
|
44
|
+
calendar_dict = current_group_info.copy()
|
|
55
45
|
calendar_dict[date_column] = full_dates
|
|
56
|
-
|
|
57
46
|
full_calendar = pd.DataFrame(calendar_dict)
|
|
58
47
|
|
|
59
|
-
#
|
|
60
|
-
merged = full_calendar.merge(
|
|
61
|
-
group,
|
|
62
|
-
on=group_columns + [date_column],
|
|
63
|
-
how="left"
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
# ---- Step 3: Mark missing rows based on selected columns ----
|
|
67
|
-
merged["is_missing_record"] = merged[variable].isna()
|
|
48
|
+
# 3. Merge
|
|
49
|
+
merged = full_calendar.merge(group, on=group_columns + [date_column], how="left")
|
|
68
50
|
|
|
51
|
+
total_len = len(merged)
|
|
52
|
+
interpolated_count = merged[variable].isna().sum()
|
|
53
|
+
interpolation_rate = interpolated_count / total_len if total_len > 0 else 0
|
|
54
|
+
|
|
55
|
+
# --- Check 1: Min Records ---
|
|
56
|
+
if total_len < min_records:
|
|
57
|
+
drop_entry = current_group_info.copy()
|
|
58
|
+
drop_entry.update({
|
|
59
|
+
"reason": "Below Min Records",
|
|
60
|
+
"details": f"Total records {total_len} < {min_records}",
|
|
61
|
+
"dropped_records": total_len
|
|
62
|
+
})
|
|
63
|
+
dropped_metrics.append(drop_entry)
|
|
64
|
+
continue
|
|
69
65
|
|
|
70
|
-
#
|
|
71
|
-
|
|
66
|
+
# --- Check 2: Max Interpolation Rate ---
|
|
67
|
+
if interpolation_rate > 0.25:
|
|
68
|
+
drop_entry = current_group_info.copy()
|
|
69
|
+
drop_entry.update({
|
|
70
|
+
"reason": "High Interpolation",
|
|
71
|
+
"details": f"{interpolation_rate:.1%} > 25%",
|
|
72
|
+
"dropped_records": total_len
|
|
73
|
+
})
|
|
74
|
+
dropped_metrics.append(drop_entry)
|
|
75
|
+
continue
|
|
72
76
|
|
|
73
|
-
|
|
74
|
-
|
|
77
|
+
# --- Success: Interpolate ---
|
|
78
|
+
merged["is_missing_record"] = merged[variable].isna()
|
|
79
|
+
merged[variable] = merged[variable].interpolate(method="linear", limit_direction="both")
|
|
75
80
|
|
|
81
|
+
success_entry = current_group_info.copy()
|
|
82
|
+
success_entry.update({
|
|
83
|
+
"initial_records": len(group),
|
|
84
|
+
"interpolated_count": interpolated_count
|
|
85
|
+
"final_records": total_len,
|
|
86
|
+
"interpolation_pct": round(interpolation_rate * 100, 2)
|
|
87
|
+
})
|
|
88
|
+
success_metrics.append(success_entry)
|
|
76
89
|
full_group_data.append(merged)
|
|
77
90
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
91
|
+
# Convert lists of dicts to DataFrames
|
|
92
|
+
final_df = pd.concat(full_group_data, ignore_index=True) if full_group_data else pd.DataFrame()
|
|
93
|
+
success_report = pd.DataFrame(success_metrics)
|
|
94
|
+
exclusion_report = pd.DataFrame(dropped_metrics)
|
|
95
|
+
|
|
96
|
+
return final_df, success_report, exclusion_report
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def print_anomaly_stats(final_results, success_report, exclusion_report,group_columns,interpolation_method="linear"):
|
|
100
|
+
# 1. Calculate Global Counts
|
|
101
|
+
total_records = len(final_results)
|
|
102
|
+
total_anomalies = final_results['is_Anomaly'].fillna(False).astype(bool).sum()
|
|
103
|
+
anomaly_rate = (total_anomalies / total_records) * 100 if total_records > 0 else 0
|
|
104
|
+
|
|
105
|
+
# 2. Extract specific stats from reports
|
|
106
|
+
# If exclusion_report is passed but empty, len() returns 0
|
|
107
|
+
num_excluded = len(exclusion_report)
|
|
108
|
+
total_groups = len(success_report) + num_excluded
|
|
109
|
+
evaluated_groups = len(success_report)
|
|
110
|
+
|
|
111
|
+
# Interpolation stats
|
|
112
|
+
total_interpolated_records = success_report['interpolated_count'].sum() if not success_report.empty else 0
|
|
113
|
+
groups_with_interpolation = success_report[success_report['interpolated_count'] > 0].shape[0] if not success_report.empty else 0
|
|
114
|
+
|
|
115
|
+
# 3. Handle Exclusion stats (check if empty to avoid filtering errors)
|
|
116
|
+
if num_excluded > 0:
|
|
117
|
+
missing_data_exclusions = exclusion_report[exclusion_report['reason'] == "High Interpolation"].shape[0]
|
|
118
|
+
insufficient_history_exclusions = exclusion_report[exclusion_report['reason'] == "Below Min Records"].shape[0]
|
|
119
|
+
else:
|
|
120
|
+
missing_data_exclusions = 0
|
|
121
|
+
insufficient_history_exclusions = 0
|
|
81
122
|
|
|
123
|
+
# --- START PRINTING ---
|
|
124
|
+
print("\n" + "="*55)
|
|
125
|
+
print(f"{'ANOMALY DETECTION EXECUTIVE SUMMARY':^55}")
|
|
126
|
+
print("="*55)
|
|
127
|
+
|
|
128
|
+
stats_table = [
|
|
129
|
+
["Total Groups", f"{total_groups:,}"],
|
|
130
|
+
["Total Records", f"{total_records:,}"],
|
|
131
|
+
["Evaluated Groups", f"{evaluated_groups:,}"],
|
|
132
|
+
["Evaluated Records", f"{total_records:,}"],
|
|
133
|
+
["Evaluated Anomalies", f"{total_anomalies:,}"],
|
|
134
|
+
["Anomaly Rate", f"{anomaly_rate:.2f}%"]
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
for label, val in stats_table:
|
|
138
|
+
print(f"{label:<25} : {val:>25}")
|
|
139
|
+
|
|
140
|
+
print("-" * 55)
|
|
82
141
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
142
|
+
# Interpolation Details
|
|
143
|
+
print(f"INTERPOLATION REPORT:")
|
|
144
|
+
print(f"{total_interpolated_records:,} records were missing from {groups_with_interpolation} groups")
|
|
145
|
+
print(f"Values were interpolated using the {interpolation_method} method.")
|
|
146
|
+
print(f"Total {total_interpolated_records:,} additional records are added to the data.")
|
|
147
|
+
|
|
148
|
+
# Show 5 examples of interpolated records
|
|
149
|
+
if 'is_interpolated' in final_results.columns:
|
|
150
|
+
interpolated_samples = final_results[final_results['is_interpolated'] == True].head(5)
|
|
151
|
+
if not interpolated_samples.empty:
|
|
152
|
+
print("\nExample Interpolated Records:")
|
|
153
|
+
# Only show group columns, timestamp (ds), and value (y)
|
|
154
|
+
cols_to_show = group_columns + ['ds', 'y']
|
|
155
|
+
print(interpolated_samples[cols_to_show].to_string(index=False))
|
|
156
|
+
|
|
157
|
+
print("-" * 55)
|
|
89
158
|
|
|
90
|
-
|
|
91
|
-
print(f"
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
159
|
+
# Exclusion Details
|
|
160
|
+
print(f"EXCLUSION SUMMARY:")
|
|
161
|
+
if num_excluded > 0:
|
|
162
|
+
print(f"- {missing_data_exclusions} groups had >25% missing data and could not be interpolated.")
|
|
163
|
+
print(f"- {insufficient_history_exclusions} groups lacked the minimum historical data to train.")
|
|
164
|
+
print(f"See exclusion_report for full list of IDs.")
|
|
165
|
+
else:
|
|
166
|
+
print("- No groups were excluded from this run.")
|
|
97
167
|
|
|
98
|
-
|
|
99
|
-
print(f"Top 5 Groups by Anomaly Rate ({' > '.join(group_columns)}):")
|
|
168
|
+
print("-" * 55)
|
|
100
169
|
|
|
101
|
-
#
|
|
102
|
-
|
|
103
|
-
group_stats =
|
|
170
|
+
# Group Breakdown
|
|
171
|
+
print(f"TOP 5 GROUPS BY ANOMALY RATE ({' > '.join(group_columns)}):")
|
|
172
|
+
group_stats = final_results.groupby(group_columns)['is_Anomaly'].agg(['mean', 'sum']).sort_values(by='mean', ascending=False).head(5)
|
|
104
173
|
|
|
105
174
|
for label, row in group_stats.iterrows():
|
|
106
|
-
# Handle
|
|
107
|
-
group_label = label if isinstance(label, str) else " | ".join(map(str, label))
|
|
108
|
-
|
|
109
|
-
|
|
175
|
+
# Handle tuple-based index for multi-grouping
|
|
176
|
+
group_label = label if isinstance(label, (str, int)) else " | ".join(map(str, label))
|
|
177
|
+
print(f" - {group_label:<30} : {row['mean']*100:>6.2f}% ({int(row['sum'])} anomalies)")
|
|
178
|
+
|
|
179
|
+
print("="*55 + "\n")
|
|
180
|
+
|
|
181
|
+
def calculate_ensemble_scores(df, variable):
|
|
182
|
+
"""
|
|
183
|
+
Calculates the normalized consensus score across all anomaly models.
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
# Identify all columns that are model flags (is_..._anomaly)
|
|
187
|
+
anomaly_flags = [col for col in df.columns if col.startswith('is_') and col.endswith('_anomaly') and col != 'is_Anomaly']
|
|
188
|
+
|
|
189
|
+
# 1. Total Votes (Count of True)
|
|
190
|
+
df['Anomaly_Votes'] = df[anomaly_flags].sum(axis=1).astype(int)
|
|
191
|
+
|
|
192
|
+
# 2. Total Models active for that row (Count of non-NaN values)
|
|
193
|
+
df['Vote_Cnt'] = df[anomaly_flags].notna().sum(axis=1).astype(int)
|
|
194
|
+
|
|
195
|
+
# 3. Anomaly Votes Score Display (x out of N)
|
|
196
|
+
df['Anomaly_Votes_Display'] = df['Anomaly_Votes'].astype(int).astype(str) + " out of " + df['Vote_Cnt'].astype(int).astype(str)
|
|
197
|
+
|
|
198
|
+
# 5. Final Boolean Consensus (e.g., majority rule)
|
|
199
|
+
df['is_Anomaly'] = df['Anomaly_Votes'] / df['Vote_Cnt'] >= 0.5
|
|
200
|
+
|
|
201
|
+
# 6. Scale all the model scores to be between -1 and 1
|
|
202
|
+
try:
|
|
203
|
+
df['Percentile_score_scaled'] = np.where(df['is_Percentile_anomaly'].isna()==False,
|
|
204
|
+
abs(df[variable] - (df['Percentile_high'] + df['Percentile_low'])/2)/((df['Percentile_high'] - df['Percentile_low'])/2) - 1,
|
|
205
|
+
np.nan)
|
|
206
|
+
df['Percentile_score_scaled'] = df['Percentile_score_scaled']/abs(df['Percentile_score_scaled']).max()
|
|
207
|
+
except:
|
|
208
|
+
pass
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
df['SD_score_scaled'] = np.where(df['is_SD_anomaly'].isna()==False,
|
|
212
|
+
abs(df[variable] - (df['SD2_high'] + df['SD2_low'])/2)/((df['SD2_high'] - df['SD2_low'])/2) - 1,
|
|
213
|
+
np.nan)
|
|
214
|
+
df['SD_score_scaled'] = df['SD_score_scaled']/abs(df['SD_score_scaled']).max()
|
|
215
|
+
except:
|
|
216
|
+
pass
|
|
217
|
+
|
|
218
|
+
try:
|
|
219
|
+
df['MAD_score_scaled'] = np.where(df['is_MAD_anomaly'].isna()==False,
|
|
220
|
+
abs(df[variable] - (df['MAD_high'] + df['MAD_low'])/2)/((df['MAD_high'] - df['MAD_low'])/2) - 1,
|
|
221
|
+
np.nan)
|
|
222
|
+
df['MAD_score_scaled'] = df['MAD_score_scaled']/abs(df['MAD_score_scaled']).max()
|
|
223
|
+
except:
|
|
224
|
+
pass
|
|
225
|
+
|
|
226
|
+
try:
|
|
227
|
+
df['IQR_score_scaled'] = np.where(df['is_IQR_anomaly'].isna()==False,
|
|
228
|
+
abs(df[variable] - (df['IQR_high'] + df['IQR_low'])/2)/((df['IQR_high'] - df['IQR_low'])/2) - 1,
|
|
229
|
+
np.nan)
|
|
230
|
+
df['IQR_score_scaled'] = df['IQR_score_scaled']/abs(df['IQR_score_scaled']).max()
|
|
231
|
+
except:
|
|
232
|
+
pass
|
|
233
|
+
|
|
234
|
+
try:
|
|
235
|
+
df['EWMA_score_scaled'] = np.where(df['is_EWMA_anomaly'].isna()==False,
|
|
236
|
+
abs(df[variable] - (df['EWMA_high'] + df['EWMA_low'])/2)/((df['EWMA_high'] - df['EWMA_low'])/2) - 1,
|
|
237
|
+
np.nan)
|
|
238
|
+
df['EWMA_score_scaled'] = df['EWMA_score_scaled']/abs(df['EWMA_score_scaled']).max()
|
|
239
|
+
except:
|
|
240
|
+
pass
|
|
241
|
+
|
|
242
|
+
try:
|
|
243
|
+
df['FB_score_scaled'] = np.where(df['is_FB_anomaly'].isna()==False,
|
|
244
|
+
abs(df[variable] - (df['FB_high'] + df['FB_low'])/2)/((df['FB_high'] - df['FB_low'])/2) - 1,
|
|
245
|
+
np.nan)
|
|
246
|
+
df['FB_score_scaled'] = df['FB_score_scaled']/abs(df['FB_score_scaled']).max()
|
|
247
|
+
except:
|
|
248
|
+
pass
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
df['IsoForest_score_scaled'] = np.where(df['is_IsolationForest_anomaly'].isna()==False,
|
|
252
|
+
df['IsolationForest_score'] - df['IsolationForest_score_low'],
|
|
253
|
+
np.nan)
|
|
254
|
+
df['IsoForest_score_scaled'] = df['IsoForest_score_scaled']/abs(df['IsoForest_score_scaled']).max()
|
|
255
|
+
except:
|
|
256
|
+
pass
|
|
257
|
+
|
|
258
|
+
try:
|
|
259
|
+
df['dbscan_score_scaled'] = np.where(df['is_DBSCAN_anomaly'].isna()==False, df['dbscan_score_high'] - df['dbscan_score'], np.nan)
|
|
260
|
+
df['dbscan_score_scaled'] = df['dbscan_score_scaled']/abs(df['dbscan_score_scaled']).max()
|
|
261
|
+
except:
|
|
262
|
+
pass
|
|
263
|
+
|
|
264
|
+
score_scaled_cols = []
|
|
265
|
+
for col in df.columns.to_list():
|
|
266
|
+
if '_score_scaled' in col:
|
|
267
|
+
score_scaled_cols.append(col)
|
|
268
|
+
|
|
269
|
+
df['Anomaly_Score'] = df[score_scaled_cols].mean(axis=1)
|
|
270
|
+
# Rescale all non anomalies between 0 and 0.5 and anomalies between 0.5 and 1.0
|
|
271
|
+
if len(df[df['is_Anomaly'] == True]) >= 1:
|
|
272
|
+
# df.loc[df['is_Anomaly'] == True, 'Anomaly_Score'] = ((df.loc[df['is_Anomaly'] == True, 'Anomaly_Score'] + 1) * 0.245) + 0.51
|
|
110
273
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
274
|
+
is_anomaly_min = df[df['is_Anomaly'] == True]['Anomaly_Score'].min()
|
|
275
|
+
is_anomaly_max = df[df['is_Anomaly'] == True]['Anomaly_Score'].max()
|
|
276
|
+
# Scale to [0, 0.49] based on actual data range
|
|
277
|
+
if is_anomaly_max == is_anomaly_min:
|
|
278
|
+
df.loc[df['is_Anomaly'] == True, 'Anomaly_Score'] = df.loc[df['is_Anomaly'] == True, 'Anomaly_Score'] * 0 + 0.51
|
|
279
|
+
else:
|
|
280
|
+
df.loc[df['is_Anomaly'] == True, 'Anomaly_Score'] = (((df.loc[df['is_Anomaly'] == True, 'Anomaly_Score'] - is_anomaly_min) / (is_anomaly_max - is_anomaly_min)) * 0.48) + 0.52
|
|
114
281
|
|
|
115
|
-
|
|
282
|
+
if len(df[df['is_Anomaly'] == False]) >= 1:
|
|
283
|
+
not_anomaly_min = df[df['is_Anomaly'] == False]['Anomaly_Score'].min()
|
|
284
|
+
not_anomaly_max = df[df['is_Anomaly'] == False]['Anomaly_Score'].max()
|
|
285
|
+
# Scale to [0, 0.49] based on actual data range
|
|
286
|
+
if not_anomaly_max == not_anomaly_min:
|
|
287
|
+
df.loc[df['is_Anomaly'] == False, 'Anomaly_Score'] = df.loc[df['is_Anomaly'] == False, 'Anomaly_Score'] * 0 # Default to 0 if constant
|
|
288
|
+
else:
|
|
289
|
+
df.loc[df['is_Anomaly'] == False, 'Anomaly_Score'] = ((df.loc[df['is_Anomaly'] == False, 'Anomaly_Score'] - not_anomaly_min) / (not_anomaly_max - not_anomaly_min)) * 0.48
|
|
290
|
+
|
|
291
|
+
df['Anomaly_Score_Display'] = np.where(df['is_Anomaly'] == True, np.ceil(100 * df['Anomaly_Score']), np.floor(100 * df['Anomaly_Score'])).astype(int)
|
|
292
|
+
|
|
293
|
+
# 7. Reposition is_Anomaly column to the end
|
|
294
|
+
df['is_Anomaly'] = df.pop('is_Anomaly')
|
|
295
|
+
|
|
296
|
+
return df
|
|
116
297
|
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def min_records_extraction(freq,eval_period):
|
|
301
|
+
freq_upper = freq.upper()
|
|
302
|
+
|
|
303
|
+
if freq_upper.startswith('W'):
|
|
304
|
+
annual_count = 52
|
|
305
|
+
elif freq_upper.startswith('D') or freq_upper.startswith('B'):
|
|
306
|
+
annual_count = 365
|
|
307
|
+
elif freq_upper.startswith('M'):
|
|
308
|
+
annual_count = 12
|
|
309
|
+
else:
|
|
310
|
+
# Fallback to weekly if custom/unknown
|
|
311
|
+
annual_count = 52
|
|
312
|
+
|
|
313
|
+
# Logic: 1 year for min, 2 years for max
|
|
314
|
+
min_records = annual_count + eval_period
|
|
315
|
+
#max_records = (2 * annual_count) + eval_period
|
|
316
|
+
|
|
317
|
+
return min_records
|
anomaly_pipeline/helpers/STD.py
CHANGED
|
@@ -3,6 +3,70 @@ import numpy as np
|
|
|
3
3
|
from .Preprocessing import classify
|
|
4
4
|
|
|
5
5
|
def detect_outliers_sd(group, variable, date_column, eval_period):
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# 📈 Standard-Deviation–Based Outlier Detection (Expanding Window)
|
|
9
|
+
|
|
10
|
+
## **Function:** `detect_outliers_sd`
|
|
11
|
+
|
|
12
|
+
This function detects anomalies in a time series using a mean ± 2 standard deviation (SD) rule, applied in a train–test, expanding-window framework.
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## 🔍 **What the Function Does**
|
|
17
|
+
|
|
18
|
+
### **1. Minimum Data Requirement**
|
|
19
|
+
- Requires **at least 10 observations**
|
|
20
|
+
- Returns an empty DataFrame if insufficient data is provided
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## 🏋️ **Training Phase**
|
|
25
|
+
*(Initial fixed window)*
|
|
26
|
+
|
|
27
|
+
- Uses all observations **prior to the evaluation period**
|
|
28
|
+
- Computes:
|
|
29
|
+
- **Mean**
|
|
30
|
+
- **Standard Deviation**
|
|
31
|
+
- **Lower bound:** `max(mean − 2 × SD, 0)`
|
|
32
|
+
- **Upper bound:** `mean + 2 × SD`
|
|
33
|
+
- Flags anomalies where values fall **outside the 2-SD range**
|
|
34
|
+
- Labels rows as **TRAIN**
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## 🔁 **Evaluation Phase**
|
|
39
|
+
*(Expanding window)*
|
|
40
|
+
|
|
41
|
+
For each step in the evaluation period:
|
|
42
|
+
- Expands the training window to include all prior observations
|
|
43
|
+
- Recomputes **mean and SD dynamically**
|
|
44
|
+
- Recalculates anomaly bounds
|
|
45
|
+
- Tests the current observation against updated bounds
|
|
46
|
+
- Labels rows as **TEST**
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## 🚨 **Anomaly Classification**
|
|
51
|
+
|
|
52
|
+
Each observation receives:
|
|
53
|
+
- **`SD_anomaly`** → categorical label via `classify()`
|
|
54
|
+
- **`is_SD_anomaly`** → boolean flag
|
|
55
|
+
- `True` if outside ±2 SD
|
|
56
|
+
- `False` otherwise
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## 📊 **Output Columns Added**
|
|
61
|
+
|
|
62
|
+
- **Mean**
|
|
63
|
+
- **SD**
|
|
64
|
+
- **SD2_low**
|
|
65
|
+
- **SD2_high**
|
|
66
|
+
- **set** (`TRAIN` or `TEST`)
|
|
67
|
+
- **SD_anomaly**
|
|
68
|
+
- **is_SD_anomaly**"""
|
|
69
|
+
|
|
6
70
|
n = len(group)
|
|
7
71
|
# checking the min_size requirements
|
|
8
72
|
if n < 10:
|
|
@@ -1 +1,13 @@
|
|
|
1
|
-
from .
|
|
1
|
+
from .help_anomaly import help_anomaly, get_example_df
|
|
2
|
+
from .evaluation_info import evaluation_info
|
|
3
|
+
from .evaluation_plots import (
|
|
4
|
+
anomaly_overview_plot,
|
|
5
|
+
anomaly_percentile_plot,
|
|
6
|
+
anomaly_sd_plot,
|
|
7
|
+
anomaly_mad_plot,
|
|
8
|
+
anomaly_iqr_plot,
|
|
9
|
+
anomaly_ewma_plot,
|
|
10
|
+
anomaly_fb_plot,
|
|
11
|
+
anomaly_dbscan_plot,
|
|
12
|
+
anomaly_isolation_forest_plot
|
|
13
|
+
)
|
|
@@ -1,8 +1,15 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
import numpy as np
|
|
3
3
|
from IPython.display import display, Markdown
|
|
4
|
-
from
|
|
5
|
-
|
|
4
|
+
from .evaluation_plots import (anomaly_overview_plot,
|
|
5
|
+
anomaly_percentile_plot,
|
|
6
|
+
anomaly_sd_plot,
|
|
7
|
+
anomaly_mad_plot,
|
|
8
|
+
anomaly_iqr_plot,
|
|
9
|
+
anomaly_ewma_plot,
|
|
10
|
+
anomaly_fb_plot,
|
|
11
|
+
anomaly_dbscan_plot,
|
|
12
|
+
anomaly_isolation_forest_plot)
|
|
6
13
|
|
|
7
14
|
|
|
8
15
|
def evaluation_info(
|
|
@@ -10,7 +17,7 @@ def evaluation_info(
|
|
|
10
17
|
group_columns,
|
|
11
18
|
variable,
|
|
12
19
|
date_column,
|
|
13
|
-
eval_period
|
|
20
|
+
eval_period,
|
|
14
21
|
models_to_plot=[]
|
|
15
22
|
):
|
|
16
23
|
|
|
@@ -42,7 +49,7 @@ def evaluation_info(
|
|
|
42
49
|
interpolation_msg = ""
|
|
43
50
|
|
|
44
51
|
no_eval_groups = (
|
|
45
|
-
eval_df.groupby(
|
|
52
|
+
eval_df.groupby(group_columns)['is_Anomaly']\
|
|
46
53
|
.agg(is_all_na=lambda x: x.isna().all(), historical_data_points='size')\
|
|
47
54
|
.reset_index()
|
|
48
55
|
)
|
|
@@ -81,7 +88,7 @@ To increase the chance of evaluating these records, lower the `eval_period` para
|
|
|
81
88
|
display(Markdown(eval_msg1))
|
|
82
89
|
|
|
83
90
|
if interpolated_cnt >= 1:
|
|
84
|
-
display(eval_df[eval_df['is_missing_record'] == True].sample(5))
|
|
91
|
+
display(eval_df[eval_df['is_missing_record'] == True].sample(min(interpolated_cnt, 5)))
|
|
85
92
|
|
|
86
93
|
display(Markdown(eval_msg2))
|
|
87
94
|
|
|
@@ -102,20 +109,21 @@ To increase the chance of evaluating these records, lower the `eval_period` para
|
|
|
102
109
|
|
|
103
110
|
for model in models_to_plot:
|
|
104
111
|
if model == 'overall':
|
|
105
|
-
|
|
112
|
+
anomaly_overview_plot(group_df, group_columns, variable, date_column, eval_period=12, show_anomaly_scores_on_main_plot=False)
|
|
106
113
|
elif model == 'percentile':
|
|
107
|
-
anomaly_percentile_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column,
|
|
108
|
-
elif model == 'iqr':
|
|
109
|
-
anomaly_iqr_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, final_anomalies=False, eval_period=12)
|
|
110
|
-
elif model == 'mad':
|
|
111
|
-
anomaly_mad_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, final_anomalies=False, eval_period=12)
|
|
114
|
+
anomaly_percentile_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, eval_period=eval_period, final_anomalies=False)
|
|
112
115
|
elif model == 'std':
|
|
113
|
-
anomaly_sd_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column,
|
|
116
|
+
anomaly_sd_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, eval_period=eval_period, final_anomalies=False)
|
|
117
|
+
elif model == 'mad':
|
|
118
|
+
anomaly_mad_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, eval_period=eval_period, final_anomalies=False)
|
|
119
|
+
elif model == 'iqr':
|
|
120
|
+
anomaly_iqr_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, eval_period=eval_period, final_anomalies=False)
|
|
114
121
|
elif model == 'ewma':
|
|
115
|
-
anomaly_ewma_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column,
|
|
122
|
+
anomaly_ewma_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, eval_period=eval_period, final_anomalies=False)
|
|
116
123
|
elif model == 'prophet':
|
|
117
|
-
anomaly_fb_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column,
|
|
118
|
-
elif model == 'dbscan':
|
|
119
|
-
anomaly_dbscan_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, final_anomalies=False, eval_period=12)
|
|
124
|
+
anomaly_fb_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, eval_period=eval_period, final_anomalies=False)
|
|
120
125
|
elif model == 'isolation_forest':
|
|
121
|
-
|
|
126
|
+
anomaly_isolation_forest_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, eval_period=eval_period, final_anomalies=False)
|
|
127
|
+
elif model == 'dbscan':
|
|
128
|
+
anomaly_dbscan_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, eval_period=eval_period, final_anomalies=False)
|
|
129
|
+
|