anomaly-pipeline 0.1.27__py3-none-any.whl → 0.1.61__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anomaly_pipeline/__init__.py +73 -1
- anomaly_pipeline/helpers/DB_scan.py +144 -10
- anomaly_pipeline/helpers/MAD.py +45 -0
- anomaly_pipeline/helpers/Preprocessing.py +274 -73
- anomaly_pipeline/helpers/STD.py +64 -0
- anomaly_pipeline/helpers/__init__.py +13 -1
- anomaly_pipeline/helpers/evaluation_info.py +25 -17
- anomaly_pipeline/helpers/evaluation_plots.py +636 -30
- anomaly_pipeline/helpers/ewma.py +105 -7
- anomaly_pipeline/helpers/fb_prophet.py +150 -2
- anomaly_pipeline/helpers/{help_info.py → help_anomaly.py} +194 -89
- anomaly_pipeline/helpers/iso_forest_general.py +5 -3
- anomaly_pipeline/helpers/iso_forest_timeseries.py +195 -23
- anomaly_pipeline/helpers/percentile.py +46 -3
- anomaly_pipeline/main.py +158 -39
- anomaly_pipeline/pipeline.py +106 -34
- anomaly_pipeline-0.1.61.dist-info/METADATA +275 -0
- anomaly_pipeline-0.1.61.dist-info/RECORD +24 -0
- anomaly_pipeline-0.1.27.dist-info/METADATA +0 -15
- anomaly_pipeline-0.1.27.dist-info/RECORD +0 -24
- {anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/WHEEL +0 -0
- {anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/entry_points.txt +0 -0
- {anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,52 @@ from .Preprocessing import classify
|
|
|
7
7
|
# Anomaly category columns (optional, keep if you still want string labels)
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
def detect_outliers_percentile(group, variable,date_column,eval_period):
|
|
10
|
+
def detect_outliers_percentile(group, variable, date_column, eval_period):
|
|
11
|
+
|
|
12
|
+
"""# 📈 PERCENTILE MODEL
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
The `detect_outliers_percentile` function is a robust anomaly detection tool designed to identify **statistical outliers** in
|
|
16
|
+
time series or grouped data using a dynamic, **expanding window percentile approach**.
|
|
17
|
+
|
|
18
|
+
## 📋 Functional Overview
|
|
19
|
+
The function operates by partitioning the data into an initial training set and a subsequent evaluation period. It establishes
|
|
20
|
+
**"normal" behavior** based on the 5th and 95th percentiles of the available historical data, flagging any value that falls
|
|
21
|
+
outside these bounds as an anomaly.
|
|
22
|
+
|
|
23
|
+
## 🧠 Core Logic Stages
|
|
24
|
+
|
|
25
|
+
### 1. Data Preparation and Validation
|
|
26
|
+
> **Minimum Threshold:** The function requires at least **10 data points** to run; otherwise, it returns an empty DataFrame to
|
|
27
|
+
prevent statistically insignificant results.
|
|
28
|
+
>
|
|
29
|
+
> **Copying:** It creates a copy of the input group to ensure the original data remains unaltered during the calculation process.
|
|
30
|
+
|
|
31
|
+
### 2. Initial Training Block
|
|
32
|
+
* **Static Baseline:** For the first part of the data (everything before the `eval_period`), the function calculates a single
|
|
33
|
+
static baseline using the 5th and 95th percentiles of the entire training block.
|
|
34
|
+
* **Classification:** It applies these fixed bounds to the training rows, labeling them using a helper `classify` function and
|
|
35
|
+
assigning a boolean `is_Percentile_anomaly` flag.
|
|
36
|
+
|
|
37
|
+
### 3. Expanding Window Evaluation
|
|
38
|
+
* **Sequential Testing:** For each data point in the evaluation period (the last *n* points specified by `eval_period`), the
|
|
39
|
+
function recalculates the percentiles using **all previously seen data points**.
|
|
40
|
+
* **Dynamic Adaptation:** As the loop progresses, the "training set" grows. This allows the model to adapt to gradual shifts in
|
|
41
|
+
the data distribution, as the thresholds for the current point are informed by every point that came before it.
|
|
42
|
+
* **Real-time Simulation:** By calculating the bounds for point $i$ based only on points $0$ to $i-1$, the function simulates how
|
|
43
|
+
the model would perform in a live environment.
|
|
44
|
+
|
|
45
|
+
## 📤 Key Output Columns
|
|
46
|
+
The function appends the following columns to the returned DataFrame:
|
|
47
|
+
* **`Percentile_low` / `Percentile_high`**: The specific thresholds used to evaluate that row.
|
|
48
|
+
* **`Percentile_anomaly`**: A categorical label (likely "High," "Low," or "Normal") generated by the external `classify` function.
|
|
49
|
+
* **`is_Percentile_anomaly`**: A boolean flag indicating whether the value was outside the 5%–95% range.
|
|
50
|
+
|
|
51
|
+
## 💡 Usage Context
|
|
52
|
+
This function is particularly useful for detecting spikes or drops in metrics where the underlying distribution might **drift
|
|
53
|
+
slowly over time**. By using percentiles rather than standard deviations, it is more resilient to extreme historical outliers
|
|
54
|
+
that might otherwise skew a mean-based threshold."""
|
|
55
|
+
|
|
11
56
|
n = len(group)
|
|
12
57
|
if n < 10:
|
|
13
58
|
# Optional: log specific keys if they exist in your scope
|
|
@@ -61,5 +106,3 @@ def detect_outliers_percentile(group, variable,date_column,eval_period):
|
|
|
61
106
|
group[date_column] = pd.to_datetime(group[date_column])
|
|
62
107
|
|
|
63
108
|
return group
|
|
64
|
-
|
|
65
|
-
|
anomaly_pipeline/main.py
CHANGED
|
@@ -1,63 +1,182 @@
|
|
|
1
1
|
from .pipeline import run_pipeline
|
|
2
|
+
import pandas as pd
|
|
2
3
|
|
|
3
|
-
def timeseries_anomaly_detection(
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
4
|
+
def timeseries_anomaly_detection(
|
|
5
|
+
master_data=None,
|
|
6
|
+
group_columns = None,
|
|
7
|
+
variable= None,
|
|
8
|
+
date_column = None,
|
|
9
|
+
freq="W-MON",
|
|
10
|
+
min_records=None,
|
|
11
|
+
max_records =None,
|
|
12
|
+
contamination=0.03,
|
|
13
|
+
random_state=42,
|
|
14
|
+
alpha=0.3,
|
|
15
|
+
sigma=1.5,
|
|
16
|
+
eval_period=1,
|
|
17
|
+
prophet_CI=0.90,
|
|
18
|
+
mad_threshold=2,
|
|
19
|
+
mad_scale_factor=0.6745
|
|
20
|
+
):
|
|
9
21
|
|
|
10
22
|
"""
|
|
11
23
|
Performs anomaly detection on grouped time-series data.
|
|
12
24
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
25
|
+
Timeseries_anomaly_detection is designed to identify anomalous values on a single column that is time-ordered. The data should have a time component. Currently, we support daily, weekly, and monthly data. Data for missing time units is interpolated. Maximum interpolation is 25% of the series. Combines 8 models (Statistical + ML) to provide a robust Anomaly_Score and a final is_Anomaly consensus. The pipeline utilizes an ensemble of the following methodologies:
|
|
26
|
+
|
|
27
|
+
- Statistical: Percentile (5th/95th), Standard Deviation (SD), Median Absolute Deviation (MAD), and Interquartile Range (IQR).
|
|
28
|
+
|
|
29
|
+
- Time-Series Specific: EWMA (Exponentially Weighted Moving Average) and FB Prophet (Walk-forward validation).
|
|
30
|
+
|
|
31
|
+
- Machine Learning: Isolation Forest (General & Time-series optimized) and DBSCAN.
|
|
32
|
+
|
|
33
|
+
For more information, ask for help on each specific model, such as below:
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
|
|
37
|
+
from anomaly_pipeline import help_anomaly
|
|
38
|
+
help_anomaly('fb')
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
# Mandatory Columns:
|
|
43
|
+
- master_data: Input DataFrame containing variables, dates, and group identifiers.
|
|
44
|
+
- group_columns: List of columns used to segment the data (e.g., ['Region', 'Product']).
|
|
45
|
+
- variable (numeric): The numerical target column to analyze for outliers.
|
|
46
|
+
- date_column: The datetime column representing the time axis.
|
|
47
|
+
|
|
48
|
+
# Default arguments:
|
|
49
|
+
- freq (str): Frequency of the time series (Pandas offset alias). Defaults to 'W-MON'.
|
|
50
|
+
- min_records: Minimum history required per group. Default is None; If None, extracts based on freq (1 Year + eval_period). Ex: if freq is weekly and eval_period is 1: min_records = 52+1.
|
|
51
|
+
- max_records: Maximum history to retain per group. Default is None; if provided, filters for the most recent N records.
|
|
52
|
+
- contamination (float): Expected proportion of outliers in the data (0 to 0.5). Defaults to 0.03.
|
|
53
|
+
- random_state (int): Seed for reproducibility in stochastic models. Defaults to 42.
|
|
54
|
+
- alpha (float): Smoothing factor for trend calculations. Defaults to 0.3.
|
|
55
|
+
- sigma (float): Standard deviation multiplier for thresholding. Defaults to 1.5.
|
|
56
|
+
- eval_period: The number of trailing records in each group to evaluate for anomalies.
|
|
57
|
+
- prophet_CI (float): The confidence level for the prediction interval (0 to 1). Defaults to 0.9.
|
|
31
58
|
|
|
32
59
|
Returns:
|
|
33
|
-
pd.DataFrame
|
|
60
|
+
tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
|
61
|
+
- final_results: The main dataframe containing original data, interpolated values,
|
|
62
|
+
forecasts, residuals, and anomaly flags (e.g., is_FB_anomaly, is_IQR_anomaly).
|
|
63
|
+
- success_report: A summary table for successful groups showing 'initial_record_count',
|
|
64
|
+
'interpolated_record_count', and 'interpolation_pct'.
|
|
65
|
+
- exclusion_report: A diagnostic table listing groups dropped from the analysis
|
|
66
|
+
and the specific reason (e.g., "Insufficient records" or "High Interpolation").
|
|
67
|
+
|
|
34
68
|
"""
|
|
69
|
+
# making robust with input parameters
|
|
70
|
+
if isinstance(group_columns, str):
|
|
71
|
+
group_columns = [group_columns]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# --- 1. MANDATORY PARAMETER VALIDATION ---
|
|
76
|
+
required_params = {
|
|
77
|
+
"master_data": master_data,
|
|
78
|
+
"group_columns": group_columns,
|
|
79
|
+
"variable": variable,
|
|
80
|
+
"date_column": date_column
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
missing_params = [name for name, val in required_params.items() if val is None]
|
|
35
84
|
|
|
36
|
-
|
|
85
|
+
if missing_params:
|
|
86
|
+
print("\n" + "!"*60)
|
|
87
|
+
print("❌ ERROR: MISSING REQUIRED PARAMETERS")
|
|
88
|
+
print("The following parameters are required to run the detection:")
|
|
89
|
+
for param in missing_params:
|
|
90
|
+
print(f" - {param}")
|
|
91
|
+
|
|
92
|
+
print("\n💡 HINT: Use help(timeseries_anomaly_detection) to see detailed")
|
|
93
|
+
print("descriptions and expected formats for each parameter.")
|
|
94
|
+
print("!"*60 + "\n")
|
|
95
|
+
return # Exit early
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# --- 2. MANDATORY COLUMN VALIDATION ---
|
|
99
|
+
mandatory_cols = group_columns + [variable, date_column]
|
|
100
|
+
missing_cols = [col for col in mandatory_cols if col not in master_data.columns]
|
|
101
|
+
|
|
102
|
+
if missing_cols:
|
|
103
|
+
raise ValueError(
|
|
104
|
+
f"CRITICAL ERROR: Mandatory columns missing from input DataFrame: {missing_cols}. "
|
|
105
|
+
f"Please ensure group_columns, variable, and date_column are correctly spelled."
|
|
106
|
+
)
|
|
107
|
+
return # Exit early
|
|
108
|
+
|
|
109
|
+
# Check if the variable is numeric
|
|
110
|
+
if not pd.api.types.is_numeric_dtype(master_data[variable]):
|
|
111
|
+
raise TypeError(f"CRITICAL: The variable '{variable}' must be numeric, but found {master_data[variable].dtype}.")
|
|
112
|
+
|
|
113
|
+
# --- 3. EXECUTE PIPELINE ---
|
|
114
|
+
# Store results in a local variable first
|
|
115
|
+
final_df, success_report, exclusion_report = run_pipeline(
|
|
37
116
|
master_data=master_data,
|
|
38
117
|
group_columns=group_columns,
|
|
39
118
|
variable=variable,
|
|
40
119
|
date_column=date_column,
|
|
41
120
|
freq=freq,
|
|
42
|
-
max_records=max_records,
|
|
43
121
|
min_records=min_records,
|
|
122
|
+
max_records=max_records,
|
|
44
123
|
contamination=contamination,
|
|
45
124
|
random_state=random_state,
|
|
46
125
|
alpha=alpha,
|
|
47
126
|
sigma=sigma,
|
|
48
127
|
eval_period=eval_period,
|
|
49
|
-
|
|
50
|
-
mad_threshold
|
|
51
|
-
mad_scale_factor
|
|
52
|
-
|
|
128
|
+
prophet_CI=prophet_CI,
|
|
129
|
+
mad_threshold=mad_threshold,
|
|
130
|
+
mad_scale_factor=mad_scale_factor
|
|
53
131
|
)
|
|
54
|
-
|
|
55
|
-
print("Anomaly pipeline successfully invoked via python -m!")
|
|
56
132
|
|
|
57
|
-
|
|
58
|
-
#
|
|
59
|
-
#
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
133
|
+
import inspect
|
|
134
|
+
# Inside your timeseries_anomaly_detection function:
|
|
135
|
+
# 1. Get the line of code that called this function
|
|
136
|
+
frame = inspect.currentframe().f_back
|
|
137
|
+
call_line = ""
|
|
138
|
+
if frame and inspect.getframeinfo(frame).code_context:
|
|
139
|
+
call_line = inspect.getframeinfo(frame).code_context[0].strip()
|
|
140
|
+
|
|
141
|
+
# 2. Check if the user assigned the result to variables
|
|
142
|
+
# We split by the function name and check the part before it (index 0)
|
|
143
|
+
is_assigned = False
|
|
144
|
+
if "timeseries_anomaly_detection" in call_line:
|
|
145
|
+
prefix = call_line.split("timeseries_anomaly_detection")[0]
|
|
146
|
+
# If there is exactly one '=', it's an assignment
|
|
147
|
+
if prefix.count("=") == 1:
|
|
148
|
+
is_assigned = True
|
|
149
|
+
|
|
150
|
+
# 3. If NOT assigned, trigger the "Auto-Save" to the global namespace
|
|
151
|
+
if not is_assigned:
|
|
152
|
+
from IPython import get_ipython
|
|
153
|
+
shell = get_ipython()
|
|
154
|
+
if shell:
|
|
155
|
+
shell.user_ns['final_results'] = final_df
|
|
156
|
+
shell.user_ns['success_report'] = success_report
|
|
157
|
+
shell.user_ns['exclusion_report'] = exclusion_report
|
|
63
158
|
|
|
159
|
+
print("\n" + "*"*60)
|
|
160
|
+
print("🚀 AUTO-SAVE: Variables were not assigned.")
|
|
161
|
+
print("The outputs have been saved globally for you as:")
|
|
162
|
+
print(" - final_results, success_report, exclusion_report")
|
|
163
|
+
print("*"*60 + "\n")
|
|
164
|
+
|
|
165
|
+
# 4. Final return logic
|
|
166
|
+
if is_assigned:
|
|
167
|
+
# Determine if the user assigned to a single variable or multiple
|
|
168
|
+
prefix = call_line.split("=")[0].strip()
|
|
169
|
+
|
|
170
|
+
# If there's no comma in the assignment prefix, they used a single variable
|
|
171
|
+
if "," not in prefix:
|
|
172
|
+
print(f"\n💡 INFO: You assigned the output to a single variable: '{prefix}'")
|
|
173
|
+
print(f" This variable is a tuple containing 3 DataFrames. Access them via:")
|
|
174
|
+
print(f" 1. Results Data: {prefix}[0]")
|
|
175
|
+
print(f" 2. Success Report: {prefix}[1]")
|
|
176
|
+
print(f" 3. Exclusion List: {prefix}[2]")
|
|
177
|
+
print(f" Or unpack them: final_df, success, exclusion = {prefix}\n")
|
|
178
|
+
|
|
179
|
+
return final_df, success_report, exclusion_report
|
|
180
|
+
else:
|
|
181
|
+
# Return None so Jupyter doesn't print the "wall of text"
|
|
182
|
+
return None
|
anomaly_pipeline/pipeline.py
CHANGED
|
@@ -11,10 +11,21 @@ from .helpers.ewma import ewma_with_anomalies_rolling_group
|
|
|
11
11
|
from .helpers.fb_prophet import detect_time_series_anomalies_fb_walkforward
|
|
12
12
|
from .helpers.iso_forest_timeseries import detect_time_series_anomalies_isoforest
|
|
13
13
|
from .helpers.DB_scan import detect_time_series_anomalies_dbscan
|
|
14
|
-
from .helpers.Preprocessing import create_full_calendar_and_interpolate,
|
|
14
|
+
from .helpers.Preprocessing import (create_full_calendar_and_interpolate,
|
|
15
|
+
print_anomaly_stats,
|
|
16
|
+
calculate_ensemble_scores,
|
|
17
|
+
min_records_extraction)
|
|
18
|
+
|
|
19
|
+
from .helpers.evaluation_plots import (summary_pie_plot,
|
|
20
|
+
anomaly_stacked_bar_plot,
|
|
21
|
+
avg_anomaly_score_plot,
|
|
22
|
+
anomaly_overview_plot)
|
|
23
|
+
|
|
24
|
+
from IPython.display import display, Markdown
|
|
25
|
+
|
|
15
26
|
|
|
16
27
|
def process_group(model, name, group, group_columns, variable,
|
|
17
|
-
date_column, alpha, sigma, eval_period,
|
|
28
|
+
date_column, alpha, sigma, eval_period, prophet_CI, contamination, random_state):
|
|
18
29
|
|
|
19
30
|
if model == "ISF_general":
|
|
20
31
|
return detect_outliers_isf_general(group, variable, contamination, random_state, eval_period)
|
|
@@ -26,7 +37,7 @@ def process_group(model, name, group, group_columns, variable,
|
|
|
26
37
|
|
|
27
38
|
if model == "FB":
|
|
28
39
|
return detect_time_series_anomalies_fb_walkforward(
|
|
29
|
-
group, variable, date_column, eval_period,
|
|
40
|
+
group, variable, date_column, eval_period, prophet_CI
|
|
30
41
|
)
|
|
31
42
|
|
|
32
43
|
if model == 'ISF_timeseries':
|
|
@@ -41,19 +52,28 @@ def process_group(model, name, group, group_columns, variable,
|
|
|
41
52
|
|
|
42
53
|
|
|
43
54
|
def run_pipeline(master_data, group_columns, variable,
|
|
44
|
-
date_column, freq,
|
|
45
|
-
max_records, min_records,
|
|
55
|
+
date_column, freq, min_records,max_records,
|
|
46
56
|
contamination, random_state,
|
|
47
57
|
alpha, sigma, eval_period,
|
|
48
|
-
|
|
58
|
+
prophet_CI, mad_threshold, mad_scale_factor):
|
|
59
|
+
|
|
60
|
+
if min_records is None:
|
|
61
|
+
min_records = min_records_extraction(freq,eval_period)
|
|
62
|
+
print(f"Min records needed to run an anomaly pipeline for a group is {min_records}")
|
|
63
|
+
|
|
64
|
+
if max_records is not None:
|
|
65
|
+
max_records = max_records + eval_period
|
|
66
|
+
print(f"Max records used to run an anomaly pipeline for a group is {max_records}")
|
|
49
67
|
|
|
50
68
|
# preprocess calendar
|
|
51
|
-
final_data = create_full_calendar_and_interpolate(
|
|
69
|
+
final_data, success_report, exclusion_report = create_full_calendar_and_interpolate(
|
|
52
70
|
master_data,
|
|
53
71
|
group_columns,
|
|
54
72
|
variable,
|
|
55
73
|
date_column,
|
|
56
|
-
freq
|
|
74
|
+
freq,
|
|
75
|
+
min_records,
|
|
76
|
+
max_records
|
|
57
77
|
)
|
|
58
78
|
|
|
59
79
|
groups = list(final_data.groupby(group_columns))
|
|
@@ -113,7 +133,7 @@ def run_pipeline(master_data, group_columns, variable,
|
|
|
113
133
|
|
|
114
134
|
|
|
115
135
|
## ISF_general
|
|
116
|
-
results_ISF_general = Parallel(n_jobs=-1, verbose=0)(delayed(process_group)('ISF_general', name, group, group_columns, variable,date_column, alpha, sigma, eval_period,
|
|
136
|
+
results_ISF_general = Parallel(n_jobs=-1, verbose=0)(delayed(process_group)('ISF_general', name, group, group_columns, variable,date_column, alpha, sigma, eval_period, prophet_CI, contamination, random_state) for name, group in groups)
|
|
117
137
|
|
|
118
138
|
|
|
119
139
|
# Combine results back
|
|
@@ -127,7 +147,7 @@ def run_pipeline(master_data, group_columns, variable,
|
|
|
127
147
|
## EWMA
|
|
128
148
|
results_EWMA = Parallel(n_jobs=-1, verbose=0)(
|
|
129
149
|
delayed(process_group)('EWMA', name, group,group_columns, variable, date_column,
|
|
130
|
-
alpha, sigma, eval_period,
|
|
150
|
+
alpha, sigma, eval_period, prophet_CI, contamination, random_state) for name, group in groups)
|
|
131
151
|
|
|
132
152
|
|
|
133
153
|
# Combine results back
|
|
@@ -138,15 +158,14 @@ def run_pipeline(master_data, group_columns, variable,
|
|
|
138
158
|
#print("anomaly_key_channel_EWMA data frame created")
|
|
139
159
|
#print(anomaly_key_channel_EWMA.head())
|
|
140
160
|
EWMA_cols = group_columns+[date_column]+['alpha', 'sigma', 'EWMA_forecast',
|
|
141
|
-
'STD', 'EWMA_high', 'EWMA_low','is_EWMA_anomaly']
|
|
161
|
+
'STD', 'EWMA_high', 'EWMA_low',"EWMA_residual", "EWMA_anomaly",'is_EWMA_anomaly']
|
|
142
162
|
|
|
143
163
|
anomaly_key_channel_EWMA_final = anomaly_key_channel_EWMA[EWMA_cols]
|
|
144
164
|
|
|
145
165
|
|
|
146
|
-
|
|
147
|
-
|
|
166
|
+
## FB
|
|
148
167
|
results_fb = Parallel(n_jobs=-1, verbose=0)(delayed(process_group)('FB', name, group,group_columns, variable,date_column,
|
|
149
|
-
alpha, sigma, eval_period,
|
|
168
|
+
alpha, sigma, eval_period,prophet_CI, contamination, random_state) for name, group in groups)
|
|
150
169
|
|
|
151
170
|
|
|
152
171
|
# Combine results back
|
|
@@ -166,7 +185,7 @@ def run_pipeline(master_data, group_columns, variable,
|
|
|
166
185
|
## Isolation Forest timeseries
|
|
167
186
|
results_ISF_timeseries = Parallel(n_jobs=-1, verbose=0)(
|
|
168
187
|
delayed(process_group)('ISF_timeseries', name, group,group_columns, variable, date_column,
|
|
169
|
-
alpha, sigma, eval_period,
|
|
188
|
+
alpha, sigma, eval_period, prophet_CI, contamination, random_state) for name, group in groups)
|
|
170
189
|
|
|
171
190
|
|
|
172
191
|
# Combine results back
|
|
@@ -175,7 +194,7 @@ def run_pipeline(master_data, group_columns, variable,
|
|
|
175
194
|
.sort_values(by=group_columns+[date_column])
|
|
176
195
|
)
|
|
177
196
|
#print(anomaly_key_channel_ISF_timeseries.head())
|
|
178
|
-
ISF_cols = group_columns+[date_column]+["IsolationForest_score_timeseries", "is_IsolationForest_anomaly_timeseries"]
|
|
197
|
+
ISF_cols = group_columns+[date_column]+["IsolationForest_score_timeseries", "IsolationForest_score_low_timeseries", "is_IsolationForest_anomaly_timeseries"]
|
|
179
198
|
anomaly_key_channel_ISF_timeseries_final = anomaly_key_channel_ISF_timeseries[ISF_cols]
|
|
180
199
|
|
|
181
200
|
#print("anomaly_key_channel_ISF_timeseries data frame created")
|
|
@@ -184,7 +203,7 @@ def run_pipeline(master_data, group_columns, variable,
|
|
|
184
203
|
## DB Scan
|
|
185
204
|
results_DB = Parallel(n_jobs=-1, verbose=0)(
|
|
186
205
|
delayed(process_group)('DBSCAN', name, group,group_columns, variable, date_column,
|
|
187
|
-
alpha, sigma, eval_period,
|
|
206
|
+
alpha, sigma, eval_period,prophet_CI, contamination, random_state) for name, group in groups)
|
|
188
207
|
|
|
189
208
|
# Combine results back
|
|
190
209
|
anomaly_key_channel_DB= (
|
|
@@ -196,7 +215,7 @@ def run_pipeline(master_data, group_columns, variable,
|
|
|
196
215
|
#print("anomaly_key_channel_DB data frame created")
|
|
197
216
|
#print(anomaly_key_channel_DB.head())
|
|
198
217
|
|
|
199
|
-
DB_cols = group_columns+[date_column]+["dbscan_score", "is_DBSCAN_anomaly"]
|
|
218
|
+
DB_cols = group_columns+[date_column]+["dbscan_score", "dbscan_score_high", "is_DBSCAN_anomaly"]
|
|
200
219
|
anomaly_key_channel_DB_final = anomaly_key_channel_DB[DB_cols]
|
|
201
220
|
|
|
202
221
|
# combine ISF general and timeseries data frames
|
|
@@ -209,12 +228,16 @@ def run_pipeline(master_data, group_columns, variable,
|
|
|
209
228
|
anomaly_key_channel_ISF['IsolationForest_score_general'],
|
|
210
229
|
anomaly_key_channel_ISF['IsolationForest_score_timeseries'])
|
|
211
230
|
|
|
231
|
+
anomaly_key_channel_ISF['IsolationForest_score_low'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
|
|
232
|
+
anomaly_key_channel_ISF['IsolationForest_score_low_general'],
|
|
233
|
+
anomaly_key_channel_ISF['IsolationForest_score_low_timeseries'])
|
|
234
|
+
|
|
212
235
|
# Column 2 Logic: If 'type' is train, take from 'IsolationForest_general', else take from 'IsolationForest_timeseries'
|
|
213
236
|
anomaly_key_channel_ISF['is_IsolationForest_anomaly'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
|
|
214
237
|
anomaly_key_channel_ISF['is_IsolationForest_anomaly_general'],
|
|
215
238
|
anomaly_key_channel_ISF['is_IsolationForest_anomaly_timeseries'])
|
|
216
239
|
|
|
217
|
-
ISF_cols = group_columns+[date_column]+['IsolationForest_score', 'is_IsolationForest_anomaly']
|
|
240
|
+
ISF_cols = group_columns+[date_column]+['IsolationForest_score', 'IsolationForest_score_low', 'is_IsolationForest_anomaly']
|
|
218
241
|
anomaly_key_channel_ISF_final = anomaly_key_channel_ISF[ISF_cols]
|
|
219
242
|
|
|
220
243
|
|
|
@@ -231,23 +254,72 @@ def run_pipeline(master_data, group_columns, variable,
|
|
|
231
254
|
anomaly = anomaly.merge(anomaly_key_channel_fb_final, on= group_columns+[date_column], how= 'inner')
|
|
232
255
|
anomaly = anomaly.merge(anomaly_key_channel_ISF_final, on= group_columns+[date_column], how= 'inner')
|
|
233
256
|
anomaly = anomaly.merge(anomaly_key_channel_DB_final, on= group_columns+[date_column], how= 'inner')
|
|
257
|
+
anomaly_final = calculate_ensemble_scores(anomaly, variable)
|
|
258
|
+
globals()['anomaly_df'] = anomaly_final
|
|
259
|
+
#print(anomaly_final.head())
|
|
260
|
+
#print(f"Successfully processed {len(success_report)} groups.")
|
|
261
|
+
#print(f"Excluded {len(exclusion_report)} groups due to low quality.")
|
|
262
|
+
|
|
263
|
+
print_anomaly_stats(anomaly_final, success_report, exclusion_report,group_columns,interpolation_method="linear")
|
|
234
264
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
265
|
+
# Plot summary charts
|
|
266
|
+
# ------------------------------
|
|
267
|
+
|
|
268
|
+
# Get data for pie chart
|
|
269
|
+
pie_chart_df = anomaly_final['is_Anomaly'].value_counts().reset_index()
|
|
270
|
+
pie_chart_df['is_Anomaly'] = np.where(pie_chart_df['is_Anomaly'] == True, 'Anomalous Records', 'Evaluated Records')
|
|
271
|
+
pie_chart_df = pie_chart_df.rename(columns={'is_Anomaly': 'Records'})
|
|
272
|
+
if len(exclusion_report) > 0:
|
|
273
|
+
pie_chart_df = pd.concat([pie_chart_df,
|
|
274
|
+
pd.DataFrame({'Records': ['Dropped Records'], 'count': [exclusion_report['dropped_records'].sum()]})])
|
|
275
|
+
exclusion_report = exclusion_report.drop(columns='dropped_records')
|
|
276
|
+
print("")
|
|
277
|
+
summary_pie_plot(pie_chart_df, title=f"Anomaly Detection Summary for {len(master_data[group_columns].drop_duplicates())} Groups")
|
|
278
|
+
anomaly_stacked_bar_plot(anomaly_final, group_columns, variable, date_column, secondary_line=variable)
|
|
279
|
+
anomaly_stacked_bar_plot(anomaly_final, group_columns, variable, date_column, secondary_line='Anomaly_Score')
|
|
280
|
+
avg_anomaly_score_plot(anomaly_final, group_columns, date_column)
|
|
281
|
+
|
|
282
|
+
top_5_anomaly_groups = anomaly_final.groupby(group_columns)['is_Anomaly'].agg(['mean', 'sum', 'count']).reset_index()\
|
|
283
|
+
.sort_values('mean', ascending=False).reset_index(drop=True).head(5)
|
|
241
284
|
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
285
|
+
eval_plots_msg = f"""
|
|
286
|
+
---
|
|
287
|
+
### Overall Evaluation Plots of the {len(top_5_anomaly_groups)} Groups with the Highest Anomaly Rates
|
|
288
|
+
|
|
289
|
+
Here is how to view detailed plots of individual anomaly detection models per group.\n
|
|
290
|
+
Start with the main (first) DataFrame returned from the timeseries_anomaly_detection function.\n
|
|
291
|
+
Suppose you called that DataFrame anomaly_df, that the group_columns are 'taxonomy' and 'channel', and that you want to see all the plots for the group where 'taxonomy' = 'tools' and 'channel' = 'mobile'.
|
|
292
|
+
Then you could run this code block:\n
|
|
293
|
+
|
|
294
|
+
```python
|
|
295
|
+
from anomaly_pipeline import evaluation_info
|
|
296
|
+
from anomaly_pipeline import help_anomaly
|
|
297
|
+
|
|
298
|
+
group_values = ['tools', 'mobile']
|
|
299
|
+
mask = anomaly_df[group_columns].eq(group_values).all(axis=1)
|
|
300
|
+
group_df = anomaly_df[mask]
|
|
301
|
+
|
|
302
|
+
evaluation_info(
|
|
303
|
+
group_df,
|
|
304
|
+
group_columns,
|
|
305
|
+
variable,
|
|
306
|
+
date_column,
|
|
307
|
+
eval_period)
|
|
308
|
+
```
|
|
309
|
+
---
|
|
310
|
+
"""
|
|
311
|
+
|
|
312
|
+
display(Markdown(eval_plots_msg))
|
|
248
313
|
|
|
249
|
-
|
|
314
|
+
group_nbr = 1
|
|
315
|
+
for group_key, group in top_5_anomaly_groups.groupby(group_columns, sort=False):
|
|
316
|
+
anomaly_rate = group['mean'].values[0]
|
|
317
|
+
group_df = anomaly_final.merge(group[group_columns], on=group_columns, how='inner')
|
|
318
|
+
group_id = group_df[group_columns].drop_duplicates().astype(str).apply(lambda x: ' -- '.join(x), axis=1).values[0]
|
|
319
|
+
group_msg = f"""#### #{group_nbr}, Anomaly Rate: {anomaly_rate:.1%}, Group: {group_id}"""
|
|
320
|
+
display(Markdown(group_msg))
|
|
321
|
+
anomaly_overview_plot(group_df, group_columns, variable, date_column, eval_period, show_anomaly_scores_on_main_plot=False)
|
|
322
|
+
group_nbr += 1
|
|
250
323
|
|
|
251
|
-
|
|
324
|
+
return anomaly_final, success_report, exclusion_report
|
|
252
325
|
|
|
253
|
-
return anomaly
|