anomaly-pipeline 0.1.27__py3-none-any.whl → 0.1.61__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,52 @@ from .Preprocessing import classify
7
7
  # Anomaly category columns (optional, keep if you still want string labels)
8
8
 
9
9
 
10
- def detect_outliers_percentile(group, variable,date_column,eval_period):
10
+ def detect_outliers_percentile(group, variable, date_column, eval_period):
11
+
12
+ """# 📈 PERCENTILE MODEL
13
+ ---
14
+
15
+ The `detect_outliers_percentile` function is a robust anomaly detection tool designed to identify **statistical outliers** in
16
+ time series or grouped data using a dynamic, **expanding window percentile approach**.
17
+
18
+ ## 📋 Functional Overview
19
+ The function operates by partitioning the data into an initial training set and a subsequent evaluation period. It establishes
20
+ **"normal" behavior** based on the 5th and 95th percentiles of the available historical data, flagging any value that falls
21
+ outside these bounds as an anomaly.
22
+
23
+ ## 🧠 Core Logic Stages
24
+
25
+ ### 1. Data Preparation and Validation
26
+ > **Minimum Threshold:** The function requires at least **10 data points** to run; otherwise, it returns an empty DataFrame to
27
+ prevent statistically insignificant results.
28
+ >
29
+ > **Copying:** It creates a copy of the input group to ensure the original data remains unaltered during the calculation process.
30
+
31
+ ### 2. Initial Training Block
32
+ * **Static Baseline:** For the first part of the data (everything before the `eval_period`), the function calculates a single
33
+ static baseline using the 5th and 95th percentiles of the entire training block.
34
+ * **Classification:** It applies these fixed bounds to the training rows, labeling them using a helper `classify` function and
35
+ assigning a boolean `is_Percentile_anomaly` flag.
36
+
37
+ ### 3. Expanding Window Evaluation
38
+ * **Sequential Testing:** For each data point in the evaluation period (the last *n* points specified by `eval_period`), the
39
+ function recalculates the percentiles using **all previously seen data points**.
40
+ * **Dynamic Adaptation:** As the loop progresses, the "training set" grows. This allows the model to adapt to gradual shifts in
41
+ the data distribution, as the thresholds for the current point are informed by every point that came before it.
42
+ * **Real-time Simulation:** By calculating the bounds for point $i$ based only on points $0$ to $i-1$, the function simulates how
43
+ the model would perform in a live environment.
44
+
45
+ ## 📤 Key Output Columns
46
+ The function appends the following columns to the returned DataFrame:
47
+ * **`Percentile_low` / `Percentile_high`**: The specific thresholds used to evaluate that row.
48
+ * **`Percentile_anomaly`**: A categorical label (likely "High," "Low," or "Normal") generated by the external `classify` function.
49
+ * **`is_Percentile_anomaly`**: A boolean flag indicating whether the value was outside the 5%–95% range.
50
+
51
+ ## 💡 Usage Context
52
+ This function is particularly useful for detecting spikes or drops in metrics where the underlying distribution might **drift
53
+ slowly over time**. By using percentiles rather than standard deviations, it is more resilient to extreme historical outliers
54
+ that might otherwise skew a mean-based threshold."""
55
+
11
56
  n = len(group)
12
57
  if n < 10:
13
58
  # Optional: log specific keys if they exist in your scope
@@ -61,5 +106,3 @@ def detect_outliers_percentile(group, variable,date_column,eval_period):
61
106
  group[date_column] = pd.to_datetime(group[date_column])
62
107
 
63
108
  return group
64
-
65
-
anomaly_pipeline/main.py CHANGED
@@ -1,63 +1,182 @@
1
1
  from .pipeline import run_pipeline
2
+ import pandas as pd
2
3
 
3
- def timeseries_anomaly_detection(master_data, group_columns, variable,
4
- date_column="week_start", freq="W-MON",
5
- max_records=104, min_records=15,
6
- contamination=0.03, random_state=42,
7
- alpha=0.3, sigma=1.5, eval_period=12,
8
- interval_width=0.90, mad_threshold = 2, mad_scale_factor = 0.6745):
4
+ def timeseries_anomaly_detection(
5
+ master_data=None,
6
+ group_columns = None,
7
+ variable= None,
8
+ date_column = None,
9
+ freq="W-MON",
10
+ min_records=None,
11
+ max_records =None,
12
+ contamination=0.03,
13
+ random_state=42,
14
+ alpha=0.3,
15
+ sigma=1.5,
16
+ eval_period=1,
17
+ prophet_CI=0.90,
18
+ mad_threshold=2,
19
+ mad_scale_factor=0.6745
20
+ ):
9
21
 
10
22
  """
11
23
  Performs anomaly detection on grouped time-series data.
12
24
 
13
- This function identifies outliers within specific groups of data by analyzing
14
- historical trends, applying statistical thresholds, and calculating
15
- prediction intervals.
16
-
17
- Args:
18
- master_data (pd.DataFrame): The input dataset containing the time series.
19
- group_columns (list[str]): Columns used to partition the data (e.g., ['store_id', 'item_id']).
20
- variable (str): The target numerical column to analyze for anomalies.
21
- date_column (str): The column containing datetime information. Defaults to 'week_start'.
22
- freq (str): Frequency of the time series (Pandas offset alias). Defaults to 'W-MON'.
23
- max_records (int): Maximum historical records to consider for the model. Defaults to 104.
24
- min_records (int): Minimum records required to perform detection. Defaults to 15.
25
- contamination (float): Expected proportion of outliers in the data (0 to 0.5). Defaults to 0.03.
26
- random_state (int): Seed for reproducibility in stochastic models. Defaults to 42.
27
- alpha (float): Smoothing factor for trend calculations. Defaults to 0.3.
28
- sigma (float): Standard deviation multiplier for thresholding. Defaults to 1.5.
29
- eval_periods (int): Number of recent periods to evaluate for anomalies. Defaults to 12.
30
- interval_width (float): The confidence level for the prediction interval (0 to 1). Defaults to 0.9.
25
+ Timeseries_anomaly_detection is designed to identify anomalous values on a single column that is time-ordered. The data should have a time component. Currently, we support daily, weekly, and monthly data. Data for missing time units is interpolated. Maximum interpolation is 25% of the series. Combines 8 models (Statistical + ML) to provide a robust Anomaly_Score and a final is_Anomaly consensus. The pipeline utilizes an ensemble of the following methodologies:
26
+
27
+ - Statistical: Percentile (5th/95th), Standard Deviation (SD), Median Absolute Deviation (MAD), and Interquartile Range (IQR).
28
+
29
+ - Time-Series Specific: EWMA (Exponentially Weighted Moving Average) and FB Prophet (Walk-forward validation).
30
+
31
+ - Machine Learning: Isolation Forest (General & Time-series optimized) and DBSCAN.
32
+
33
+ For more information, ask for help on each specific model, such as below:
34
+
35
+ ```python
36
+
37
+ from anomaly_pipeline import help_anomaly
38
+ help_anomaly('fb')
39
+
40
+ ```
41
+
42
+ # Mandatory Columns:
43
+ - master_data: Input DataFrame containing variables, dates, and group identifiers.
44
+ - group_columns: List of columns used to segment the data (e.g., ['Region', 'Product']).
45
+ - variable (numeric): The numerical target column to analyze for outliers.
46
+ - date_column: The datetime column representing the time axis.
47
+
48
+ # Default arguments:
49
+ - freq (str): Frequency of the time series (Pandas offset alias). Defaults to 'W-MON'.
50
+ - min_records: Minimum history required per group. Default is None; If None, extracts based on freq (1 Year + eval_period). Ex: if freq is weekly and eval_period is 1: min_records = 52+1.
51
+ - max_records: Maximum history to retain per group. Default is None; if provided, filters for the most recent N records.
52
+ - contamination (float): Expected proportion of outliers in the data (0 to 0.5). Defaults to 0.03.
53
+ - random_state (int): Seed for reproducibility in stochastic models. Defaults to 42.
54
+ - alpha (float): Smoothing factor for trend calculations. Defaults to 0.3.
55
+ - sigma (float): Standard deviation multiplier for thresholding. Defaults to 1.5.
56
+ - eval_period: The number of trailing records in each group to evaluate for anomalies.
57
+ - prophet_CI (float): The confidence level for the prediction interval (0 to 1). Defaults to 0.9.
31
58
 
32
59
  Returns:
33
- pd.DataFrame: The original dataframe appended with anomaly flags and scores.
60
+ tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
61
+ - final_results: The main dataframe containing original data, interpolated values,
62
+ forecasts, residuals, and anomaly flags (e.g., is_FB_anomaly, is_IQR_anomaly).
63
+ - success_report: A summary table for successful groups showing 'initial_record_count',
64
+ 'interpolated_record_count', and 'interpolation_pct'.
65
+ - exclusion_report: A diagnostic table listing groups dropped from the analysis
66
+ and the specific reason (e.g., "Insufficient records" or "High Interpolation").
67
+
34
68
  """
69
+ # making robust with input parameters
70
+ if isinstance(group_columns, str):
71
+ group_columns = [group_columns]
72
+
73
+
74
+
75
+ # --- 1. MANDATORY PARAMETER VALIDATION ---
76
+ required_params = {
77
+ "master_data": master_data,
78
+ "group_columns": group_columns,
79
+ "variable": variable,
80
+ "date_column": date_column
81
+ }
82
+
83
+ missing_params = [name for name, val in required_params.items() if val is None]
35
84
 
36
- return run_pipeline(
85
+ if missing_params:
86
+ print("\n" + "!"*60)
87
+ print("❌ ERROR: MISSING REQUIRED PARAMETERS")
88
+ print("The following parameters are required to run the detection:")
89
+ for param in missing_params:
90
+ print(f" - {param}")
91
+
92
+ print("\n💡 HINT: Use help(timeseries_anomaly_detection) to see detailed")
93
+ print("descriptions and expected formats for each parameter.")
94
+ print("!"*60 + "\n")
95
+ return # Exit early
96
+
97
+
98
+ # --- 2. MANDATORY COLUMN VALIDATION ---
99
+ mandatory_cols = group_columns + [variable, date_column]
100
+ missing_cols = [col for col in mandatory_cols if col not in master_data.columns]
101
+
102
+ if missing_cols:
103
+ raise ValueError(
104
+ f"CRITICAL ERROR: Mandatory columns missing from input DataFrame: {missing_cols}. "
105
+ f"Please ensure group_columns, variable, and date_column are correctly spelled."
106
+ )
107
+ return # Exit early
108
+
109
+ # Check if the variable is numeric
110
+ if not pd.api.types.is_numeric_dtype(master_data[variable]):
111
+ raise TypeError(f"CRITICAL: The variable '{variable}' must be numeric, but found {master_data[variable].dtype}.")
112
+
113
+ # --- 3. EXECUTE PIPELINE ---
114
+ # Store results in a local variable first
115
+ final_df, success_report, exclusion_report = run_pipeline(
37
116
  master_data=master_data,
38
117
  group_columns=group_columns,
39
118
  variable=variable,
40
119
  date_column=date_column,
41
120
  freq=freq,
42
- max_records=max_records,
43
121
  min_records=min_records,
122
+ max_records=max_records,
44
123
  contamination=contamination,
45
124
  random_state=random_state,
46
125
  alpha=alpha,
47
126
  sigma=sigma,
48
127
  eval_period=eval_period,
49
- interval_width=interval_width,
50
- mad_threshold = mad_threshold,
51
- mad_scale_factor = mad_scale_factor
52
-
128
+ prophet_CI=prophet_CI,
129
+ mad_threshold=mad_threshold,
130
+ mad_scale_factor=mad_scale_factor
53
131
  )
54
-
55
- print("Anomaly pipeline successfully invoked via python -m!")
56
132
 
57
- # change test_weeks to eval_periods: automate min_records based on eval_periods,
58
- # max_records = max_records + eval_records
59
- # freq_daily: max_records based on frequency (for version 2) 104 for weekly
60
- # split all the 5 functions and parametrize all the variables
61
- # change interval_width name to prophet_CI
62
- # change FB_anomaly column to high low and none insted of -1, 1, 0
133
+ import inspect
134
+ # Inside your timeseries_anomaly_detection function:
135
+ # 1. Get the line of code that called this function
136
+ frame = inspect.currentframe().f_back
137
+ call_line = ""
138
+ if frame and inspect.getframeinfo(frame).code_context:
139
+ call_line = inspect.getframeinfo(frame).code_context[0].strip()
140
+
141
+ # 2. Check if the user assigned the result to variables
142
+ # We split by the function name and check the part before it (index 0)
143
+ is_assigned = False
144
+ if "timeseries_anomaly_detection" in call_line:
145
+ prefix = call_line.split("timeseries_anomaly_detection")[0]
146
+ # If there is exactly one '=', it's an assignment
147
+ if prefix.count("=") == 1:
148
+ is_assigned = True
149
+
150
+ # 3. If NOT assigned, trigger the "Auto-Save" to the global namespace
151
+ if not is_assigned:
152
+ from IPython import get_ipython
153
+ shell = get_ipython()
154
+ if shell:
155
+ shell.user_ns['final_results'] = final_df
156
+ shell.user_ns['success_report'] = success_report
157
+ shell.user_ns['exclusion_report'] = exclusion_report
63
158
 
159
+ print("\n" + "*"*60)
160
+ print("🚀 AUTO-SAVE: Variables were not assigned.")
161
+ print("The outputs have been saved globally for you as:")
162
+ print(" - final_results, success_report, exclusion_report")
163
+ print("*"*60 + "\n")
164
+
165
+ # 4. Final return logic
166
+ if is_assigned:
167
+ # Determine if the user assigned to a single variable or multiple
168
+ prefix = call_line.split("=")[0].strip()
169
+
170
+ # If there's no comma in the assignment prefix, they used a single variable
171
+ if "," not in prefix:
172
+ print(f"\n💡 INFO: You assigned the output to a single variable: '{prefix}'")
173
+ print(f" This variable is a tuple containing 3 DataFrames. Access them via:")
174
+ print(f" 1. Results Data: {prefix}[0]")
175
+ print(f" 2. Success Report: {prefix}[1]")
176
+ print(f" 3. Exclusion List: {prefix}[2]")
177
+ print(f" Or unpack them: final_df, success, exclusion = {prefix}\n")
178
+
179
+ return final_df, success_report, exclusion_report
180
+ else:
181
+ # Return None so Jupyter doesn't print the "wall of text"
182
+ return None
@@ -11,10 +11,21 @@ from .helpers.ewma import ewma_with_anomalies_rolling_group
11
11
  from .helpers.fb_prophet import detect_time_series_anomalies_fb_walkforward
12
12
  from .helpers.iso_forest_timeseries import detect_time_series_anomalies_isoforest
13
13
  from .helpers.DB_scan import detect_time_series_anomalies_dbscan
14
- from .helpers.Preprocessing import create_full_calendar_and_interpolate, print_anomaly_stats
14
+ from .helpers.Preprocessing import (create_full_calendar_and_interpolate,
15
+ print_anomaly_stats,
16
+ calculate_ensemble_scores,
17
+ min_records_extraction)
18
+
19
+ from .helpers.evaluation_plots import (summary_pie_plot,
20
+ anomaly_stacked_bar_plot,
21
+ avg_anomaly_score_plot,
22
+ anomaly_overview_plot)
23
+
24
+ from IPython.display import display, Markdown
25
+
15
26
 
16
27
  def process_group(model, name, group, group_columns, variable,
17
- date_column, alpha, sigma, eval_period, interval_width, contamination, random_state):
28
+ date_column, alpha, sigma, eval_period, prophet_CI, contamination, random_state):
18
29
 
19
30
  if model == "ISF_general":
20
31
  return detect_outliers_isf_general(group, variable, contamination, random_state, eval_period)
@@ -26,7 +37,7 @@ def process_group(model, name, group, group_columns, variable,
26
37
 
27
38
  if model == "FB":
28
39
  return detect_time_series_anomalies_fb_walkforward(
29
- group, variable, date_column, eval_period, interval_width
40
+ group, variable, date_column, eval_period, prophet_CI
30
41
  )
31
42
 
32
43
  if model == 'ISF_timeseries':
@@ -41,19 +52,28 @@ def process_group(model, name, group, group_columns, variable,
41
52
 
42
53
 
43
54
  def run_pipeline(master_data, group_columns, variable,
44
- date_column, freq,
45
- max_records, min_records,
55
+ date_column, freq, min_records,max_records,
46
56
  contamination, random_state,
47
57
  alpha, sigma, eval_period,
48
- interval_width, mad_threshold, mad_scale_factor):
58
+ prophet_CI, mad_threshold, mad_scale_factor):
59
+
60
+ if min_records is None:
61
+ min_records = min_records_extraction(freq,eval_period)
62
+ print(f"Min records needed to run an anomaly pipeline for a group is {min_records}")
63
+
64
+ if max_records is not None:
65
+ max_records = max_records + eval_period
66
+ print(f"Max records used to run an anomaly pipeline for a group is {max_records}")
49
67
 
50
68
  # preprocess calendar
51
- final_data = create_full_calendar_and_interpolate(
69
+ final_data, success_report, exclusion_report = create_full_calendar_and_interpolate(
52
70
  master_data,
53
71
  group_columns,
54
72
  variable,
55
73
  date_column,
56
- freq
74
+ freq,
75
+ min_records,
76
+ max_records
57
77
  )
58
78
 
59
79
  groups = list(final_data.groupby(group_columns))
@@ -113,7 +133,7 @@ def run_pipeline(master_data, group_columns, variable,
113
133
 
114
134
 
115
135
  ## ISF_general
116
- results_ISF_general = Parallel(n_jobs=-1, verbose=0)(delayed(process_group)('ISF_general', name, group, group_columns, variable,date_column, alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
136
+ results_ISF_general = Parallel(n_jobs=-1, verbose=0)(delayed(process_group)('ISF_general', name, group, group_columns, variable,date_column, alpha, sigma, eval_period, prophet_CI, contamination, random_state) for name, group in groups)
117
137
 
118
138
 
119
139
  # Combine results back
@@ -127,7 +147,7 @@ def run_pipeline(master_data, group_columns, variable,
127
147
  ## EWMA
128
148
  results_EWMA = Parallel(n_jobs=-1, verbose=0)(
129
149
  delayed(process_group)('EWMA', name, group,group_columns, variable, date_column,
130
- alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
150
+ alpha, sigma, eval_period, prophet_CI, contamination, random_state) for name, group in groups)
131
151
 
132
152
 
133
153
  # Combine results back
@@ -138,15 +158,14 @@ def run_pipeline(master_data, group_columns, variable,
138
158
  #print("anomaly_key_channel_EWMA data frame created")
139
159
  #print(anomaly_key_channel_EWMA.head())
140
160
  EWMA_cols = group_columns+[date_column]+['alpha', 'sigma', 'EWMA_forecast',
141
- 'STD', 'EWMA_high', 'EWMA_low','is_EWMA_anomaly']
161
+ 'STD', 'EWMA_high', 'EWMA_low',"EWMA_residual", "EWMA_anomaly",'is_EWMA_anomaly']
142
162
 
143
163
  anomaly_key_channel_EWMA_final = anomaly_key_channel_EWMA[EWMA_cols]
144
164
 
145
165
 
146
- ## FB
147
-
166
+ ## FB
148
167
  results_fb = Parallel(n_jobs=-1, verbose=0)(delayed(process_group)('FB', name, group,group_columns, variable,date_column,
149
- alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
168
+ alpha, sigma, eval_period,prophet_CI, contamination, random_state) for name, group in groups)
150
169
 
151
170
 
152
171
  # Combine results back
@@ -166,7 +185,7 @@ def run_pipeline(master_data, group_columns, variable,
166
185
  ## Isolation Forest timeseries
167
186
  results_ISF_timeseries = Parallel(n_jobs=-1, verbose=0)(
168
187
  delayed(process_group)('ISF_timeseries', name, group,group_columns, variable, date_column,
169
- alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
188
+ alpha, sigma, eval_period, prophet_CI, contamination, random_state) for name, group in groups)
170
189
 
171
190
 
172
191
  # Combine results back
@@ -175,7 +194,7 @@ def run_pipeline(master_data, group_columns, variable,
175
194
  .sort_values(by=group_columns+[date_column])
176
195
  )
177
196
  #print(anomaly_key_channel_ISF_timeseries.head())
178
- ISF_cols = group_columns+[date_column]+["IsolationForest_score_timeseries", "is_IsolationForest_anomaly_timeseries"]
197
+ ISF_cols = group_columns+[date_column]+["IsolationForest_score_timeseries", "IsolationForest_score_low_timeseries", "is_IsolationForest_anomaly_timeseries"]
179
198
  anomaly_key_channel_ISF_timeseries_final = anomaly_key_channel_ISF_timeseries[ISF_cols]
180
199
 
181
200
  #print("anomaly_key_channel_ISF_timeseries data frame created")
@@ -184,7 +203,7 @@ def run_pipeline(master_data, group_columns, variable,
184
203
  ## DB Scan
185
204
  results_DB = Parallel(n_jobs=-1, verbose=0)(
186
205
  delayed(process_group)('DBSCAN', name, group,group_columns, variable, date_column,
187
- alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
206
+ alpha, sigma, eval_period,prophet_CI, contamination, random_state) for name, group in groups)
188
207
 
189
208
  # Combine results back
190
209
  anomaly_key_channel_DB= (
@@ -196,7 +215,7 @@ def run_pipeline(master_data, group_columns, variable,
196
215
  #print("anomaly_key_channel_DB data frame created")
197
216
  #print(anomaly_key_channel_DB.head())
198
217
 
199
- DB_cols = group_columns+[date_column]+["dbscan_score", "is_DBSCAN_anomaly"]
218
+ DB_cols = group_columns+[date_column]+["dbscan_score", "dbscan_score_high", "is_DBSCAN_anomaly"]
200
219
  anomaly_key_channel_DB_final = anomaly_key_channel_DB[DB_cols]
201
220
 
202
221
  # combine ISF general and timeseries data frames
@@ -209,12 +228,16 @@ def run_pipeline(master_data, group_columns, variable,
209
228
  anomaly_key_channel_ISF['IsolationForest_score_general'],
210
229
  anomaly_key_channel_ISF['IsolationForest_score_timeseries'])
211
230
 
231
+ anomaly_key_channel_ISF['IsolationForest_score_low'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
232
+ anomaly_key_channel_ISF['IsolationForest_score_low_general'],
233
+ anomaly_key_channel_ISF['IsolationForest_score_low_timeseries'])
234
+
212
235
  # Column 2 Logic: If 'type' is train, take from 'IsolationForest_general', else take from 'IsolationForest_timeseries'
213
236
  anomaly_key_channel_ISF['is_IsolationForest_anomaly'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
214
237
  anomaly_key_channel_ISF['is_IsolationForest_anomaly_general'],
215
238
  anomaly_key_channel_ISF['is_IsolationForest_anomaly_timeseries'])
216
239
 
217
- ISF_cols = group_columns+[date_column]+['IsolationForest_score', 'is_IsolationForest_anomaly']
240
+ ISF_cols = group_columns+[date_column]+['IsolationForest_score', 'IsolationForest_score_low', 'is_IsolationForest_anomaly']
218
241
  anomaly_key_channel_ISF_final = anomaly_key_channel_ISF[ISF_cols]
219
242
 
220
243
 
@@ -231,23 +254,72 @@ def run_pipeline(master_data, group_columns, variable,
231
254
  anomaly = anomaly.merge(anomaly_key_channel_fb_final, on= group_columns+[date_column], how= 'inner')
232
255
  anomaly = anomaly.merge(anomaly_key_channel_ISF_final, on= group_columns+[date_column], how= 'inner')
233
256
  anomaly = anomaly.merge(anomaly_key_channel_DB_final, on= group_columns+[date_column], how= 'inner')
257
+ anomaly_final = calculate_ensemble_scores(anomaly, variable)
258
+ globals()['anomaly_df'] = anomaly_final
259
+ #print(anomaly_final.head())
260
+ #print(f"Successfully processed {len(success_report)} groups.")
261
+ #print(f"Excluded {len(exclusion_report)} groups due to low quality.")
262
+
263
+ print_anomaly_stats(anomaly_final, success_report, exclusion_report,group_columns,interpolation_method="linear")
234
264
 
235
- # ---- Unified anomaly flag (majority voting) ----
236
- anomaly_flags = [
237
- 'is_Percentile_anomaly',
238
- 'is_SD_anomaly', 'is_MAD_anomaly',
239
- 'is_IQR_anomaly',
240
- 'is_EWMA_anomaly', 'is_FB_anomaly','is_IsolationForest_anomaly','is_DBSCAN_anomaly']
265
+ # Plot summary charts
266
+ # ------------------------------
267
+
268
+ # Get data for pie chart
269
+ pie_chart_df = anomaly_final['is_Anomaly'].value_counts().reset_index()
270
+ pie_chart_df['is_Anomaly'] = np.where(pie_chart_df['is_Anomaly'] == True, 'Anomalous Records', 'Evaluated Records')
271
+ pie_chart_df = pie_chart_df.rename(columns={'is_Anomaly': 'Records'})
272
+ if len(exclusion_report) > 0:
273
+ pie_chart_df = pd.concat([pie_chart_df,
274
+ pd.DataFrame({'Records': ['Dropped Records'], 'count': [exclusion_report['dropped_records'].sum()]})])
275
+ exclusion_report = exclusion_report.drop(columns='dropped_records')
276
+ print("")
277
+ summary_pie_plot(pie_chart_df, title=f"Anomaly Detection Summary for {len(master_data[group_columns].drop_duplicates())} Groups")
278
+ anomaly_stacked_bar_plot(anomaly_final, group_columns, variable, date_column, secondary_line=variable)
279
+ anomaly_stacked_bar_plot(anomaly_final, group_columns, variable, date_column, secondary_line='Anomaly_Score')
280
+ avg_anomaly_score_plot(anomaly_final, group_columns, date_column)
281
+
282
+ top_5_anomaly_groups = anomaly_final.groupby(group_columns)['is_Anomaly'].agg(['mean', 'sum', 'count']).reset_index()\
283
+ .sort_values('mean', ascending=False).reset_index(drop=True).head(5)
241
284
 
242
- anomaly['Anomaly_Votes'] = anomaly[anomaly_flags].sum(axis=1)
243
- # Majority rule: anomaly if flagged by at least half the methods
244
- anomaly['is_Anomaly'] = anomaly['Anomaly_Votes'] >= 4
245
-
246
- # Add refresh_date as the first column
247
- anomaly.insert(0, 'refresh_date', pd.to_datetime(date.today()))
285
+ eval_plots_msg = f"""
286
+ ---
287
+ ### Overall Evaluation Plots of the {len(top_5_anomaly_groups)} Groups with the Highest Anomaly Rates
288
+
289
+ Here is how to view detailed plots of individual anomaly detection models per group.\n
290
+ Start with the main (first) DataFrame returned from the timeseries_anomaly_detection function.\n
291
+ Suppose you called that DataFrame anomaly_df, that the group_columns are 'taxonomy' and 'channel', and that you want to see all the plots for the group where 'taxonomy' = 'tools' and 'channel' = 'mobile'.
292
+ Then you could run this code block:\n
293
+
294
+ ```python
295
+ from anomaly_pipeline import evaluation_info
296
+ from anomaly_pipeline import help_anomaly
297
+
298
+ group_values = ['tools', 'mobile']
299
+ mask = anomaly_df[group_columns].eq(group_values).all(axis=1)
300
+ group_df = anomaly_df[mask]
301
+
302
+ evaluation_info(
303
+ group_df,
304
+ group_columns,
305
+ variable,
306
+ date_column,
307
+ eval_period)
308
+ ```
309
+ ---
310
+ """
311
+
312
+ display(Markdown(eval_plots_msg))
248
313
 
249
- print(anomaly.head())
314
+ group_nbr = 1
315
+ for group_key, group in top_5_anomaly_groups.groupby(group_columns, sort=False):
316
+ anomaly_rate = group['mean'].values[0]
317
+ group_df = anomaly_final.merge(group[group_columns], on=group_columns, how='inner')
318
+ group_id = group_df[group_columns].drop_duplicates().astype(str).apply(lambda x: ' -- '.join(x), axis=1).values[0]
319
+ group_msg = f"""#### #{group_nbr}, Anomaly Rate: {anomaly_rate:.1%}, Group: {group_id}"""
320
+ display(Markdown(group_msg))
321
+ anomaly_overview_plot(group_df, group_columns, variable, date_column, eval_period, show_anomaly_scores_on_main_plot=False)
322
+ group_nbr += 1
250
323
 
251
- print_anomaly_stats(anomaly, group_columns)
324
+ return anomaly_final, success_report, exclusion_report
252
325
 
253
- return anomaly