anomaly-pipeline 0.1.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,50 @@
1
+ import pandas as pd
2
+ from sklearn.ensemble import IsolationForest
3
+
4
+ def detect_outliers_isf_general(group, variable, contamination=0.03, random_state=42, eval_period=12):
5
+ n = len(group)
6
+ if n < 10:
7
+ return pd.DataFrame(columns=group.columns)
8
+
9
+ group = group.copy()
10
+ train_size = n - eval_period
11
+
12
+ # Initialize columns
13
+ group['set'] = ""
14
+ group['IsolationForest_score_general'] = 0.0
15
+ group['is_IsolationForest_anomaly_general'] = False
16
+
17
+ # --- 1. HANDLE TRAINING DATA (Initial Block) ---
18
+ # Baseline ISF using all data available before eval_period
19
+ initial_train = group[[variable]].iloc[:train_size]
20
+
21
+ iso = IsolationForest(contamination=contamination, random_state=random_state)
22
+
23
+ # Fit and predict the initial block
24
+ group.loc[group.index[:train_size], 'IsolationForest_score_general'] = iso.fit_predict(initial_train) # Note: this is actually the cluster label
25
+ # We use decision_function for the raw anomaly score
26
+ group.loc[group.index[:train_size], 'IsolationForest_score_general'] = iso.decision_function(initial_train)
27
+ group.loc[group.index[:train_size], 'is_IsolationForest_anomaly_general'] = iso.predict(initial_train) == -1
28
+ group.loc[group.index[:train_size], 'set'] = "TRAIN"
29
+
30
+ # --- 2. HANDLE EVALUATION DATA (Expanding Window) ---
31
+ # Iterate through the eval period, increasing the training set one point at a time
32
+ for i in range(train_size, n):
33
+ # Data available up to this point (expanding window)
34
+ current_train = group[[variable]].iloc[:i]
35
+
36
+ # Re-fit the model on all data known up to point i
37
+ iso_expanding = IsolationForest(contamination=contamination, random_state=random_state)
38
+ iso_expanding.fit(current_train)
39
+
40
+ # Test the current point i
41
+ current_point = group[[variable]].iloc[[i]]
42
+
43
+ group.iloc[i, group.columns.get_loc('IsolationForest_score_general')] = iso_expanding.decision_function(current_point)[0]
44
+ group.iloc[i, group.columns.get_loc('is_IsolationForest_anomaly_general')] = iso_expanding.predict(current_point)[0] == -1
45
+ group.iloc[i, group.columns.get_loc('set')] = "TEST"
46
+
47
+ # Cast boolean column properly
48
+ group['is_IsolationForest_anomaly_general'] = group['is_IsolationForest_anomaly_general'].astype(bool)
49
+
50
+ return group
@@ -0,0 +1,123 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.ensemble import IsolationForest
4
+ from statsmodels.tsa.stattools import acf
5
+
6
+ def get_dynamic_lags(series: pd.Series) -> list:
7
+
8
+ n = len(series)
9
+
10
+ # Determine Max Lags (Max is min(50% of data, a hard cap of 60))
11
+ nlags = min(int(n * 0.5), 60)
12
+
13
+ if nlags < 5:
14
+ return [1, 2, 3]
15
+
16
+ # Calculate ACF and Confidence Intervals, get the 10 most-significant lags
17
+ autocorrelations, confint = acf(series.dropna(), nlags=nlags, alpha=0.25, fft=True)
18
+ autocorr_values = autocorrelations[1:]
19
+ conf_limit = confint[1:, 1] - autocorr_values
20
+ is_significant = np.abs(autocorr_values) > conf_limit
21
+ significant_autocorr = autocorr_values[is_significant]
22
+ significant_lags_indices = np.where(is_significant)[0] + 1
23
+ ranked_indices = np.argsort(np.abs(significant_autocorr))[::-1]
24
+ top_lags_indices = ranked_indices[:10]
25
+ top_lags = significant_lags_indices[top_lags_indices].tolist()
26
+ base_lags = [1, 2, 3]
27
+ dynamic_lags = sorted(list(set(base_lags + top_lags)))[:10]
28
+
29
+ return dynamic_lags
30
+
31
+ def detect_time_series_anomalies_isoforest(
32
+ group,
33
+ variable,
34
+ date_column,
35
+ eval_period,
36
+ ):
37
+
38
+ group[date_column] = pd.to_datetime(group[date_column])
39
+ group = group.copy().sort_values(date_column).reset_index(drop=True)
40
+
41
+ '''
42
+ Iterate over each of the evaluation periods, fitting the model to all the data before the evaluation period
43
+ and then getting the predicted anomaly score for the given evaluation period
44
+ '''
45
+ try:
46
+ test_anom = []
47
+
48
+ for t in list(range(eval_period - 1, -1, -1)):
49
+
50
+ try:
51
+
52
+ # Boundary between rolling train and rolling forecast region
53
+ cutoff_date = group[date_column].max() - pd.Timedelta(weeks=t)
54
+
55
+ # Get train set to determine lags
56
+ model_group = group.copy()
57
+ train = model_group[model_group[date_column] <= cutoff_date].copy()
58
+ lags = get_dynamic_lags(train[variable])
59
+
60
+ # Create lag features on the entire model_group DF
61
+ for lag in lags:
62
+ model_group[f'lag{lag}'] = model_group[variable].shift(lag)
63
+
64
+ # Get rolling stats features for the entire model_group DF
65
+ rolling_stats_features = []
66
+ for w in [int(np.ceil(max(lags)/4)), int(np.ceil(max(lags)/2)), int(max(lags))]:
67
+ if w >= 3:
68
+ rolling_stats_features.append('roll_mean' + str(w))
69
+ rolling_stats_features.append('roll_std' + str(w))
70
+ model_group['roll_mean' + str(w)] = model_group[variable].shift(1).rolling(w).mean()
71
+ model_group['roll_std' + str(w)] = model_group[variable].shift(1).rolling(w).std()
72
+
73
+ # Get trend feature
74
+ model_group['trend'] = group.index
75
+
76
+ # Drop records with NAs
77
+ model_group = model_group.copy().dropna()
78
+
79
+ # Split into train and test (train and test now both have all the features
80
+ train = model_group[model_group[date_column] <= cutoff_date].copy()
81
+ test = model_group[model_group[date_column] == cutoff_date].copy()
82
+
83
+ # Identify all model features (lags, rolling stats, trend, and the variable itself)
84
+ features = [f'lag{i}' for i in lags] + rolling_stats_features + ['trend'] + [variable]
85
+
86
+ # Create and fit the model
87
+ iso_forest_model = IsolationForest(
88
+ n_estimators=200,
89
+ contamination=0.01,
90
+ random_state=42
91
+ )
92
+ iso_forest_model.fit(train[features])
93
+
94
+ train['IsolationForest_score_timeseries'] = iso_forest_model.decision_function(train[features])
95
+ anomaly_threshold = min(0,
96
+ train[train['IsolationForest_score_timeseries'] > 0]['IsolationForest_score_timeseries'].mean() - 3 * train[train['IsolationForest_score_timeseries'] > 0]['IsolationForest_score_timeseries'].std())
97
+ test['IsolationForest_score_timeseries'] = iso_forest_model.decision_function(test[features])
98
+ test['contamination_anomaly'] = iso_forest_model.predict(test[features]) # -1 = anomaly, 1 = normal
99
+ test['anomaly_threshold'] = anomaly_threshold
100
+ test['threshold_anomaly'] = np.where(test['IsolationForest_score_timeseries'] < anomaly_threshold, -1, 1)
101
+
102
+ test['is_IsolationForest_anomaly_timeseries'] = np.where((test['contamination_anomaly'] == -1) & (test['threshold_anomaly'] == -1), True, False)
103
+ test = test[[variable, date_column, 'IsolationForest_score_timeseries', 'is_IsolationForest_anomaly_timeseries']]
104
+ test_anom.append(test)
105
+ except:
106
+ pass
107
+ try:
108
+ test_anom = pd.concat(test_anom)
109
+ group = group.merge(test_anom[[variable, date_column, 'IsolationForest_score_timeseries', 'is_IsolationForest_anomaly_timeseries']], on=[variable, date_column], how='left')
110
+ except:
111
+ print("Error in Isolation Forest process")
112
+ group["IsolationForest_score_timeseries"] = np.nan
113
+ group["is_IsolationForest_anomaly_timeseries"] = np.nan
114
+
115
+ except:
116
+ group["IsolationForest_score_timeseries"] = np.nan
117
+ group["is_IsolationForest_anomaly_timeseries"] = np.nan
118
+ # Get string or object dtype columns from group that would identify the group
119
+ group_id = key_series.select_dtypes(include=['object', 'string']).columns.tolist()
120
+ group_id = " ".join(key_series[group_id].reset_index(drop=True).iloc[0].to_list())
121
+ print(f'Isolation Forest Anomaly Detection failed for {group_id}')
122
+
123
+ return group
@@ -0,0 +1,65 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from .Preprocessing import classify
5
+
6
+
7
+ # Anomaly category columns (optional, keep if you still want string labels)
8
+
9
+
10
+ def detect_outliers_percentile(group, variable,date_column,eval_period):
11
+ n = len(group)
12
+ if n < 10:
13
+ # Optional: log specific keys if they exist in your scope
14
+ return pd.DataFrame(columns=group.columns)
15
+
16
+ group = group.copy()
17
+ # Explicitly ensure date_column is datetime right at the start
18
+ group[date_column] = pd.to_datetime(group[date_column])
19
+ train_size = n - eval_period
20
+
21
+ # --- 1. HANDLE TRAINING DATA (Initial Block) ---
22
+ # Calculate baseline IQR using all data available before eval_period
23
+ initial_train = group[variable].iloc[:train_size]
24
+
25
+ low = initial_train.quantile(0.05)
26
+ high= initial_train.quantile(0.95)
27
+
28
+ # Assign initial bounds to the training rows
29
+ group.loc[group.index[:train_size], 'set'] = "TRAIN"
30
+ group.loc[group.index[:train_size], 'Percentile_low'] = low
31
+ group.loc[group.index[:train_size], 'Percentile_high'] = high
32
+ group.loc[group.index[:train_size], 'Percentile_anomaly'] = group[variable].iloc[:train_size].apply(
33
+ lambda x: classify(x, low, high)
34
+ )
35
+ group.loc[group.index[:train_size], 'is_Percentile_anomaly'] = (
36
+ (group[variable].iloc[:train_size] < low) |
37
+ (group[variable].iloc[:train_size] > high)
38
+ )
39
+
40
+
41
+ # --- 2. HANDLE EVALUATION DATA (Expanding Window) ---
42
+ # Iterate through the eval period, increasing the training set one point at a time
43
+ for i in range(train_size, n):
44
+ # Data available up to this point (expanding)
45
+ current_train = group[variable].iloc[:i]
46
+
47
+ LOW = current_train.quantile(0.05)
48
+ HIGH = current_train.quantile(0.95)
49
+
50
+ # Test the current point i
51
+ current_val = group[variable].iloc[i]
52
+ group.iloc[i, group.columns.get_loc('set')] = "TEST"
53
+ group.iloc[i, group.columns.get_loc('Percentile_low')] = LOW
54
+ group.iloc[i, group.columns.get_loc('Percentile_high')] = HIGH
55
+ group.iloc[i, group.columns.get_loc('Percentile_anomaly')] = classify(current_val, LOW, HIGH)
56
+ group.iloc[i, group.columns.get_loc('is_Percentile_anomaly')] = (current_val < LOW) or (current_val > HIGH)
57
+
58
+ # Cast boolean column properly
59
+ group['is_Percentile_anomaly'] = group['is_Percentile_anomaly'].astype(bool)
60
+ # FINAL SAFETY CHECK
61
+ group[date_column] = pd.to_datetime(group[date_column])
62
+
63
+ return group
64
+
65
+
@@ -0,0 +1,63 @@
1
+ from .pipeline import run_pipeline
2
+
3
+ def timeseries_anomaly_detection(master_data, group_columns, variable,
4
+ date_column="week_start", freq="W-MON",
5
+ max_records=104, min_records=15,
6
+ contamination=0.03, random_state=42,
7
+ alpha=0.3, sigma=1.5, eval_period=12,
8
+ interval_width=0.90, mad_threshold = 2, mad_scale_factor = 0.6745):
9
+
10
+ """
11
+ Performs anomaly detection on grouped time-series data.
12
+
13
+ This function identifies outliers within specific groups of data by analyzing
14
+ historical trends, applying statistical thresholds, and calculating
15
+ prediction intervals.
16
+
17
+ Args:
18
+ master_data (pd.DataFrame): The input dataset containing the time series.
19
+ group_columns (list[str]): Columns used to partition the data (e.g., ['store_id', 'item_id']).
20
+ variable (str): The target numerical column to analyze for anomalies.
21
+ date_column (str): The column containing datetime information. Defaults to 'week_start'.
22
+ freq (str): Frequency of the time series (Pandas offset alias). Defaults to 'W-MON'.
23
+ max_records (int): Maximum historical records to consider for the model. Defaults to 104.
24
+ min_records (int): Minimum records required to perform detection. Defaults to 15.
25
+ contamination (float): Expected proportion of outliers in the data (0 to 0.5). Defaults to 0.03.
26
+ random_state (int): Seed for reproducibility in stochastic models. Defaults to 42.
27
+ alpha (float): Smoothing factor for trend calculations. Defaults to 0.3.
28
+ sigma (float): Standard deviation multiplier for thresholding. Defaults to 1.5.
29
+ eval_periods (int): Number of recent periods to evaluate for anomalies. Defaults to 12.
30
+ interval_width (float): The confidence level for the prediction interval (0 to 1). Defaults to 0.9.
31
+
32
+ Returns:
33
+ pd.DataFrame: The original dataframe appended with anomaly flags and scores.
34
+ """
35
+
36
+ return run_pipeline(
37
+ master_data=master_data,
38
+ group_columns=group_columns,
39
+ variable=variable,
40
+ date_column=date_column,
41
+ freq=freq,
42
+ max_records=max_records,
43
+ min_records=min_records,
44
+ contamination=contamination,
45
+ random_state=random_state,
46
+ alpha=alpha,
47
+ sigma=sigma,
48
+ eval_period=eval_period,
49
+ interval_width=interval_width,
50
+ mad_threshold = mad_threshold,
51
+ mad_scale_factor = mad_scale_factor
52
+
53
+ )
54
+
55
+ print("Anomaly pipeline successfully invoked via python -m!")
56
+
57
+ # change test_weeks to eval_periods: automate min_records based on eval_periods,
58
+ # max_records = max_records + eval_records
59
+ # freq_daily: max_records based on frequency (for version 2) 104 for weekly
60
+ # split all the 5 functions and parametrize all the variables
61
+ # change interval_width name to prophet_CI
62
+ # change FB_anomaly column to high low and none insted of -1, 1, 0
63
+
@@ -0,0 +1,253 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from datetime import date
4
+ from joblib import Parallel, delayed
5
+ from .helpers.percentile import detect_outliers_percentile
6
+ from .helpers.STD import detect_outliers_sd
7
+ from .helpers.MAD import detect_outliers_mad
8
+ from .helpers.IQR import detect_outliers_iqr
9
+ from .helpers.iso_forest_general import detect_outliers_isf_general
10
+ from .helpers.ewma import ewma_with_anomalies_rolling_group
11
+ from .helpers.fb_prophet import detect_time_series_anomalies_fb_walkforward
12
+ from .helpers.iso_forest_timeseries import detect_time_series_anomalies_isoforest
13
+ from .helpers.DB_scan import detect_time_series_anomalies_dbscan
14
+ from .helpers.Preprocessing import create_full_calendar_and_interpolate, print_anomaly_stats
15
+
16
+ def process_group(model, name, group, group_columns, variable,
17
+ date_column, alpha, sigma, eval_period, interval_width, contamination, random_state):
18
+
19
+ if model == "ISF_general":
20
+ return detect_outliers_isf_general(group, variable, contamination, random_state, eval_period)
21
+
22
+ if model == "EWMA":
23
+ return ewma_with_anomalies_rolling_group(
24
+ group, group_columns, variable, date_column, alpha, sigma, eval_period
25
+ )
26
+
27
+ if model == "FB":
28
+ return detect_time_series_anomalies_fb_walkforward(
29
+ group, variable, date_column, eval_period, interval_width
30
+ )
31
+
32
+ if model == 'ISF_timeseries':
33
+ return detect_time_series_anomalies_isoforest(
34
+ group, variable, date_column, eval_period
35
+ )
36
+
37
+ if model == 'DBSCAN':
38
+ return detect_time_series_anomalies_dbscan(
39
+ group, variable, date_column, eval_period
40
+ )
41
+
42
+
43
+ def run_pipeline(master_data, group_columns, variable,
44
+ date_column, freq,
45
+ max_records, min_records,
46
+ contamination, random_state,
47
+ alpha, sigma, eval_period,
48
+ interval_width, mad_threshold, mad_scale_factor):
49
+
50
+ # preprocess calendar
51
+ final_data = create_full_calendar_and_interpolate(
52
+ master_data,
53
+ group_columns,
54
+ variable,
55
+ date_column,
56
+ freq
57
+ )
58
+
59
+ groups = list(final_data.groupby(group_columns))
60
+
61
+ # Run in parallel (use all cores: n_jobs=-1)
62
+
63
+ ## Percentile
64
+ results_percentile = []
65
+ results_SD = []
66
+ results_IQR = []
67
+ results_MAD = []
68
+ for name, group in groups:
69
+ # percentile
70
+ res_percentile = detect_outliers_percentile(group, variable, date_column, eval_period)
71
+ results_percentile.append(res_percentile)
72
+
73
+ # SD
74
+ res_SD = detect_outliers_sd(group, variable, date_column, eval_period)
75
+ results_SD.append(res_SD)
76
+
77
+ # MAD
78
+ res_MAD = detect_outliers_mad(group, variable, date_column, mad_threshold, mad_scale_factor, eval_period)
79
+ results_MAD.append(res_MAD)
80
+
81
+ # IQR
82
+ res_IQR = detect_outliers_iqr(group, variable, date_column, eval_period)
83
+ results_IQR.append(res_IQR)
84
+
85
+ anomaly_key_channel_percentile = pd.concat(results_percentile, ignore_index=True)
86
+
87
+ #print("anomaly_key_channel_percentile data frame created")
88
+ #print(anomaly_key_channel_percentile.head())
89
+
90
+ anomaly_key_channel_SD = pd.concat(results_SD, ignore_index=True)
91
+ SD_cols = group_columns+[date_column]+['Mean', 'SD', 'SD2_low', 'SD2_high','SD_anomaly',
92
+ 'is_SD_anomaly']
93
+ anomaly_key_channel_SD_final = anomaly_key_channel_SD[SD_cols]
94
+
95
+ #print("anomaly_key_channel_SD data frame created")
96
+ #print(anomaly_key_channel_SD.head())
97
+
98
+ anomaly_key_channel_MAD = pd.concat(results_MAD, ignore_index=True)
99
+ MAD_cols = group_columns+[date_column]+['Median', 'MAD', 'MAD_low', 'MAD_high','is_MAD_anomaly',
100
+ 'MAD_anomaly']
101
+ anomaly_key_channel_MAD_final = anomaly_key_channel_MAD[MAD_cols]
102
+
103
+ #print("anomaly_key_channel_MAD data frame created")
104
+ #print(anomaly_key_channel_MAD.head())
105
+
106
+ anomaly_key_channel_IQR = pd.concat(results_IQR, ignore_index=True)
107
+ IQR_cols = group_columns+[date_column]+['Q1', 'Q3', 'IQR', 'IQR_low', 'IQR_high','IQR_anomaly',
108
+ 'is_IQR_anomaly']
109
+ anomaly_key_channel_IQR_final = anomaly_key_channel_IQR[IQR_cols]
110
+
111
+ #print("anomaly_key_channel_IQR data frame created")
112
+ #print(anomaly_key_channel_IQR.head())
113
+
114
+
115
+ ## ISF_general
116
+ results_ISF_general = Parallel(n_jobs=-1, verbose=0)(delayed(process_group)('ISF_general', name, group, group_columns, variable,date_column, alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
117
+
118
+
119
+ # Combine results back
120
+ anomaly_key_channel_ISF_general= (
121
+ pd.concat(results_ISF_general)
122
+ .sort_values(by=group_columns+[date_column])
123
+ )
124
+ #print("anomaly_key_channel_ISF_general data frame created")
125
+ #print(anomaly_key_channel_ISF_general.head())
126
+
127
+ ## EWMA
128
+ results_EWMA = Parallel(n_jobs=-1, verbose=0)(
129
+ delayed(process_group)('EWMA', name, group,group_columns, variable, date_column,
130
+ alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
131
+
132
+
133
+ # Combine results back
134
+ anomaly_key_channel_EWMA= (
135
+ pd.concat(results_EWMA)
136
+ .sort_values(by=group_columns+[date_column])
137
+ )
138
+ #print("anomaly_key_channel_EWMA data frame created")
139
+ #print(anomaly_key_channel_EWMA.head())
140
+ EWMA_cols = group_columns+[date_column]+['alpha', 'sigma', 'EWMA_forecast',
141
+ 'STD', 'EWMA_high', 'EWMA_low','is_EWMA_anomaly']
142
+
143
+ anomaly_key_channel_EWMA_final = anomaly_key_channel_EWMA[EWMA_cols]
144
+
145
+
146
+ ## FB
147
+
148
+ results_fb = Parallel(n_jobs=-1, verbose=0)(delayed(process_group)('FB', name, group,group_columns, variable,date_column,
149
+ alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
150
+
151
+
152
+ # Combine results back
153
+ anomaly_key_channel_fb= (
154
+ pd.concat(results_fb)
155
+ .sort_values(by=group_columns+[date_column])
156
+ )
157
+
158
+ #print("anomaly_key_channel_fb data frame created")
159
+ #print(anomaly_key_channel_fb.head())
160
+ FB_cols = group_columns+[date_column]+["FB_forecast","FB_low","FB_high",
161
+ "FB_residual","FB_anomaly","is_FB_anomaly"]
162
+
163
+ anomaly_key_channel_fb_final = anomaly_key_channel_fb[FB_cols]
164
+
165
+
166
+ ## Isolation Forest timeseries
167
+ results_ISF_timeseries = Parallel(n_jobs=-1, verbose=0)(
168
+ delayed(process_group)('ISF_timeseries', name, group,group_columns, variable, date_column,
169
+ alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
170
+
171
+
172
+ # Combine results back
173
+ anomaly_key_channel_ISF_timeseries= (
174
+ pd.concat(results_ISF_timeseries)
175
+ .sort_values(by=group_columns+[date_column])
176
+ )
177
+ #print(anomaly_key_channel_ISF_timeseries.head())
178
+ ISF_cols = group_columns+[date_column]+["IsolationForest_score_timeseries", "is_IsolationForest_anomaly_timeseries"]
179
+ anomaly_key_channel_ISF_timeseries_final = anomaly_key_channel_ISF_timeseries[ISF_cols]
180
+
181
+ #print("anomaly_key_channel_ISF_timeseries data frame created")
182
+ #print(anomaly_key_channel_ISF_timeseries.head())
183
+
184
+ ## DB Scan
185
+ results_DB = Parallel(n_jobs=-1, verbose=0)(
186
+ delayed(process_group)('DBSCAN', name, group,group_columns, variable, date_column,
187
+ alpha, sigma, eval_period, interval_width, contamination, random_state) for name, group in groups)
188
+
189
+ # Combine results back
190
+ anomaly_key_channel_DB= (
191
+ pd.concat(results_DB)
192
+ .sort_values(by=group_columns+[date_column])
193
+ )
194
+
195
+
196
+ #print("anomaly_key_channel_DB data frame created")
197
+ #print(anomaly_key_channel_DB.head())
198
+
199
+ DB_cols = group_columns+[date_column]+["dbscan_score", "is_DBSCAN_anomaly"]
200
+ anomaly_key_channel_DB_final = anomaly_key_channel_DB[DB_cols]
201
+
202
+ # combine ISF general and timeseries data frames
203
+ anomaly_key_channel_ISF = anomaly_key_channel_ISF_general.merge(anomaly_key_channel_ISF_timeseries_final,
204
+ on= group_columns+[date_column], how= 'inner')
205
+
206
+
207
+ # Column 1 Logic: If 'type' is train, take from 'col_A', else take from 'col_B'
208
+ anomaly_key_channel_ISF['IsolationForest_score'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
209
+ anomaly_key_channel_ISF['IsolationForest_score_general'],
210
+ anomaly_key_channel_ISF['IsolationForest_score_timeseries'])
211
+
212
+ # Column 2 Logic: If 'type' is train, take from 'IsolationForest_general', else take from 'IsolationForest_timeseries'
213
+ anomaly_key_channel_ISF['is_IsolationForest_anomaly'] = np.where(anomaly_key_channel_ISF['set'] == 'TRAIN',
214
+ anomaly_key_channel_ISF['is_IsolationForest_anomaly_general'],
215
+ anomaly_key_channel_ISF['is_IsolationForest_anomaly_timeseries'])
216
+
217
+ ISF_cols = group_columns+[date_column]+['IsolationForest_score', 'is_IsolationForest_anomaly']
218
+ anomaly_key_channel_ISF_final = anomaly_key_channel_ISF[ISF_cols]
219
+
220
+
221
+ #print("anomaly_key_channel_ISF data frame created")
222
+ #print(anomaly_key_channel_ISF.head())
223
+
224
+
225
+ # combine all the data frames
226
+
227
+ anomaly = anomaly_key_channel_percentile.merge(anomaly_key_channel_SD_final, on= group_columns+[date_column], how='inner')
228
+ anomaly = anomaly.merge(anomaly_key_channel_MAD_final, on= group_columns+[date_column], how='inner')
229
+ anomaly = anomaly.merge(anomaly_key_channel_IQR_final, on= group_columns+[date_column], how='inner')
230
+ anomaly = anomaly.merge(anomaly_key_channel_EWMA_final, on= group_columns+[date_column], how='inner')
231
+ anomaly = anomaly.merge(anomaly_key_channel_fb_final, on= group_columns+[date_column], how= 'inner')
232
+ anomaly = anomaly.merge(anomaly_key_channel_ISF_final, on= group_columns+[date_column], how= 'inner')
233
+ anomaly = anomaly.merge(anomaly_key_channel_DB_final, on= group_columns+[date_column], how= 'inner')
234
+
235
+ # ---- Unified anomaly flag (majority voting) ----
236
+ anomaly_flags = [
237
+ 'is_Percentile_anomaly',
238
+ 'is_SD_anomaly', 'is_MAD_anomaly',
239
+ 'is_IQR_anomaly',
240
+ 'is_EWMA_anomaly', 'is_FB_anomaly','is_IsolationForest_anomaly','is_DBSCAN_anomaly']
241
+
242
+ anomaly['Anomaly_Votes'] = anomaly[anomaly_flags].sum(axis=1)
243
+ # Majority rule: anomaly if flagged by at least half the methods
244
+ anomaly['is_Anomaly'] = anomaly['Anomaly_Votes'] >= 4
245
+
246
+ # Add refresh_date as the first column
247
+ anomaly.insert(0, 'refresh_date', pd.to_datetime(date.today()))
248
+
249
+ print(anomaly.head())
250
+
251
+ print_anomaly_stats(anomaly, group_columns)
252
+
253
+ return anomaly
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: anomaly_pipeline
3
+ Version: 0.1.27
4
+ Requires-Dist: pandas
5
+ Requires-Dist: numpy<2
6
+ Requires-Dist: joblib
7
+ Requires-Dist: prophet
8
+ Requires-Dist: scikit-learn
9
+ Requires-Dist: google-cloud-bigquery
10
+ Requires-Dist: google-cloud-storage
11
+ Requires-Dist: statsmodels
12
+ Requires-Dist: plotly
13
+ Requires-Dist: pandas-gbq
14
+ Requires-Dist: gcsfs
15
+ Dynamic: requires-dist
@@ -0,0 +1,24 @@
1
+ anomaly_pipeline/__init__.py,sha256=ED-UPADjbdS8xjK41KmWVYcFIn6q_cN-SwBx-dRI-nM,77
2
+ anomaly_pipeline/main.py,sha256=khiatXxr01XYHB8SrIfyTnlaCu008MA6ORGiI_2Tjr4,2925
3
+ anomaly_pipeline/pipeline.py,sha256=3Lf9b0Vok-mqWDLhhZeN9emgx5i30stPrU8XOmKpmEw,11204
4
+ anomaly_pipeline/helpers/DB_scan.py,sha256=80PLlubpcwY6dOUx5rm569hvFlGNa1rtvjs74US9oIk,8134
5
+ anomaly_pipeline/helpers/IQR.py,sha256=VlYU6Yf-4KQmVroLvzwd220jn5BUNJEchsVE4_KxKm4,2824
6
+ anomaly_pipeline/helpers/MAD.py,sha256=XDG8r9o1JNi7YZ2NKwNzqmu_Oyz2OPP2rThCuw8WZhs,3377
7
+ anomaly_pipeline/helpers/Preprocessing.py,sha256=VsAohcAW1wTKDdNAF1xNF4j4I2gyZ8rOC1HjyK0NpGk,3933
8
+ anomaly_pipeline/helpers/STD.py,sha256=SZ1UaS_Aa5ay6qWNzKpBXpQIloUuPlliOrfr7yHba4k,2769
9
+ anomaly_pipeline/helpers/__init__.py,sha256=aDAAxiNAusL4rwcn9XbkUIApp3i02UXolB_CWvbbY_0,32
10
+ anomaly_pipeline/helpers/baseline.py,sha256=h9t_LWcAw17P9qmoRQMceukGzOOr-gFLuHfVbipQB7M,3824
11
+ anomaly_pipeline/helpers/cluster_functions.py,sha256=Nhk2YdKVynrKywEILg_5B2xD4zrCZ_ICWw3oOdTDHuA,13040
12
+ anomaly_pipeline/helpers/evaluation_info.py,sha256=SXa1LkznNQXTOcFCbryRmRJMSNC_Fa2CU-HhFnyTIKY,6219
13
+ anomaly_pipeline/helpers/evaluation_plots.py,sha256=xfyVlE7B4E376EL4AF8A4T5kUfqzPShGOSy548psT6M,21230
14
+ anomaly_pipeline/helpers/ewma.py,sha256=YprdcvR17EQ4X9pJo5OusaD3jNaaoHvQLHRHHt25CGk,3562
15
+ anomaly_pipeline/helpers/fb_prophet.py,sha256=-ivBIgMBPT4DG-hbGXPMB1-aiEBfLw2LQvy6eXKzELQ,3182
16
+ anomaly_pipeline/helpers/help_info.py,sha256=QuRd206KQ8etRnlODH9Ek_zmXUvHSBwVQtukqf0iKSc,37012
17
+ anomaly_pipeline/helpers/iso_forest_general.py,sha256=nonZl2wcLyHe0E50mqQUw_IB3tuMochmZKQNd0xMFVk,2350
18
+ anomaly_pipeline/helpers/iso_forest_timeseries.py,sha256=SWf6g0mwLohIRdMvGfMCAcfWi5FPPokiV7dM8Un5qpE,5900
19
+ anomaly_pipeline/helpers/percentile.py,sha256=eLk0PgY7m7z7VKTLfXg8ykKii0ciAJvlGOYXpv84mOE,2523
20
+ anomaly_pipeline-0.1.27.dist-info/METADATA,sha256=YIIJMpsDchA8L2Jp0T4wBXpxwcL5r-UiJ35gLP6BRCs,371
21
+ anomaly_pipeline-0.1.27.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
22
+ anomaly_pipeline-0.1.27.dist-info/entry_points.txt,sha256=c7aMFN_VdyQk_gKp9S2-bz4AF3eBActUectAElnEdMo,92
23
+ anomaly_pipeline-0.1.27.dist-info/top_level.txt,sha256=3QhrLt05iNbxIQhnAA0vmIkRQje4Hc_STGY_Tukx3Vg,17
24
+ anomaly_pipeline-0.1.27.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ run_anomaly_pipeline = anomaly_pipeline.main:timeseries_anomaly_detection
@@ -0,0 +1 @@
1
+ anomaly_pipeline