anomaly-pipeline 0.1.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,289 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import sklearn
4
+ from sklearn.preprocessing import StandardScaler
5
+ from sklearn.cluster import DBSCAN
6
+ from sklearn.neighbors import NearestNeighbors
7
+ from sklearn.ensemble import IsolationForest
8
+ from statsmodels.tsa.stattools import acf
9
+
10
+
11
+ def get_dynamic_lags(series: pd.Series) -> list:
12
+
13
+ n = len(series)
14
+
15
+ # Determine Max Lags (Max is min(50% of data, a hard cap of 60))
16
+ nlags = min(int(n * 0.5), 60)
17
+
18
+ if nlags < 5:
19
+ return [1, 2, 3]
20
+
21
+ # Calculate ACF and Confidence Intervals, get the 10 most-significant lags
22
+ autocorrelations, confint = acf(series.dropna(), nlags=nlags, alpha=0.25, fft=True)
23
+ autocorr_values = autocorrelations[1:]
24
+ conf_limit = confint[1:, 1] - autocorr_values
25
+ is_significant = np.abs(autocorr_values) > conf_limit
26
+ significant_autocorr = autocorr_values[is_significant]
27
+ significant_lags_indices = np.where(is_significant)[0] + 1
28
+ ranked_indices = np.argsort(np.abs(significant_autocorr))[::-1]
29
+ top_lags_indices = ranked_indices[:10]
30
+ top_lags = significant_lags_indices[top_lags_indices].tolist()
31
+ base_lags = [1, 2, 3]
32
+ dynamic_lags = sorted(list(set(base_lags + top_lags)))[:10]
33
+
34
+ return dynamic_lags
35
+
36
+
37
+ def detect_time_series_anomalies_isoforest(
38
+ group,
39
+ variable,
40
+ date_column,
41
+ eval_period,
42
+ ):
43
+
44
+ group[date_column] = pd.to_datetime(group[date_column])
45
+ group = group.copy().sort_values(date_column).reset_index(drop=True)
46
+
47
+ '''
48
+ Iterate over each of the evaluation periods, fitting the model to all the data before the evaluation period
49
+ and then getting the predicted anomaly score for the given evaluation period
50
+ '''
51
+ try:
52
+ test_anom = []
53
+
54
+ for t in list(range(eval_period - 1, -1, -1)):
55
+
56
+ try:
57
+
58
+ # Boundary between rolling train and rolling forecast region
59
+ cutoff_date = group[date_column].max() - pd.Timedelta(weeks=t)
60
+
61
+ # Get train set to determine lags
62
+ model_group = group.copy()[[date_column, variable]]
63
+ train = model_group[model_group[date_column] <= cutoff_date].copy()
64
+ lags = get_dynamic_lags(train[variable])
65
+
66
+ # Create lag features on the entire model_group DF
67
+ for lag in lags:
68
+ model_group[f'lag{lag}'] = model_group[variable].shift(lag)
69
+
70
+ # Get rolling stats features for the entire model_group DF
71
+ rolling_stats_features = []
72
+ for w in [int(np.ceil(max(lags)/4)), int(np.ceil(max(lags)/2)), int(max(lags))]:
73
+ if w >= 3:
74
+ rolling_stats_features.append('roll_mean' + str(w))
75
+ rolling_stats_features.append('roll_std' + str(w))
76
+ model_group['roll_mean' + str(w)] = model_group[variable].shift(1).rolling(w).mean()
77
+ model_group['roll_std' + str(w)] = model_group[variable].shift(1).rolling(w).std()
78
+
79
+ # Get trend feature
80
+ model_group['trend'] = group.index
81
+
82
+ # Drop records with NAs
83
+ model_group = model_group.copy().dropna()
84
+
85
+ # Split into train and test (train and test now both have all the features
86
+ train = model_group[model_group[date_column] <= cutoff_date].copy()
87
+ test = model_group[model_group[date_column] == cutoff_date].copy()
88
+
89
+ # Identify all model features (lags, rolling stats, trend, and the variable itself)
90
+ features = [f'lag{i}' for i in lags] + rolling_stats_features + ['trend'] + [variable]
91
+
92
+ # Create and fit the model
93
+ iso_forest_model = IsolationForest(
94
+ n_estimators=200,
95
+ contamination=0.01,
96
+ random_state=42
97
+ )
98
+ iso_forest_model.fit(train[features])
99
+
100
+ train['isolation_forest_score'] = iso_forest_model.decision_function(train[features])
101
+ anomaly_threshold = min(0,
102
+ train[train['isolation_forest_score'] > 0]['isolation_forest_score'].mean() - 3 * train[train['isolation_forest_score'] > 0]['isolation_forest_score'].std())
103
+ test['isolation_forest_score'] = iso_forest_model.decision_function(test[features])
104
+ test['contamination_anomaly'] = iso_forest_model.predict(test[features]) # -1 = anomaly, 1 = normal
105
+ test['isolation_forest_anomaly_threshold'] = anomaly_threshold
106
+ test['threshold_anomaly'] = np.where(test['isolation_forest_score'] < anomaly_threshold, -1, 1)
107
+
108
+ test['is_IsolationForest_anomaly'] = np.where((test['contamination_anomaly'] == -1) & (test['threshold_anomaly'] == -1), True, False)
109
+ test = test[[variable, date_column, 'isolation_forest_anomaly_threshold', 'isolation_forest_score', 'is_IsolationForest_anomaly']]
110
+ test_anom.append(test)
111
+ except:
112
+ pass
113
+ try:
114
+ test_anom = pd.concat(test_anom)
115
+ group = group.merge(test_anom[[variable, date_column, 'isolation_forest_anomaly_threshold', 'isolation_forest_score', 'is_IsolationForest_anomaly']],
116
+ on=[variable, date_column], how='left')
117
+ except:
118
+ print("Error in Isolation Forest process")
119
+ group["isolation_forest_anomaly_threshold"] = np.nan
120
+ group["isolation_forest_score"] = np.nan
121
+ group["is_IsolationForest_anomaly"] = np.nan
122
+
123
+ except:
124
+ group["isolation_forest_anomaly_threshold"] = np.nan
125
+ group["isolation_forest_score"] = np.nan
126
+ group["is_IsolationForest_anomaly"] = np.nan
127
+ # Get string or object dtype columns from group that would identify the group
128
+ group_id = key_series.select_dtypes(include=['object', 'string']).columns.tolist()
129
+ group_id = " ".join(key_series[group_id].reset_index(drop=True).iloc[0].to_list())
130
+ print(f'Isolation Forest Anomaly Detection failed for {group_id}')
131
+
132
+ return group
133
+
134
+
135
+ def find_optimal_epsilon(X_scaled: np.ndarray, k: int) -> float:
136
+ """
137
+ Finds the optimal epsilon by calculating the distance to the k-th nearest neighbor
138
+ and taking a high percentile (90-95th) of those distances as the cutoff.
139
+ This serves as a programmatic proxy for the 'elbow' method in a rolling window.
140
+ """
141
+ if len(X_scaled) < k:
142
+ return 1.0 # Fallback
143
+
144
+ # Find the distance to the k-th (min_samples) neighbor for every point
145
+ # n_neighbors is k+1 because the first distance is 0 (to itself)
146
+ neigh = NearestNeighbors(n_neighbors=k + 1)
147
+ neigh.fit(X_scaled)
148
+
149
+ # distances matrix: [n_samples, k+1]
150
+ distances, indices = neigh.kneighbors(X_scaled)
151
+
152
+ # We are interested in the distance to the k-th neighbor (index k)
153
+ # This k-distance is the required radius for a point to be a core point's neighbor.
154
+ k_distances = distances[:, k]
155
+
156
+ # The elbow is hard to find programmatically. A robust proxy for the density
157
+ # threshold is to take a high percentile (e.g., 95th) of the k-distances.
158
+ # This sets epsilon such that 95% of your *training* points would be considered
159
+ # part of a cluster's neighborhood.
160
+ optimal_eps = np.percentile(k_distances, 95)
161
+
162
+ # Ensure a minimum value if data is extremely sparse
163
+ return max(optimal_eps, 0.1)
164
+
165
+
166
+ def detect_time_series_anomalies_dbscan(
167
+ group,
168
+ variable,
169
+ date_column,
170
+ eval_period,
171
+ ):
172
+
173
+ group[date_column] = pd.to_datetime(group[date_column])
174
+ group = group.copy().sort_values(date_column).reset_index(drop=True)
175
+
176
+ # --- Default DBSCAN Parameters ---
177
+ # These parameters often need tuning, but these are reasonable starting points:
178
+ DEFAULT_EPS = 0.5 # Neighborhood radius (critical parameter)
179
+
180
+ try:
181
+ test_anom = []
182
+
183
+ for t in list(range(eval_period - 1, -1, -1)):
184
+
185
+ try:
186
+ # Boundary between rolling train and rolling forecast region
187
+ cutoff_date = group[date_column].max() - pd.Timedelta(weeks=t)
188
+
189
+ # Get train set to determine lags
190
+ model_group = group.copy()[[date_column, variable]]
191
+ train = model_group[model_group[date_column] <= cutoff_date].copy()
192
+ lags = get_dynamic_lags(train[variable])
193
+
194
+ # Create lag features and rolling stats for the entire DF
195
+ rolling_stats_features = []
196
+ for lag in lags:
197
+ model_group[f'lag{lag}'] = model_group[variable].shift(lag)
198
+
199
+ for w in [int(np.ceil(max(lags)/4)), int(np.ceil(max(lags)/2)), int(max(lags))]:
200
+ if w >= 3:
201
+ rolling_stats_features.extend([f'roll_mean_{w}', f'roll_std_{w}'])
202
+ model_group[f'roll_mean_{w}'] = model_group[variable].shift(1).rolling(w).mean()
203
+ model_group[f'roll_std_{w}'] = model_group[variable].shift(1).rolling(w).std()
204
+
205
+ model_group['trend'] = group.index
206
+ model_group = model_group.copy().dropna()
207
+
208
+ # Split into train and test
209
+ train = model_group[model_group[date_column] <= cutoff_date].copy()
210
+ test = model_group[model_group[date_column] == cutoff_date].copy()
211
+
212
+ # Identify all model features (lags, rolling stats, trend, and the variable itself)
213
+ features = [f'lag{i}' for i in lags] + rolling_stats_features + ['trend'] + [variable]
214
+
215
+ # Fit the scaler ONLY on the training data to avoid data leakage
216
+ scaler = StandardScaler()
217
+
218
+ # Fit the scaler on the train data features
219
+ scaler.fit(train[features])
220
+
221
+ # Transform both train and test sets
222
+ train_scaled = scaler.transform(train[features])
223
+ test_scaled = scaler.transform(test[features])
224
+
225
+ # Determine min_samples based on feature space dimension
226
+ min_samples = max(2 * len(features), 3)
227
+
228
+ # Find optimal epsilon
229
+ calculated_eps = find_optimal_epsilon(train_scaled, k=min_samples)
230
+
231
+ # --- DBSCAN MODEL ---
232
+ dbscan_model = DBSCAN(
233
+ eps=calculated_eps,
234
+ min_samples=min_samples,
235
+ n_jobs=-1
236
+ )
237
+
238
+ # Fit DBSCAN on the scaled training data
239
+ dbscan_model.fit(train_scaled)
240
+
241
+ # Since DBSCAN doesn't have a direct predict() method for new data points,
242
+ # the simplest (and common) proxy is to treat the test point as unassigned noise,
243
+ # which requires complex distance logic.
244
+
245
+ neigh = NearestNeighbors(n_neighbors=min_samples)
246
+ neigh.fit(train_scaled)
247
+
248
+ # Find the distance of the test point to its nearest neighbors in the train set
249
+ distances, indices = neigh.kneighbors(test_scaled)
250
+
251
+ # Anomaly check: If the distance to the min_samples-th neighbor is > eps, it's noise.
252
+ # Use the distance to the k-th neighbor (index min_samples-1)
253
+ k_distance = distances[:, min_samples - 1]
254
+
255
+ # Flag as anomaly if the k-distance is greater than the trained eps threshold
256
+ test['dbscan_anomaly_threshold'] = 0
257
+ test['DBSCAN_score'] = k_distance - calculated_eps
258
+ test['is_DBSCAN_anomaly'] = np.where(k_distance > calculated_eps, True, False)
259
+
260
+ test = test[[variable, date_column, 'dbscan_anomaly_threshold', 'DBSCAN_score', 'is_DBSCAN_anomaly']]
261
+ test_anom.append(test)
262
+
263
+ except Exception as e:
264
+ print(f"Error in iteration {t}: {e}")
265
+ pass
266
+
267
+ try:
268
+ test_anom = pd.concat(test_anom)
269
+ group = group.merge(test_anom[[variable, date_column, 'DBSCAN_score', 'is_DBSCAN_anomaly']], on=[variable, date_column], how='left')
270
+ except:
271
+ print("Error in DBSCAN process")
272
+ group['dbscan_anomaly_threshold'] = np.nan
273
+ group['DBSCAN_score'] = np.nan
274
+ group["is_DBSCAN_anomaly"] = np.nan
275
+
276
+ except Exception as e:
277
+ # Fallback error handling
278
+ # Replace key_series with group for robustness if key_series is not defined
279
+ try:
280
+ group_id_cols = group.select_dtypes(include=['object', 'string']).columns.tolist()
281
+ group_id = " ".join(group[group_id_cols].reset_index(drop=True).iloc[0].astype(str).to_list())
282
+ except:
283
+ group_id = "Unknown Group ID"
284
+ print(f'DBSCAN Anomaly Detection failed for {group_id}. Error: {e}')
285
+ group['dbscan_anomaly_threshold'] = np.nan
286
+ group['DBSCAN_score'] = np.nan
287
+ group["is_DBSCAN_anomaly"] = np.nan
288
+
289
+ return group
@@ -0,0 +1,121 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from IPython.display import display, Markdown
4
+ from anomaly_pipeline.helpers.evaluation_plots import anomaly_eval_plot, anomaly_percentile_plot,\
5
+ anomaly_sd_plot, anomaly_mad_plot, anomaly_iqr_plot, anomaly_ewma_plot, anomaly_fb_plot, anomaly_dbscan_plot, anomaly_isolation_forest_timeseries_plot
6
+
7
+
8
+ def evaluation_info(
9
+ eval_df,
10
+ group_columns,
11
+ variable,
12
+ date_column,
13
+ eval_period=12,
14
+ models_to_plot=[]
15
+ ):
16
+
17
+ group_ids = eval_df[group_columns].drop_duplicates().reset_index(drop=True)
18
+ group_cnt = len(group_ids)
19
+
20
+ if group_cnt == 1 and len(models_to_plot) == 0:
21
+ models_to_plot = ['overall', 'percentile', 'iqr', 'mad', 'std', 'ewma', 'prophet', 'dbscan', 'isolation_forest']
22
+ elif group_cnt >= 2 and len(models_to_plot) == 0:
23
+ models_to_plot = ['overall']
24
+
25
+ record_cnt = len(eval_df)
26
+ date_cnt = len(eval_df[date_column].drop_duplicates())
27
+ anomaly_cnt = len(eval_df[eval_df['is_Anomaly'] == True])
28
+ interpolated_cnt = len(eval_df[eval_df['is_missing_record'] == True])
29
+ interpolation_method = 'linear'
30
+ if interpolated_cnt >= 6:
31
+ interpolated_records_msg = " Here is a view of 5 of the interpolated records:"""
32
+ elif interpolated_cnt <= 5 and interpolated_cnt >= 2:
33
+ interpolated_records_msg = f" Here is a view of the {interpolated_cnt} interpolated records:"
34
+ elif interpolated_cnt == 1:
35
+ interpolated_records_msg = " Here is a view of the 1 interpolated record:"
36
+ else:
37
+ interpolated_records_msg = ""
38
+
39
+ if interpolated_cnt >= 1:
40
+ interpolation_msg = f""" and values were interpolated using the {interpolation_method} method and {interpolated_cnt} additional records were added to the data.{interpolated_records_msg}"""
41
+ else:
42
+ interpolation_msg = ""
43
+
44
+ no_eval_groups = (
45
+ eval_df.groupby(['taxonomy', 'channel'])['is_Anomaly']\
46
+ .agg(is_all_na=lambda x: x.isna().all(), historical_data_points='size')\
47
+ .reset_index()
48
+ )
49
+ no_eval_groups = no_eval_groups[no_eval_groups['is_all_na'] == True].drop(columns='is_all_na').reset_index(drop=True)
50
+
51
+ if len(no_eval_groups) >= 6:
52
+ no_evals_sub_msg = f"Here are 5 of the {len(no_eval_groups)} groups that do not have enough historical data points:"
53
+ elif len(no_eval_groups) >= 2 and len(no_eval_groups) <= 5:
54
+ no_evals_sub_msg = f"Here are the {len(no_eval_groups)} groups that do not have enough historical data points:"
55
+ elif len(no_eval_groups) == 1:
56
+ no_evals_sub_msg = f"Here is the 1 group that does not have enough historical data points:"
57
+ else:
58
+ no_evals_sub_msg = ""
59
+
60
+ no_evals_msg = f"""{len(no_eval_groups)} distinct group_column values did not have minimum number of historical data points to satisfy the period for evaluation that you specified.
61
+
62
+ To increase the chance of evaluating these records, lower the `eval_period` parameter, which controls which number of periods to evaluate.
63
+
64
+ {no_evals_sub_msg}
65
+ """
66
+
67
+ eval_msg1 = f"""## Anomaly Detection successfully ran on the `{variable}` column.
68
+
69
+ {group_cnt} distinct unique ID from group_columns values {'were' if group_cnt >= 2 else 'was'} evaluated on {'their' if group_cnt >= 2 else 'its'} last {eval_period} dates, which is {(eval_period/date_cnt):.0%} of the records in the data."""
70
+
71
+ eval_msg2 = f"""
72
+ {interpolated_cnt} records were missing{interpolation_msg}.
73
+
74
+ {anomaly_cnt} records were identified as anomalous. This is {(anomaly_cnt/record_cnt):.0%} of the data.
75
+
76
+ ### Preview of final table:"""
77
+
78
+ plot_msg = """---
79
+ ## Evaluation Plots"""
80
+
81
+ display(Markdown(eval_msg1))
82
+
83
+ if interpolated_cnt >= 1:
84
+ display(eval_df[eval_df['is_missing_record'] == True].sample(5))
85
+
86
+ display(Markdown(eval_msg2))
87
+
88
+ display(eval_df.head(10))
89
+
90
+ if len(no_eval_groups) >= 1:
91
+ display(Markdown(no_evals_msg))
92
+ display(no_eval_groups.head(5))
93
+
94
+ display(Markdown(plot_msg))
95
+
96
+ # plot all specified model plots for all groups
97
+ for row in group_ids.itertuples():
98
+ group_df = pd.DataFrame([row._asdict()])[group_columns]
99
+ group_df = eval_df.copy().merge(group_df, on=group_columns, how='inner')
100
+
101
+ if len(group_df) > eval_period:
102
+
103
+ for model in models_to_plot:
104
+ if model == 'overall':
105
+ anomaly_eval_plot(group_df, group_columns, variable, date_column, eval_period=12, show_anomaly_scores_on_main_plot=False)
106
+ elif model == 'percentile':
107
+ anomaly_percentile_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, final_anomalies=False, eval_period=12)
108
+ elif model == 'iqr':
109
+ anomaly_iqr_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, final_anomalies=False, eval_period=12)
110
+ elif model == 'mad':
111
+ anomaly_mad_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, final_anomalies=False, eval_period=12)
112
+ elif model == 'std':
113
+ anomaly_sd_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, final_anomalies=False, eval_period=12)
114
+ elif model == 'ewma':
115
+ anomaly_ewma_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, final_anomalies=False, eval_period=12)
116
+ elif model == 'prophet':
117
+ anomaly_fb_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, final_anomalies=False, eval_period=12)
118
+ elif model == 'dbscan':
119
+ anomaly_dbscan_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, final_anomalies=False, eval_period=12)
120
+ elif model == 'isolation_forest':
121
+ anomaly_isolation_forest_timeseries_plot(group=group_df, group_columns=group_columns, variable=variable, date_column=date_column, final_anomalies=False, eval_period=12)