anomaly-pipeline 0.1.27__py3-none-any.whl → 0.1.61__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anomaly_pipeline/__init__.py +73 -1
- anomaly_pipeline/helpers/DB_scan.py +144 -10
- anomaly_pipeline/helpers/MAD.py +45 -0
- anomaly_pipeline/helpers/Preprocessing.py +274 -73
- anomaly_pipeline/helpers/STD.py +64 -0
- anomaly_pipeline/helpers/__init__.py +13 -1
- anomaly_pipeline/helpers/evaluation_info.py +25 -17
- anomaly_pipeline/helpers/evaluation_plots.py +636 -30
- anomaly_pipeline/helpers/ewma.py +105 -7
- anomaly_pipeline/helpers/fb_prophet.py +150 -2
- anomaly_pipeline/helpers/{help_info.py → help_anomaly.py} +194 -89
- anomaly_pipeline/helpers/iso_forest_general.py +5 -3
- anomaly_pipeline/helpers/iso_forest_timeseries.py +195 -23
- anomaly_pipeline/helpers/percentile.py +46 -3
- anomaly_pipeline/main.py +158 -39
- anomaly_pipeline/pipeline.py +106 -34
- anomaly_pipeline-0.1.61.dist-info/METADATA +275 -0
- anomaly_pipeline-0.1.61.dist-info/RECORD +24 -0
- anomaly_pipeline-0.1.27.dist-info/METADATA +0 -15
- anomaly_pipeline-0.1.27.dist-info/RECORD +0 -24
- {anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/WHEEL +0 -0
- {anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/entry_points.txt +0 -0
- {anomaly_pipeline-0.1.27.dist-info → anomaly_pipeline-0.1.61.dist-info}/top_level.txt +0 -0
anomaly_pipeline/helpers/ewma.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
import numpy as np
|
|
3
3
|
import statistics
|
|
4
|
+
from .Preprocessing import classify
|
|
4
5
|
|
|
5
6
|
# # EWMA functions
|
|
6
7
|
|
|
7
|
-
def ewma_forecast(train, alpha):
|
|
8
|
-
|
|
8
|
+
"""def ewma_forecast(train, alpha):
|
|
9
|
+
Return last EWMA forecast value based on training data.
|
|
9
10
|
ewma = train.ewm(alpha=alpha, adjust=False).mean()
|
|
10
11
|
return ewma.iloc[-1]
|
|
11
|
-
|
|
12
|
+
"""
|
|
12
13
|
|
|
13
14
|
"""
|
|
14
15
|
def ew_std(series, alpha):
|
|
@@ -42,13 +43,13 @@ def ew_std(series, alpha):
|
|
|
42
43
|
# Std = sqrt(var)
|
|
43
44
|
return np.sqrt(ewma_var.iloc[-1]) """
|
|
44
45
|
|
|
45
|
-
|
|
46
|
+
"""
|
|
46
47
|
def ewma_with_anomalies_rolling_group(group, group_columns, variable, date_column, alpha, sigma, eval_period):
|
|
47
48
|
|
|
48
|
-
|
|
49
|
+
|
|
49
50
|
Rolling (expanding window) EWMA anomaly detection for a SINGLE GROUP ONLY.
|
|
50
51
|
Expects `group` to already be filtered to one group.
|
|
51
|
-
|
|
52
|
+
|
|
52
53
|
|
|
53
54
|
group = group.sort_values(date_column).reset_index(drop=True)
|
|
54
55
|
n = len(group)
|
|
@@ -115,5 +116,102 @@ def ewma_with_anomalies_rolling_group(group, group_columns, variable, date_colum
|
|
|
115
116
|
final_output = pd.concat(results, ignore_index=True)
|
|
116
117
|
# Type Safety Check: Ensure the date column is always datetime before returning
|
|
117
118
|
final_output[date_column] = pd.to_datetime(final_output[date_column])
|
|
118
|
-
return final_output
|
|
119
|
+
return final_output"""
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def ewma_with_anomalies_rolling_group(group, group_columns, variable, date_column, alpha, sigma, eval_period):
|
|
124
|
+
"""
|
|
125
|
+
Rolling (expanding window) EWMA anomaly detection for a SINGLE GROUP ONLY.
|
|
126
|
+
Expects `group` to already be filtered to one group.
|
|
127
|
+
|
|
128
|
+
# 📉 EWMA Rolling Anomaly Detection
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
The `ewma_with_anomalies_rolling_group` function implements a **statistically weighted** approach to identifying outliers.
|
|
132
|
+
It uses an **Expanding Window** (Walk-Forward) strategy to adapt to recent trends while maintaining a memory of historical data.
|
|
133
|
+
|
|
134
|
+
## 📋 Functional Overview
|
|
135
|
+
This function calculates the **Exponentially Weighted Moving Average (EWMA)**, which assigns higher importance to recent observations.
|
|
136
|
+
By combining this forecast with a dynamic standard deviation "envelope," the function identifies points that deviate significantly from the expected trend.
|
|
137
|
+
|
|
119
138
|
|
|
139
|
+
|
|
140
|
+
## 🧠 Core Logic Components
|
|
141
|
+
|
|
142
|
+
### 1. Forecast Engine (`ewma_forecast`)
|
|
143
|
+
* **Weighting Mechanism:** Uses an `alpha` parameter (between 0 and 1) to determine the "decay" of information. A **higher alpha** makes the model more sensitive to recent changes.
|
|
144
|
+
* **Calculation:** Employs the formula:
|
|
145
|
+
$$EWMA_t = \\alpha \\cdot Y_t + (1 - \\alpha) \\cdot EWMA_{t-1}$$
|
|
146
|
+
|
|
147
|
+
### 2. The Rolling Anomaly Loop
|
|
148
|
+
The function partitions data into **TRAIN** and **TEST** sets and iterates through the evaluation period:
|
|
149
|
+
* **Expanding Training Set:** For every evaluation point, the function uses all preceding data to re-calculate the baseline.
|
|
150
|
+
* **Dynamic Thresholding:** * **Upper Limit:** `Forecast + (Sigma * Standard Deviation)`
|
|
151
|
+
* **Lower Limit:** `max(Forecast - (Sigma * Standard Deviation), 0)`
|
|
152
|
+
* **Iterative Evaluation:** It forecasts exactly **one point ahead**, checks for an anomaly, and then moves that point into the training set for the next iteration.
|
|
153
|
+
|
|
154
|
+
## 📤 Key Output Columns
|
|
155
|
+
The function returns a concatenated DataFrame containing:
|
|
156
|
+
* **`EWMA_forecast`**: The predicted value for that timestamp.
|
|
157
|
+
* **`STD`**: The standard deviation used to calculate the threshold.
|
|
158
|
+
* **`EWMA_high` / `EWMA_low`**: The dynamic boundaries (the "envelope") for the test period.
|
|
159
|
+
* **`set`**: Labels data as either **"TRAIN"** (historical baseline) or **"TEST"** (anomaly detection window).
|
|
160
|
+
* **`is_EWMA_anomaly`**: A boolean flag indicating if the actual value fell outside the limits.
|
|
161
|
+
|
|
162
|
+
## 💡 Usage Context
|
|
163
|
+
EWMA is ideal for **streaming-style data** or metrics that exhibit **level shifts**.
|
|
164
|
+
Because it weights recent data more heavily than a simple moving average, it is faster to adapt to new "normals" while still filtering out minor noise.
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
### ⚙️ Parameter Tuning
|
|
168
|
+
* **`alpha`**: Adjust this to control how quickly the model "forgets" old data (Typical range: `0.1 - 0.3`).
|
|
169
|
+
* **`sigma`**: Adjust this to control sensitivity. A **lower sigma** results in more anomalies, while a **higher sigma** (e.g., `3.0`) only flags extreme outliers.
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
# 1. Prepare Data
|
|
173
|
+
group = group.sort_values(date_column).reset_index(drop=True)
|
|
174
|
+
vals = group[variable].astype(float)
|
|
175
|
+
|
|
176
|
+
# 2. Calculate Statistics (Vectorized)
|
|
177
|
+
# Shift(1) ensures we use history only (no data leakage)
|
|
178
|
+
ewma_forecast = vals.ewm(alpha=alpha, adjust=False).mean().shift(1)
|
|
179
|
+
std_expanding = vals.expanding().std().shift(1)
|
|
180
|
+
|
|
181
|
+
# 3. Construct Output DataFrame
|
|
182
|
+
results = group[group_columns + [date_column]].copy()
|
|
183
|
+
results[variable] = vals
|
|
184
|
+
results["alpha"] = alpha
|
|
185
|
+
results["sigma"] = sigma
|
|
186
|
+
|
|
187
|
+
# 4. Handle Nulls for the first two rows (The "Backfill" logic)
|
|
188
|
+
# Backfilling allows us to have a baseline even for the very first point
|
|
189
|
+
results["EWMA_forecast"] = ewma_forecast.bfill()
|
|
190
|
+
results["STD"] = std_expanding.bfill().fillna(0) # fillna(0) in case there's only 1 row total
|
|
191
|
+
|
|
192
|
+
# 5. Define Bounds (Now that nulls are handled)
|
|
193
|
+
results["EWMA_high"] = results["EWMA_forecast"] + (sigma * results["STD"])
|
|
194
|
+
results["EWMA_low"] = (results["EWMA_forecast"] - (sigma * results["STD"])).clip(lower=0)
|
|
195
|
+
|
|
196
|
+
# 6. USE THE CLASSIFY FUNCTION
|
|
197
|
+
# Note: Ensure 'classify' function is defined in your script!
|
|
198
|
+
results["EWMA_anomaly"] = results.apply(
|
|
199
|
+
lambda row: classify(row[variable], row["EWMA_low"], row["EWMA_high"]),
|
|
200
|
+
axis=1
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# If the first row was backfilled, we should force it to 'none'
|
|
204
|
+
# to be safe since it's not a "real" statistical forecast.
|
|
205
|
+
results.loc[0, "EWMA_anomaly"] = 'none'
|
|
206
|
+
|
|
207
|
+
# 7. Final Flags and Labels
|
|
208
|
+
results["is_EWMA_anomaly"] = results["EWMA_anomaly"] != 'none'
|
|
209
|
+
results["EWMA_residual"] = vals - results["EWMA_forecast"]
|
|
210
|
+
|
|
211
|
+
results["set"] = "TRAIN"
|
|
212
|
+
if eval_period > 0 and len(results) >= eval_period:
|
|
213
|
+
results.iloc[-eval_period:, results.columns.get_loc("set")] = "TEST"
|
|
214
|
+
|
|
215
|
+
results[date_column] = pd.to_datetime(results[date_column])
|
|
216
|
+
|
|
217
|
+
return results
|
|
@@ -5,6 +5,7 @@ from prophet import Prophet
|
|
|
5
5
|
import warnings
|
|
6
6
|
import os
|
|
7
7
|
import sys
|
|
8
|
+
from .Preprocessing import classify
|
|
8
9
|
from contextlib import contextmanager
|
|
9
10
|
|
|
10
11
|
warnings.filterwarnings("ignore")
|
|
@@ -21,13 +22,15 @@ def suppress_stdout_stderr():
|
|
|
21
22
|
finally:
|
|
22
23
|
sys.stdout, sys.stderr = old_stdout, old_stderr
|
|
23
24
|
|
|
25
|
+
"""
|
|
24
26
|
def detect_time_series_anomalies_fb_walkforward(
|
|
25
27
|
group,
|
|
26
28
|
variable,
|
|
27
29
|
date_column,
|
|
28
30
|
eval_period,
|
|
29
|
-
|
|
31
|
+
prophet_CI
|
|
30
32
|
):
|
|
33
|
+
|
|
31
34
|
# 1. Silence the cmdstanpy logger completely
|
|
32
35
|
logger = logging.getLogger('cmdstanpy')
|
|
33
36
|
logger.addHandler(logging.NullHandler())
|
|
@@ -54,7 +57,8 @@ def detect_time_series_anomalies_fb_walkforward(
|
|
|
54
57
|
weekly_seasonality=True,
|
|
55
58
|
yearly_seasonality=True,
|
|
56
59
|
daily_seasonality=False,
|
|
57
|
-
interval_width=
|
|
60
|
+
interval_width=prophet_CI,
|
|
61
|
+
# prophet_CI=prophet_CI
|
|
58
62
|
)
|
|
59
63
|
|
|
60
64
|
# --- WRAP THE FIT IN THE MUTER ---
|
|
@@ -91,4 +95,148 @@ def detect_time_series_anomalies_fb_walkforward(
|
|
|
91
95
|
group.loc[train_mask, "FB_residual"] = np.nan
|
|
92
96
|
group.loc[train_mask, "is_FB_anomaly"] = np.nan
|
|
93
97
|
|
|
98
|
+
return group
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
def detect_time_series_anomalies_fb_walkforward(
|
|
102
|
+
group,
|
|
103
|
+
variable,
|
|
104
|
+
date_column,
|
|
105
|
+
eval_period,
|
|
106
|
+
prophet_CI
|
|
107
|
+
):
|
|
108
|
+
"""
|
|
109
|
+
# 🚀 Facebook Prophet Walk-Forward Model
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
The `detect_time_series_anomalies_fb_walkforward` function is a sophisticated forecasting tool designed for **iterative anomaly detection**. It utilizes the Facebook Prophet library to perform a **walk-forward validation**, forecasting one data point at a time and expanding the training set as it progresses.
|
|
113
|
+
|
|
114
|
+
## 📋 Functional Overview
|
|
115
|
+
Unlike standard batch forecasting, this function operates by simulating a real-world scenario where the model is updated as soon as new data arrives. It establishes a **cutoff date** based on the specified `eval_period`, then iteratively predicts the next point, compares it to the observed value, and incorporates that value back into the training history.
|
|
116
|
+
|
|
117
|
+
## 🧠 Core Logic Stages
|
|
118
|
+
|
|
119
|
+
### 1. Data Preparation and Cutoff
|
|
120
|
+
* **Standardization:** The input data is sorted by date and converted to **datetime objects** to ensure proper time-series alignment.
|
|
121
|
+
* **Partitioning:** The dataset is split into an **Initial Training Set** (all data before the cutoff) and an **Evaluation Set** (the rolling forecast window).
|
|
122
|
+
|
|
123
|
+
### 2. Walk-Forward Loop (Sequential Testing)
|
|
124
|
+
* **Model Fitting:** For every point in the evaluation set, a new **Prophet model** is initialized with weekly and yearly seasonality enabled.
|
|
125
|
+
* **One-Step Forecast:** The model generates a prediction (`yhat`) and an uncertainty interval (`yhat_lower`, `yhat_upper`) specifically for the **next single point**.
|
|
126
|
+
* **Dynamic Training Expansion:** After each prediction, the actual observed value is appended to the training data. This ensures the model learns from the most recent information before making the next prediction.
|
|
127
|
+
* **Robust Error Handling:** If the Prophet fit fails, the function falls back to a **baseline persistence model** (last observed value) to prevent pipeline failure.
|
|
128
|
+
|
|
129
|
+
### 3. Anomaly Classification
|
|
130
|
+
* **Uncertainty Bounds:** Anomalies are defined by the `prophet_CI` parameter. Any observation falling outside the predicted upper or lower bounds is flagged.
|
|
131
|
+
* **Residual Calculation:** The function computes the **FB_residual** (Actual - Forecast) to quantify the magnitude of deviations.
|
|
132
|
+
|
|
133
|
+
## 📤 Key Output Columns
|
|
134
|
+
The function appends the following columns to the returned DataFrame:
|
|
135
|
+
* **`FB_forecast`**: The point estimate predicted by Prophet for that date.
|
|
136
|
+
* **`FB_low` / `FB_high`**: The dynamic boundaries based on the specified uncertainty interval.
|
|
137
|
+
* **`FB_residual`**: The difference between the actual observed metric and the forecast.
|
|
138
|
+
* **`FB_anomaly`**: A categorical label designating the deviation as **"high"** or **"low"**.
|
|
139
|
+
* **`is_FB_anomaly`**: A boolean flag identifying outliers in the evaluation region.
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
## 💡 Usage Context
|
|
143
|
+
This approach is highly effective for metrics with **strong seasonality and complex trends**. Because it uses a walk-forward loop, it is significantly more accurate than a static forecast for long evaluation periods, as it corrects itself based on the most recent trends. It is ideal for detecting "sudden" shifts that standard statistical models (like Z-Score) might miss.
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
### 📊 Evaluation Strategy
|
|
147
|
+
This function strictly ignores the training region for anomaly reporting, ensuring that all reported anomalies are based on "out-of-sample" performance where the model had no prior knowledge of the specific data point being tested.
|
|
148
|
+
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
# 1. Silence the cmdstanpy logger
|
|
152
|
+
logger = logging.getLogger('cmdstanpy')
|
|
153
|
+
logger.addHandler(logging.NullHandler())
|
|
154
|
+
logger.propagate = False
|
|
155
|
+
logger.setLevel(logging.CRITICAL)
|
|
156
|
+
|
|
157
|
+
group = group.sort_values(date_column).copy()
|
|
158
|
+
group[date_column] = pd.to_datetime(group[date_column])
|
|
159
|
+
|
|
160
|
+
# Calculate cutoff for the walk-forward
|
|
161
|
+
cutoff_date = group[date_column].max() - pd.Timedelta(weeks=eval_period)
|
|
162
|
+
|
|
163
|
+
group["FB_forecast"] = np.nan
|
|
164
|
+
group["FB_low"] = np.nan
|
|
165
|
+
group["FB_high"] = np.nan
|
|
166
|
+
|
|
167
|
+
train = group[group[date_column] <= cutoff_date].copy()
|
|
168
|
+
test = group[group[date_column] > cutoff_date].copy()
|
|
169
|
+
|
|
170
|
+
# --- INITIAL FIT FOR TRAIN DATA ---
|
|
171
|
+
prophet_train_initial = train.rename(columns={date_column: "ds", variable: "y"})
|
|
172
|
+
try:
|
|
173
|
+
model_initial = Prophet(
|
|
174
|
+
weekly_seasonality=True,
|
|
175
|
+
yearly_seasonality=True,
|
|
176
|
+
daily_seasonality=False,
|
|
177
|
+
interval_width=prophet_CI # Fixed: Prophet uses interval_width
|
|
178
|
+
)
|
|
179
|
+
with suppress_stdout_stderr():
|
|
180
|
+
model_initial.fit(prophet_train_initial)
|
|
181
|
+
|
|
182
|
+
# Predict on the training dates to get historical bounds
|
|
183
|
+
train_forecast = model_initial.predict(prophet_train_initial)
|
|
184
|
+
|
|
185
|
+
# Map back to group (Train indices)
|
|
186
|
+
train_indices = group[group[date_column] <= cutoff_date].index
|
|
187
|
+
group.loc[train_indices, "FB_forecast"] = train_forecast["yhat"].values
|
|
188
|
+
group.loc[train_indices, "FB_low"] = train_forecast["yhat_lower"].clip(lower=0).values
|
|
189
|
+
group.loc[train_indices, "FB_high"] = train_forecast["yhat_upper"].values
|
|
190
|
+
|
|
191
|
+
except Exception as e:
|
|
192
|
+
print(f"Initial Prophet fit failed: {e}")
|
|
193
|
+
|
|
194
|
+
# --- WALK-FORWARD FOR TEST DATA ---
|
|
195
|
+
for i, row in test.iterrows():
|
|
196
|
+
prophet_train = train.rename(columns={date_column: "ds", variable: "y"})
|
|
197
|
+
try:
|
|
198
|
+
model = Prophet(
|
|
199
|
+
weekly_seasonality=True,
|
|
200
|
+
yearly_seasonality=True,
|
|
201
|
+
daily_seasonality=False,
|
|
202
|
+
interval_width=prophet_CI
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
with suppress_stdout_stderr():
|
|
206
|
+
model.fit(prophet_train)
|
|
207
|
+
|
|
208
|
+
future = pd.DataFrame({"ds": [row[date_column]]})
|
|
209
|
+
fc = model.predict(future).iloc[0]
|
|
210
|
+
|
|
211
|
+
group.loc[i, "FB_forecast"] = fc["yhat"]
|
|
212
|
+
group.loc[i, "FB_low"] = max(fc["yhat_lower"], 0)
|
|
213
|
+
group.loc[i, "FB_high"] = fc["yhat_upper"]
|
|
214
|
+
|
|
215
|
+
except Exception as e:
|
|
216
|
+
print(f"Prophet failed for KEY={group.get('key', ['NA'])[0]} on date={row[date_column]}: {e}")
|
|
217
|
+
# Fallback to naive logic
|
|
218
|
+
last_val = train[variable].iloc[-1]
|
|
219
|
+
group.loc[i, "FB_forecast"] = last_val
|
|
220
|
+
group.loc[i, "FB_low"] = last_val
|
|
221
|
+
group.loc[i, "FB_high"] = last_val
|
|
222
|
+
|
|
223
|
+
# Update train for next iteration
|
|
224
|
+
new_train_row = row.to_frame().T
|
|
225
|
+
train = pd.concat([train, new_train_row], ignore_index=True)
|
|
226
|
+
|
|
227
|
+
# --- UNIFIED ANOMALY DETECTION (Train + Test) ---
|
|
228
|
+
group["FB_residual"] = group[variable] - group["FB_forecast"]
|
|
229
|
+
|
|
230
|
+
# Applying your custom classify function row by row
|
|
231
|
+
group["FB_anomaly"] = group.apply(
|
|
232
|
+
lambda row: classify(row[variable], row["FB_low"], row["FB_high"]),
|
|
233
|
+
axis=1
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
group["is_FB_anomaly"] = group["FB_anomaly"] != 'none'
|
|
237
|
+
|
|
238
|
+
# Label set
|
|
239
|
+
group["set"] = "TRAIN"
|
|
240
|
+
group.loc[group[date_column] > cutoff_date, "set"] = "TEST"
|
|
241
|
+
|
|
94
242
|
return group
|