anomaly-pipeline 0.1.27__py3-none-any.whl → 0.1.61__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,15 @@
1
1
  import pandas as pd
2
2
  import numpy as np
3
3
  import statistics
4
+ from .Preprocessing import classify
4
5
 
5
6
  # # EWMA functions
6
7
 
7
- def ewma_forecast(train, alpha):
8
- """Return last EWMA forecast value based on training data."""
8
+ """def ewma_forecast(train, alpha):
9
+ Return last EWMA forecast value based on training data.
9
10
  ewma = train.ewm(alpha=alpha, adjust=False).mean()
10
11
  return ewma.iloc[-1]
11
-
12
+ """
12
13
 
13
14
  """
14
15
  def ew_std(series, alpha):
@@ -42,13 +43,13 @@ def ew_std(series, alpha):
42
43
  # Std = sqrt(var)
43
44
  return np.sqrt(ewma_var.iloc[-1]) """
44
45
 
45
-
46
+ """
46
47
  def ewma_with_anomalies_rolling_group(group, group_columns, variable, date_column, alpha, sigma, eval_period):
47
48
 
48
- """
49
+
49
50
  Rolling (expanding window) EWMA anomaly detection for a SINGLE GROUP ONLY.
50
51
  Expects `group` to already be filtered to one group.
51
- """
52
+
52
53
 
53
54
  group = group.sort_values(date_column).reset_index(drop=True)
54
55
  n = len(group)
@@ -115,5 +116,102 @@ def ewma_with_anomalies_rolling_group(group, group_columns, variable, date_colum
115
116
  final_output = pd.concat(results, ignore_index=True)
116
117
  # Type Safety Check: Ensure the date column is always datetime before returning
117
118
  final_output[date_column] = pd.to_datetime(final_output[date_column])
118
- return final_output
119
+ return final_output"""
120
+
121
+
122
+
123
+ def ewma_with_anomalies_rolling_group(group, group_columns, variable, date_column, alpha, sigma, eval_period):
124
+ """
125
+ Rolling (expanding window) EWMA anomaly detection for a SINGLE GROUP ONLY.
126
+ Expects `group` to already be filtered to one group.
127
+
128
+ # 📉 EWMA Rolling Anomaly Detection
129
+ ---
130
+
131
+ The `ewma_with_anomalies_rolling_group` function implements a **statistically weighted** approach to identifying outliers.
132
+ It uses an **Expanding Window** (Walk-Forward) strategy to adapt to recent trends while maintaining a memory of historical data.
133
+
134
+ ## 📋 Functional Overview
135
+ This function calculates the **Exponentially Weighted Moving Average (EWMA)**, which assigns higher importance to recent observations.
136
+ By combining this forecast with a dynamic standard deviation "envelope," the function identifies points that deviate significantly from the expected trend.
137
+
119
138
 
139
+
140
+ ## 🧠 Core Logic Components
141
+
142
+ ### 1. Forecast Engine (`ewma_forecast`)
143
+ * **Weighting Mechanism:** Uses an `alpha` parameter (between 0 and 1) to determine the "decay" of information. A **higher alpha** makes the model more sensitive to recent changes.
144
+ * **Calculation:** Employs the formula:
145
+ $$EWMA_t = \\alpha \\cdot Y_t + (1 - \\alpha) \\cdot EWMA_{t-1}$$
146
+
147
+ ### 2. The Rolling Anomaly Loop
148
+ The function partitions data into **TRAIN** and **TEST** sets and iterates through the evaluation period:
149
+ * **Expanding Training Set:** For every evaluation point, the function uses all preceding data to re-calculate the baseline.
150
+ * **Dynamic Thresholding:** * **Upper Limit:** `Forecast + (Sigma * Standard Deviation)`
151
+ * **Lower Limit:** `max(Forecast - (Sigma * Standard Deviation), 0)`
152
+ * **Iterative Evaluation:** It forecasts exactly **one point ahead**, checks for an anomaly, and then moves that point into the training set for the next iteration.
153
+
154
+ ## 📤 Key Output Columns
155
+ The function returns a concatenated DataFrame containing:
156
+ * **`EWMA_forecast`**: The predicted value for that timestamp.
157
+ * **`STD`**: The standard deviation used to calculate the threshold.
158
+ * **`EWMA_high` / `EWMA_low`**: The dynamic boundaries (the "envelope") for the test period.
159
+ * **`set`**: Labels data as either **"TRAIN"** (historical baseline) or **"TEST"** (anomaly detection window).
160
+ * **`is_EWMA_anomaly`**: A boolean flag indicating if the actual value fell outside the limits.
161
+
162
+ ## 💡 Usage Context
163
+ EWMA is ideal for **streaming-style data** or metrics that exhibit **level shifts**.
164
+ Because it weights recent data more heavily than a simple moving average, it is faster to adapt to new "normals" while still filtering out minor noise.
165
+
166
+ ---
167
+ ### ⚙️ Parameter Tuning
168
+ * **`alpha`**: Adjust this to control how quickly the model "forgets" old data (Typical range: `0.1 - 0.3`).
169
+ * **`sigma`**: Adjust this to control sensitivity. A **lower sigma** results in more anomalies, while a **higher sigma** (e.g., `3.0`) only flags extreme outliers.
170
+ """
171
+
172
+ # 1. Prepare Data
173
+ group = group.sort_values(date_column).reset_index(drop=True)
174
+ vals = group[variable].astype(float)
175
+
176
+ # 2. Calculate Statistics (Vectorized)
177
+ # Shift(1) ensures we use history only (no data leakage)
178
+ ewma_forecast = vals.ewm(alpha=alpha, adjust=False).mean().shift(1)
179
+ std_expanding = vals.expanding().std().shift(1)
180
+
181
+ # 3. Construct Output DataFrame
182
+ results = group[group_columns + [date_column]].copy()
183
+ results[variable] = vals
184
+ results["alpha"] = alpha
185
+ results["sigma"] = sigma
186
+
187
+ # 4. Handle Nulls for the first two rows (The "Backfill" logic)
188
+ # Backfilling allows us to have a baseline even for the very first point
189
+ results["EWMA_forecast"] = ewma_forecast.bfill()
190
+ results["STD"] = std_expanding.bfill().fillna(0) # fillna(0) in case there's only 1 row total
191
+
192
+ # 5. Define Bounds (Now that nulls are handled)
193
+ results["EWMA_high"] = results["EWMA_forecast"] + (sigma * results["STD"])
194
+ results["EWMA_low"] = (results["EWMA_forecast"] - (sigma * results["STD"])).clip(lower=0)
195
+
196
+ # 6. USE THE CLASSIFY FUNCTION
197
+ # Note: Ensure 'classify' function is defined in your script!
198
+ results["EWMA_anomaly"] = results.apply(
199
+ lambda row: classify(row[variable], row["EWMA_low"], row["EWMA_high"]),
200
+ axis=1
201
+ )
202
+
203
+ # If the first row was backfilled, we should force it to 'none'
204
+ # to be safe since it's not a "real" statistical forecast.
205
+ results.loc[0, "EWMA_anomaly"] = 'none'
206
+
207
+ # 7. Final Flags and Labels
208
+ results["is_EWMA_anomaly"] = results["EWMA_anomaly"] != 'none'
209
+ results["EWMA_residual"] = vals - results["EWMA_forecast"]
210
+
211
+ results["set"] = "TRAIN"
212
+ if eval_period > 0 and len(results) >= eval_period:
213
+ results.iloc[-eval_period:, results.columns.get_loc("set")] = "TEST"
214
+
215
+ results[date_column] = pd.to_datetime(results[date_column])
216
+
217
+ return results
@@ -5,6 +5,7 @@ from prophet import Prophet
5
5
  import warnings
6
6
  import os
7
7
  import sys
8
+ from .Preprocessing import classify
8
9
  from contextlib import contextmanager
9
10
 
10
11
  warnings.filterwarnings("ignore")
@@ -21,13 +22,15 @@ def suppress_stdout_stderr():
21
22
  finally:
22
23
  sys.stdout, sys.stderr = old_stdout, old_stderr
23
24
 
25
+ """
24
26
  def detect_time_series_anomalies_fb_walkforward(
25
27
  group,
26
28
  variable,
27
29
  date_column,
28
30
  eval_period,
29
- interval_width
31
+ prophet_CI
30
32
  ):
33
+
31
34
  # 1. Silence the cmdstanpy logger completely
32
35
  logger = logging.getLogger('cmdstanpy')
33
36
  logger.addHandler(logging.NullHandler())
@@ -54,7 +57,8 @@ def detect_time_series_anomalies_fb_walkforward(
54
57
  weekly_seasonality=True,
55
58
  yearly_seasonality=True,
56
59
  daily_seasonality=False,
57
- interval_width=interval_width
60
+ interval_width=prophet_CI,
61
+ # prophet_CI=prophet_CI
58
62
  )
59
63
 
60
64
  # --- WRAP THE FIT IN THE MUTER ---
@@ -91,4 +95,148 @@ def detect_time_series_anomalies_fb_walkforward(
91
95
  group.loc[train_mask, "FB_residual"] = np.nan
92
96
  group.loc[train_mask, "is_FB_anomaly"] = np.nan
93
97
 
98
+ return group
99
+ """
100
+
101
+ def detect_time_series_anomalies_fb_walkforward(
102
+ group,
103
+ variable,
104
+ date_column,
105
+ eval_period,
106
+ prophet_CI
107
+ ):
108
+ """
109
+ # 🚀 Facebook Prophet Walk-Forward Model
110
+ ---
111
+
112
+ The `detect_time_series_anomalies_fb_walkforward` function is a sophisticated forecasting tool designed for **iterative anomaly detection**. It utilizes the Facebook Prophet library to perform a **walk-forward validation**, forecasting one data point at a time and expanding the training set as it progresses.
113
+
114
+ ## 📋 Functional Overview
115
+ Unlike standard batch forecasting, this function operates by simulating a real-world scenario where the model is updated as soon as new data arrives. It establishes a **cutoff date** based on the specified `eval_period`, then iteratively predicts the next point, compares it to the observed value, and incorporates that value back into the training history.
116
+
117
+ ## 🧠 Core Logic Stages
118
+
119
+ ### 1. Data Preparation and Cutoff
120
+ * **Standardization:** The input data is sorted by date and converted to **datetime objects** to ensure proper time-series alignment.
121
+ * **Partitioning:** The dataset is split into an **Initial Training Set** (all data before the cutoff) and an **Evaluation Set** (the rolling forecast window).
122
+
123
+ ### 2. Walk-Forward Loop (Sequential Testing)
124
+ * **Model Fitting:** For every point in the evaluation set, a new **Prophet model** is initialized with weekly and yearly seasonality enabled.
125
+ * **One-Step Forecast:** The model generates a prediction (`yhat`) and an uncertainty interval (`yhat_lower`, `yhat_upper`) specifically for the **next single point**.
126
+ * **Dynamic Training Expansion:** After each prediction, the actual observed value is appended to the training data. This ensures the model learns from the most recent information before making the next prediction.
127
+ * **Robust Error Handling:** If the Prophet fit fails, the function falls back to a **baseline persistence model** (last observed value) to prevent pipeline failure.
128
+
129
+ ### 3. Anomaly Classification
130
+ * **Uncertainty Bounds:** Anomalies are defined by the `prophet_CI` parameter. Any observation falling outside the predicted upper or lower bounds is flagged.
131
+ * **Residual Calculation:** The function computes the **FB_residual** (Actual - Forecast) to quantify the magnitude of deviations.
132
+
133
+ ## 📤 Key Output Columns
134
+ The function appends the following columns to the returned DataFrame:
135
+ * **`FB_forecast`**: The point estimate predicted by Prophet for that date.
136
+ * **`FB_low` / `FB_high`**: The dynamic boundaries based on the specified uncertainty interval.
137
+ * **`FB_residual`**: The difference between the actual observed metric and the forecast.
138
+ * **`FB_anomaly`**: A categorical label designating the deviation as **"high"** or **"low"**.
139
+ * **`is_FB_anomaly`**: A boolean flag identifying outliers in the evaluation region.
140
+
141
+
142
+ ## 💡 Usage Context
143
+ This approach is highly effective for metrics with **strong seasonality and complex trends**. Because it uses a walk-forward loop, it is significantly more accurate than a static forecast for long evaluation periods, as it corrects itself based on the most recent trends. It is ideal for detecting "sudden" shifts that standard statistical models (like Z-Score) might miss.
144
+
145
+ ---
146
+ ### 📊 Evaluation Strategy
147
+ This function strictly ignores the training region for anomaly reporting, ensuring that all reported anomalies are based on "out-of-sample" performance where the model had no prior knowledge of the specific data point being tested.
148
+
149
+ """
150
+
151
+ # 1. Silence the cmdstanpy logger
152
+ logger = logging.getLogger('cmdstanpy')
153
+ logger.addHandler(logging.NullHandler())
154
+ logger.propagate = False
155
+ logger.setLevel(logging.CRITICAL)
156
+
157
+ group = group.sort_values(date_column).copy()
158
+ group[date_column] = pd.to_datetime(group[date_column])
159
+
160
+ # Calculate cutoff for the walk-forward
161
+ cutoff_date = group[date_column].max() - pd.Timedelta(weeks=eval_period)
162
+
163
+ group["FB_forecast"] = np.nan
164
+ group["FB_low"] = np.nan
165
+ group["FB_high"] = np.nan
166
+
167
+ train = group[group[date_column] <= cutoff_date].copy()
168
+ test = group[group[date_column] > cutoff_date].copy()
169
+
170
+ # --- INITIAL FIT FOR TRAIN DATA ---
171
+ prophet_train_initial = train.rename(columns={date_column: "ds", variable: "y"})
172
+ try:
173
+ model_initial = Prophet(
174
+ weekly_seasonality=True,
175
+ yearly_seasonality=True,
176
+ daily_seasonality=False,
177
+ interval_width=prophet_CI # Fixed: Prophet uses interval_width
178
+ )
179
+ with suppress_stdout_stderr():
180
+ model_initial.fit(prophet_train_initial)
181
+
182
+ # Predict on the training dates to get historical bounds
183
+ train_forecast = model_initial.predict(prophet_train_initial)
184
+
185
+ # Map back to group (Train indices)
186
+ train_indices = group[group[date_column] <= cutoff_date].index
187
+ group.loc[train_indices, "FB_forecast"] = train_forecast["yhat"].values
188
+ group.loc[train_indices, "FB_low"] = train_forecast["yhat_lower"].clip(lower=0).values
189
+ group.loc[train_indices, "FB_high"] = train_forecast["yhat_upper"].values
190
+
191
+ except Exception as e:
192
+ print(f"Initial Prophet fit failed: {e}")
193
+
194
+ # --- WALK-FORWARD FOR TEST DATA ---
195
+ for i, row in test.iterrows():
196
+ prophet_train = train.rename(columns={date_column: "ds", variable: "y"})
197
+ try:
198
+ model = Prophet(
199
+ weekly_seasonality=True,
200
+ yearly_seasonality=True,
201
+ daily_seasonality=False,
202
+ interval_width=prophet_CI
203
+ )
204
+
205
+ with suppress_stdout_stderr():
206
+ model.fit(prophet_train)
207
+
208
+ future = pd.DataFrame({"ds": [row[date_column]]})
209
+ fc = model.predict(future).iloc[0]
210
+
211
+ group.loc[i, "FB_forecast"] = fc["yhat"]
212
+ group.loc[i, "FB_low"] = max(fc["yhat_lower"], 0)
213
+ group.loc[i, "FB_high"] = fc["yhat_upper"]
214
+
215
+ except Exception as e:
216
+ print(f"Prophet failed for KEY={group.get('key', ['NA'])[0]} on date={row[date_column]}: {e}")
217
+ # Fallback to naive logic
218
+ last_val = train[variable].iloc[-1]
219
+ group.loc[i, "FB_forecast"] = last_val
220
+ group.loc[i, "FB_low"] = last_val
221
+ group.loc[i, "FB_high"] = last_val
222
+
223
+ # Update train for next iteration
224
+ new_train_row = row.to_frame().T
225
+ train = pd.concat([train, new_train_row], ignore_index=True)
226
+
227
+ # --- UNIFIED ANOMALY DETECTION (Train + Test) ---
228
+ group["FB_residual"] = group[variable] - group["FB_forecast"]
229
+
230
+ # Applying your custom classify function row by row
231
+ group["FB_anomaly"] = group.apply(
232
+ lambda row: classify(row[variable], row["FB_low"], row["FB_high"]),
233
+ axis=1
234
+ )
235
+
236
+ group["is_FB_anomaly"] = group["FB_anomaly"] != 'none'
237
+
238
+ # Label set
239
+ group["set"] = "TRAIN"
240
+ group.loc[group[date_column] > cutoff_date, "set"] = "TEST"
241
+
94
242
  return group