anomaly-pipeline 0.1.27__py3-none-any.whl → 0.1.61__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import plotly.express as px
6
6
 
7
7
  def initialize_fig(group, group_columns, variable, date_column, anomaly_detection_model):
8
8
 
9
- plot_title = " -- ".join(list(group[group_columns].values[0])).upper() + " -- " + anomaly_detection_model
9
+ plot_title = " - ".join(list(group[group_columns].values[0])).upper() + " -- " + anomaly_detection_model
10
10
 
11
11
  fig = go.Figure()
12
12
 
@@ -19,6 +19,21 @@ def initialize_fig(group, group_columns, variable, date_column, anomaly_detectio
19
19
  name=variable if variable == variable.upper() else variable.title(),
20
20
  ))
21
21
 
22
+ # --- Calculate X-Axis Padding (One Period) ---
23
+ dates = group[date_column].sort_values()
24
+ min_date = dates.min()
25
+ max_date = dates.max()
26
+
27
+ if len(dates) > 1:
28
+ # Calculate the most common time difference to determine the period
29
+ period = dates.diff().mode().iloc[0]
30
+ else:
31
+ period = pd.Timedelta(days=1)
32
+
33
+ # Apply padding
34
+ range_min = min_date - period
35
+ range_max = max_date + period
36
+
22
37
  fig.update_layout(
23
38
  title=dict(
24
39
  text=plot_title,
@@ -34,7 +49,7 @@ def initialize_fig(group, group_columns, variable, date_column, anomaly_detectio
34
49
  plot_bgcolor='snow',
35
50
  paper_bgcolor='whitesmoke',
36
51
  xaxis=dict(
37
- range=[group[date_column].min(), group[date_column].max()],
52
+ range=[range_min, range_max],
38
53
  showline=True,
39
54
  linewidth=0.5,
40
55
  linecolor='orange',
@@ -52,7 +67,7 @@ def initialize_fig(group, group_columns, variable, date_column, anomaly_detectio
52
67
  mirror=True
53
68
  ),
54
69
  yaxis_title=dict(
55
- text=variable if variable == variable.upper() else variable.title(),
70
+ text=variable.replace('_', ' ') if variable == variable.upper() else variable.title().replace('_', ' '),
56
71
  font=dict(size=16, weight='bold', color='black')
57
72
  ),
58
73
  legend=dict(
@@ -139,16 +154,45 @@ def add_eval_period_highlight(fig, group, date_column, variable, eval_period):
139
154
  color='rgba(0, 255, 0, 0.25)', # 'lime' with 0.25 alpha
140
155
  width=10
141
156
  ),
142
- name='Evalution Period',
157
+ name='Evaluation Period',
143
158
  hoverinfo='skip',
144
159
  ))
145
160
  return fig
146
161
 
147
162
 
148
- def anomaly_eval_plot(group, group_columns, variable, date_column, eval_period=12, show_anomaly_scores_on_main_plot=False):
163
+ def anomaly_overview_plot(group, group_columns, variable, date_column, eval_period, show_anomaly_scores_on_main_plot=False):
149
164
 
150
165
  # IS ANOMALY Plot
151
166
  # This is the main plot
167
+ """
168
+ Generates an ensemble anomaly evaluation plot using Plotly.
169
+
170
+ This function aggregates multiple anomaly detection models (columns starting with 'is_'
171
+ and ending with '_anomaly') to create a consensus 'Anomaly Score'. It visualizes
172
+ actual values, mean, median, and highlights points where the ensemble of models
173
+ agrees there is an anomaly.
174
+
175
+ Args:
176
+ group (pd.DataFrame): The processed dataframe containing original data and
177
+ boolean anomaly flags from various models (e.g., 'is_FB_anomaly').
178
+ group_columns (list): List of column names used to identify the group
179
+ (e.g., ['Region', 'Product']).
180
+ variable (str): The name of the numeric column being analyzed.
181
+ date_column (str): The name of the datetime column.
182
+ eval_period (int, optional): The number of recent periods evaluated. Defaults to 12.
183
+ show_anomaly_scores_on_main_plot (bool, optional): If True, adds a secondary
184
+ Y-axis bar chart showing the normalized ensemble score (-100 to 100).
185
+ Defaults to False.
186
+
187
+ Logic:
188
+ - Voting: Counts all columns matching 'is_*_anomaly'.
189
+ - is_Anomaly: True if >= 50% of the active models flag the point.
190
+ - Anomaly Score: A normalized metric where 100 represents total consensus
191
+ among all models and negative values represent low-risk points.
192
+
193
+ Returns:
194
+ None: Displays an interactive Plotly figure.
195
+ """
152
196
  try:
153
197
  group = group.copy()
154
198
 
@@ -157,24 +201,20 @@ def anomaly_eval_plot(group, group_columns, variable, date_column, eval_period=1
157
201
  if col.startswith('is_') and col.endswith('_anomaly') and col != 'is_anomaly':
158
202
  anomaly_cols.append(col)
159
203
  group['Anomaly Vote Models'] = group.apply(
160
- lambda row: sorted([col.removeprefix('is_').removesuffix('_anomaly')
204
+ lambda row: ([
205
+ 'IF' if 'IsolationForest' in col else
206
+ 'PCNTL' if 'Percentile' in col else
207
+ col.removeprefix('is_').removesuffix('_anomaly')
161
208
  for col in anomaly_cols
162
- if pd.notna(row[col]) and row[col] == True]),
209
+ if pd.notna(row[col]) and row[col] == True
210
+ ]),
163
211
  axis=1)
164
212
  group['Anomaly Vote Models'] = group['Anomaly Vote Models'].apply(lambda x: ', '.join(x))
165
- group['Anomaly_Votes'] = group[anomaly_cols].sum(axis=1).astype(int)
166
- group['Vote_Cnt'] = group[anomaly_cols].replace(False, True).sum(axis=1).astype(int)
167
- group['Anomaly_Votes_Display'] = group['Anomaly_Votes'].astype(str) + ' out of ' + group['Vote_Cnt'].astype(str)
168
- group['is_Anomaly'] = np.where(group['Anomaly_Votes']/group['Vote_Cnt'] >= 0.5, True, False)
169
- group['Anomaly_Score'] = 2 * (group['Anomaly_Votes']/group['Vote_Cnt'] - 0.5).astype(float)
170
- group['Anomaly_Score_Display'] = np.where(group['Anomaly_Score'] < 0, np.floor(100*group['Anomaly_Score']),
171
- np.where(group['Anomaly_Score'] > 0, np.ceil(100*group['Anomaly_Score']),
172
- 1)).astype(float)
173
213
  group['Mean'] = group[variable].mean()
174
214
  group['Median'] = group[variable].median()
175
215
 
176
- fig = initialize_fig(group, group_columns, variable, date_column, "Anomalies")
177
-
216
+ fig = initialize_fig(group, group_columns, variable, date_column, "Anomalies Overview Plot")
217
+
178
218
  # Mean
179
219
  fig.add_trace(go.Scatter(
180
220
  x=group[date_column],
@@ -202,7 +242,11 @@ def anomaly_eval_plot(group, group_columns, variable, date_column, eval_period=1
202
242
  x=group[group['is_Anomaly'] == True][date_column],
203
243
  y=group[group['is_Anomaly'] == True][variable],
204
244
  mode='markers',
205
- marker=dict(color='red', symbol='circle', line=dict(width=1), size=5*(group[group['is_Anomaly'] == True]['Anomaly_Score'] + 2)),
245
+ marker=dict(color='crimson',
246
+ symbol='circle',
247
+ line=dict(width=1),
248
+ size=10*(group[group['is_Anomaly'] == True]['Anomaly_Votes']) ** (1/4)
249
+ ),
206
250
  name='Anomalies',
207
251
  customdata=group[group['is_Anomaly'] == True][['Anomaly_Votes_Display', 'Anomaly Vote Models', 'Anomaly_Score_Display']],
208
252
  hovertemplate=(
@@ -222,7 +266,8 @@ def anomaly_eval_plot(group, group_columns, variable, date_column, eval_period=1
222
266
  marker=dict(color='orange',
223
267
  symbol='circle',
224
268
  line=dict(width=1),
225
- size=5*(group[(group['is_Anomaly'] == False) & (group['Anomaly_Votes'] >= 1)]['Anomaly_Score'] + 2)),
269
+ size=8*(group[(group['is_Anomaly'] == False) & (group['Anomaly_Votes'] >= 1)]['Anomaly_Votes']) ** (1/4)
270
+ ),
226
271
  name='Not Quite Anomalies',
227
272
  customdata=group[(group['is_Anomaly'] == False) & (group['Anomaly_Votes'] >= 1)][['Anomaly_Votes_Display', 'Anomaly Vote Models', 'Anomaly_Score_Display']],
228
273
  hovertemplate=(
@@ -234,6 +279,22 @@ def anomaly_eval_plot(group, group_columns, variable, date_column, eval_period=1
234
279
  )
235
280
  ))
236
281
 
282
+ # Not Anomalies
283
+ fig.add_trace(go.Scatter(
284
+ x=group[(group['is_Anomaly'] == False) & (group['Anomaly_Votes'] == 0)][date_column],
285
+ y=group[(group['is_Anomaly'] == False) & (group['Anomaly_Votes'] == 0)][variable],
286
+ mode='markers',
287
+ marker=dict(color='lightgray',
288
+ symbol='circle',
289
+ line=dict(width=0),
290
+ size=6),
291
+ name='Normal',
292
+ customdata=group[(group['is_Anomaly'] == False) & (group['Anomaly_Votes'] == 0)][['Anomaly_Votes_Display', 'Anomaly Vote Models', 'Anomaly_Score_Display']],
293
+ hovertemplate=(
294
+ f'Date: %{{x|%Y-%m-%d}}<br>' +
295
+ f'{variable if variable == variable.upper() else variable.title()}: %{{y:,d}}<br>'
296
+ )))
297
+
237
298
  # Add Anomaly Scores to Secondary Axis
238
299
  if show_anomaly_scores_on_main_plot:
239
300
  fig.add_trace(go.Bar(
@@ -274,8 +335,38 @@ def anomaly_eval_plot(group, group_columns, variable, date_column, eval_period=1
274
335
  print(f"Anomaly Plot Failed: {e}")
275
336
 
276
337
 
277
- def anomaly_percentile_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
338
+ def anomaly_percentile_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
278
339
  # Percentile Model Plot
340
+ """
341
+ Visualizes anomaly detection based on Percentile-derived thresholds.
342
+
343
+ This function plots the time-series data alongside shaded regions representing
344
+ the upper and lower percentile boundaries. It highlights specific 'Percentile'
345
+ model anomalies and can optionally overlay the final consensus anomalies.
346
+
347
+ Args:
348
+ group (pd.DataFrame): Dataframe containing the time-series data and
349
+ calculated percentile columns ('Percentile_low', 'Percentile_high',
350
+ and 'is_Percentile_anomaly').
351
+ group_columns (list): Column names used for grouping/title identification.
352
+ variable (str): The numeric column name being plotted on the Y-axis.
353
+ date_column (str): The datetime column name for the X-axis.
354
+ final_anomalies (bool, optional): If True, overlays the final ensemble
355
+ consensus markers (red circles) on top of the model-specific markers.
356
+ Defaults to True.
357
+ eval_period (int, optional): The look-back period used for the evaluation
358
+ context. Defaults to 12.
359
+
360
+ Logic:
361
+ - Shading: Uses `add_anomaly_region` to fill the area beyond 'Percentile_low'
362
+ and 'Percentile_high'.
363
+ - Model Markers: Highlights points where 'is_Percentile_anomaly' is True.
364
+ - Integration: Uses helper functions `initialize_fig`, `add_anomaly_region`,
365
+ and `add_model_anomalies` to maintain a consistent UI/UX.
366
+
367
+ Returns:
368
+ None: Displays an interactive Plotly figure.
369
+ """
279
370
  try:
280
371
  group = group.copy()
281
372
  fig = initialize_fig(group, group_columns, variable, date_column, "Percentile Anomaly Detection")
@@ -294,8 +385,41 @@ def anomaly_percentile_plot(group, group_columns, variable, date_column, final_a
294
385
  print(f"Percentile Anomaly Plot Failed: {e}")
295
386
 
296
387
 
297
- def anomaly_sd_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
388
+ def anomaly_sd_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
298
389
  # SD Model Plot
390
+ """
391
+ Visualizes anomaly detection based on Standard Deviation (SD) thresholds.
392
+
393
+ This function plots the time-series data and overlays shaded regions representing
394
+ statistical boundaries (typically 2 or 3 standard deviations from the mean).
395
+ It identifies 'SD' model-specific anomalies and can optionally display the
396
+ final ensemble consensus markers.
397
+
398
+ Args:
399
+ group (pd.DataFrame): Dataframe containing the time-series data and
400
+ calculated SD boundary columns ('SD2_low', 'SD2_high', and
401
+ 'is_SD_anomaly').
402
+ group_columns (list): Column names used for grouping/title identification.
403
+ variable (str): The numeric column name being plotted on the Y-axis.
404
+ date_column (str): The datetime column name for the X-axis.
405
+ final_anomalies (bool, optional): If True, overlays the final ensemble
406
+ consensus markers (red circles) on top of the SD model markers.
407
+ Defaults to True.
408
+ eval_period (int, optional): The look-back period used for the evaluation
409
+ context. Defaults to 12.
410
+
411
+ Logic:
412
+ - Shading: Utilizes `add_anomaly_region` to fill the areas outside the
413
+ 'SD2_low' and 'SD2_high' thresholds, visually representing the
414
+ statistical "outlier zones."
415
+ - Model Markers: Highlights points where the SD model specifically
416
+ triggered an anomaly flag.
417
+ - Visualization Helpers: Relies on `initialize_fig`, `add_anomaly_region`,
418
+ and `add_model_anomalies` for UI consistency across the pipeline.
419
+
420
+ Returns:
421
+ None: Displays an interactive Plotly figure and prints a newline.
422
+ """
299
423
  try:
300
424
  group = group.copy()
301
425
  fig = initialize_fig(group, group_columns, variable, date_column, "SD Anomaly Detection")
@@ -314,8 +438,41 @@ def anomaly_sd_plot(group, group_columns, variable, date_column, final_anomalies
314
438
  print(f"SD Anomaly Plot Failed: {e}")
315
439
 
316
440
 
317
- def anomaly_mad_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
441
+ def anomaly_mad_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
318
442
  # MAD Model Plot
443
+ """
444
+ Visualizes anomaly detection based on Median Absolute Deviation (MAD).
445
+
446
+ MAD is a robust measure of statistical dispersion. This plot displays the
447
+ time-series data with shaded thresholds derived from the median and
448
+ the MAD scale factor. It is particularly effective for datasets where
449
+ mean and standard deviation are heavily skewed by extreme outliers.
450
+
451
+ Args:
452
+ group (pd.DataFrame): Dataframe containing the time-series data and
453
+ calculated MAD boundary columns ('MAD_low', 'MAD_high', and
454
+ 'is_MAD_anomaly').
455
+ group_columns (list): Column names used for grouping/title identification.
456
+ variable (str): The numeric column name being plotted on the Y-axis.
457
+ date_column (str): The datetime column name for the X-axis.
458
+ final_anomalies (bool, optional): If True, overlays the final ensemble
459
+ consensus markers (red circles) on top of the MAD model markers.
460
+ Defaults to True.
461
+ eval_period (int, optional): The look-back period used for the evaluation
462
+ context. Defaults to 12.
463
+
464
+ Logic:
465
+ - Shading: Highlights the areas outside the 'MAD_low' and 'MAD_high'
466
+ thresholds. Because MAD uses the median as a baseline, these bands
467
+ are often tighter and more resistant to outlier-driven "threshold bloat."
468
+ - Model Markers: Specifically plots points flagged by the 'is_MAD_anomaly'
469
+ logic.
470
+ - Helper Integration: Uses `initialize_fig` for layout and `add_anomalies`
471
+ for consensus overlay.
472
+
473
+ Returns:
474
+ None: Displays an interactive Plotly figure.
475
+ """
319
476
  try:
320
477
  group = group.copy()
321
478
  fig = initialize_fig(group, group_columns, variable, date_column, "MAD Anomaly Detection")
@@ -334,7 +491,38 @@ def anomaly_mad_plot(group, group_columns, variable, date_column, final_anomalie
334
491
  print(f"MAD Anomaly Plot Failed: {e}")
335
492
 
336
493
 
337
- def anomaly_iqr_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
494
+ def anomaly_iqr_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
495
+ """
496
+ Visualizes anomaly detection based on the Interquartile Range (IQR).
497
+
498
+ This function utilizes the Tukey's Fences method to identify outliers. It
499
+ calculates the spread between the 25th (Q1) and 75th (Q3) percentiles to
500
+ establish 'Normal' bounds. It is highly effective for skewed data as it
501
+ does not assume a normal distribution.
502
+
503
+ Args:
504
+ group (pd.DataFrame): Dataframe containing the time-series data and
505
+ calculated IQR boundary columns ('IQR_low', 'IQR_high', and
506
+ 'is_IQR_anomaly').
507
+ group_columns (list): Column names used for grouping/title identification.
508
+ variable (str): The numeric column name being plotted on the Y-axis.
509
+ date_column (str): The datetime column name for the X-axis.
510
+ final_anomalies (bool, optional): If True, overlays the final ensemble
511
+ consensus markers (red circles) on top of the IQR-specific markers.
512
+ Defaults to True.
513
+ eval_period (int, optional): The look-back period used for the evaluation
514
+ context. Defaults to 12.
515
+
516
+ Logic:
517
+ - Shading: Fills the region below Q1 - 1.5*IQR and above Q3 + 1.5*IQR.
518
+ - Robustness: Because it uses quartiles rather than mean/SD, it is
519
+ resistant to being "fooled" by the outliers it is trying to detect.
520
+ - Consistency: Uses the standard suite of helpers (`initialize_fig`,
521
+ `add_anomaly_region`) to match the rest of the pipeline's visual style.
522
+
523
+ Returns:
524
+ None: Displays an interactive Plotly figure.
525
+ """
338
526
  # IQR Model Plot
339
527
  try:
340
528
  group = group.copy()
@@ -354,7 +542,38 @@ def anomaly_iqr_plot(group, group_columns, variable, date_column, final_anomalie
354
542
  print(f"IQR Anomaly Plot Failed: {e}")
355
543
 
356
544
 
357
- def anomaly_ewma_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
545
+ def anomaly_ewma_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
546
+ """
547
+ Visualizes anomaly detection based on Exponentially Weighted Moving Average (EWMA).
548
+
549
+ This plot highlights anomalies using a moving baseline that gives more weight to
550
+ recent observations. It visualizes the EWMA forecast line, the calculated upper
551
+ and lower control limits (bands), and model-specific outliers. It is ideal for
552
+ detecting shifts in mean or variance in non-stationary time series.
553
+
554
+ Args:
555
+ group (pd.DataFrame): Dataframe containing the time-series data and
556
+ EWMA-specific columns ('EWMA_forecast', 'EWMA_low', 'EWMA_high',
557
+ and 'is_EWMA_anomaly').
558
+ group_columns (list): Column names used for grouping and plot titles.
559
+ variable (str): The name of the target numeric column.
560
+ date_column (str): The name of the datetime column.
561
+ final_anomalies (bool, optional): If True, overlays the final ensemble
562
+ consensus markers (red circles) on top of the EWMA markers.
563
+ Defaults to True.
564
+ eval_period (int, optional): The number of recent periods evaluated.
565
+ Used for context in title or scaling. Defaults to 12.
566
+
567
+ Logic:
568
+ - Forecast Line: Displays the weighted moving average ('slateblue').
569
+ - Dynamic Thresholds: Visualizes 'EWMA_low' and 'EWMA_high' as 'orangered'
570
+ dashdot lines with light red shading in the outlier zones.
571
+ - Model Markers: Highlights points where the EWMA logic specifically
572
+ triggered an anomaly flag.
573
+
574
+ Returns:
575
+ None: Displays an interactive Plotly figure.
576
+ """
358
577
  # EWMA Model Plot
359
578
  try:
360
579
  group = group.copy()
@@ -425,7 +644,39 @@ def anomaly_ewma_plot(group, group_columns, variable, date_column, final_anomali
425
644
  print(f"EWMA Anomaly Plot Failed: {e}")
426
645
 
427
646
 
428
- def anomaly_fb_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
647
+ def anomaly_fb_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
648
+ """
649
+ Visualizes anomaly detection using the Facebook Prophet (FB) model.
650
+
651
+ This function displays the Prophet model's additive trend and seasonality
652
+ forecasts along with its uncertainty intervals (yhat_upper and yhat_lower).
653
+ It is particularly useful for identifying anomalies in data with strong
654
+ seasonality (weekly/yearly) that simpler statistical models might miss.
655
+
656
+ Args:
657
+ group (pd.DataFrame): Dataframe containing Prophet output columns
658
+ ('FB_forecast', 'FB_low', 'FB_high', and 'is_FB_anomaly').
659
+ group_columns (list): Column names used to identify and title the group.
660
+ variable (str): The name of the target numeric column analyzed.
661
+ date_column (str): The name of the datetime column.
662
+ final_anomalies (bool, optional): If True, overlays the final ensemble
663
+ consensus markers (red circles) over the Prophet markers.
664
+ Defaults to True.
665
+ eval_period (int, optional): The number of recent periods analyzed.
666
+ Defaults to 12.
667
+
668
+ Logic:
669
+ - Recursive Visibility: Since FB Prophet is run in a walk-forward manner,
670
+ the shaded regions represent the prediction interval at the time
671
+ of forecast.
672
+ - Outlier Zones: Shaded red areas represent values that fall outside
673
+ the model's expected confidence interval (based on `prophet_CI`).
674
+ - Model Markers: Highlights points where Prophet specifically flagged
675
+ an anomaly based on its trend and seasonal expectations.
676
+
677
+ Returns:
678
+ None: Displays an interactive Plotly figure.
679
+ """
429
680
  # FB Prophet Model Plot
430
681
  try:
431
682
  group = group.copy()
@@ -493,10 +744,42 @@ def anomaly_fb_plot(group, group_columns, variable, date_column, final_anomalies
493
744
  fig.show()
494
745
  print("\n")
495
746
  except Exception as e:
496
- print(f"EWMA Anomaly Plot Failed: {e}")
747
+ print(f"FB Anomaly Plot Failed: {e}")
748
+
749
+
750
+ def anomaly_dbscan_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
751
+ """
752
+ Visualizes anomaly detection using the DBSCAN clustering algorithm.
497
753
 
754
+ DBSCAN identifies anomalies as 'noise' points that reside in low-density
755
+ regions of the feature space. Unlike threshold-based methods, DBSCAN
756
+ looks for multi-dimensional patterns. This plot highlights points
757
+ flagged as noise by the algorithm, contextually placed within the
758
+ time-series trend.
498
759
 
499
- def anomaly_dbscan_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
760
+ Args:
761
+ group (pd.DataFrame): Dataframe containing the time-series data and
762
+ DBSCAN results (specifically the 'is_DBSCAN_anomaly' column).
763
+ group_columns (list): Column names used to identify and title the group.
764
+ variable (str): The name of the target numeric column analyzed.
765
+ date_column (str): The name of the datetime column.
766
+ final_anomalies (bool, optional): If True, overlays the final ensemble
767
+ consensus markers (red circles) over the DBSCAN markers.
768
+ Defaults to True.
769
+ eval_period (int, optional): The number of recent periods to highlight
770
+ as the evaluation window. Defaults to 12.
771
+
772
+ Logic:
773
+ - Density Clustering: Points are flagged as anomalies if they are
774
+ isolated from the main "clusters" of data points in the feature space.
775
+ - Eval Period Highlight: Uses `add_eval_period_highlight` to visually
776
+ distinguish the recent testing window from the historical training data.
777
+ - Model Markers: Highlights specific DBSCAN outliers using 'mediumorchid'
778
+ circles.
779
+
780
+ Returns:
781
+ None: Displays an interactive Plotly figure.
782
+ """
500
783
  # DBSCAN Model Plot
501
784
  try:
502
785
  group = group.copy()
@@ -521,7 +804,38 @@ def anomaly_dbscan_plot(group, group_columns, variable, date_column, final_anoma
521
804
  print(f"DBSCAN Anomaly Plot Failed: {e}")
522
805
 
523
806
 
524
- def anomaly_isolation_forest_timeseries_plot(group, group_columns, variable, date_column, final_anomalies=True, eval_period=12):
807
+ def anomaly_isolation_forest_plot(group, group_columns, variable, date_column, eval_period, final_anomalies=True):
808
+ """
809
+ Visualizes anomaly detection using the Isolation Forest algorithm.
810
+
811
+ Isolation Forest is an unsupervised learning algorithm that isolates anomalies
812
+ by randomly selecting a feature and a split value. Since anomalies are few
813
+ and different, they are easier to isolate (shorter path length in the tree).
814
+ This plot shows points identified as anomalies based on this branching logic.
815
+
816
+ Args:
817
+ group (pd.DataFrame): Dataframe containing time-series data and
818
+ Isolation Forest results (specifically 'is_IsolationForest_anomaly_timeseries').
819
+ group_columns (list): Column names used to identify and title the group.
820
+ variable (str): The name of the target numeric column analyzed.
821
+ date_column (str): The name of the datetime column.
822
+ final_anomalies (bool, optional): If True, overlays the final ensemble
823
+ consensus markers (red circles) over the Isolation Forest markers.
824
+ Defaults to True.
825
+ eval_period (int, optional): The number of recent periods to highlight
826
+ as the evaluation window. Defaults to 12.
827
+
828
+ Logic:
829
+ - Tree-Based Isolation: Anomalies are identified by having shorter average
830
+ path lengths across a forest of random trees.
831
+ - Temporal Context: Uses `add_eval_period_highlight` to shade the recursive
832
+ testing window, helping users see if anomalies are recent.
833
+ - Model Markers: Highlights specific Isolation Forest outliers using
834
+ 'mediumorchid' markers.
835
+
836
+ Returns:
837
+ None: Displays an interactive Plotly figure.
838
+ """
525
839
  # Isolation Forest Model Plot
526
840
  try:
527
841
  group = group.copy()
@@ -531,8 +845,8 @@ def anomaly_isolation_forest_timeseries_plot(group, group_columns, variable, dat
531
845
  fig = add_eval_period_highlight(fig, group, date_column, variable, eval_period)
532
846
  # Isolation Forest Anomalies
533
847
  fig.add_trace(go.Scatter(
534
- x=group[group['is_IsolationForest_anomaly_timeseries'] == True][date_column],
535
- y=group[group['is_IsolationForest_anomaly_timeseries'] == True][variable],
848
+ x=group[group['is_IsolationForest_anomaly'] == True][date_column],
849
+ y=group[group['is_IsolationForest_anomaly'] == True][variable],
536
850
  mode='markers',
537
851
  marker=dict(color='mediumorchid', symbol='circle', line=dict(width=1), size=7),
538
852
  name='Isolation Forest Anomalies',
@@ -544,3 +858,295 @@ def anomaly_isolation_forest_timeseries_plot(group, group_columns, variable, dat
544
858
  print("\n")
545
859
  except Exception as e:
546
860
  print(f"Isolation Forest Time Series Anomaly Plot Failed: {e}")
861
+
862
+
863
+ def anomaly_stacked_bar_plot(df, group_columns, variable, date_column, anomaly_col='is_Anomaly', secondary_line=None):
864
+ """
865
+ Generates a time-ordered stacked bar chart showing Normal vs. Anomalous record counts.
866
+
867
+ Args:
868
+ df (pd.DataFrame): The dataframe containing the data.
869
+ date_column (str): The name of the datetime column.
870
+ anomaly_col (str): The name of the boolean column (True=Anomaly).
871
+ title (str): Title of the chart.
872
+
873
+ Returns:
874
+ None: Displays the interactive Plotly figure.
875
+ """
876
+ try:
877
+ # 1. Aggregation
878
+
879
+ # Group by date to get counts across all unique_ids for that specific timestamp
880
+ df['normal_val'] = np.where(df[anomaly_col] != True, 1, 0)
881
+ df['anomaly_val'] = np.where(df[anomaly_col] == True, 1, 0)
882
+ agg_df = df.groupby(date_column).agg(
883
+ normal_sum=('normal_val', 'sum'),
884
+ anomaly_sum=('anomaly_val', 'sum'),
885
+ variable_mean=(variable, 'mean'),
886
+ score_mean=('Anomaly_Score', 'mean'),
887
+ ).reset_index()
888
+
889
+ agg_df['total_sum'] = agg_df['normal_sum'] + agg_df['anomaly_sum']
890
+
891
+ # Calculate percentage (handle division by zero just in case)
892
+ agg_df['anomaly_pct'] = np.where(agg_df['total_sum'] > 0, (agg_df['anomaly_sum'] / agg_df['total_sum']) * 100, 0)
893
+
894
+ dates = agg_df[date_column].sort_values()
895
+ min_date = dates.min()
896
+ max_date = dates.max()
897
+
898
+ if len(dates) > 1:
899
+ # Calculate the most common time difference to determine the period
900
+ period = dates.diff().mode().iloc[0]
901
+ else:
902
+ period = pd.Timedelta(days=1)
903
+
904
+ # Apply padding
905
+ range_min = min_date - period
906
+ range_max = max_date + period
907
+
908
+ # 2. Initialize Figure
909
+ fig = go.Figure()
910
+
911
+ if secondary_line is None or secondary_line == variable:
912
+ line_var = 'variable_mean'
913
+ var_title = f"Avg {variable if variable == variable.upper() else variable.replace('_', ' ').title()}"
914
+ else:
915
+ line_var = 'score_mean'
916
+ var_title = 'Avg Anomaly Score'
917
+
918
+ # 3. Add Traces
919
+ # Bottom Bar: Non-Anomalous (Grey)
920
+ fig.add_trace(go.Bar(
921
+ x=agg_df[date_column],
922
+ y=agg_df['normal_sum'],
923
+ name='Normal',
924
+ marker_color='lightgray',
925
+ customdata=agg_df[['total_sum']],
926
+ hovertemplate=(
927
+ f'<b>Date:</b> %{{x|%Y-%m-%d}}<br>' +
928
+ f'<b>Normal Records:</b> %{{y:,}}<br>' +
929
+ f'<b>Total Volume:</b> %{{customdata[0]:,}}<extra></extra>'
930
+ )
931
+ ))
932
+
933
+ # Top Bar: Anomalous (Red)
934
+ fig.add_trace(go.Bar(
935
+ x=agg_df[date_column],
936
+ y=agg_df['anomaly_sum'],
937
+ name='Anomaly',
938
+ marker_color='crimson', # Red for anomalies
939
+ customdata=agg_df[['total_sum', 'anomaly_pct']],
940
+ hovertemplate=(
941
+ f'<b>Date:</b> %{{x|%Y-%m-%d}}<br>' +
942
+ f'<b>Anomalies:</b> %{{y:,}}<br>' +
943
+ f'<b>Anomaly Rate:</b> %{{customdata[1]:.0f}}%<extra></extra>'
944
+ )
945
+ ))
946
+
947
+ # Line on secondary axis
948
+ fig.add_trace(go.Scatter(
949
+ x=agg_df[date_column],
950
+ y=agg_df[line_var],
951
+ name=var_title,
952
+ yaxis='y2',
953
+ mode='lines',
954
+ line=dict(width=3, color='darkslategray'),
955
+ hovertemplate=(
956
+ f'<b>Date:</b> %{{x|%Y-%m-%d}}<br>' +
957
+ f'<b>Total {variable}:</b> %{{y:,.2f}}<extra></extra>'
958
+ )
959
+ ))
960
+
961
+ # 4. Apply Visual Design (Matching your existing style)
962
+ fig.update_layout(
963
+ title=dict(
964
+ text=f'Anomalies and {var_title} per Group Over Time for {len(df[group_columns].drop_duplicates())} Groups',
965
+ y=0.96,
966
+ x=0.5,
967
+ xanchor='center',
968
+ yanchor='top',
969
+ font=dict(size=18, color='black', weight='bold'),
970
+ ),
971
+ barmode='stack',
972
+ height=350,
973
+ width=1200,
974
+ margin=dict(l=50, r=100, t=60, b=30),
975
+ plot_bgcolor='snow',
976
+ paper_bgcolor='whitesmoke',
977
+ xaxis=dict(
978
+ range=[range_min, range_max],
979
+ showline=True,
980
+ linewidth=0.5,
981
+ linecolor='orange',
982
+ zeroline=False,
983
+ gridcolor='rgba(255, 165, 0, 0.1)',
984
+ mirror=True,
985
+ ),
986
+ yaxis=dict(
987
+ # Dynamic range with a little headroom
988
+ range=[0, agg_df['total_sum'].max()],
989
+ showline=True,
990
+ linewidth=0.5,
991
+ linecolor='orange',
992
+ zeroline=False,
993
+ gridcolor='rgba(255, 165, 0, 0.1)',
994
+ mirror=True,
995
+ title=dict(text="Group Count", font=dict(size=16, weight='bold', color='black')),
996
+ ),
997
+ yaxis2=dict(
998
+ title=dict(text=var_title, font=dict(size=14, weight='bold', color='darkslategray')),
999
+ tickfont=dict(color='darkslategray'),
1000
+ anchor="x",
1001
+ overlaying="y",
1002
+ side="right",
1003
+ showgrid=False, # Usually better to hide grid for 2nd axis to avoid clutter
1004
+ zeroline=False,
1005
+ range=[0, agg_df[line_var].max() * 1.1] # Give it some headroom
1006
+ ),
1007
+ legend=dict(
1008
+ orientation="v",
1009
+ yanchor="top",
1010
+ y=1,
1011
+ xanchor="left",
1012
+ x=1.08,
1013
+ )
1014
+ )
1015
+
1016
+ fig.show()
1017
+ print("\n")
1018
+ except Exception as e:
1019
+ print(f"Stacked Bar Plot Failed: {e}")
1020
+
1021
+
1022
+ def summary_pie_plot(summary_df, title="Anomaly Detection Summary"):
1023
+ """
1024
+ Generates a Pie Chart visualizing the distribution of Evaluated, Anomalous,
1025
+ and Dropped records using the specific project styling.
1026
+
1027
+ Args:
1028
+ summary_df (pd.DataFrame): Dataframe containing columns 'evaluated records',
1029
+ 'anomalies', and 'dropped'.
1030
+
1031
+ Returns:
1032
+ None: Displays the interactive Plotly figure.
1033
+ """
1034
+ try:
1035
+ colors = ['silver', 'crimson', 'gold']
1036
+
1037
+ # 2. Initialize Figure
1038
+ fig = go.Figure()
1039
+
1040
+ # 3. Add Trace
1041
+ fig.add_trace(go.Pie(
1042
+ labels=summary_df['Records'],
1043
+ values=summary_df['count'],
1044
+ marker=dict(
1045
+ colors=colors,
1046
+ line=dict(color='white', width=2)
1047
+ ),
1048
+ textposition='auto',
1049
+ texttemplate='%{label}<br>%{percent:.0%}',
1050
+ # textinfo='percent+label',
1051
+ hoverinfo='label+value+percent',
1052
+ sort=False
1053
+ ))
1054
+
1055
+ # 4. Apply Visual Design (Matching provided style)
1056
+ fig.update_layout(
1057
+ title=dict(
1058
+ text=title,
1059
+ y=0.96,
1060
+ x=0.5,
1061
+ xanchor='center',
1062
+ yanchor='top',
1063
+ font=dict(size=18, color='black', weight='bold'),
1064
+ ),
1065
+ height=400,
1066
+ width=600,
1067
+ margin=dict(l=50, r=50, t=80, b=30),
1068
+ plot_bgcolor='snow',
1069
+ paper_bgcolor='whitesmoke',
1070
+ legend=dict(
1071
+ orientation="v",
1072
+ yanchor="top",
1073
+ y=1,
1074
+ xanchor="left",
1075
+ x=1.02,
1076
+ )
1077
+ )
1078
+
1079
+ fig.show()
1080
+ print("\n")
1081
+
1082
+ except Exception as e:
1083
+ print(f"Summary Pie Plot Failed: {e}")
1084
+
1085
+
1086
+ def avg_anomaly_score_plot(df, group_columns, date_column):
1087
+
1088
+ try:
1089
+ plot_title = f"Average Anomaly Scores Over Time for {len(df[group_columns].drop_duplicates())} Groups"
1090
+
1091
+ fig = go.Figure()
1092
+
1093
+ agg_df = df.groupby(date_column)['Anomaly_Score'].mean().reset_index()
1094
+
1095
+ # Average Anomaly Scores
1096
+ fig.add_trace(go.Scatter(
1097
+ x=agg_df[date_column],
1098
+ y=agg_df['Anomaly_Score'],
1099
+ mode='lines',
1100
+ line=dict(color='seagreen', width=1.5),
1101
+ name='Average Anomaly Score',
1102
+ ))
1103
+
1104
+ fig.update_layout(
1105
+ title=dict(
1106
+ text=plot_title,
1107
+ y=0.96,
1108
+ x=0.5,
1109
+ xanchor='center',
1110
+ yanchor='top',
1111
+ font=dict(size=18, color='black', weight='bold'),
1112
+ ),
1113
+ height=350,
1114
+ width=1200,
1115
+ margin=dict(l=50, r=50, t=40, b=30),
1116
+ plot_bgcolor='snow',
1117
+ paper_bgcolor='whitesmoke',
1118
+ xaxis=dict(
1119
+ range=[agg_df[date_column].min(), agg_df[date_column].max()],
1120
+ showline=True,
1121
+ linewidth=0.5,
1122
+ linecolor='orange',
1123
+ zeroline=False,
1124
+ gridcolor='rgba(255, 165, 0, 0.1)',
1125
+ mirror=True
1126
+ ),
1127
+ yaxis=dict(
1128
+ range=[agg_df['Anomaly_Score'].min()*0.9, agg_df['Anomaly_Score'].max()*1.06],
1129
+ showline=True,
1130
+ linewidth=0.5,
1131
+ linecolor='orange',
1132
+ zeroline=False,
1133
+ gridcolor='rgba(255, 165, 0, 0.1)',
1134
+ mirror=True
1135
+ ),
1136
+ yaxis_title=dict(
1137
+ text='Average Anomaly Score',
1138
+ font=dict(size=16, weight='bold', color='black')
1139
+ ),
1140
+ legend=dict(
1141
+ orientation="v",
1142
+ yanchor="top",
1143
+ y=1,
1144
+ xanchor="left",
1145
+ x=1.02,
1146
+ )
1147
+ )
1148
+
1149
+ fig.show()
1150
+ print("\n")
1151
+ except Exception as e:
1152
+ print(f"Anomaly Score Plot Failed: {e}")